first commit
This commit is contained in:
189
app/Command/spider/VogueCommand.php
Executable file
189
app/Command/spider/VogueCommand.php
Executable file
@ -0,0 +1,189 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppBrand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class VogueCommand extends BaseSpider
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = 'https://www.vogue.com';
|
||||
|
||||
protected const PLATFORM = 'vogue';
|
||||
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct('spider:vogue');
|
||||
ini_set('pcre.backtrack_limit', '-1');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集vogue.com');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id.', false);
|
||||
$this->addOption('forceUpdate', 'f', InputOption::VALUE_NEGATABLE, '是否对已经保存的数据进行强制更新.', false);
|
||||
$this->addOption('onlyPlatform', 'o', InputOption::VALUE_NEGATABLE, '是否只对当前平台品牌更新.', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->setCommandConfigure($input->getOptions());
|
||||
|
||||
run(function () {
|
||||
// 最大查询的品牌数量, 防止同时最大协程数都有子数据, 导致无法创建协程的问题。
|
||||
$maxBrandExecuteCount = $this->maxCo / 2;
|
||||
$currentBrandExecute = 0;
|
||||
foreach ($this->_getTask() as $task) {
|
||||
$currentBrandExecute++;
|
||||
|
||||
$this->createCoroutine(function () use ($task, &$currentBrandExecute) {
|
||||
$this->spiderStart($task);
|
||||
$currentBrandExecute--;
|
||||
});
|
||||
|
||||
while (true) {
|
||||
if ($currentBrandExecute > $maxBrandExecuteCount) {
|
||||
Coroutine::sleep(1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Coroutine::sleep(60);
|
||||
exit(0);
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private function _getTask(): \Generator
|
||||
{
|
||||
$query = AppBrand::query();
|
||||
$brandId = $this->getCommandConfigure('brandId');
|
||||
$onlyPlatform = $this->getCommandConfigure('onlyPlatform');
|
||||
if ($brandId) {
|
||||
$query->where(['id' => $brandId]);
|
||||
} else {
|
||||
$query->where('id', '>', 1)->when($onlyPlatform, fn($q) => $q->where('spider_origin', static::PLATFORM))->orderBy('id');
|
||||
}
|
||||
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
protected function getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-',
|
||||
'&' => ''
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart($task): void
|
||||
{
|
||||
$brandName = $this->getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected function getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected function getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
// 如果不是force update
|
||||
// 不更新原来的数据
|
||||
if ($model->id && $this->getCommandConfigure('forceUpdate') === false) {
|
||||
return;
|
||||
}
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::PLATFORM;
|
||||
$model->brand = $brandId;
|
||||
$model->module = 0;
|
||||
$model->year = AppHelper::getYear($info['hed']);
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
$model->source_url = $requestUrl;
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = $detailUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
foreach ($img['details'] ?? [] as $detail) {
|
||||
$detailUrl[] = ['src' => $detail['image']['sources']['xxl']['url']];
|
||||
}
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
$model->cover = $saveUrl[0]['src'];
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user