first commit
This commit is contained in:
171
app/Command/spider/BaseSpider.php
Executable file
171
app/Command/spider/BaseSpider.php
Executable file
@ -0,0 +1,171 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Model\AppArticle;
|
||||
use App\Model\AppSpiderArticle;
|
||||
use Hyperf\Command\Command;
|
||||
use Hyperf\Contract\StdoutLoggerInterface;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\Di\Annotation\Inject;
|
||||
use Laminas\Stdlib\ArrayUtils;
|
||||
use Swoole\Coroutine\Channel;
|
||||
use Swoole\Timer;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Hyperf\Coroutine\co;
|
||||
|
||||
class BaseSpider extends Command
|
||||
{
|
||||
/**
|
||||
* 最大协程数量
|
||||
* @var int
|
||||
*/
|
||||
protected int $maxCo = 10;
|
||||
|
||||
protected ?\Swoole\Coroutine\Channel $channel = null;
|
||||
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = '';
|
||||
|
||||
#[Inject]
|
||||
protected ?StdoutLoggerInterface $logger = null;
|
||||
|
||||
protected array $coroutineList = [];
|
||||
|
||||
protected const PLATFORM = '';
|
||||
|
||||
private bool $isInit = false;
|
||||
|
||||
protected int|bool $timer = false;
|
||||
|
||||
protected array $commandConfigure = [];
|
||||
|
||||
private function init()
|
||||
{
|
||||
// 因为最外层还有个父协程, 所以加一
|
||||
$this->channel = new Channel($this->maxCo + 1);
|
||||
$this->timer = Timer::tick(1000 * 30, function () use (&$coList) {
|
||||
// count(\Swoole\Coroutine::getElapsed());
|
||||
var_dump(count($this->coroutineList));
|
||||
// var_dump($list);
|
||||
});
|
||||
for ($i = 0; $i < $this->maxCo + 1; $i++) {
|
||||
$this->channel->push(1);
|
||||
}
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->addOption('prod', '', InputOption::VALUE_NEGATABLE, '是否关闭devMode.', false);
|
||||
}
|
||||
|
||||
public static function getPlatform(): string
|
||||
{
|
||||
return static::PLATFORM;
|
||||
}
|
||||
|
||||
public function getBaseUrl(): string
|
||||
{
|
||||
return rtrim($this->baseUrl, '/');
|
||||
}
|
||||
|
||||
protected function getArticleModel(array $condition)
|
||||
{
|
||||
return AppSpiderArticle::query()->where($condition)->first() ?: new AppSpiderArticle();
|
||||
}
|
||||
|
||||
protected function request(string $url): array
|
||||
{
|
||||
$ch = curl_init();
|
||||
curl_setopt_array($ch, array(
|
||||
CURLOPT_URL => $url,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_ENCODING => '',
|
||||
CURLOPT_MAXREDIRS => 10,
|
||||
CURLOPT_TIMEOUT => 15,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
||||
CURLOPT_CUSTOMREQUEST => 'GET',
|
||||
));
|
||||
// curl_setopt($ch, CURLOPT_URL, $url);
|
||||
// curl_setopt($ch, CURLOPT_HEADER, false);
|
||||
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
$result = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
|
||||
return [$result, $httpCode];
|
||||
}
|
||||
|
||||
protected function returnPool()
|
||||
{
|
||||
return $this->channel->push(1);
|
||||
}
|
||||
|
||||
protected function getPool(): bool
|
||||
{
|
||||
return $this->channel->pop();
|
||||
}
|
||||
|
||||
protected function createCoroutine(\Closure $func): void
|
||||
{
|
||||
if ($this->isInit === false) {
|
||||
$this->isInit = true;
|
||||
$this->init();
|
||||
}
|
||||
$this->getPool();
|
||||
$cid = co(function () use ($func) {
|
||||
\Co\defer(function() {
|
||||
unset($this->coroutineList[Coroutine::id()]);
|
||||
$this->returnPool();
|
||||
});
|
||||
|
||||
$func();
|
||||
});
|
||||
$this->coroutineList[$cid] = 1;
|
||||
}
|
||||
|
||||
protected function debugPrint(array|string $message = '', $level = 0)
|
||||
{
|
||||
if ($this->getCommandConfigure('prod') === false) {
|
||||
$printTime = date('H:i:s');
|
||||
echo "[spider-debug][$printTime]" . print_r($message, true) . PHP_EOL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用于单元测试
|
||||
* @param string $methodName
|
||||
* @param $args
|
||||
* @return mixed
|
||||
*/
|
||||
public function testMethod(string $methodName, $args = [])
|
||||
{
|
||||
return $this->{$methodName}(...$args);
|
||||
}
|
||||
|
||||
public function setCommandConfigure($options): void
|
||||
{
|
||||
$this->commandConfigure = $options;
|
||||
}
|
||||
|
||||
public function getCommandConfigure($key = null, $defaultValue = null)
|
||||
{
|
||||
if (!$key) {
|
||||
return $this->commandConfigure;
|
||||
}
|
||||
|
||||
return $this->commandConfigure[$key] ?? $defaultValue;
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->setCommandConfigure($input->getOptions());
|
||||
return 0;
|
||||
}
|
||||
}
|
187
app/Command/spider/ElleStreetCommand.php
Executable file
187
app/Command/spider/ElleStreetCommand.php
Executable file
@ -0,0 +1,187 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class ElleStreetCommand extends BaseSpider
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = 'https://www.elle.com';
|
||||
|
||||
protected const PLATFORM = 'elle-street';
|
||||
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('spider:elle-street');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('elle.com/street elle街拍模块');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
run(function () {
|
||||
$this->spiderStart();
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private function _getTask($brand): \Generator
|
||||
{
|
||||
$query = Db::table('app_brands');
|
||||
if ($brand) {
|
||||
$query->where(['id' => $brand]);
|
||||
}
|
||||
|
||||
$query->where('id', '>', 1)->orderBy('id');
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
private function _getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-'
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart(): void
|
||||
{
|
||||
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (!is_array($matches) && count($matches) < 1) {
|
||||
$this->logger->info(self::getPlatform() . " 数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
$val = json_decode(($matches[1][0]), true);
|
||||
|
||||
$articles = $val[0]['itemListElement'] ?? [];
|
||||
|
||||
if (!$articles) {
|
||||
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
$saveImages = [];
|
||||
foreach ($articles as $article) {
|
||||
list($result, $httpCode) = $this->request($article['url']);
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (isset($matches[1][0])) {
|
||||
$val = json_decode($matches[1][0], true);
|
||||
|
||||
$images = $val['about']['itemListElement'];
|
||||
|
||||
foreach ($images as $image) {
|
||||
$saveImages[] = $image['item']['image'];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var_dump($saveImages);
|
||||
|
||||
|
||||
return;
|
||||
|
||||
|
||||
$this->createCoroutine(function () use ($task) {
|
||||
|
||||
$brandName = $this->_getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->_getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->_getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private function _getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private function _getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::getPlatform();
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
}
|
||||
}
|
103
app/Command/spider/FashionSnapCommand.php
Executable file
103
app/Command/spider/FashionSnapCommand.php
Executable file
@ -0,0 +1,103 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppBrand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Swoole\ExitException;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class FashionSnapCommand extends BaseSpider
|
||||
{
|
||||
protected const PLATFORM = 'fashionsnap';
|
||||
|
||||
protected string $baseUrl = 'https://www.fashionsnap.com';
|
||||
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('spider:fashionsnap');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集fashionsnap.com');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
|
||||
}
|
||||
|
||||
private function _getTask($brand): \Generator
|
||||
{
|
||||
$query = Db::table('app_brands');
|
||||
|
||||
if ($brand) {
|
||||
$query->whereIn('id', explode(',', $brand));
|
||||
} else {
|
||||
$query->where('spider_origin', '=', 'fashionsnap')->orderBy('id');
|
||||
}
|
||||
|
||||
foreach ($query->cursor() as $row) {
|
||||
if (!$row) {
|
||||
throw new ExitException('END.');
|
||||
}
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$brand = $input->getOption('brandId');
|
||||
|
||||
run(function () use ($brand) {
|
||||
foreach ($this->_getTask($brand) as $task) {
|
||||
list($result, $httpCode) = $this->request($this->getBaseUrl() . "/api/algolia/article/?blogIds=4&brandName={$task->name}&limit=50");
|
||||
echo $task->name . '--' . $httpCode . PHP_EOL;
|
||||
if ($httpCode == 200) {
|
||||
$isSuccess = false;
|
||||
$result = json_decode($result, true);
|
||||
if ($result['totalCount'] == 0 || $result['totalCount'] > 200) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($result['articles'] ?? [] as $item) {
|
||||
$model = $this->getArticleModel(['title' => $item['mainCategory']['name'], 'platform' => static::PLATFORM, 'brand' => $task->id]);
|
||||
$model->title = $item['mainCategory']['name'];
|
||||
$model->year = AppHelper::getYear($model->title);
|
||||
$model->brand = $task->id;
|
||||
$model->module = 0;
|
||||
$model->platform = self::getPlatform();
|
||||
|
||||
$saveImages = [];
|
||||
foreach ($item['mainGalleryImages'] as $image) {
|
||||
$saveImages[] = [
|
||||
'src' => 'https://fashionsnap-assets.com/asset/width=4096' . $image
|
||||
];
|
||||
}
|
||||
$model->images = json_encode($saveImages);
|
||||
$model->cover = $saveImages[0]['src'] ?? '';
|
||||
// permalink
|
||||
$model->source_url = 'https://fashionsnap.com' . $item['permalink'];
|
||||
if ($model->cover) {
|
||||
$isSuccess = $model->save();
|
||||
}
|
||||
}
|
||||
|
||||
if ($isSuccess) {
|
||||
$brandModel = AppBrand::find($task->id);
|
||||
$brandModel->spider_origin = self::getPlatform();
|
||||
$brandModel->save();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
117
app/Command/spider/TheImpressionStreetCommand.php
Executable file
117
app/Command/spider/TheImpressionStreetCommand.php
Executable file
@ -0,0 +1,117 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Enums\ArticleModuleEnum;
|
||||
use App\Helpers\AppHelper;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
#[Command]
|
||||
class TheImpressionStreetCommand extends BaseSpider
|
||||
{
|
||||
protected const PLATFORM = 'theimpression-street';
|
||||
|
||||
public function __construct(protected ContainerInterface $container)
|
||||
{
|
||||
parent::__construct('spider:theimpression-street');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集 https://theimpression.com/street-style');
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
parent::execute($input, $output);
|
||||
|
||||
$url = 'https://theimpression.com/street-style';
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 取banner的图
|
||||
(new Crawler($res))
|
||||
->filter('.parallax .mask-overlay')->each(function ($node) {
|
||||
$href = $node->attr('href');
|
||||
$text = trim($node->attr('aria-label'));
|
||||
$this->debugPrint("标题: {$text}");
|
||||
$this->debugPrint("链接: {$href}");
|
||||
$this->getDetail($href, $text);
|
||||
});
|
||||
|
||||
$articleList = [];
|
||||
// 取前五十页
|
||||
for ($i = 1; $i < 2; $i++) {
|
||||
$url = "https://theimpression.com/wp-json/codetipi-zeen/v1/block?paged={$i}&type=1&data%5Bargs%5D%5Bcat%5D=1";
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
$res = json_decode($res, true);
|
||||
(new Crawler($res[1]))
|
||||
->filter('article')->each(function (Crawler $node) use (&$articleList) {
|
||||
|
||||
$href = $node->filter('.mask-img')->attr('href', '');
|
||||
$title = $node->filter('.title-wrap')->text('');
|
||||
if (!$href || !$title) {
|
||||
$this->debugPrint("找不到标题或链接.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
$this->getDetail($href, $title);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
// return 0;
|
||||
}
|
||||
|
||||
|
||||
public function getDetail(string $url, $title)
|
||||
{
|
||||
$model = $this->getArticleModel(['title' => $title, 'platform' => static::getPlatform(), 'brand' => 0]);
|
||||
$model->title = $title;
|
||||
$model->platform = static::getPlatform();
|
||||
$model->module = ArticleModuleEnum::STREET->value;
|
||||
$model->year = AppHelper::getYear($title);
|
||||
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
$model->source_url = $url;
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
$images = [];
|
||||
|
||||
(new Crawler($res))
|
||||
->filter('figure a img')->each(function ($node) use (&$images) {
|
||||
if ($node->attr('src') && !isset($images[$node->attr('src')])) {
|
||||
$this->debugPrint("采集图片: {$node->attr('src')}");
|
||||
$images[$node->attr('src')] = [
|
||||
'src' => $node->attr('src')
|
||||
];
|
||||
}
|
||||
});
|
||||
|
||||
if ($images) {
|
||||
$model->cover = current($images)['src'];
|
||||
$model->images = json_encode(array_values($images));
|
||||
$model->save();
|
||||
}
|
||||
}
|
||||
}
|
189
app/Command/spider/VogueCommand.php
Executable file
189
app/Command/spider/VogueCommand.php
Executable file
@ -0,0 +1,189 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppBrand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class VogueCommand extends BaseSpider
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = 'https://www.vogue.com';
|
||||
|
||||
protected const PLATFORM = 'vogue';
|
||||
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct('spider:vogue');
|
||||
ini_set('pcre.backtrack_limit', '-1');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集vogue.com');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id.', false);
|
||||
$this->addOption('forceUpdate', 'f', InputOption::VALUE_NEGATABLE, '是否对已经保存的数据进行强制更新.', false);
|
||||
$this->addOption('onlyPlatform', 'o', InputOption::VALUE_NEGATABLE, '是否只对当前平台品牌更新.', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->setCommandConfigure($input->getOptions());
|
||||
|
||||
run(function () {
|
||||
// 最大查询的品牌数量, 防止同时最大协程数都有子数据, 导致无法创建协程的问题。
|
||||
$maxBrandExecuteCount = $this->maxCo / 2;
|
||||
$currentBrandExecute = 0;
|
||||
foreach ($this->_getTask() as $task) {
|
||||
$currentBrandExecute++;
|
||||
|
||||
$this->createCoroutine(function () use ($task, &$currentBrandExecute) {
|
||||
$this->spiderStart($task);
|
||||
$currentBrandExecute--;
|
||||
});
|
||||
|
||||
while (true) {
|
||||
if ($currentBrandExecute > $maxBrandExecuteCount) {
|
||||
Coroutine::sleep(1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Coroutine::sleep(60);
|
||||
exit(0);
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private function _getTask(): \Generator
|
||||
{
|
||||
$query = AppBrand::query();
|
||||
$brandId = $this->getCommandConfigure('brandId');
|
||||
$onlyPlatform = $this->getCommandConfigure('onlyPlatform');
|
||||
if ($brandId) {
|
||||
$query->where(['id' => $brandId]);
|
||||
} else {
|
||||
$query->where('id', '>', 1)->when($onlyPlatform, fn($q) => $q->where('spider_origin', static::PLATFORM))->orderBy('id');
|
||||
}
|
||||
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
protected function getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-',
|
||||
'&' => ''
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart($task): void
|
||||
{
|
||||
$brandName = $this->getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected function getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected function getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
// 如果不是force update
|
||||
// 不更新原来的数据
|
||||
if ($model->id && $this->getCommandConfigure('forceUpdate') === false) {
|
||||
return;
|
||||
}
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::PLATFORM;
|
||||
$model->brand = $brandId;
|
||||
$model->module = 0;
|
||||
$model->year = AppHelper::getYear($info['hed']);
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
$model->source_url = $requestUrl;
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = $detailUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
foreach ($img['details'] ?? [] as $detail) {
|
||||
$detailUrl[] = ['src' => $detail['image']['sources']['xxl']['url']];
|
||||
}
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
$model->cover = $saveUrl[0]['src'];
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user