first commit

This commit is contained in:
root
2025-06-18 10:31:43 +08:00
commit d9f820b55d
981 changed files with 449311 additions and 0 deletions

171
app/Command/spider/BaseSpider.php Executable file
View File

@ -0,0 +1,171 @@
<?php
namespace App\Command\spider;
use App\Model\AppArticle;
use App\Model\AppSpiderArticle;
use Hyperf\Command\Command;
use Hyperf\Contract\StdoutLoggerInterface;
use Hyperf\Coroutine\Coroutine;
use Hyperf\Di\Annotation\Inject;
use Laminas\Stdlib\ArrayUtils;
use Swoole\Coroutine\Channel;
use Swoole\Timer;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use function Hyperf\Coroutine\co;
class BaseSpider extends Command
{
/**
* 最大协程数量
* @var int
*/
protected int $maxCo = 10;
protected ?\Swoole\Coroutine\Channel $channel = null;
/**
* @var string
*/
protected string $baseUrl = '';
#[Inject]
protected ?StdoutLoggerInterface $logger = null;
protected array $coroutineList = [];
protected const PLATFORM = '';
private bool $isInit = false;
protected int|bool $timer = false;
protected array $commandConfigure = [];
private function init()
{
// 因为最外层还有个父协程, 所以加一
$this->channel = new Channel($this->maxCo + 1);
$this->timer = Timer::tick(1000 * 30, function () use (&$coList) {
// count(\Swoole\Coroutine::getElapsed());
var_dump(count($this->coroutineList));
// var_dump($list);
});
for ($i = 0; $i < $this->maxCo + 1; $i++) {
$this->channel->push(1);
}
}
public function configure()
{
parent::configure();
$this->addOption('prod', '', InputOption::VALUE_NEGATABLE, '是否关闭devMode.', false);
}
public static function getPlatform(): string
{
return static::PLATFORM;
}
public function getBaseUrl(): string
{
return rtrim($this->baseUrl, '/');
}
protected function getArticleModel(array $condition)
{
return AppSpiderArticle::query()->where($condition)->first() ?: new AppSpiderArticle();
}
protected function request(string $url): array
{
$ch = curl_init();
curl_setopt_array($ch, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => '',
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 15,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
));
// curl_setopt($ch, CURLOPT_URL, $url);
// curl_setopt($ch, CURLOPT_HEADER, false);
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$result = curl_exec($ch);
curl_close($ch);
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
return [$result, $httpCode];
}
protected function returnPool()
{
return $this->channel->push(1);
}
protected function getPool(): bool
{
return $this->channel->pop();
}
protected function createCoroutine(\Closure $func): void
{
if ($this->isInit === false) {
$this->isInit = true;
$this->init();
}
$this->getPool();
$cid = co(function () use ($func) {
\Co\defer(function() {
unset($this->coroutineList[Coroutine::id()]);
$this->returnPool();
});
$func();
});
$this->coroutineList[$cid] = 1;
}
protected function debugPrint(array|string $message = '', $level = 0)
{
if ($this->getCommandConfigure('prod') === false) {
$printTime = date('H:i:s');
echo "[spider-debug][$printTime]" . print_r($message, true) . PHP_EOL;
}
}
/**
* 用于单元测试
* @param string $methodName
* @param $args
* @return mixed
*/
public function testMethod(string $methodName, $args = [])
{
return $this->{$methodName}(...$args);
}
public function setCommandConfigure($options): void
{
$this->commandConfigure = $options;
}
public function getCommandConfigure($key = null, $defaultValue = null)
{
if (!$key) {
return $this->commandConfigure;
}
return $this->commandConfigure[$key] ?? $defaultValue;
}
public function execute(InputInterface $input, OutputInterface $output): int
{
$this->setCommandConfigure($input->getOptions());
return 0;
}
}

View File

@ -0,0 +1,187 @@
<?php
namespace App\Command\spider;
use Hyperf\Command\Annotation\Command;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use function Swoole\Coroutine\run;
#[Command]
class ElleStreetCommand extends BaseSpider
{
/**
* @var string
*/
protected string $baseUrl = 'https://www.elle.com';
protected const PLATFORM = 'elle-street';
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
{
parent::__construct('spider:elle-street');
}
public function configure()
{
parent::configure();
$this->setDescription('elle.com/street elle街拍模块');
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
}
public function execute(InputInterface $input, OutputInterface $output): int
{
run(function () {
$this->spiderStart();
});
return 0;
}
private function _getTask($brand): \Generator
{
$query = Db::table('app_brands');
if ($brand) {
$query->where(['id' => $brand]);
}
$query->where('id', '>', 1)->orderBy('id');
foreach ($query->cursor() as $row) {
yield $row;
}
}
private function _getTaskName($name): string
{
return strtolower(strtr($name, [
'.' => '-',
' ' => '-'
]));
}
public function spiderStart(): void
{
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (!is_array($matches) && count($matches) < 1) {
$this->logger->info(self::getPlatform() . " 数据获取失败。");
return;
}
$val = json_decode(($matches[1][0]), true);
$articles = $val[0]['itemListElement'] ?? [];
if (!$articles) {
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
return;
}
$saveImages = [];
foreach ($articles as $article) {
list($result, $httpCode) = $this->request($article['url']);
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (isset($matches[1][0])) {
$val = json_decode($matches[1][0], true);
$images = $val['about']['itemListElement'];
foreach ($images as $image) {
$saveImages[] = $image['item']['image'];
}
}
}
var_dump($saveImages);
return;
$this->createCoroutine(function () use ($task) {
$brandName = $this->_getTaskName($task->name);
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
// 取发布会列表
$showsList = $this->_getShowsList($url);
foreach ($showsList as $list) {
$this->createCoroutine(function () use ($task, $list) {
$this->_getDetail($task->id, $list);
});
}
});
}
private function _getShowsList($url)
{
list($request, $httpCode) = $this->request($url);
if ($httpCode == 200) {
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
$val = json_decode(current(end($matches)), true);
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
} else {
$this->logger->info('未找到数据.');
return [];
}
}
private function _getDetail(int $brandId, array $info)
{
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
$model->title = $info['hed'];
$model->images = json_encode([]);
$model->platform = self::getPlatform();
// 获取图片
$pageUri = $info['url'];
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
$matches = [];
list($result, $httpCode) = $this->request($requestUrl);
if ($httpCode != 200 || !$result) {
$this->logger->warning($requestUrl . '请求失败.');
return;
}
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
$saveUrl = [];
if (count($matches) > 1) {
$val = json_decode(current($matches[1]), true);
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
if ($images === false) {
$this->logger->warning($requestUrl . '获取图片失败.');
return;
}
foreach (is_array($images) ? $images : [] as $img) {
$saveUrl[] = [
'src' => $img['image']['sources']['xxl']['url']
];
}
$model->images = json_encode($saveUrl);
}
$model->save();
$this->logger->info("end: {$requestUrl}");
}
}

View File

@ -0,0 +1,103 @@
<?php
namespace App\Command\spider;
use App\Helpers\AppHelper;
use App\Model\AppBrand;
use Hyperf\Command\Annotation\Command;
use Hyperf\Coroutine\Coroutine;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Swoole\ExitException;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use function Swoole\Coroutine\run;
#[Command]
class FashionSnapCommand extends BaseSpider
{
protected const PLATFORM = 'fashionsnap';
protected string $baseUrl = 'https://www.fashionsnap.com';
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
{
parent::__construct('spider:fashionsnap');
}
public function configure()
{
parent::configure();
$this->setDescription('自动采集fashionsnap.com');
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
}
private function _getTask($brand): \Generator
{
$query = Db::table('app_brands');
if ($brand) {
$query->whereIn('id', explode(',', $brand));
} else {
$query->where('spider_origin', '=', 'fashionsnap')->orderBy('id');
}
foreach ($query->cursor() as $row) {
if (!$row) {
throw new ExitException('END.');
}
yield $row;
}
}
public function execute(InputInterface $input, OutputInterface $output): int
{
$brand = $input->getOption('brandId');
run(function () use ($brand) {
foreach ($this->_getTask($brand) as $task) {
list($result, $httpCode) = $this->request($this->getBaseUrl() . "/api/algolia/article/?blogIds=4&brandName={$task->name}&limit=50");
echo $task->name . '--' . $httpCode . PHP_EOL;
if ($httpCode == 200) {
$isSuccess = false;
$result = json_decode($result, true);
if ($result['totalCount'] == 0 || $result['totalCount'] > 200) {
continue;
}
foreach ($result['articles'] ?? [] as $item) {
$model = $this->getArticleModel(['title' => $item['mainCategory']['name'], 'platform' => static::PLATFORM, 'brand' => $task->id]);
$model->title = $item['mainCategory']['name'];
$model->year = AppHelper::getYear($model->title);
$model->brand = $task->id;
$model->module = 0;
$model->platform = self::getPlatform();
$saveImages = [];
foreach ($item['mainGalleryImages'] as $image) {
$saveImages[] = [
'src' => 'https://fashionsnap-assets.com/asset/width=4096' . $image
];
}
$model->images = json_encode($saveImages);
$model->cover = $saveImages[0]['src'] ?? '';
// permalink
$model->source_url = 'https://fashionsnap.com' . $item['permalink'];
if ($model->cover) {
$isSuccess = $model->save();
}
}
if ($isSuccess) {
$brandModel = AppBrand::find($task->id);
$brandModel->spider_origin = self::getPlatform();
$brandModel->save();
}
}
}
});
return 0;
}
}

View File

@ -0,0 +1,117 @@
<?php
namespace App\Command\spider;
use App\Enums\ArticleModuleEnum;
use App\Helpers\AppHelper;
use Hyperf\Command\Annotation\Command;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\DomCrawler\Crawler;
#[Command]
class TheImpressionStreetCommand extends BaseSpider
{
protected const PLATFORM = 'theimpression-street';
public function __construct(protected ContainerInterface $container)
{
parent::__construct('spider:theimpression-street');
}
public function configure()
{
parent::configure();
$this->setDescription('自动采集 https://theimpression.com/street-style');
}
public function execute(InputInterface $input, OutputInterface $output): int
{
parent::execute($input, $output);
$url = 'https://theimpression.com/street-style';
[$res, $httpCode] = $this->request($url);
if ($httpCode != 200) {
$this->debugPrint("{$url} 请求失败.");
return 0;
}
// 取banner的图
(new Crawler($res))
->filter('.parallax .mask-overlay')->each(function ($node) {
$href = $node->attr('href');
$text = trim($node->attr('aria-label'));
$this->debugPrint("标题: {$text}");
$this->debugPrint("链接: {$href}");
$this->getDetail($href, $text);
});
$articleList = [];
// 取前五十页
for ($i = 1; $i < 2; $i++) {
$url = "https://theimpression.com/wp-json/codetipi-zeen/v1/block?paged={$i}&type=1&data%5Bargs%5D%5Bcat%5D=1";
[$res, $httpCode] = $this->request($url);
if ($httpCode != 200) {
$this->debugPrint("{$url} 请求失败.");
return 0;
}
$res = json_decode($res, true);
(new Crawler($res[1]))
->filter('article')->each(function (Crawler $node) use (&$articleList) {
$href = $node->filter('.mask-img')->attr('href', '');
$title = $node->filter('.title-wrap')->text('');
if (!$href || !$title) {
$this->debugPrint("找不到标题或链接.");
return 0;
}
$this->getDetail($href, $title);
});
}
return 0;
// return 0;
}
public function getDetail(string $url, $title)
{
$model = $this->getArticleModel(['title' => $title, 'platform' => static::getPlatform(), 'brand' => 0]);
$model->title = $title;
$model->platform = static::getPlatform();
$model->module = ArticleModuleEnum::STREET->value;
$model->year = AppHelper::getYear($title);
[$res, $httpCode] = $this->request($url);
$model->source_url = $url;
if ($httpCode != 200) {
$this->debugPrint("{$url} 请求失败.");
return 0;
}
$images = [];
(new Crawler($res))
->filter('figure a img')->each(function ($node) use (&$images) {
if ($node->attr('src') && !isset($images[$node->attr('src')])) {
$this->debugPrint("采集图片: {$node->attr('src')}");
$images[$node->attr('src')] = [
'src' => $node->attr('src')
];
}
});
if ($images) {
$model->cover = current($images)['src'];
$model->images = json_encode(array_values($images));
$model->save();
}
}
}

View File

@ -0,0 +1,189 @@
<?php
namespace App\Command\spider;
use App\Helpers\AppHelper;
use App\Model\AppBrand;
use Hyperf\Command\Annotation\Command;
use Hyperf\Coroutine\Coroutine;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use function Swoole\Coroutine\run;
#[Command]
class VogueCommand extends BaseSpider
{
/**
* @var string
*/
protected string $baseUrl = 'https://www.vogue.com';
protected const PLATFORM = 'vogue';
public function __construct()
{
parent::__construct('spider:vogue');
ini_set('pcre.backtrack_limit', '-1');
}
public function configure()
{
parent::configure();
$this->setDescription('自动采集vogue.com');
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id.', false);
$this->addOption('forceUpdate', 'f', InputOption::VALUE_NEGATABLE, '是否对已经保存的数据进行强制更新.', false);
$this->addOption('onlyPlatform', 'o', InputOption::VALUE_NEGATABLE, '是否只对当前平台品牌更新.', false);
}
public function execute(InputInterface $input, OutputInterface $output): int
{
$this->setCommandConfigure($input->getOptions());
run(function () {
// 最大查询的品牌数量, 防止同时最大协程数都有子数据, 导致无法创建协程的问题。
$maxBrandExecuteCount = $this->maxCo / 2;
$currentBrandExecute = 0;
foreach ($this->_getTask() as $task) {
$currentBrandExecute++;
$this->createCoroutine(function () use ($task, &$currentBrandExecute) {
$this->spiderStart($task);
$currentBrandExecute--;
});
while (true) {
if ($currentBrandExecute > $maxBrandExecuteCount) {
Coroutine::sleep(1);
} else {
break;
}
}
}
Coroutine::sleep(60);
exit(0);
});
return 0;
}
private function _getTask(): \Generator
{
$query = AppBrand::query();
$brandId = $this->getCommandConfigure('brandId');
$onlyPlatform = $this->getCommandConfigure('onlyPlatform');
if ($brandId) {
$query->where(['id' => $brandId]);
} else {
$query->where('id', '>', 1)->when($onlyPlatform, fn($q) => $q->where('spider_origin', static::PLATFORM))->orderBy('id');
}
foreach ($query->cursor() as $row) {
yield $row;
}
}
protected function getTaskName($name): string
{
return strtolower(strtr($name, [
'.' => '-',
' ' => '-',
'&' => ''
]));
}
public function spiderStart($task): void
{
$brandName = $this->getTaskName($task->name);
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
$this->logger->info(sprintf("[Command] brandName: {$this->getTaskName($task->name)}; spiderUrl: {$url}"));
// 取发布会列表
$showsList = $this->getShowsList($url);
foreach ($showsList as $list) {
$this->createCoroutine(function () use ($task, $list) {
$this->getDetail($task->id, $list);
});
}
}
protected function getShowsList($url)
{
list($request, $httpCode) = $this->request($url);
if ($httpCode == 200) {
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
$val = json_decode(current(end($matches)), true);
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
} else {
$this->logger->info('未找到数据.');
return [];
}
}
protected function getDetail(int $brandId, array $info)
{
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
// 如果不是force update
// 不更新原来的数据
if ($model->id && $this->getCommandConfigure('forceUpdate') === false) {
return;
}
$model->title = $info['hed'];
$model->images = json_encode([]);
$model->platform = self::PLATFORM;
$model->brand = $brandId;
$model->module = 0;
$model->year = AppHelper::getYear($info['hed']);
// 获取图片
$pageUri = $info['url'];
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
$model->source_url = $requestUrl;
$matches = [];
list($result, $httpCode) = $this->request($requestUrl);
if ($httpCode != 200 || !$result) {
$this->logger->warning($requestUrl . '请求失败.');
return;
}
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
$saveUrl = $detailUrl = [];
if (count($matches) > 1) {
$val = json_decode(current($matches[1]), true);
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
if ($images === false) {
$this->logger->warning($requestUrl . '获取图片失败.');
return;
}
foreach (is_array($images) ? $images : [] as $img) {
$saveUrl[] = [
'src' => $img['image']['sources']['xxl']['url']
];
foreach ($img['details'] ?? [] as $detail) {
$detailUrl[] = ['src' => $detail['image']['sources']['xxl']['url']];
}
}
$model->images = json_encode($saveUrl);
$model->cover = $saveUrl[0]['src'];
}
$model->save();
$this->logger->info("end: {$requestUrl}");
}
}