first commit
This commit is contained in:
68
app/Command/CoverCommand.php
Executable file
68
app/Command/CoverCommand.php
Executable file
@ -0,0 +1,68 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppArticle;
|
||||
use Hyperf\Collection\Collection;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Command\Command as HyperfCommand;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use function Hyperf\Coroutine\co;
|
||||
|
||||
#[Command]
|
||||
class CoverCommand extends HyperfCommand
|
||||
{
|
||||
|
||||
protected $reids;
|
||||
public function __construct(protected ContainerInterface $container)
|
||||
{
|
||||
parent::__construct('demo:cover');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('Hyperf Demo Command');
|
||||
}
|
||||
|
||||
public function handle()
|
||||
{
|
||||
$this->line('Hello Hyperf!', 'info');
|
||||
|
||||
Db::table('app_articles')->where('id', '>', 11284)->orderBy('id')->chunk(20, function (Collection $item) {
|
||||
$waitGroup = new \Hyperf\Coroutine\WaitGroup();
|
||||
foreach ($item as $v) {;
|
||||
if (!$v || $v->cover) {
|
||||
if ($v->year) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$waitGroup->add();
|
||||
co(function () use ($waitGroup, $v) {
|
||||
$cover = json_decode($v->images, true);
|
||||
$v->cover = current($cover)['src'];
|
||||
|
||||
$model = AppArticle::find($v->id);
|
||||
if ($model) {
|
||||
$model->cover = current($cover)['src'];
|
||||
preg_match('/([0-9]+)/', $model->title, $y);
|
||||
|
||||
$model->year = $y[1] ?? 0;
|
||||
echo "update {$v->id} year: {$model->year} effect: {$model->update()}" . PHP_EOL;
|
||||
}
|
||||
|
||||
$waitGroup->done();
|
||||
});
|
||||
}
|
||||
|
||||
$waitGroup->wait();
|
||||
});
|
||||
|
||||
|
||||
// var_dump(file_get_contents('https://www.vogue.com/fashion-shows/designer/AFKIR'));
|
||||
}
|
||||
}
|
184
app/Command/FooCommand.php
Executable file
184
app/Command/FooCommand.php
Executable file
@ -0,0 +1,184 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Command\spider\VogueCommand;
|
||||
use App\FormModel\spider\ReviewModel;
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Helpers\TitleHelper;
|
||||
use App\Model\AppArticle;
|
||||
use App\Model\AppBrand;
|
||||
use App\Model\AppSpiderArticle;
|
||||
use Co\WaitGroup;
|
||||
use GuzzleHttp\RequestOptions;
|
||||
use Hyperf\Collection\Collection;
|
||||
use Hyperf\Command\Command as HyperfCommand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Context\ApplicationContext;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Laminas\Stdlib\ArrayUtils;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Spatie\Crawler\Crawler;
|
||||
use Swoole\Timer;
|
||||
use function Hyperf\Coroutine\co;
|
||||
|
||||
#[Command]
|
||||
class FooCommand extends HyperfCommand
|
||||
{
|
||||
|
||||
protected LoggerInterface $logger;
|
||||
|
||||
protected $reids;
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('demo:command');
|
||||
$this->logger = $loggerFactory->get('log', 'command');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('Hyperf Demo Command');
|
||||
}
|
||||
|
||||
public function handle()
|
||||
{
|
||||
$this->line('Hello Hyperf!', 'info');
|
||||
|
||||
// Crawler::create([
|
||||
// RequestOptions::TIMEOUT => 30.0, // 请求最大持续时间
|
||||
// RequestOptions::CONNECT_TIMEOUT => 10.0, // 连接超时
|
||||
// ])->setCrawlObserver(new TestClass())->startCrawling('https://theimpression.com/milan-street-style-fall-2025-day-6/');
|
||||
|
||||
|
||||
$query = AppArticle::where('id', '>', 1);
|
||||
|
||||
// $model = new ReviewModel();
|
||||
$map = [
|
||||
// 'Fall 1996 Ready-to-Wear' => '/Fall ([0-9]*?) Ready-to-Wear/',
|
||||
'/Fall ([0-9]*?) Ready-to-Wear/' => [
|
||||
'trans' => '秋季成衣',
|
||||
'style' => 0,
|
||||
'location' => 0,
|
||||
],
|
||||
'/Spring ([0-9]*?) Ready-to-Wear/' => [
|
||||
'trans' => '春季成衣',
|
||||
'style' => 0,
|
||||
'location' => 0,
|
||||
],
|
||||
'/Pre-Fall ([0-9]*?)/' => [
|
||||
'trans' => '早秋',
|
||||
'style' => 0,
|
||||
'location' => 0,
|
||||
],
|
||||
'/Australia Resort ([0-9]*?)/' => [
|
||||
'trans' => '时装',
|
||||
'style' => 1,
|
||||
'location' => 1,
|
||||
],
|
||||
'/Fall ([0-9]*?) Menswear/' => [
|
||||
'trans' => '秋季男装',
|
||||
'style' => 0,
|
||||
'location' => 1, // 澳大利亚
|
||||
],
|
||||
'/Ukraine Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 2, // 乌克兰
|
||||
],
|
||||
'/Kiev Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 3, // 基辅
|
||||
],
|
||||
'/Stockholm Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 4, // 斯德哥尔摩
|
||||
],
|
||||
'/Tokyo Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 5, // 东京
|
||||
],
|
||||
'/Berlin Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 6, // 柏林
|
||||
],
|
||||
'/Copenhagen Fall ([0-9]*?)/' => [
|
||||
'trans' => '秋季',
|
||||
'style' => 0,
|
||||
'location' => 7, // 哥本哈根
|
||||
],
|
||||
'/([0-9]*?) Spring Summer/' => [
|
||||
'trans' => '春夏',
|
||||
'style' => 0,
|
||||
'location' => 0,
|
||||
],
|
||||
'/([0-9]*?) Autumn Winter/' => [
|
||||
'trans' => '秋冬',
|
||||
'style' => 0,
|
||||
'location' => 0,
|
||||
],
|
||||
];
|
||||
|
||||
//
|
||||
// var_dump(TitleHelper::translate('Pre-Fall 2019'));die;
|
||||
foreach ($query->cursor() as $item) {
|
||||
echo '正在同步' . $item->id . PHP_EOL;
|
||||
if (in_array($item->title, ['春季', ' 春季', '秋季', ' 秋季', '早秋', ' 早秋', '时装', ' 时装']) || stripos($item->title, '|') !== false) {
|
||||
$originTitle = AppSpiderArticle::find($item->spider_article_id)->title;
|
||||
if ($title = TitleHelper::translate($originTitle)) {
|
||||
var_dump($title);
|
||||
$item->title = $title;
|
||||
$item->save();
|
||||
}
|
||||
} else {
|
||||
if ($title = TitleHelper::translate($item->title)) {
|
||||
$item->title = $title;
|
||||
$item->save();
|
||||
}
|
||||
}
|
||||
|
||||
// $model = AppArticle::find($item->id);
|
||||
// $model->aid = strtr(uniqid(more_entropy:true), [
|
||||
// '.' => ''
|
||||
// ]);
|
||||
//// $model->description = '1';
|
||||
// $model->save();
|
||||
// foreach ($map as $mapPreg => $mapItem) {
|
||||
//
|
||||
// preg_match_all($mapPreg, $item->title, $matches);
|
||||
//
|
||||
// if (count($matches) > 1 && $matches[1]) {
|
||||
// echo current($matches[1]) . " {$mapItem['trans']}" . PHP_EOL;
|
||||
// $model = AppArticle::find($item->id);
|
||||
// $model->title = current($matches[1]) . " {$mapItem['trans']}";
|
||||
// $model->location = $mapItem['location'];
|
||||
// $model->style = $mapItem['style'];
|
||||
//
|
||||
// echo 'save';
|
||||
// $model->save();
|
||||
// continue;
|
||||
// }
|
||||
}
|
||||
//
|
||||
//// $model->pass($item->id);
|
||||
////
|
||||
////// var_dump($item->created_at->date);
|
||||
////// $model->deleted_at = $item->created_at->timestamp;
|
||||
////// var_dump($model->save());
|
||||
// }
|
||||
|
||||
// Fall 1996 Ready-to-Wear
|
||||
// Spring 2025 Ready-to-Wear
|
||||
// Pre-Fall 2025
|
||||
|
||||
}
|
||||
}
|
42
app/Command/SpiderReviewCommand.php
Executable file
42
app/Command/SpiderReviewCommand.php
Executable file
@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\FormModel\spider\ReviewModel;
|
||||
use App\Model\AppArticle;
|
||||
use Hyperf\Collection\Collection;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Command\Command as HyperfCommand;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Hyperf\Coroutine\co;
|
||||
|
||||
#[Command]
|
||||
class SpiderReviewCommand extends HyperfCommand
|
||||
{
|
||||
|
||||
protected $reids;
|
||||
public function __construct(protected ContainerInterface $container)
|
||||
{
|
||||
parent::__construct('spider:review');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('review spider article.');
|
||||
$this->addOption('id', 'i', InputOption::VALUE_REQUIRED, '文章id', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$reviewModel = new ReviewModel();
|
||||
$reviewModel->pass($input->getOption('id'));
|
||||
return 0;
|
||||
}
|
||||
}
|
26
app/Command/TestClass.php
Executable file
26
app/Command/TestClass.php
Executable file
@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Http\Message\UriInterface;
|
||||
use Spatie\Crawler\CrawlObservers\CrawlObserver;
|
||||
|
||||
class TestClass extends CrawlObserver
|
||||
{
|
||||
public function willCrawl(UriInterface $url, ?string $linkText): void
|
||||
{
|
||||
echo "即将爬取: {$url}\n";
|
||||
}
|
||||
|
||||
public function crawled(
|
||||
UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null, ?string $linkText = null): void {
|
||||
var_dump($response);
|
||||
echo "已成功爬取: {$url} 状态码: " . $response->getStatusCode() . "\n";
|
||||
}
|
||||
|
||||
public function crawlFailed(UriInterface $url, \Throwable $exception, ?UriInterface $foundOnUrl = null, ?string $linkText = null): void
|
||||
{
|
||||
echo "爬取失败: {$url} 错误: {$exception->getMessage()}\n";
|
||||
}
|
||||
}
|
171
app/Command/spider/BaseSpider.php
Executable file
171
app/Command/spider/BaseSpider.php
Executable file
@ -0,0 +1,171 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Model\AppArticle;
|
||||
use App\Model\AppSpiderArticle;
|
||||
use Hyperf\Command\Command;
|
||||
use Hyperf\Contract\StdoutLoggerInterface;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\Di\Annotation\Inject;
|
||||
use Laminas\Stdlib\ArrayUtils;
|
||||
use Swoole\Coroutine\Channel;
|
||||
use Swoole\Timer;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Hyperf\Coroutine\co;
|
||||
|
||||
class BaseSpider extends Command
|
||||
{
|
||||
/**
|
||||
* 最大协程数量
|
||||
* @var int
|
||||
*/
|
||||
protected int $maxCo = 10;
|
||||
|
||||
protected ?\Swoole\Coroutine\Channel $channel = null;
|
||||
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = '';
|
||||
|
||||
#[Inject]
|
||||
protected ?StdoutLoggerInterface $logger = null;
|
||||
|
||||
protected array $coroutineList = [];
|
||||
|
||||
protected const PLATFORM = '';
|
||||
|
||||
private bool $isInit = false;
|
||||
|
||||
protected int|bool $timer = false;
|
||||
|
||||
protected array $commandConfigure = [];
|
||||
|
||||
private function init()
|
||||
{
|
||||
// 因为最外层还有个父协程, 所以加一
|
||||
$this->channel = new Channel($this->maxCo + 1);
|
||||
$this->timer = Timer::tick(1000 * 30, function () use (&$coList) {
|
||||
// count(\Swoole\Coroutine::getElapsed());
|
||||
var_dump(count($this->coroutineList));
|
||||
// var_dump($list);
|
||||
});
|
||||
for ($i = 0; $i < $this->maxCo + 1; $i++) {
|
||||
$this->channel->push(1);
|
||||
}
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->addOption('prod', '', InputOption::VALUE_NEGATABLE, '是否关闭devMode.', false);
|
||||
}
|
||||
|
||||
public static function getPlatform(): string
|
||||
{
|
||||
return static::PLATFORM;
|
||||
}
|
||||
|
||||
public function getBaseUrl(): string
|
||||
{
|
||||
return rtrim($this->baseUrl, '/');
|
||||
}
|
||||
|
||||
protected function getArticleModel(array $condition)
|
||||
{
|
||||
return AppSpiderArticle::query()->where($condition)->first() ?: new AppSpiderArticle();
|
||||
}
|
||||
|
||||
protected function request(string $url): array
|
||||
{
|
||||
$ch = curl_init();
|
||||
curl_setopt_array($ch, array(
|
||||
CURLOPT_URL => $url,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_ENCODING => '',
|
||||
CURLOPT_MAXREDIRS => 10,
|
||||
CURLOPT_TIMEOUT => 15,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
||||
CURLOPT_CUSTOMREQUEST => 'GET',
|
||||
));
|
||||
// curl_setopt($ch, CURLOPT_URL, $url);
|
||||
// curl_setopt($ch, CURLOPT_HEADER, false);
|
||||
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
$result = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
|
||||
return [$result, $httpCode];
|
||||
}
|
||||
|
||||
protected function returnPool()
|
||||
{
|
||||
return $this->channel->push(1);
|
||||
}
|
||||
|
||||
protected function getPool(): bool
|
||||
{
|
||||
return $this->channel->pop();
|
||||
}
|
||||
|
||||
protected function createCoroutine(\Closure $func): void
|
||||
{
|
||||
if ($this->isInit === false) {
|
||||
$this->isInit = true;
|
||||
$this->init();
|
||||
}
|
||||
$this->getPool();
|
||||
$cid = co(function () use ($func) {
|
||||
\Co\defer(function() {
|
||||
unset($this->coroutineList[Coroutine::id()]);
|
||||
$this->returnPool();
|
||||
});
|
||||
|
||||
$func();
|
||||
});
|
||||
$this->coroutineList[$cid] = 1;
|
||||
}
|
||||
|
||||
protected function debugPrint(array|string $message = '', $level = 0)
|
||||
{
|
||||
if ($this->getCommandConfigure('prod') === false) {
|
||||
$printTime = date('H:i:s');
|
||||
echo "[spider-debug][$printTime]" . print_r($message, true) . PHP_EOL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用于单元测试
|
||||
* @param string $methodName
|
||||
* @param $args
|
||||
* @return mixed
|
||||
*/
|
||||
public function testMethod(string $methodName, $args = [])
|
||||
{
|
||||
return $this->{$methodName}(...$args);
|
||||
}
|
||||
|
||||
public function setCommandConfigure($options): void
|
||||
{
|
||||
$this->commandConfigure = $options;
|
||||
}
|
||||
|
||||
public function getCommandConfigure($key = null, $defaultValue = null)
|
||||
{
|
||||
if (!$key) {
|
||||
return $this->commandConfigure;
|
||||
}
|
||||
|
||||
return $this->commandConfigure[$key] ?? $defaultValue;
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->setCommandConfigure($input->getOptions());
|
||||
return 0;
|
||||
}
|
||||
}
|
187
app/Command/spider/ElleStreetCommand.php
Executable file
187
app/Command/spider/ElleStreetCommand.php
Executable file
@ -0,0 +1,187 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class ElleStreetCommand extends BaseSpider
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = 'https://www.elle.com';
|
||||
|
||||
protected const PLATFORM = 'elle-street';
|
||||
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('spider:elle-street');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('elle.com/street elle街拍模块');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
run(function () {
|
||||
$this->spiderStart();
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private function _getTask($brand): \Generator
|
||||
{
|
||||
$query = Db::table('app_brands');
|
||||
if ($brand) {
|
||||
$query->where(['id' => $brand]);
|
||||
}
|
||||
|
||||
$query->where('id', '>', 1)->orderBy('id');
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
private function _getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-'
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart(): void
|
||||
{
|
||||
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (!is_array($matches) && count($matches) < 1) {
|
||||
$this->logger->info(self::getPlatform() . " 数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
$val = json_decode(($matches[1][0]), true);
|
||||
|
||||
$articles = $val[0]['itemListElement'] ?? [];
|
||||
|
||||
if (!$articles) {
|
||||
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
$saveImages = [];
|
||||
foreach ($articles as $article) {
|
||||
list($result, $httpCode) = $this->request($article['url']);
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (isset($matches[1][0])) {
|
||||
$val = json_decode($matches[1][0], true);
|
||||
|
||||
$images = $val['about']['itemListElement'];
|
||||
|
||||
foreach ($images as $image) {
|
||||
$saveImages[] = $image['item']['image'];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var_dump($saveImages);
|
||||
|
||||
|
||||
return;
|
||||
|
||||
|
||||
$this->createCoroutine(function () use ($task) {
|
||||
|
||||
$brandName = $this->_getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->_getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->_getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private function _getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private function _getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::getPlatform();
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
}
|
||||
}
|
103
app/Command/spider/FashionSnapCommand.php
Executable file
103
app/Command/spider/FashionSnapCommand.php
Executable file
@ -0,0 +1,103 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppBrand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Swoole\ExitException;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class FashionSnapCommand extends BaseSpider
|
||||
{
|
||||
protected const PLATFORM = 'fashionsnap';
|
||||
|
||||
protected string $baseUrl = 'https://www.fashionsnap.com';
|
||||
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('spider:fashionsnap');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集fashionsnap.com');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id', false);
|
||||
}
|
||||
|
||||
private function _getTask($brand): \Generator
|
||||
{
|
||||
$query = Db::table('app_brands');
|
||||
|
||||
if ($brand) {
|
||||
$query->whereIn('id', explode(',', $brand));
|
||||
} else {
|
||||
$query->where('spider_origin', '=', 'fashionsnap')->orderBy('id');
|
||||
}
|
||||
|
||||
foreach ($query->cursor() as $row) {
|
||||
if (!$row) {
|
||||
throw new ExitException('END.');
|
||||
}
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$brand = $input->getOption('brandId');
|
||||
|
||||
run(function () use ($brand) {
|
||||
foreach ($this->_getTask($brand) as $task) {
|
||||
list($result, $httpCode) = $this->request($this->getBaseUrl() . "/api/algolia/article/?blogIds=4&brandName={$task->name}&limit=50");
|
||||
echo $task->name . '--' . $httpCode . PHP_EOL;
|
||||
if ($httpCode == 200) {
|
||||
$isSuccess = false;
|
||||
$result = json_decode($result, true);
|
||||
if ($result['totalCount'] == 0 || $result['totalCount'] > 200) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($result['articles'] ?? [] as $item) {
|
||||
$model = $this->getArticleModel(['title' => $item['mainCategory']['name'], 'platform' => static::PLATFORM, 'brand' => $task->id]);
|
||||
$model->title = $item['mainCategory']['name'];
|
||||
$model->year = AppHelper::getYear($model->title);
|
||||
$model->brand = $task->id;
|
||||
$model->module = 0;
|
||||
$model->platform = self::getPlatform();
|
||||
|
||||
$saveImages = [];
|
||||
foreach ($item['mainGalleryImages'] as $image) {
|
||||
$saveImages[] = [
|
||||
'src' => 'https://fashionsnap-assets.com/asset/width=4096' . $image
|
||||
];
|
||||
}
|
||||
$model->images = json_encode($saveImages);
|
||||
$model->cover = $saveImages[0]['src'] ?? '';
|
||||
// permalink
|
||||
$model->source_url = 'https://fashionsnap.com' . $item['permalink'];
|
||||
if ($model->cover) {
|
||||
$isSuccess = $model->save();
|
||||
}
|
||||
}
|
||||
|
||||
if ($isSuccess) {
|
||||
$brandModel = AppBrand::find($task->id);
|
||||
$brandModel->spider_origin = self::getPlatform();
|
||||
$brandModel->save();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
117
app/Command/spider/TheImpressionStreetCommand.php
Executable file
117
app/Command/spider/TheImpressionStreetCommand.php
Executable file
@ -0,0 +1,117 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Enums\ArticleModuleEnum;
|
||||
use App\Helpers\AppHelper;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
#[Command]
|
||||
class TheImpressionStreetCommand extends BaseSpider
|
||||
{
|
||||
protected const PLATFORM = 'theimpression-street';
|
||||
|
||||
public function __construct(protected ContainerInterface $container)
|
||||
{
|
||||
parent::__construct('spider:theimpression-street');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集 https://theimpression.com/street-style');
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
parent::execute($input, $output);
|
||||
|
||||
$url = 'https://theimpression.com/street-style';
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 取banner的图
|
||||
(new Crawler($res))
|
||||
->filter('.parallax .mask-overlay')->each(function ($node) {
|
||||
$href = $node->attr('href');
|
||||
$text = trim($node->attr('aria-label'));
|
||||
$this->debugPrint("标题: {$text}");
|
||||
$this->debugPrint("链接: {$href}");
|
||||
$this->getDetail($href, $text);
|
||||
});
|
||||
|
||||
$articleList = [];
|
||||
// 取前五十页
|
||||
for ($i = 1; $i < 2; $i++) {
|
||||
$url = "https://theimpression.com/wp-json/codetipi-zeen/v1/block?paged={$i}&type=1&data%5Bargs%5D%5Bcat%5D=1";
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
$res = json_decode($res, true);
|
||||
(new Crawler($res[1]))
|
||||
->filter('article')->each(function (Crawler $node) use (&$articleList) {
|
||||
|
||||
$href = $node->filter('.mask-img')->attr('href', '');
|
||||
$title = $node->filter('.title-wrap')->text('');
|
||||
if (!$href || !$title) {
|
||||
$this->debugPrint("找不到标题或链接.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
$this->getDetail($href, $title);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
// return 0;
|
||||
}
|
||||
|
||||
|
||||
public function getDetail(string $url, $title)
|
||||
{
|
||||
$model = $this->getArticleModel(['title' => $title, 'platform' => static::getPlatform(), 'brand' => 0]);
|
||||
$model->title = $title;
|
||||
$model->platform = static::getPlatform();
|
||||
$model->module = ArticleModuleEnum::STREET->value;
|
||||
$model->year = AppHelper::getYear($title);
|
||||
|
||||
[$res, $httpCode] = $this->request($url);
|
||||
$model->source_url = $url;
|
||||
if ($httpCode != 200) {
|
||||
$this->debugPrint("{$url} 请求失败.");
|
||||
return 0;
|
||||
}
|
||||
$images = [];
|
||||
|
||||
(new Crawler($res))
|
||||
->filter('figure a img')->each(function ($node) use (&$images) {
|
||||
if ($node->attr('src') && !isset($images[$node->attr('src')])) {
|
||||
$this->debugPrint("采集图片: {$node->attr('src')}");
|
||||
$images[$node->attr('src')] = [
|
||||
'src' => $node->attr('src')
|
||||
];
|
||||
}
|
||||
});
|
||||
|
||||
if ($images) {
|
||||
$model->cover = current($images)['src'];
|
||||
$model->images = json_encode(array_values($images));
|
||||
$model->save();
|
||||
}
|
||||
}
|
||||
}
|
189
app/Command/spider/VogueCommand.php
Executable file
189
app/Command/spider/VogueCommand.php
Executable file
@ -0,0 +1,189 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command\spider;
|
||||
|
||||
use App\Helpers\AppHelper;
|
||||
use App\Model\AppBrand;
|
||||
use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\Coroutine\Coroutine;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use function Swoole\Coroutine\run;
|
||||
|
||||
#[Command]
|
||||
class VogueCommand extends BaseSpider
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
protected string $baseUrl = 'https://www.vogue.com';
|
||||
|
||||
protected const PLATFORM = 'vogue';
|
||||
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct('spider:vogue');
|
||||
ini_set('pcre.backtrack_limit', '-1');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
{
|
||||
parent::configure();
|
||||
$this->setDescription('自动采集vogue.com');
|
||||
$this->addOption('brandId', 'b', InputOption::VALUE_OPTIONAL, '指定的品牌id.', false);
|
||||
$this->addOption('forceUpdate', 'f', InputOption::VALUE_NEGATABLE, '是否对已经保存的数据进行强制更新.', false);
|
||||
$this->addOption('onlyPlatform', 'o', InputOption::VALUE_NEGATABLE, '是否只对当前平台品牌更新.', false);
|
||||
}
|
||||
|
||||
public function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->setCommandConfigure($input->getOptions());
|
||||
|
||||
run(function () {
|
||||
// 最大查询的品牌数量, 防止同时最大协程数都有子数据, 导致无法创建协程的问题。
|
||||
$maxBrandExecuteCount = $this->maxCo / 2;
|
||||
$currentBrandExecute = 0;
|
||||
foreach ($this->_getTask() as $task) {
|
||||
$currentBrandExecute++;
|
||||
|
||||
$this->createCoroutine(function () use ($task, &$currentBrandExecute) {
|
||||
$this->spiderStart($task);
|
||||
$currentBrandExecute--;
|
||||
});
|
||||
|
||||
while (true) {
|
||||
if ($currentBrandExecute > $maxBrandExecuteCount) {
|
||||
Coroutine::sleep(1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Coroutine::sleep(60);
|
||||
exit(0);
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private function _getTask(): \Generator
|
||||
{
|
||||
$query = AppBrand::query();
|
||||
$brandId = $this->getCommandConfigure('brandId');
|
||||
$onlyPlatform = $this->getCommandConfigure('onlyPlatform');
|
||||
if ($brandId) {
|
||||
$query->where(['id' => $brandId]);
|
||||
} else {
|
||||
$query->where('id', '>', 1)->when($onlyPlatform, fn($q) => $q->where('spider_origin', static::PLATFORM))->orderBy('id');
|
||||
}
|
||||
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
protected function getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-',
|
||||
'&' => ''
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart($task): void
|
||||
{
|
||||
$brandName = $this->getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
protected function getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected function getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
// 如果不是force update
|
||||
// 不更新原来的数据
|
||||
if ($model->id && $this->getCommandConfigure('forceUpdate') === false) {
|
||||
return;
|
||||
}
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::PLATFORM;
|
||||
$model->brand = $brandId;
|
||||
$model->module = 0;
|
||||
$model->year = AppHelper::getYear($info['hed']);
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
$model->source_url = $requestUrl;
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = $detailUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
foreach ($img['details'] ?? [] as $detail) {
|
||||
$detailUrl[] = ['src' => $detail['image']['sources']['xxl']['url']];
|
||||
}
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
$model->cover = $saveUrl[0]['src'];
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user