This commit is contained in:
toom1996
2025-07-29 19:24:44 +08:00
parent 9f8710f069
commit 1b36a808a8

View File

@ -0,0 +1,300 @@
<?php
namespace App\Command\spider;
use App\Helpers\AppHelper;
use App\Model\AppNews;
use App\Model\AppNewsColumn;
use App\Model\AppNewsSecondColumn;
use App\Model\AppWebsiteConfig;
use Hyperf\Cache\Helper\StringHelper;
use Hyperf\Command\Annotation\Command;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\DomCrawler\Crawler;
use function Hyperf\Support\env;
use function Swoole\Coroutine\run;
/**
* 百度收录脚本
*/
#[Command]
class BaiduRecordCommand extends BaseSpider
{
/**
* @var string
*/
protected string $baseUrl = 'https://m.baidu.com';
protected const PLATFORM = 'wangyi';
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
{
parent::__construct('spider:baidu-record');
}
public function configure()
{
parent::configure();
$this->setDescription('36kr.com');
}
public function execute(InputInterface $input, OutputInterface $output): int
{
// 新闻:/touch/reconstruct/article/list/BBM54PGAwangning/0-10.html
$this->_start();
return 0;
run(function () {
$this->spiderStart();
});
return 0;
}
// 采集新闻板块
private function _start()
{
$cache = [];
AppNews::query()->where('is_delete', 0)->each(function($item) {
try {
if (!$item->platform) {
throw new \Exception('!');
}
if (!isset($cache['website'][$item->platform])) {
$cache['website'][$item->platform] = AppWebsiteConfig::find($item->platform)->toArray();
}
// 主域名
$domain = trim($cache['website'][$item->platform]['app_domain'], '/');
if (!isset($cache['column'][$item->column_tag])) {
$cache['column'][$item->column_tag] = AppNewsColumn::find($item->column_tag)?->toArray() ?: '';
}
$column = $cache['column'][$item->column_tag]['url'];
if (!isset($cache['column_2'][$item->second_column])) {
$cache['column_2'][$item->second_column] = AppNewsSecondColumn::find($item->second_column)?->toArray() ?: '';
}
$secondColumn = $cache['column_2'][$item->second_column]['url'];
var_dump("https://{$domain}{$column}{$secondColumn}/{$item->id}");
}catch (\Throwable $exception) {
var_dump($exception->getMessage());
}
}, 1);
return;
$columnUrl = '/pu=sz%401321_480/from=0/ssid=0/s?word=https%3A%2F%2Fhegsfc.com%2Fnews%2F155';
list($res, $code) = $this->request($this->baseUrl . $columnUrl);
preg_match_all('/window\.initialState=(\{.*\})/', $res, $matches);
$articleList = json_decode(current(end($matches)), true);
foreach ($articleList['information']['informationList']['itemList'] ?? [] as $item) {
$itemId = $item['itemId'];
$title = $item['templateMaterial']['widgetTitle'];
var_dump($title);
// var_dump($item);return;
$url = $this->baseUrl . '/p/' . $itemId;
var_dump($url);
list($res, $code) = $this->request($url);
sleep(2);
(new Crawler($res))->filter('.articleDetailContent')->each(function ($node) {
var_dump($node->html());
});
// var_dump($res);
// var_dump($url);
// return;
}
return;
return;
// foreach ($articleList as $item) {
// foreach ($item as $article) {
// try {
// if ($is = AppNews::query()->where('source_url', $article['docid'])->where('source_platform', self::PLATFORM)->exists()) {
// continue;
// }
// $model = new AppNews();
// $model->title = $article['title'];
// $model->description = $article['digest'] ?: $article['title'];
// // video 类型 和 无图片不采集
// if (!$article['imgsrc'] || (isset($article['skipType']) && $article['skipType'] == 'video')) {
// continue;
// }
// $coverContent = file_get_contents($article['imgsrc']);
// $savePath = 'uploads/news/' . date('Y-m-d') . '/';
// if (!file_exists($savePath)) {
// mkdir($savePath, 0777, true); // 0777 是文件夹的权限true 表示递归创建子目录
// }
// $saveFile = $savePath . AppHelper::generateAid() . '.png';
// file_put_contents('/www/wwwroot/' . $saveFile, $coverContent);
// $model->cover = env('APP_DOMAIN', '') . '/' . $saveFile;
// $model->platform = 3;
// $model->source_url = $article['docid'];
// $model->source_platform = self::PLATFORM;
// $model->save();
// }catch (\Throwable $exception) {
// var_dump($exception->getMessage());
// continue;
// }
// }
// }
}
private function _getTask($brand): \Generator
{
$query = Db::table('app_brands');
if ($brand) {
$query->where(['id' => $brand]);
}
$query->where('id', '>', 1)->orderBy('id');
foreach ($query->cursor() as $row) {
yield $row;
}
}
private function _getTaskName($name): string
{
return strtolower(strtr($name, [
'.' => '-',
' ' => '-'
]));
}
public function spiderStart(): void
{
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (!is_array($matches) && count($matches) < 1) {
$this->logger->info(self::getPlatform() . " 数据获取失败。");
return;
}
$val = json_decode(($matches[1][0]), true);
$articles = $val[0]['itemListElement'] ?? [];
if (!$articles) {
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
return;
}
$saveImages = [];
foreach ($articles as $article) {
list($result, $httpCode) = $this->request($article['url']);
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (isset($matches[1][0])) {
$val = json_decode($matches[1][0], true);
$images = $val['about']['itemListElement'];
foreach ($images as $image) {
$saveImages[] = $image['item']['image'];
}
}
}
var_dump($saveImages);
return;
$this->createCoroutine(function () use ($task) {
$brandName = $this->_getTaskName($task->name);
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
// 取发布会列表
$showsList = $this->_getShowsList($url);
foreach ($showsList as $list) {
$this->createCoroutine(function () use ($task, $list) {
$this->_getDetail($task->id, $list);
});
}
});
}
private function _getShowsList($url)
{
list($request, $httpCode) = $this->request($url);
if ($httpCode == 200) {
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
$val = json_decode(current(end($matches)), true);
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
} else {
$this->logger->info('未找到数据.');
return [];
}
}
private function _getDetail(int $brandId, array $info)
{
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
$model->title = $info['hed'];
$model->images = json_encode([]);
$model->platform = self::getPlatform();
// 获取图片
$pageUri = $info['url'];
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
$matches = [];
list($result, $httpCode) = $this->request($requestUrl);
if ($httpCode != 200 || !$result) {
$this->logger->warning($requestUrl . '请求失败.');
return;
}
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
$saveUrl = [];
if (count($matches) > 1) {
$val = json_decode(current($matches[1]), true);
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
if ($images === false) {
$this->logger->warning($requestUrl . '获取图片失败.');
return;
}
foreach (is_array($images) ? $images : [] as $img) {
$saveUrl[] = [
'src' => $img['image']['sources']['xxl']['url']
];
}
$model->images = json_encode($saveUrl);
}
$model->save();
$this->logger->info("end: {$requestUrl}");
}
}