Files
backend/app/Command/spider/BaiduRecordCommand.php
toom1996 1b36a808a8 update
2025-07-29 19:24:44 +08:00

300 lines
9.8 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Command\spider;
use App\Helpers\AppHelper;
use App\Model\AppNews;
use App\Model\AppNewsColumn;
use App\Model\AppNewsSecondColumn;
use App\Model\AppWebsiteConfig;
use Hyperf\Cache\Helper\StringHelper;
use Hyperf\Command\Annotation\Command;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\DomCrawler\Crawler;
use function Hyperf\Support\env;
use function Swoole\Coroutine\run;
/**
* 百度收录脚本
*/
#[Command]
class BaiduRecordCommand extends BaseSpider
{
/**
* @var string
*/
protected string $baseUrl = 'https://m.baidu.com';
protected const PLATFORM = 'wangyi';
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
{
parent::__construct('spider:baidu-record');
}
public function configure()
{
parent::configure();
$this->setDescription('36kr.com');
}
public function execute(InputInterface $input, OutputInterface $output): int
{
// 新闻:/touch/reconstruct/article/list/BBM54PGAwangning/0-10.html
$this->_start();
return 0;
run(function () {
$this->spiderStart();
});
return 0;
}
// 采集新闻板块
private function _start()
{
$cache = [];
AppNews::query()->where('is_delete', 0)->each(function($item) {
try {
if (!$item->platform) {
throw new \Exception('!');
}
if (!isset($cache['website'][$item->platform])) {
$cache['website'][$item->platform] = AppWebsiteConfig::find($item->platform)->toArray();
}
// 主域名
$domain = trim($cache['website'][$item->platform]['app_domain'], '/');
if (!isset($cache['column'][$item->column_tag])) {
$cache['column'][$item->column_tag] = AppNewsColumn::find($item->column_tag)?->toArray() ?: '';
}
$column = $cache['column'][$item->column_tag]['url'];
if (!isset($cache['column_2'][$item->second_column])) {
$cache['column_2'][$item->second_column] = AppNewsSecondColumn::find($item->second_column)?->toArray() ?: '';
}
$secondColumn = $cache['column_2'][$item->second_column]['url'];
var_dump("https://{$domain}{$column}{$secondColumn}/{$item->id}");
}catch (\Throwable $exception) {
var_dump($exception->getMessage());
}
}, 1);
return;
$columnUrl = '/pu=sz%401321_480/from=0/ssid=0/s?word=https%3A%2F%2Fhegsfc.com%2Fnews%2F155';
list($res, $code) = $this->request($this->baseUrl . $columnUrl);
preg_match_all('/window\.initialState=(\{.*\})/', $res, $matches);
$articleList = json_decode(current(end($matches)), true);
foreach ($articleList['information']['informationList']['itemList'] ?? [] as $item) {
$itemId = $item['itemId'];
$title = $item['templateMaterial']['widgetTitle'];
var_dump($title);
// var_dump($item);return;
$url = $this->baseUrl . '/p/' . $itemId;
var_dump($url);
list($res, $code) = $this->request($url);
sleep(2);
(new Crawler($res))->filter('.articleDetailContent')->each(function ($node) {
var_dump($node->html());
});
// var_dump($res);
// var_dump($url);
// return;
}
return;
return;
// foreach ($articleList as $item) {
// foreach ($item as $article) {
// try {
// if ($is = AppNews::query()->where('source_url', $article['docid'])->where('source_platform', self::PLATFORM)->exists()) {
// continue;
// }
// $model = new AppNews();
// $model->title = $article['title'];
// $model->description = $article['digest'] ?: $article['title'];
// // video 类型 和 无图片不采集
// if (!$article['imgsrc'] || (isset($article['skipType']) && $article['skipType'] == 'video')) {
// continue;
// }
// $coverContent = file_get_contents($article['imgsrc']);
// $savePath = 'uploads/news/' . date('Y-m-d') . '/';
// if (!file_exists($savePath)) {
// mkdir($savePath, 0777, true); // 0777 是文件夹的权限true 表示递归创建子目录
// }
// $saveFile = $savePath . AppHelper::generateAid() . '.png';
// file_put_contents('/www/wwwroot/' . $saveFile, $coverContent);
// $model->cover = env('APP_DOMAIN', '') . '/' . $saveFile;
// $model->platform = 3;
// $model->source_url = $article['docid'];
// $model->source_platform = self::PLATFORM;
// $model->save();
// }catch (\Throwable $exception) {
// var_dump($exception->getMessage());
// continue;
// }
// }
// }
}
private function _getTask($brand): \Generator
{
$query = Db::table('app_brands');
if ($brand) {
$query->where(['id' => $brand]);
}
$query->where('id', '>', 1)->orderBy('id');
foreach ($query->cursor() as $row) {
yield $row;
}
}
private function _getTaskName($name): string
{
return strtolower(strtr($name, [
'.' => '-',
' ' => '-'
]));
}
public function spiderStart(): void
{
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (!is_array($matches) && count($matches) < 1) {
$this->logger->info(self::getPlatform() . " 数据获取失败。");
return;
}
$val = json_decode(($matches[1][0]), true);
$articles = $val[0]['itemListElement'] ?? [];
if (!$articles) {
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
return;
}
$saveImages = [];
foreach ($articles as $article) {
list($result, $httpCode) = $this->request($article['url']);
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
if (isset($matches[1][0])) {
$val = json_decode($matches[1][0], true);
$images = $val['about']['itemListElement'];
foreach ($images as $image) {
$saveImages[] = $image['item']['image'];
}
}
}
var_dump($saveImages);
return;
$this->createCoroutine(function () use ($task) {
$brandName = $this->_getTaskName($task->name);
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
// 取发布会列表
$showsList = $this->_getShowsList($url);
foreach ($showsList as $list) {
$this->createCoroutine(function () use ($task, $list) {
$this->_getDetail($task->id, $list);
});
}
});
}
private function _getShowsList($url)
{
list($request, $httpCode) = $this->request($url);
if ($httpCode == 200) {
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
$val = json_decode(current(end($matches)), true);
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
} else {
$this->logger->info('未找到数据.');
return [];
}
}
private function _getDetail(int $brandId, array $info)
{
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
$model->title = $info['hed'];
$model->images = json_encode([]);
$model->platform = self::getPlatform();
// 获取图片
$pageUri = $info['url'];
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
$matches = [];
list($result, $httpCode) = $this->request($requestUrl);
if ($httpCode != 200 || !$result) {
$this->logger->warning($requestUrl . '请求失败.');
return;
}
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
$saveUrl = [];
if (count($matches) > 1) {
$val = json_decode(current($matches[1]), true);
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
if ($images === false) {
$this->logger->warning($requestUrl . '获取图片失败.');
return;
}
foreach (is_array($images) ? $images : [] as $img) {
$saveUrl[] = [
'src' => $img['image']['sources']['xxl']['url']
];
}
$model->images = json_encode($saveUrl);
}
$model->save();
$this->logger->info("end: {$requestUrl}");
}
}