Files
backend/app/Command/spider/BaiduRecordCommand.php
toom1996 68e874a2b3 update
2025-07-30 12:24:16 +08:00

139 lines
4.6 KiB
PHP

<?php
namespace App\Command\spider;
use App\Helpers\AppHelper;
use App\Model\AppNews;
use App\Model\AppNewsColumn;
use App\Model\AppNewsSecondColumn;
use App\Model\AppWebsiteConfig;
use Hyperf\Cache\Helper\StringHelper;
use Hyperf\Command\Annotation\Command;
use Hyperf\DbConnection\Db;
use Hyperf\Logger\LoggerFactory;
use Psr\Container\ContainerInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\DomCrawler\Crawler;
use function Hyperf\Support\env;
use function Swoole\Coroutine\run;
/**
* 百度收录脚本
*/
#[Command]
class BaiduRecordCommand extends BaseSpider
{
/**
* @var string
*/
protected string $baseUrl = 'https://m.baidu.com';
protected const PLATFORM = 'wangyi';
protected LoggerInterface $log;
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
{
parent::__construct('spider:baidu-record');
$this->log = $loggerFactory->get('log', 'command-baidu-record');
}
public function configure()
{
parent::configure();
$this->setDescription('查百度收录');
$this->addOption('id', 'i', InputOption::VALUE_OPTIONAL, '指定的新闻id.', false);
}
public function execute(InputInterface $input, OutputInterface $output): int
{
$id = $input->getOption('id');
$this->_start($id);
return 0;
}
// 采集新闻板块
private function _start(int|bool $id = false)
{
$cache = [];
$requestCookie = '';
$query = AppNews::query()->where('is_delete', 0)->where('is_record', 0);
if ($id) {
$query = $query->where('id', $id);
}
$query?->each(function($item) use (&$requestCookie, &$cache) {
try {
if (!$item->platform) {
throw new \Exception('没找到平台!!');
}
if (!isset($cache['website'][$item->platform])) {
$cache['website'][$item->platform] = AppWebsiteConfig::find($item->platform)->toArray();
}
// 主域名
$domain = trim($cache['website'][$item->platform]['app_domain'], '/');
$url = 'https://m.baidu.com/pu=sz%401321_480/from=0/ssid=0/s?word=' . urlencode("https://{$domain}/news/{$item->id}");
$this->log->info('正在处理' . $url);
list($res, $code, $cookie) = $this->request($url, headers: [
'Cookie' => $requestCookie
]);
if ($cookie) {
$requestCookie = $cookie;
}
var_dump($res);
if (stripos($res, '验证') !== false) {
$this->log->info('有验证码!!');
}
(new Crawler($res))?->filter('.abs')?->each(function ($node) use ($domain, $item) {
if (stripos($node->html(), $domain) !== false) {
$this->log->info('已收录');
AppNews::find($item->id)->update(['is_record' => 1]);
}
});
$this->log->info('处理结束..');
}catch (\Throwable $exception) {
$this->log->info($exception->getMessage());
$this->log->info($exception->getTraceAsString());
}
}, 1);
return;
}
protected function request(string $url, array $headers = [], string $cookieFile = 'cookie.txt'): array
{
$ch = curl_init();
curl_setopt_array($ch, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => '',
CURLOPT_MAXREDIRS => 10,
CURLOPT_HEADER => true,
CURLOPT_TIMEOUT => 15,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_HTTPHEADER => $headers,
));
// curl_setopt($ch, CURLOPT_URL, $url);
// curl_setopt($ch, CURLOPT_HEADER, false);
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$result = curl_exec($ch);
preg_match_all('/Set-Cookie: (.*?);/i', $result, $matches);
curl_close($ch);
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
return [$result, $httpCode, $matches[1][0] ?? ''];
}
}