150 lines
4.8 KiB
PHP
150 lines
4.8 KiB
PHP
<?php
|
|
|
|
namespace App\Command\spider;
|
|
|
|
use App\Helpers\AppHelper;
|
|
use App\Model\AppNews;
|
|
use App\Model\AppNewsColumn;
|
|
use App\Model\AppNewsSecondColumn;
|
|
use App\Model\AppWebsiteConfig;
|
|
use Hyperf\Cache\Helper\StringHelper;
|
|
use Hyperf\Command\Annotation\Command;
|
|
use Hyperf\DbConnection\Db;
|
|
use Hyperf\Logger\LoggerFactory;
|
|
use Psr\Container\ContainerInterface;
|
|
use Psr\Log\LoggerInterface;
|
|
use Symfony\Component\Console\Input\InputInterface;
|
|
use Symfony\Component\Console\Input\InputOption;
|
|
use Symfony\Component\Console\Output\OutputInterface;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use function Hyperf\Support\env;
|
|
use function Swoole\Coroutine\run;
|
|
|
|
/**
|
|
* 百度收录脚本
|
|
*/
|
|
#[Command]
|
|
class BaiduRecordCommand extends BaseSpider
|
|
{
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected string $baseUrl = 'https://m.baidu.com';
|
|
|
|
protected const PLATFORM = 'wangyi';
|
|
|
|
protected LoggerInterface $log;
|
|
|
|
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
|
{
|
|
parent::__construct('spider:baidu-record');
|
|
$this->log = $loggerFactory->get('log', 'command-baidu-record');
|
|
}
|
|
|
|
public function configure()
|
|
{
|
|
parent::configure();
|
|
$this->setDescription('查百度收录');
|
|
$this->addOption('id', 'i', InputOption::VALUE_OPTIONAL, '指定的新闻id.', false);
|
|
}
|
|
|
|
public function execute(InputInterface $input, OutputInterface $output): int
|
|
{
|
|
$id = $input->getOption('id');
|
|
$this->_start($id);
|
|
return 0;
|
|
}
|
|
|
|
// 采集新闻板块
|
|
private function _start(int|bool $id = false)
|
|
{
|
|
$cache = [];
|
|
$requestCookie = '';
|
|
$query = AppNews::query()
|
|
->where('is_delete', 0)
|
|
->where('is_record', 0)
|
|
->orderBy(Db::raw('RAND()'))
|
|
->limit(10);
|
|
if ($id) {
|
|
$query = $query->where('id', $id);
|
|
}
|
|
|
|
$query = $query->get();
|
|
|
|
foreach ($query as $index => $item) {
|
|
sleep(3);
|
|
try {
|
|
if (!$item->platform) {
|
|
throw new \Exception('没找到平台!!');
|
|
}
|
|
|
|
if (!isset($cache['website'][$item->platform])) {
|
|
$cache['website'][$item->platform] = AppWebsiteConfig::find($item->platform)->toArray();
|
|
}
|
|
|
|
// 主域名
|
|
$domain = trim($cache['website'][$item->platform]['app_domain'], '/');
|
|
|
|
$url = 'https://m.baidu.com/pu=sz%401321_480/from=0/ssid=0/s?word=' . urlencode("https://{$domain}/news/{$item->id}");
|
|
|
|
$this->log->info('正在处理' . $url);
|
|
list($res, $code, $cookie) = $this->request($url, headers: [
|
|
'Cookie' => $requestCookie
|
|
]);
|
|
if ($cookie) {
|
|
$requestCookie = $cookie;
|
|
}
|
|
|
|
if (stripos($res, '验证') !== false || stripos($res, 'wappass') !== false) {
|
|
$this->log->info('有验证码!!');
|
|
// 有验证码就跳出
|
|
return;
|
|
}
|
|
|
|
(new Crawler($res))?->filter('.abs')?->each(function ($node) use ($domain, $item) {
|
|
if (stripos($node->html(), $domain) !== false) {
|
|
$this->log->info('已收录');
|
|
$f = AppNews::find($item->id);
|
|
$f->is_record = 1;
|
|
$f->save();
|
|
}
|
|
});
|
|
$this->log->info('处理结束..');
|
|
}catch (\Throwable $exception) {
|
|
$this->log->info($exception->getMessage());
|
|
$this->log->info($exception->getTraceAsString());
|
|
}
|
|
}
|
|
return;
|
|
|
|
return;
|
|
}
|
|
|
|
protected function request(string $url, array $headers = [], string $cookieFile = 'cookie.txt'): array
|
|
{
|
|
$ch = curl_init();
|
|
curl_setopt_array($ch, array(
|
|
CURLOPT_URL => $url,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_ENCODING => '',
|
|
CURLOPT_MAXREDIRS => 10,
|
|
CURLOPT_HEADER => true,
|
|
CURLOPT_TIMEOUT => 15,
|
|
CURLOPT_SSL_VERIFYPEER => false,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
CURLOPT_HTTPHEADER => $headers,
|
|
));
|
|
|
|
// curl_setopt($ch, CURLOPT_URL, $url);
|
|
// curl_setopt($ch, CURLOPT_HEADER, false);
|
|
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
|
$result = curl_exec($ch);
|
|
preg_match_all('/Set-Cookie: (.*?);/i', $result, $matches);
|
|
curl_close($ch);
|
|
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
|
|
return [$result, $httpCode, $matches[1][0] ?? ''];
|
|
}
|
|
} |