From 6f56e186619979edac37bf6fdbcfa1aef971ea10 Mon Sep 17 00:00:00 2001 From: toom1996 Date: Wed, 30 Jul 2025 11:52:33 +0800 Subject: [PATCH] update --- app/Command/spider/BaiduRecordCommand.php | 264 ++++-------------- .../admin/api/WebsiteController.php | 6 + config/autoload/crontab.php | 5 + config/autoload/logger.php | 17 ++ config/autoload/processes.php | 1 + config/crontabs.php | 7 + storage/view/website/index.blade.php | 7 +- 7 files changed, 95 insertions(+), 212 deletions(-) create mode 100644 config/autoload/crontab.php create mode 100644 config/crontabs.php diff --git a/app/Command/spider/BaiduRecordCommand.php b/app/Command/spider/BaiduRecordCommand.php index 40e9f2b..bcec8ce 100644 --- a/app/Command/spider/BaiduRecordCommand.php +++ b/app/Command/spider/BaiduRecordCommand.php @@ -12,6 +12,7 @@ use Hyperf\Command\Annotation\Command; use Hyperf\DbConnection\Db; use Hyperf\Logger\LoggerFactory; use Psr\Container\ContainerInterface; +use Psr\Log\LoggerInterface; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Output\OutputInterface; @@ -32,9 +33,12 @@ class BaiduRecordCommand extends BaseSpider protected const PLATFORM = 'wangyi'; + protected LoggerInterface $log; + public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory) { parent::__construct('spider:baidu-record'); + $this->log = $loggerFactory->get('log', 'command-baidu-record'); } public function configure() @@ -50,22 +54,17 @@ class BaiduRecordCommand extends BaseSpider $this->_start(); return 0; - run(function () { - $this->spiderStart(); - }); - - return 0; } // 采集新闻板块 private function _start() { $cache = []; - AppNews::query()->where('is_delete', 0)->each(function($item) { + $requestCookie = ''; + AppNews::query()->where('is_delete', 0)->each(function($item) use (&$requestCookie, &$cache) { try { - if (!$item->platform) { - throw new \Exception('!'); + throw new \Exception('没找到平台!!'); } if (!isset($cache['website'][$item->platform])) { @@ -79,222 +78,67 @@ class BaiduRecordCommand extends BaseSpider $cache['column'][$item->column_tag] = AppNewsColumn::find($item->column_tag)?->toArray() ?: ''; } - $column = $cache['column'][$item->column_tag]['url']; + $column = $cache['column'][$item->column_tag]['url'] ?? ''; if (!isset($cache['column_2'][$item->second_column])) { $cache['column_2'][$item->second_column] = AppNewsSecondColumn::find($item->second_column)?->toArray() ?: ''; } - $secondColumn = $cache['column_2'][$item->second_column]['url']; - var_dump("https://{$domain}{$column}{$secondColumn}/{$item->id}"); + $secondColumn = $cache['column_2'][$item->second_column]['url'] ?? ''; + $url = 'https://m.baidu.com/pu=sz%401321_480/from=0/ssid=0/s?word=' . urlencode("https://{$domain}{$column}{$secondColumn}/{$item->id}"); + + $this->log->info('正在处理' . $url); + list($res, $code, $cookie) = $this->request($url, headers: [ + 'Cookie' => $requestCookie + ]); + + if ($cookie) { + $requestCookie = $cookie; + } + if (stripos($res, '验证') !== false) { + $this->log->info('有验证码!!'); + } + (new Crawler($res))->filter('.abs')->each(function ($node) use ($domain, $item) { + if (stripos($node->html(), $domain) !== false) { + $this->log->info('已收录'); + AppNews::find($item->id)->update(['is_record' => 1]); + } + }); + $this->log->info('处理结束..'); }catch (\Throwable $exception) { - var_dump($exception->getMessage()); + $this->log->info($exception->getMessage()); } }, 1); return; - $columnUrl = '/pu=sz%401321_480/from=0/ssid=0/s?word=https%3A%2F%2Fhegsfc.com%2Fnews%2F155'; - - list($res, $code) = $this->request($this->baseUrl . $columnUrl); - - preg_match_all('/window\.initialState=(\{.*\})/', $res, $matches); - $articleList = json_decode(current(end($matches)), true); - - foreach ($articleList['information']['informationList']['itemList'] ?? [] as $item) { - $itemId = $item['itemId']; - $title = $item['templateMaterial']['widgetTitle']; - var_dump($title); -// var_dump($item);return; - $url = $this->baseUrl . '/p/' . $itemId; - var_dump($url); - list($res, $code) = $this->request($url); - sleep(2); - (new Crawler($res))->filter('.articleDetailContent')->each(function ($node) { - var_dump($node->html()); - }); -// var_dump($res); -// var_dump($url); -// return; - } - - return; - - - return; -// foreach ($articleList as $item) { -// foreach ($item as $article) { -// try { -// if ($is = AppNews::query()->where('source_url', $article['docid'])->where('source_platform', self::PLATFORM)->exists()) { -// continue; -// } -// $model = new AppNews(); -// $model->title = $article['title']; -// $model->description = $article['digest'] ?: $article['title']; -// // video 类型 和 无图片不采集 -// if (!$article['imgsrc'] || (isset($article['skipType']) && $article['skipType'] == 'video')) { -// continue; -// } -// $coverContent = file_get_contents($article['imgsrc']); -// $savePath = 'uploads/news/' . date('Y-m-d') . '/'; -// if (!file_exists($savePath)) { -// mkdir($savePath, 0777, true); // 0777 是文件夹的权限,true 表示递归创建子目录 -// } -// $saveFile = $savePath . AppHelper::generateAid() . '.png'; -// file_put_contents('/www/wwwroot/' . $saveFile, $coverContent); -// $model->cover = env('APP_DOMAIN', '') . '/' . $saveFile; -// $model->platform = 3; -// $model->source_url = $article['docid']; -// $model->source_platform = self::PLATFORM; -// $model->save(); -// }catch (\Throwable $exception) { -// var_dump($exception->getMessage()); -// continue; -// } -// } -// } } - private function _getTask($brand): \Generator + protected function request(string $url, array $headers = [], string $cookieFile = 'cookie.txt'): array { - $query = Db::table('app_brands'); - if ($brand) { - $query->where(['id' => $brand]); - } + $ch = curl_init(); + curl_setopt_array($ch, array( + CURLOPT_URL => $url, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_ENCODING => '', + CURLOPT_MAXREDIRS => 10, + CURLOPT_HEADER => true, + CURLOPT_TIMEOUT => 15, + CURLOPT_SSL_VERIFYPEER => false, + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, + CURLOPT_CUSTOMREQUEST => 'GET', + CURLOPT_HTTPHEADER => $headers, + )); - $query->where('id', '>', 1)->orderBy('id'); - foreach ($query->cursor() as $row) { - yield $row; - } - } - - private function _getTaskName($name): string - { - return strtolower(strtr($name, [ - '.' => '-', - ' ' => '-' - ])); - } - - public function spiderStart(): void - { - list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/'); - - preg_match_all('/