From 1b36a808a8819ee0bd2c261daa0c421f94796a88 Mon Sep 17 00:00:00 2001 From: toom1996 Date: Tue, 29 Jul 2025 19:24:44 +0800 Subject: [PATCH] update --- app/Command/spider/BaiduRecordCommand.php | 300 ++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 app/Command/spider/BaiduRecordCommand.php diff --git a/app/Command/spider/BaiduRecordCommand.php b/app/Command/spider/BaiduRecordCommand.php new file mode 100644 index 0000000..40e9f2b --- /dev/null +++ b/app/Command/spider/BaiduRecordCommand.php @@ -0,0 +1,300 @@ +setDescription('36kr.com'); + } + + public function execute(InputInterface $input, OutputInterface $output): int + { + + // 新闻:/touch/reconstruct/article/list/BBM54PGAwangning/0-10.html + + $this->_start(); + return 0; + run(function () { + $this->spiderStart(); + }); + + return 0; + } + + // 采集新闻板块 + private function _start() + { + $cache = []; + AppNews::query()->where('is_delete', 0)->each(function($item) { + try { + + if (!$item->platform) { + throw new \Exception('!'); + } + + if (!isset($cache['website'][$item->platform])) { + $cache['website'][$item->platform] = AppWebsiteConfig::find($item->platform)->toArray(); + } + + // 主域名 + $domain = trim($cache['website'][$item->platform]['app_domain'], '/'); + + if (!isset($cache['column'][$item->column_tag])) { + $cache['column'][$item->column_tag] = AppNewsColumn::find($item->column_tag)?->toArray() ?: ''; + } + + $column = $cache['column'][$item->column_tag]['url']; + + if (!isset($cache['column_2'][$item->second_column])) { + $cache['column_2'][$item->second_column] = AppNewsSecondColumn::find($item->second_column)?->toArray() ?: ''; + } + + $secondColumn = $cache['column_2'][$item->second_column]['url']; + var_dump("https://{$domain}{$column}{$secondColumn}/{$item->id}"); + }catch (\Throwable $exception) { + var_dump($exception->getMessage()); + } + + }, 1); + + return; + $columnUrl = '/pu=sz%401321_480/from=0/ssid=0/s?word=https%3A%2F%2Fhegsfc.com%2Fnews%2F155'; + + list($res, $code) = $this->request($this->baseUrl . $columnUrl); + + preg_match_all('/window\.initialState=(\{.*\})/', $res, $matches); + $articleList = json_decode(current(end($matches)), true); + + foreach ($articleList['information']['informationList']['itemList'] ?? [] as $item) { + $itemId = $item['itemId']; + $title = $item['templateMaterial']['widgetTitle']; + var_dump($title); +// var_dump($item);return; + $url = $this->baseUrl . '/p/' . $itemId; + var_dump($url); + list($res, $code) = $this->request($url); + sleep(2); + (new Crawler($res))->filter('.articleDetailContent')->each(function ($node) { + var_dump($node->html()); + }); +// var_dump($res); +// var_dump($url); +// return; + } + + return; + + + return; +// foreach ($articleList as $item) { +// foreach ($item as $article) { +// try { +// if ($is = AppNews::query()->where('source_url', $article['docid'])->where('source_platform', self::PLATFORM)->exists()) { +// continue; +// } +// $model = new AppNews(); +// $model->title = $article['title']; +// $model->description = $article['digest'] ?: $article['title']; +// // video 类型 和 无图片不采集 +// if (!$article['imgsrc'] || (isset($article['skipType']) && $article['skipType'] == 'video')) { +// continue; +// } +// $coverContent = file_get_contents($article['imgsrc']); +// $savePath = 'uploads/news/' . date('Y-m-d') . '/'; +// if (!file_exists($savePath)) { +// mkdir($savePath, 0777, true); // 0777 是文件夹的权限,true 表示递归创建子目录 +// } +// $saveFile = $savePath . AppHelper::generateAid() . '.png'; +// file_put_contents('/www/wwwroot/' . $saveFile, $coverContent); +// $model->cover = env('APP_DOMAIN', '') . '/' . $saveFile; +// $model->platform = 3; +// $model->source_url = $article['docid']; +// $model->source_platform = self::PLATFORM; +// $model->save(); +// }catch (\Throwable $exception) { +// var_dump($exception->getMessage()); +// continue; +// } +// } +// } + } + + private function _getTask($brand): \Generator + { + $query = Db::table('app_brands'); + if ($brand) { + $query->where(['id' => $brand]); + } + + $query->where('id', '>', 1)->orderBy('id'); + foreach ($query->cursor() as $row) { + yield $row; + } + } + + private function _getTaskName($name): string + { + return strtolower(strtr($name, [ + '.' => '-', + ' ' => '-' + ])); + } + + public function spiderStart(): void + { + list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/'); + + preg_match_all('/