update
This commit is contained in:
@ -12,6 +12,7 @@ use Hyperf\Command\Annotation\Command;
|
||||
use Hyperf\DbConnection\Db;
|
||||
use Hyperf\Logger\LoggerFactory;
|
||||
use Psr\Container\ContainerInterface;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
@ -32,9 +33,12 @@ class BaiduRecordCommand extends BaseSpider
|
||||
|
||||
protected const PLATFORM = 'wangyi';
|
||||
|
||||
protected LoggerInterface $log;
|
||||
|
||||
public function __construct(protected ContainerInterface $container, LoggerFactory $loggerFactory)
|
||||
{
|
||||
parent::__construct('spider:baidu-record');
|
||||
$this->log = $loggerFactory->get('log', 'command-baidu-record');
|
||||
}
|
||||
|
||||
public function configure()
|
||||
@ -50,22 +54,17 @@ class BaiduRecordCommand extends BaseSpider
|
||||
|
||||
$this->_start();
|
||||
return 0;
|
||||
run(function () {
|
||||
$this->spiderStart();
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 采集新闻板块
|
||||
private function _start()
|
||||
{
|
||||
$cache = [];
|
||||
AppNews::query()->where('is_delete', 0)->each(function($item) {
|
||||
$requestCookie = '';
|
||||
AppNews::query()->where('is_delete', 0)->each(function($item) use (&$requestCookie, &$cache) {
|
||||
try {
|
||||
|
||||
if (!$item->platform) {
|
||||
throw new \Exception('!');
|
||||
throw new \Exception('没找到平台!!');
|
||||
}
|
||||
|
||||
if (!isset($cache['website'][$item->platform])) {
|
||||
@ -79,222 +78,67 @@ class BaiduRecordCommand extends BaseSpider
|
||||
$cache['column'][$item->column_tag] = AppNewsColumn::find($item->column_tag)?->toArray() ?: '';
|
||||
}
|
||||
|
||||
$column = $cache['column'][$item->column_tag]['url'];
|
||||
$column = $cache['column'][$item->column_tag]['url'] ?? '';
|
||||
|
||||
if (!isset($cache['column_2'][$item->second_column])) {
|
||||
$cache['column_2'][$item->second_column] = AppNewsSecondColumn::find($item->second_column)?->toArray() ?: '';
|
||||
}
|
||||
|
||||
$secondColumn = $cache['column_2'][$item->second_column]['url'];
|
||||
var_dump("https://{$domain}{$column}{$secondColumn}/{$item->id}");
|
||||
$secondColumn = $cache['column_2'][$item->second_column]['url'] ?? '';
|
||||
$url = 'https://m.baidu.com/pu=sz%401321_480/from=0/ssid=0/s?word=' . urlencode("https://{$domain}{$column}{$secondColumn}/{$item->id}");
|
||||
|
||||
$this->log->info('正在处理' . $url);
|
||||
list($res, $code, $cookie) = $this->request($url, headers: [
|
||||
'Cookie' => $requestCookie
|
||||
]);
|
||||
|
||||
if ($cookie) {
|
||||
$requestCookie = $cookie;
|
||||
}
|
||||
if (stripos($res, '验证') !== false) {
|
||||
$this->log->info('有验证码!!');
|
||||
}
|
||||
(new Crawler($res))->filter('.abs')->each(function ($node) use ($domain, $item) {
|
||||
if (stripos($node->html(), $domain) !== false) {
|
||||
$this->log->info('已收录');
|
||||
AppNews::find($item->id)->update(['is_record' => 1]);
|
||||
}
|
||||
});
|
||||
$this->log->info('处理结束..');
|
||||
}catch (\Throwable $exception) {
|
||||
var_dump($exception->getMessage());
|
||||
$this->log->info($exception->getMessage());
|
||||
}
|
||||
|
||||
}, 1);
|
||||
|
||||
return;
|
||||
$columnUrl = '/pu=sz%401321_480/from=0/ssid=0/s?word=https%3A%2F%2Fhegsfc.com%2Fnews%2F155';
|
||||
|
||||
list($res, $code) = $this->request($this->baseUrl . $columnUrl);
|
||||
|
||||
preg_match_all('/window\.initialState=(\{.*\})/', $res, $matches);
|
||||
$articleList = json_decode(current(end($matches)), true);
|
||||
|
||||
foreach ($articleList['information']['informationList']['itemList'] ?? [] as $item) {
|
||||
$itemId = $item['itemId'];
|
||||
$title = $item['templateMaterial']['widgetTitle'];
|
||||
var_dump($title);
|
||||
// var_dump($item);return;
|
||||
$url = $this->baseUrl . '/p/' . $itemId;
|
||||
var_dump($url);
|
||||
list($res, $code) = $this->request($url);
|
||||
sleep(2);
|
||||
(new Crawler($res))->filter('.articleDetailContent')->each(function ($node) {
|
||||
var_dump($node->html());
|
||||
});
|
||||
// var_dump($res);
|
||||
// var_dump($url);
|
||||
// return;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
|
||||
return;
|
||||
// foreach ($articleList as $item) {
|
||||
// foreach ($item as $article) {
|
||||
// try {
|
||||
// if ($is = AppNews::query()->where('source_url', $article['docid'])->where('source_platform', self::PLATFORM)->exists()) {
|
||||
// continue;
|
||||
// }
|
||||
// $model = new AppNews();
|
||||
// $model->title = $article['title'];
|
||||
// $model->description = $article['digest'] ?: $article['title'];
|
||||
// // video 类型 和 无图片不采集
|
||||
// if (!$article['imgsrc'] || (isset($article['skipType']) && $article['skipType'] == 'video')) {
|
||||
// continue;
|
||||
// }
|
||||
// $coverContent = file_get_contents($article['imgsrc']);
|
||||
// $savePath = 'uploads/news/' . date('Y-m-d') . '/';
|
||||
// if (!file_exists($savePath)) {
|
||||
// mkdir($savePath, 0777, true); // 0777 是文件夹的权限,true 表示递归创建子目录
|
||||
// }
|
||||
// $saveFile = $savePath . AppHelper::generateAid() . '.png';
|
||||
// file_put_contents('/www/wwwroot/' . $saveFile, $coverContent);
|
||||
// $model->cover = env('APP_DOMAIN', '') . '/' . $saveFile;
|
||||
// $model->platform = 3;
|
||||
// $model->source_url = $article['docid'];
|
||||
// $model->source_platform = self::PLATFORM;
|
||||
// $model->save();
|
||||
// }catch (\Throwable $exception) {
|
||||
// var_dump($exception->getMessage());
|
||||
// continue;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
private function _getTask($brand): \Generator
|
||||
protected function request(string $url, array $headers = [], string $cookieFile = 'cookie.txt'): array
|
||||
{
|
||||
$query = Db::table('app_brands');
|
||||
if ($brand) {
|
||||
$query->where(['id' => $brand]);
|
||||
}
|
||||
$ch = curl_init();
|
||||
curl_setopt_array($ch, array(
|
||||
CURLOPT_URL => $url,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_ENCODING => '',
|
||||
CURLOPT_MAXREDIRS => 10,
|
||||
CURLOPT_HEADER => true,
|
||||
CURLOPT_TIMEOUT => 15,
|
||||
CURLOPT_SSL_VERIFYPEER => false,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
|
||||
CURLOPT_CUSTOMREQUEST => 'GET',
|
||||
CURLOPT_HTTPHEADER => $headers,
|
||||
));
|
||||
|
||||
$query->where('id', '>', 1)->orderBy('id');
|
||||
foreach ($query->cursor() as $row) {
|
||||
yield $row;
|
||||
}
|
||||
}
|
||||
|
||||
private function _getTaskName($name): string
|
||||
{
|
||||
return strtolower(strtr($name, [
|
||||
'.' => '-',
|
||||
' ' => '-'
|
||||
]));
|
||||
}
|
||||
|
||||
public function spiderStart(): void
|
||||
{
|
||||
list($result, $httpCode) = $this->request($this->getBaseUrl() . '/fashion/street-style/');
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (!is_array($matches) && count($matches) < 1) {
|
||||
$this->logger->info(self::getPlatform() . " 数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
$val = json_decode(($matches[1][0]), true);
|
||||
|
||||
$articles = $val[0]['itemListElement'] ?? [];
|
||||
|
||||
if (!$articles) {
|
||||
$this->logger->info(self::getPlatform() . " 文章数据获取失败。");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
$saveImages = [];
|
||||
foreach ($articles as $article) {
|
||||
list($result, $httpCode) = $this->request($article['url']);
|
||||
|
||||
preg_match_all('/<script id="json-ld" type="application\/ld\+json">([\s\S]*?)<\/script>/', $result, $matches);
|
||||
|
||||
if (isset($matches[1][0])) {
|
||||
$val = json_decode($matches[1][0], true);
|
||||
|
||||
$images = $val['about']['itemListElement'];
|
||||
|
||||
foreach ($images as $image) {
|
||||
$saveImages[] = $image['item']['image'];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var_dump($saveImages);
|
||||
|
||||
|
||||
return;
|
||||
|
||||
|
||||
$this->createCoroutine(function () use ($task) {
|
||||
|
||||
$brandName = $this->_getTaskName($task->name);
|
||||
$url = $this->getBaseUrl() . '/fashion-shows/designer/' . $brandName;
|
||||
$this->logger->info(sprintf("[Command] brandName: {$this->_getTaskName($task->name)}; spiderUrl: {$url}"));
|
||||
|
||||
// 取发布会列表
|
||||
$showsList = $this->_getShowsList($url);
|
||||
|
||||
foreach ($showsList as $list) {
|
||||
$this->createCoroutine(function () use ($task, $list) {
|
||||
$this->_getDetail($task->id, $list);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private function _getShowsList($url)
|
||||
{
|
||||
list($request, $httpCode) = $this->request($url);
|
||||
|
||||
if ($httpCode == 200) {
|
||||
preg_match_all('/window.__PRELOADED_STATE__ = ([\s\S]*?);<\/script>/', $request, $matches);
|
||||
$val = json_decode(current(end($matches)), true);
|
||||
return $val['transformed']['runwayDesignerContent']['designerCollections'] ?? [];
|
||||
} else {
|
||||
$this->logger->info('未找到数据.');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private function _getDetail(int $brandId, array $info)
|
||||
{
|
||||
$model = $this->getArticleModel(['brand' => $brandId, 'title' => $info['hed']]);
|
||||
|
||||
$model->title = $info['hed'];
|
||||
$model->images = json_encode([]);
|
||||
$model->platform = self::getPlatform();
|
||||
|
||||
// 获取图片
|
||||
$pageUri = $info['url'];
|
||||
$requestUrl = $this->getBaseUrl() . $pageUri . '/slideshow/collection';
|
||||
$this->logger->info("正在匹配发布会详情 {$requestUrl}");
|
||||
|
||||
$matches = [];
|
||||
list($result, $httpCode) = $this->request($requestUrl);
|
||||
|
||||
if ($httpCode != 200 || !$result) {
|
||||
$this->logger->warning($requestUrl . '请求失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
preg_match_all('/window\.__PRELOADED_STATE__ = (.*?);</s', $result, $matches);
|
||||
|
||||
$saveUrl = [];
|
||||
if (count($matches) > 1) {
|
||||
$val = json_decode(current($matches[1]), true);
|
||||
$images = $val['transformed']['runwayGalleries']['galleries'][0]['items'] ?? false;
|
||||
|
||||
if ($images === false) {
|
||||
$this->logger->warning($requestUrl . '获取图片失败.');
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (is_array($images) ? $images : [] as $img) {
|
||||
$saveUrl[] = [
|
||||
'src' => $img['image']['sources']['xxl']['url']
|
||||
];
|
||||
}
|
||||
$model->images = json_encode($saveUrl);
|
||||
}
|
||||
|
||||
$model->save();
|
||||
|
||||
$this->logger->info("end: {$requestUrl}");
|
||||
// curl_setopt($ch, CURLOPT_URL, $url);
|
||||
// curl_setopt($ch, CURLOPT_HEADER, false);
|
||||
// curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
// curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
$result = curl_exec($ch);
|
||||
preg_match_all('/Set-Cookie: (.*?);/i', $result, $matches);
|
||||
curl_close($ch);
|
||||
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
|
||||
return [$result, $httpCode, $matches[1][0] ?? ''];
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@
|
||||
namespace App\Controller\admin\api;
|
||||
|
||||
use App\Controller\AbstractController;
|
||||
use App\Model\AppNews;
|
||||
use App\Model\AppNewsColumn;
|
||||
use App\Model\AppNewsSecondColumn;
|
||||
use App\Model\AppWebsiteConfig;
|
||||
@ -21,6 +22,11 @@ class WebsiteController extends AbstractController
|
||||
public function config(): ResponseInterface
|
||||
{
|
||||
$query = AppWebsiteConfig::query()->where('is_delete', 0)->get()->toArray();
|
||||
foreach ($query as &$item) {
|
||||
$total = AppNews::query()->where('platform', $item['id'])->where('is_delete', 0)->count();
|
||||
$include = AppNews::query()->where('platform', $item['id'])->where('is_record', 1)->where('is_delete', 0)->count();
|
||||
$item['record'] = '百度:' . ($include ? round($include/$total, 2) : 0) . '%';
|
||||
}
|
||||
return $this->response->json([
|
||||
'code' => 0,
|
||||
'data' => $query
|
||||
|
Reference in New Issue
Block a user