- Published on
用jquery和jsdom实现的简单通用爬虫
- Authors
- Name
- 国Wei
- @ygweric
背景
在项目中使用了大模型,但大模型有幻觉问题,需要通过bing搜索的结果来交叉确认大模型的正确性。
bing接口使用的是bing官方的免费api,1千次每月。接下来要实现的就是写一个爬虫,获取bing结果网页中的内容。爬虫的要求如下:
- 这个内容不需要太精确,只需要提取出正文内容,传给大模型即可。
- 不需要考虑对应网站的反爬策略,不让爬取就直接忽略,只需要有一半左右的链接能爬取有结果即可。
最后的代码在最后面,这中间想着重介绍几个小点
核心要点
1. 对网页编码进行正确判断
网页有gbk、utf8等不同编码,需要判断,否则乱码,用的是iconv来解码
let res_ = "";
try {
const response = await axios.get(pageContentUrl, {
responseType: "arraybuffer",
headers: {
"User-Agent": `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36`,
},
});
// res_ = iconv.decode(response.data, "gbk");
const tmpRes = iconv.decode(response.data, "utf-8");
const dom = new JSDOM(tmpRes);
const encoding = getHtmlEncoding(tmpRes) || "utf-8";
res_ = iconv.decode(response.data, encoding);
} catch (err) {
console.error(err.message);
// throw err;
errorInfoStr += `\n${err.message}`;
}
const htmlString = String(res_);
2. 精确正文选择器
由于bing搜索结果,大部分是百家号、腾讯新闻、知乎、官网等信息,所以可以多找些真是数据,找出对应的选择器写死即可,然后循环,找到一个即是最后结果
const getExplicitMatchedContent = ($) => {
const contentSelectors = [
`[data-role="paragraph"]`, // https://new.qq.com/rain/a/20241016A08CP400
"article",
".article",
".detail-content",
".content-article",
".article-content",
".main",
".content",
".index_centent",
`[data-testid="article"]`, //百家号 https://baijiahao.baidu.com/s?id=1730584141402361753
];
let maxContent = null;
let maxSelector = null;
for (let i = 0; i < contentSelectors.length; i++) {
const selector = contentSelectors[i];
const contentText = $(selector).first().text().trim();
if (contentText.length > 50) {
maxContent = contentText;
maxSelector = `$(${selector}).first().text().trim()`;
break;
}
}
return {
content: maxContent,
selector: maxSelector,
};
// return extractMaxContent($, contentSelectors, { limitedElementCount: 1 });
};
3. 模糊(通用)正文选择器
对于其他未精确匹配的网站,可以使用一套模糊选择器,匹配article,content,main等内容,然后循环对比,找出内容最长的节点即可。
这里可以做下改进,进一步判断or移除节点中的js、css、无效内容,来对比的更准确
/**
* 模糊匹配标签
* @param {*} $
* @returns
*/
const getImplicitMatchedContent = ($) => {
// 定义可能包含正文的常见标签
const contentSelectors = [
".article", // 常见的正文标签
'div[class*="content"]',
'div[class*="article"]',
'div[id*="content"]',
'div[id*="article"]',
"main", // 网页主要内容的标签
"section", // 可能包含正文的标签
"p", // 段落
];
return extractMaxContent($, contentSelectors, {
extraDiscardFn: (currentContent) => {
// const withTooManyBlank = hasConsecutiveBlanks(currentContent, 5, 5); // 判断是否太多空白字符,表示选择的范围太大了,包括了太多导航栏啥的
const consecutiveBlanksCount = countConsecutiveBlanksGreaterThanN(
currentContent,
5
); // 判断是否太多空白字符,表示选择的范围太大了,包括了太多导航栏啥的
console.log(`连续空白字符个数:${consecutiveBlanksCount},`);
const withTooManyBlank = consecutiveBlanksCount > 10;
return withTooManyBlank;
},
});
};
进一步优化
可以做下面优化(能做,但我估计懒得改进了)
- 针对不同网站设置header、cookie,最大限度绕过反爬虫机制
- 针对不同网站设置不同的选择器
- 加入NLP等ai手段获取更正确的正文
爬虫结果
最终效果如下图,看着还行,比较干净
最终js文件
/**
* 针对bing的爬虫,直接提取SEO友好的页面标题、内容
*/
import fs from "fs";
import dayjs from "dayjs";
import { JSDOM } from "jsdom";
import jquery from "jquery";
import axios from "axios";
import iconv from "iconv-lite";
import * as path from "path";
import { JSONFilePreset } from "lowdb/node";
import {
countConsecutiveBlanks,
countConsecutiveBlanksGreaterThanN,
createDirectorySync,
createFileIfNotExist,
delay,
getHtmlEncoding,
hasConsecutiveBlanks,
sanitizeFileName,
extractMaxContent,
isDomainOnly,
splitStringIntoChunks,
} from "./utils.js";
// const settingFilePath = `./res/llm/bing/settingDb.json`;
const resultUrlsPath = `./res/llm/bing/urls.json`;
const pageContentFolderPath = `./res/llm/bing/single-file-page-content/${dayjs().format(
"DD_HH-mm-ss/"
)}`;
const jsonlFilePath = `./res/llm/bing/jsonl/${dayjs().format(
"DD_HH-mm-ss"
)}.jsonl`;
let errorInfoStr = "";
// createDirectorySync(settingFilePath);
createDirectorySync(pageContentFolderPath);
createDirectorySync(jsonlFilePath);
createFileIfNotExist(jsonlFilePath);
// const settingDb = await JSONFilePreset(settingFilePath, []);
const resultUrlsTxt = fs.readFileSync(resultUrlsPath, "utf-8"); // for test, comment later !!!!!!!!!!!!!!!!
let bingResults = JSON.parse(resultUrlsTxt);
bingResults = [
{
link: "https://new.qq.com/rain/a/20241016A08CP400",
title: "xxx",
},
];
const getExplicitMatchedContent = ($) => {
const contentSelectors = [
`[data-role="paragraph"]`, // https://new.qq.com/rain/a/20241016A08CP400
"article",
".article",
".detail-content",
".content-article",
".article-content",
".main",
".content",
".index_centent",
`[data-testid="article"]`, //百家号 https://baijiahao.baidu.com/s?id=1730584141402361753
];
let maxContent = null;
let maxSelector = null;
for (let i = 0; i < contentSelectors.length; i++) {
const selector = contentSelectors[i];
const contentText = $(selector).first().text().trim();
if (contentText.length > 50) {
maxContent = contentText;
maxSelector = `$(${selector}).first().text().trim()`;
break;
}
}
return {
content: maxContent,
selector: maxSelector,
};
// return extractMaxContent($, contentSelectors, { limitedElementCount: 1 });
};
/**
* 模糊匹配标签
* @param {*} $
* @returns
*/
const getImplicitMatchedContent = ($) => {
// 定义可能包含正文的常见标签
const contentSelectors = [
".article", // 常见的正文标签
'div[class*="content"]',
'div[class*="article"]',
'div[id*="content"]',
'div[id*="article"]',
"main", // 网页主要内容的标签
"section", // 可能包含正文的标签
"p", // 段落
];
return extractMaxContent($, contentSelectors, {
extraDiscardFn: (currentContent) => {
// const withTooManyBlank = hasConsecutiveBlanks(currentContent, 5, 5); // 判断是否太多空白字符,表示选择的范围太大了,包括了太多导航栏啥的
const consecutiveBlanksCount = countConsecutiveBlanksGreaterThanN(
currentContent,
5
); // 判断是否太多空白字符,表示选择的范围太大了,包括了太多导航栏啥的
console.log(`连续空白字符个数:${consecutiveBlanksCount},`);
const withTooManyBlank = consecutiveBlanksCount > 10;
return withTooManyBlank;
},
});
};
const getPageContentDataFromHtml = (htmlString) => {
try {
const dom = new JSDOM(htmlString);
const $ = jquery(dom.window);
const title =
$("h1").text().replace(/\s+/g, "") || //
$("h2").text().replace(/\s+/g, "") ||
`${Date.now()}-${Math.round(Math.random() * 1000)}`; //
// 先准确命中
let pageContentText = "";
let pageContentSelector = "";
({ content: pageContentText, selector: pageContentSelector } =
getExplicitMatchedContent($));
if (!pageContentText || pageContentText.length < 50) {
// 没有准确命中个,再模糊名字
({ content: pageContentText, selector: pageContentSelector } =
getImplicitMatchedContent($));
}
return {
title,
pageContentText,
pageContentSelector,
};
} catch (error) {
console.error(error);
return {
title: "",
pageContentText: "",
};
}
};
const featchContent = async (pageContentUrl) => {
// if (settingDb.data.includes(pageContentUrl)) {
// console.log(`当前页面已处理: ${pageContentUrl}`);
// return;
// }
console.log("featchContent url: ", pageContentUrl);
let res_ = "";
try {
const response = await axios.get(pageContentUrl, {
responseType: "arraybuffer",
headers: {
"User-Agent": `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36`,
},
});
// res_ = iconv.decode(response.data, "gbk");
const tmpRes = iconv.decode(response.data, "utf-8");
const dom = new JSDOM(tmpRes);
const encoding = getHtmlEncoding(tmpRes) || "utf-8";
res_ = iconv.decode(response.data, encoding);
} catch (err) {
console.error(err.message);
// throw err;
errorInfoStr += `\n${err.message}`;
}
const htmlString = String(res_);
// console.log(`html length : ${htmlString}`);
const { title, pageContentText, pageContentSelector } =
getPageContentDataFromHtml(htmlString);
// 打印收尾的一段内容, 来确认内容正确
console.log(
`pageContentText: ${pageContentText.slice(
0,
1000
)} \n ----- \n ${pageContentText.slice(
pageContentText.length - 500,
pageContentText.length
)}}`
);
// 不判断title,因为title有时候不标准,但不影响结果
if (!pageContentText || pageContentText.length < 50) {
// throw Error(`页面错误,请检查: ${pageContentUrl}`);
errorInfoStr += `\n页面错误,请检查: ${pageContentUrl}`;
return;
}
const fileName = title.slice(0, 60);
const newTxtFilePath = path.join(pageContentFolderPath, `${fileName}.txt`);
const finalPageContentText = `标题:${title}
原文地址:${pageContentUrl}
JQuery选择器:${pageContentSelector}
正文:
${pageContentText}`;
fs.writeFileSync(newTxtFilePath, finalPageContentText, "utf-8");
// jsonl -------------------------------
const chunkedContentList = splitStringIntoChunks(pageContentText, 3.8 * 1000); // 稳妥起见,做多3.9k
for (let i = 0; i < chunkedContentList.length; i++) {
const content = chunkedContentList[i];
const newLine = JSON.stringify({
title: title,
pageContentUrl: pageContentUrl,
releaseDate: dayjs().format("YYYY-MM-DD HH:mm:ss"),
content: content,
});
const origJsonlData = fs.readFileSync(jsonlFilePath, "utf8");
const needNewLine = origJsonlData && !origJsonlData.endsWith("\n\n");
fs.appendFileSync(jsonlFilePath, `${needNewLine ? "\n" : ""}${newLine}`);
console.log(`写入jsonl 成功`);
}
// settingDb.data.push(pageContentUrl);
// await settingDb.write();
};
setTimeout(async () => {
for (let i = 0; i < bingResults.length; i++) {
const { link, title } = bingResults[i];
console.log(bingResults[i].link);
if (
[
// 下面的地址有问题!
"https://www.36kr.com", // 返回一坨js
"https://www.cbre.com.cn",
"https://zhuanlan.zhihu.com", // 反爬 https://zhuanlan.zhihu.com/p/577551904
"https://baijiahao.baidu.com/", // 百度检测爬虫返回null
"https://www.xpu.edu.cn/", // https://www.xpu.edu.cn/index/zbgs/144.htm
"https://www.douyin.com/", // https://www.douyin.com/search/%E5%BB%9D%BF
"https://wenku.baidu.com/", // https://wenku.baidu.com/view/d66d1.html
"https://www.vzkoo.com/", // 内容正常,返回不过来, https://www.vzkoo.com/read/204a68fd1c9.html
//
].some((url) => link.startsWith(url)) ||
[
".pdf", //
//
].some((url) => link.endsWith(url)) ||
isDomainOnly(link)
) {
continue;
}
await featchContent(link);
}
console.log("finished");
}, 0);
关于我
国 wei (Eric) Github