nieheyong / blog Goto Github PK
View Code? Open in Web Editor NEW📝个人博客
📝个人博客
目前,绝大部分的爬虫教程都是基于Python和Node.js。其实,只要有Chrome浏览器,使用Chrome F12打开的的Devtools就能随时随地轻轻松松写一个爬虫,完全不用装其它语言环境。今天就介绍一下只使用Chrome Devtools来爬取网站https://www.biqudu.com/31_31729/小说并保存为文本文件的爬虫。
Devtools提供了Snippets功能让我们可以在这里写JavaScript代码,步骤参考下图:
根据Url加载一个第三方库,可以用这个函数加载jquery,underscore等工具库,加载完成后就可以在代码中使用这些库了,本例中使用这个函数加载async异步并发控制库。
async function loadLibrary(url) {
return new Promise((resolve, reject) => {
let script = document.createElement('script');
script.onload = resolve;
script.onerror = reject;
script.src = url;
document.body.appendChild(script);
});
}
将string下载到文本文件
function saveFile(string, fileName) {
var a = document.createElement('a');
a.download = fileName;
var blob = new Blob([string], {
type: 'text/plain'
});
a.href = window.URL.createObjectURL(blob);
a.click();
}
使用了Fetch api,根据url下载一个html文本文件并转换成DOM元素后返回,返回的元素具有DOM api,例如 querySelector,方便对节点的提取和分析。
async function getHtml(url) {
let response = await fetch(url);
let htmlText = await response.text();
let html = document.createElement('html');
html.innerHTML = htmlText;
return html;
}
分析小说主页https://www.biqudu.com/31_31729/,通过document.querySelectorAll('#list dd a')
可以获取包含所有章节名称和链接的a标签元素。
async function getDirectory(url) {
let page = await getHtml(url);
let directory = Array.from(page.querySelectorAll('#list dd a'));
//去除顶部最新12个章节
return directory.slice(12);
}
分析小说章节 https://www.biqudu.com/31_31729/2170175.html,章节内容位于ID为content
的DIV元素中
async function getSection({ href, innerText: title }) {
console.log(`开始获取 ${title}`);
let html = await getHtml(href);
let content = html.querySelector('#content');
Array.from(content.querySelectorAll('script')).forEach(scriptTag => content.removeChild(scriptTag));
var text = title + '\r\n' + content.innerText + '\r\n';
return text;
}
因为小说有几百几千章节,不可能一个一个章节下载,那样速度太慢了。也不能一下子全下载。所以
爬取时使用了async异步并发控制库(这个async和async function里面的async只是名字一样而已),并发数量为6,设置大了也没用因为Chrome浏览器对同一域名下的同时请求数量是6。
(async function () {
// https://www.biqudu.com/31_31729/
async function loadLibrary(url) {
return new Promise((resolve, reject) => {
let script = document.createElement('script');
script.onload = resolve;
script.onerror = reject;
script.src = url;
document.body.appendChild(script);
});
}
function saveFile(string, fileName) {
var a = document.createElement('a');
a.download = fileName;
var blob = new Blob([string], {
type: 'text/plain'
});
a.href = window.URL.createObjectURL(blob);
a.click();
}
async function getHtml(url) {
let response = await fetch(url);
let htmlText = await response.text();
let html = document.createElement('html');
html.innerHTML = htmlText;
return html;
}
async function getDirectory(url) {
let page = await getHtml(url);
let directory = Array.from(page.querySelectorAll('#list dd a'));
//去除顶部最新12个章节
return directory.slice(12);
}
async function getSection({ href, innerText: title }) {
console.log(`开始获取 ${title}`);
let html = await getHtml(href);
let content = html.querySelector('#content');
Array.from(content.querySelectorAll('script')).forEach(scriptTag => content.removeChild(scriptTag));
var text = title + '\r\n' + content.innerText + '\r\n';
return text;
}
async function run() {
let asyncLibUrl = 'https://cdn.bootcss.com/async/2.1.4/async.js';
await loadLibrary(asyncLibUrl);
let directory = await getDirectory(location.href);
let q = window.async.queue(async function (section, taskDone) {
try {
section.text = await getSection(section);
} catch (e) {
console.error(e);
section.text = "章节下载失败:" + e;
} finally {
taskDone();
}
}, 6);//并发送设成6
q.drain = function () {
let name = document.querySelector('#maininfo h1').innerText + '.txt';
console.log(`小说《${name}》下载完成`);
let content = "";
directory.forEach(function ({ text }) {
content += text;
});
saveFile(content, name);
}
q.push(directory);
}
await run();
}());
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.