我应该在哪里实现带分页的for循环?
我正在尝试用Puppeteer和Node.js抓取https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1
为此,我首先使用函数scrapeJobsInIndexPage(url)
获取每个作业的url,然后运行函数scrapeDescriptionPage(url, page)
,该函数迭代每个作业的url并抓取作业描述。
问题:代码获取每个作业的url并成功分页,但是它不执行scrapeDescriptionPage(url, page)
函数,并且我无法获取作业描述。
如何打开每个作业URL并获取每个作业的作业说明?
此部分代码起作用-它获取每个作业的url。
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
async function scrapeJobsInIndexPage(url) {
try {
const [page] = await browser.pages();
await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
waitUntil: "networkidle0",
});
await page.click("#close-cookies", {
delay: 200,
});
await page.type("[name='email_1']", "natan.chapman@gmail.com", {
delay: 200,
});
await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
await page.click("#formContainer > form > div:nth-child(5) > input", {
delay: 200,
});
await page.waitForNavigation();
await page.goto(url, { waitUntil: "networkidle0" });
const totalPagesSelector = ".pag_txt_tot";
const currentPageSelector = ".pag_txt_current";
await page.waitForSelector(totalPagesSelector);
const totalPages = await page.$eval(totalPagesSelector, (el) =>
Number(el.innerText)
);
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
await page.waitForFunction(
(sel, page) => document.querySelector(sel)?.innerText === String(page),
{},
currentPageSelector,
currentPage
);
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
const jobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(jobs);
const data = await page.evaluate(() => {
const firstDataCell =
document.querySelector("#news_tbl tr td")?.innerText;
return firstDataCell;
});
console.log(`${currentPage}: ${data}`);
await page.evaluate(() => {
document
.querySelector("span.current")
.nextElementSibling?.querySelector("a")
.click();
});
}
} catch (err) {
console.error(err);
}
}
此部件应在URL打开后获取作业详细信息,但是,我不知道如何将其连接到以前的函数。
async function scrapeDescriptionPage(url, page) {
let jobText;
try {
jobText = $("#formContainer").text();
const companyImage = await page.$eval(
".cs-media img",
(img) => img.src
);
const applyLinkRedirect = $(".ltp-btn").attr("href");
const jobDescription = $(
"#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
).html();
await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
const applyLink = await page.url();
let ventureLoopResult = new testVentureLoopDB({
url,
applyLink,
jobDescription,
companyImage,
});
ventureLoopResults.push(ventureLoopResult);
console.log(ventureLoopResults);
ventureLoopResult.save();
return ventureLoopResults;
} catch (err) {
console.log(err);
}
}
let browser;
这是将前两个函数连接在一起的最后一个函数(但scrapeDescriptionPage
不起作用)
async function main() {
browser = await puppeteer.launch({ headless: false });
const descriptionPage = await browser.newPage();
const jobs = await scrapeJobsInIndexPage(
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
);
for (var i = 1; i < jobs.length; i++) {
const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
console.log(result);
}
}
main();
解决方案
我不知道cheerio
,所以这些只是一些猜测。
- 要收集所有作业URL,需要在循环外声明
jobs
,循环后返回:
async function scrapeJobsInIndexPage(url) {
try {
//...
const jobs = [];
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
// ...
const currentJobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(currentJobs);
jobs.push(...currentJobs);
// ...
}
return jobs;
} catch (err) {
console.error(err);
}
}
- 那么,
scrapeDescriptionPage
似乎是从不同的上下文中采用的函数。但是,如果您需要对每个作业页面使用cheerio
,则需要添加以前已经使用过的内容:
async function scrapeDescriptionPage(url, page) {
await page.goto(url, { waitUntil: "networkidle0" });
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
let jobText;
// ...
相关文章