22 lines
837 B
TypeScript
22 lines
837 B
TypeScript
import { PlaywrightCrawler, Dataset } from "crawlee";
|
|
import { router } from "./routes.js";
|
|
// PlaywrightCrawler crawls the web using a headless
|
|
// browser controlled by the Playwright library.
|
|
const crawler = new PlaywrightCrawler({
|
|
launchContext: {
|
|
userDataDir: "/home/bluub/.config/chromium",
|
|
},
|
|
maxConcurrency: 1,
|
|
requestHandler: router,
|
|
// This function is called if the page processing failed more than maxRequestRetries+1 times.
|
|
failedRequestHandler({ request, log }) {
|
|
log.info(`Request ${request.url} failed too many times.`);
|
|
},
|
|
headless: true,
|
|
});
|
|
|
|
// Add first URL to the queue and start the crawl.
|
|
await crawler.run([
|
|
"https://musescore.com/sheetmusic?complexity=1&instrument=2&instrumentation=114&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain&sort=rating",
|
|
]);
|