diff --git a/crawler/src/main.ts b/crawler/src/main.ts index 5568933..67120ca 100644 --- a/crawler/src/main.ts +++ b/crawler/src/main.ts @@ -1,5 +1,5 @@ -import { PlaywrightCrawler, Dataset } from 'crawlee'; -import { router } from './routes.js'; +import { PlaywrightCrawler, Dataset } from "crawlee"; +import { router } from "./routes.js"; // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. const crawler = new PlaywrightCrawler({ @@ -16,4 +16,6 @@ const crawler = new PlaywrightCrawler({ }); // Add first URL to the queue and start the crawl. -await crawler.run(['https://musescore.com/sheetmusic?license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain']); +await crawler.run([ + "https://musescore.com/sheetmusic?complexity=1&instrument=2&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain", +]); diff --git a/crawler/src/routes.ts b/crawler/src/routes.ts index c173c8c..0623209 100644 --- a/crawler/src/routes.ts +++ b/crawler/src/routes.ts @@ -1,57 +1,70 @@ -import { Dataset, createPlaywrightRouter } from 'crawlee'; -import * as fs from 'fs'; -import { sleep } from 'crawlee'; +import { Dataset, createPlaywrightRouter } from "crawlee"; +import * as fs from "fs"; +import { sleep } from "crawlee"; export const router = createPlaywrightRouter(); router.addDefaultHandler(async ({ enqueueLinks }) => { const songs = await enqueueLinks({ - selector: 'article a.xrntp', - label: 'SONG', + selector: "article a.xrntp", + label: "SONG", }); // Find a link to the next page and enqueue it if it exists. const lists = await enqueueLinks({ - selector: '.VECGt', - label: 'LIST', + selector: ".VECGt", + label: "LIST", }); }); -router.addHandler('SONG', async ({ request, page }) => { +router.addHandler("SONG", async ({ request, page }) => { await Dataset.pushData({ url: request.loadedUrl }); await page.waitForSelector('aside div div section button[name="download"]'); - const title = await page.locator('h1').textContent() - // const artist = 'a'; - const artist = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a').first().textContent() - //const genre = 'b'; - const genres = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a').allTextContents() - console.log("new song", title, artist, genres) - await page.locator('aside div div section button[name="download"]').click() - await page.waitForSelector('section.b_r17 button'); - console.log("downloading Mxl") + const title = await page.locator("h1").textContent(); + const artist = await page + .locator( + "body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a" + ) + .first() + .textContent(); + const genres = await page + .locator( + "body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a" + ) + .allTextContents(); + console.log("new song", title, artist, genres); + await page.locator('aside div div section button[name="download"]').click(); + await page.waitForSelector("section.b_r17 button"); + console.log("downloading Mxl"); const [downloadMxl] = await Promise.all([ // Start waiting for the download - page.waitForEvent('download'), + page.waitForEvent("download"), // Perform the action that initiates download - page.locator('section.b_r17 section section div:nth-child(3) button').click(), + page + .locator("section.b_r17 section section div:nth-child(3) button") + .click(), ]); // Save downloaded file somewhere await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`); - await page.locator('body > article > section > button').click(); + await page.locator("body > article > section > button").click(); await page.waitForTimeout(1000); - await page.locator('aside div div section button[name="download"]').click() - await page.waitForSelector('section.b_r17 button'); - console.log("downloading Midi") + await page.locator('aside div div section button[name="download"]').click(); + await page.waitForSelector("section.b_r17 button"); + console.log("downloading Midi"); const [downloadMidi] = await Promise.all([ // Start waiting for the download - page.waitForEvent('download'), + page.waitForEvent("download"), // Perform the action that initiates download - page.locator('section.b_r17 section section div:nth-child(4) button').click(), + page + .locator("section.b_r17 section section div:nth-child(4) button") + .click(), ]); // Save downloaded file somewhere await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`); - fs.writeFile(`../musics/a/${title}/${title}.ini`, ` + fs.writeFile( + `../musics/a/${title}/${title}.ini`, + ` [Metadata] Name=${title} Artist=${artist} @@ -72,9 +85,11 @@ ChordTiming=0 Length=0 PedalPoint=0 Precision=0 -`, () => { }) - console.log("done downloading") +`, + () => { } + ); + console.log("done downloading"); - console.log("sleeping for 10k seconds") + //console.log("sleeping for 10k seconds") //await sleep(10_000_000); });