crawler piano only

This commit is contained in:
GitBluub
2023-05-16 00:44:58 +09:00
committed by Clément Le Bihan
parent 3335516f0e
commit 9b12c76978
2 changed files with 49 additions and 32 deletions

View File

@@ -1,5 +1,5 @@
import { PlaywrightCrawler, Dataset } from 'crawlee'; import { PlaywrightCrawler, Dataset } from "crawlee";
import { router } from './routes.js'; import { router } from "./routes.js";
// PlaywrightCrawler crawls the web using a headless // PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library. // browser controlled by the Playwright library.
const crawler = new PlaywrightCrawler({ const crawler = new PlaywrightCrawler({
@@ -16,4 +16,6 @@ const crawler = new PlaywrightCrawler({
}); });
// Add first URL to the queue and start the crawl. // Add first URL to the queue and start the crawl.
await crawler.run(['https://musescore.com/sheetmusic?license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain']); await crawler.run([
"https://musescore.com/sheetmusic?complexity=1&instrument=2&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain",
]);

View File

@@ -1,57 +1,70 @@
import { Dataset, createPlaywrightRouter } from 'crawlee'; import { Dataset, createPlaywrightRouter } from "crawlee";
import * as fs from 'fs'; import * as fs from "fs";
import { sleep } from 'crawlee'; import { sleep } from "crawlee";
export const router = createPlaywrightRouter(); export const router = createPlaywrightRouter();
router.addDefaultHandler(async ({ enqueueLinks }) => { router.addDefaultHandler(async ({ enqueueLinks }) => {
const songs = await enqueueLinks({ const songs = await enqueueLinks({
selector: 'article a.xrntp', selector: "article a.xrntp",
label: 'SONG', label: "SONG",
}); });
// Find a link to the next page and enqueue it if it exists. // Find a link to the next page and enqueue it if it exists.
const lists = await enqueueLinks({ const lists = await enqueueLinks({
selector: '.VECGt', selector: ".VECGt",
label: 'LIST', label: "LIST",
}); });
}); });
router.addHandler('SONG', async ({ request, page }) => { router.addHandler("SONG", async ({ request, page }) => {
await Dataset.pushData({ url: request.loadedUrl }); await Dataset.pushData({ url: request.loadedUrl });
await page.waitForSelector('aside div div section button[name="download"]'); await page.waitForSelector('aside div div section button[name="download"]');
const title = await page.locator('h1').textContent() const title = await page.locator("h1").textContent();
// const artist = 'a'; const artist = await page
const artist = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a').first().textContent() .locator(
//const genre = 'b'; "body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a"
const genres = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a').allTextContents() )
console.log("new song", title, artist, genres) .first()
await page.locator('aside div div section button[name="download"]').click() .textContent();
await page.waitForSelector('section.b_r17 button'); const genres = await page
console.log("downloading Mxl") .locator(
"body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"
)
.allTextContents();
console.log("new song", title, artist, genres);
await page.locator('aside div div section button[name="download"]').click();
await page.waitForSelector("section.b_r17 button");
console.log("downloading Mxl");
const [downloadMxl] = await Promise.all([ const [downloadMxl] = await Promise.all([
// Start waiting for the download // Start waiting for the download
page.waitForEvent('download'), page.waitForEvent("download"),
// Perform the action that initiates download // Perform the action that initiates download
page.locator('section.b_r17 section section div:nth-child(3) button').click(), page
.locator("section.b_r17 section section div:nth-child(3) button")
.click(),
]); ]);
// Save downloaded file somewhere // Save downloaded file somewhere
await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`); await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`);
await page.locator('body > article > section > button').click(); await page.locator("body > article > section > button").click();
await page.waitForTimeout(1000); await page.waitForTimeout(1000);
await page.locator('aside div div section button[name="download"]').click() await page.locator('aside div div section button[name="download"]').click();
await page.waitForSelector('section.b_r17 button'); await page.waitForSelector("section.b_r17 button");
console.log("downloading Midi") console.log("downloading Midi");
const [downloadMidi] = await Promise.all([ const [downloadMidi] = await Promise.all([
// Start waiting for the download // Start waiting for the download
page.waitForEvent('download'), page.waitForEvent("download"),
// Perform the action that initiates download // Perform the action that initiates download
page.locator('section.b_r17 section section div:nth-child(4) button').click(), page
.locator("section.b_r17 section section div:nth-child(4) button")
.click(),
]); ]);
// Save downloaded file somewhere // Save downloaded file somewhere
await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`); await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`);
fs.writeFile(`../musics/a/${title}/${title}.ini`, ` fs.writeFile(
`../musics/a/${title}/${title}.ini`,
`
[Metadata] [Metadata]
Name=${title} Name=${title}
Artist=${artist} Artist=${artist}
@@ -72,9 +85,11 @@ ChordTiming=0
Length=0 Length=0
PedalPoint=0 PedalPoint=0
Precision=0 Precision=0
`, () => { }) `,
console.log("done downloading") () => { }
);
console.log("done downloading");
console.log("sleeping for 10k seconds") //console.log("sleeping for 10k seconds")
//await sleep(10_000_000); //await sleep(10_000_000);
}); });