From 7edc5b75ef14c528b3c31f99ee9ad07e068997df Mon Sep 17 00:00:00 2001 From: GitBluub Date: Sat, 25 Feb 2023 11:30:59 +0900 Subject: [PATCH] random fixes --- crawler/src/main.ts | 20 +++++----- crawler/src/routes.ts | 89 +++++++++++++++++++++++-------------------- 2 files changed, 58 insertions(+), 51 deletions(-) diff --git a/crawler/src/main.ts b/crawler/src/main.ts index f158f8e..5568933 100644 --- a/crawler/src/main.ts +++ b/crawler/src/main.ts @@ -3,16 +3,16 @@ import { router } from './routes.js'; // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. const crawler = new PlaywrightCrawler({ - launchContext: { - userDataDir: "/home/bluub/.config/chromium", - }, - maxConcurrency: 1, - requestHandler: router, - // This function is called if the page processing failed more than maxRequestRetries+1 times. - failedRequestHandler({ request, log }) { - log.info(`Request ${request.url} failed too many times.`); - }, -// headless: false, + launchContext: { + userDataDir: "/home/bluub/.config/chromium", + }, + maxConcurrency: 1, + requestHandler: router, + // This function is called if the page processing failed more than maxRequestRetries+1 times. + failedRequestHandler({ request, log }) { + log.info(`Request ${request.url} failed too many times.`); + }, + headless: true, }); // Add first URL to the queue and start the crawl. diff --git a/crawler/src/routes.ts b/crawler/src/routes.ts index 292c43d..644ceff 100644 --- a/crawler/src/routes.ts +++ b/crawler/src/routes.ts @@ -1,53 +1,57 @@ import { Dataset, createPlaywrightRouter } from 'crawlee'; import * as fs from 'fs'; +import { sleep } from 'crawlee'; export const router = createPlaywrightRouter(); router.addDefaultHandler(async ({ enqueueLinks }) => { - const songs = await enqueueLinks({ - selector: 'article a.xrntp', - label: 'SONG', - }); - // Find a link to the next page and enqueue it if it exists. - const lists = await enqueueLinks({ - selector: '.VECGt', - label: 'LIST', - }); + const songs = await enqueueLinks({ + selector: 'article a.xrntp', + label: 'SONG', + }); + // Find a link to the next page and enqueue it if it exists. + const lists = await enqueueLinks({ + selector: '.VECGt', + label: 'LIST', + }); }); router.addHandler('SONG', async ({ request, page }) => { - await Dataset.pushData({ url: request.loadedUrl }); - await page.waitForSelector('aside div div section button[name="download"]'); - const title = await page.locator('h1').textContent() - // const artist = 'a'; - const artist = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a').textContent() - //const genre = 'b'; - const genre = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a').textContent() - await page.locator('aside div div section button[name="download"]').click() - await page.waitForSelector('section.b_r17 button'); - const [ downloadMxl ] = await Promise.all([ - // Start waiting for the download - page.waitForEvent('download'), - // Perform the action that initiates download - page.locator('section.b_r17 section section div:nth-child(3) button').click(), - ]); - // Save downloaded file somewhere - await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`); + await Dataset.pushData({ url: request.loadedUrl }); + await page.waitForSelector('aside div div section button[name="download"]'); + const title = await page.locator('h1').textContent() + // const artist = 'a'; + const artist = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a').first().textContent() + //const genre = 'b'; + const genre = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a').first().textContent() + console.log("new song", title, artist, genre) + await page.locator('aside div div section button[name="download"]').click() + await page.waitForSelector('section.b_r17 button'); + console.log("downloading Mxl") + const [downloadMxl] = await Promise.all([ + // Start waiting for the download + page.waitForEvent('download'), + // Perform the action that initiates download + page.locator('section.b_r17 section section div:nth-child(3) button').click(), + ]); + // Save downloaded file somewhere + await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`); - await page.locator('body > article > section > button').click(); + await page.locator('body > article > section > button').click(); - await page.waitForTimeout(15000); - await page.locator('aside div div section button[name="download"]').click() - await page.waitForSelector('section.b_r17 button'); - const [ downloadMidi ] = await Promise.all([ - // Start waiting for the download - page.waitForEvent('download'), - // Perform the action that initiates download - page.locator('section.b_r17 section section div:nth-child(4) button').click(), - ]); - // Save downloaded file somewhere - await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`); + await page.waitForTimeout(1000); + await page.locator('aside div div section button[name="download"]').click() + await page.waitForSelector('section.b_r17 button'); + console.log("downloading Midi") + const [downloadMidi] = await Promise.all([ + // Start waiting for the download + page.waitForEvent('download'), + // Perform the action that initiates download + page.locator('section.b_r17 section section div:nth-child(4) button').click(), + ]); + // Save downloaded file somewhere + await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`); - fs.writeFile(`../musics/a/${title}/${title}.ini`, ` + fs.writeFile(`../musics/a/${title}/${title}.ini`, ` [Metadata] Name=${title} Artist=${artist} @@ -68,6 +72,9 @@ ChordTiming=0 Length=0 PedalPoint=0 Precision=0 -`, () => {}) - await page.waitForTimeout(15000); +`, () => { }) + console.log("done downloading") + + console.log("sleeping for 10k seconds") + await sleep(10_000_000); });