Music Crawler (#117)
This commit is contained in:
10
crawler/.dockerignore
Normal file
10
crawler/.dockerignore
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# configurations
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# crawlee and apify storage folders
|
||||||
|
apify_storage
|
||||||
|
crawlee_storage
|
||||||
|
storage
|
||||||
|
|
||||||
|
# installed files
|
||||||
|
node_modules
|
||||||
8
crawler/.gitignore
vendored
Normal file
8
crawler/.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# This file tells Git which files shouldn't be added to source control
|
||||||
|
|
||||||
|
.idea
|
||||||
|
dist
|
||||||
|
node_modules
|
||||||
|
apify_storage
|
||||||
|
crawlee_storage
|
||||||
|
storage
|
||||||
50
crawler/Dockerfile
Normal file
50
crawler/Dockerfile
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# Specify the base Docker image. You can read more about
|
||||||
|
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||||
|
# You can also use any other image from Docker Hub.
|
||||||
|
FROM apify/actor-node:16 AS builder
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install all dependencies. Don't audit to speed up the installation.
|
||||||
|
RUN npm install --include=dev --audit=false
|
||||||
|
|
||||||
|
# Next, copy the source files using the user set
|
||||||
|
# in the base image.
|
||||||
|
COPY . ./
|
||||||
|
|
||||||
|
# Install all dependencies and build the project.
|
||||||
|
# Don't audit to speed up the installation.
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
# Create final image
|
||||||
|
FROM apify/actor-node:16
|
||||||
|
|
||||||
|
# Copy only built JS files from builder image
|
||||||
|
COPY --from=builder /usr/src/app/dist ./dist
|
||||||
|
|
||||||
|
# Copy just package.json and package-lock.json
|
||||||
|
# to speed up the build using Docker layer cache.
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install NPM packages, skip optional and development dependencies to
|
||||||
|
# keep the image small. Avoid logging too much and print the dependency
|
||||||
|
# tree for debugging
|
||||||
|
RUN npm --quiet set progress=false \
|
||||||
|
&& npm install --omit=dev --omit=optional \
|
||||||
|
&& echo "Installed NPM packages:" \
|
||||||
|
&& (npm list --omit=dev --all || true) \
|
||||||
|
&& echo "Node.js version:" \
|
||||||
|
&& node --version \
|
||||||
|
&& echo "NPM version:" \
|
||||||
|
&& npm --version
|
||||||
|
|
||||||
|
# Next, copy the remaining files and directories with the source code.
|
||||||
|
# Since we do this after NPM install, quick build will be really fast
|
||||||
|
# for most source file changes.
|
||||||
|
COPY . ./
|
||||||
|
|
||||||
|
|
||||||
|
# Run the image.
|
||||||
|
CMD npm run start:prod --silent
|
||||||
7
crawler/README.md
Normal file
7
crawler/README.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Crawler
|
||||||
|
|
||||||
|
To run, you must first login to musescore with your browser, then run
|
||||||
|
```npm i```
|
||||||
|
```npm run start```
|
||||||
|
|
||||||
|
It should download everything in ../musics/a/
|
||||||
5613
crawler/package-lock.json
generated
Normal file
5613
crawler/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
25
crawler/package.json
Normal file
25
crawler/package.json
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"name": "music-crawler",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"type": "module",
|
||||||
|
"description": "This is an example of a Crawlee project.",
|
||||||
|
"dependencies": {
|
||||||
|
"crawlee": "^3.0.0",
|
||||||
|
"fs": "^0.0.1-security",
|
||||||
|
"playwright": "^1.28.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@apify/tsconfig": "^0.1.0",
|
||||||
|
"ts-node": "^10.8.0",
|
||||||
|
"typescript": "^4.7.4"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"start": "npm run start:dev",
|
||||||
|
"start:prod": "node dist/main.js",
|
||||||
|
"start:dev": "ts-node-esm -T src/main.ts",
|
||||||
|
"build": "tsc",
|
||||||
|
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "It's not you it's me",
|
||||||
|
"license": "ISC"
|
||||||
|
}
|
||||||
19
crawler/src/main.ts
Normal file
19
crawler/src/main.ts
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import { PlaywrightCrawler, Dataset } from 'crawlee';
|
||||||
|
import { router } from './routes.js';
|
||||||
|
// PlaywrightCrawler crawls the web using a headless
|
||||||
|
// browser controlled by the Playwright library.
|
||||||
|
const crawler = new PlaywrightCrawler({
|
||||||
|
launchContext: {
|
||||||
|
userDataDir: "/home/bluub/.config/chromium",
|
||||||
|
},
|
||||||
|
maxConcurrency: 1,
|
||||||
|
requestHandler: router,
|
||||||
|
// This function is called if the page processing failed more than maxRequestRetries+1 times.
|
||||||
|
failedRequestHandler({ request, log }) {
|
||||||
|
log.info(`Request ${request.url} failed too many times.`);
|
||||||
|
},
|
||||||
|
// headless: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add first URL to the queue and start the crawl.
|
||||||
|
await crawler.run(['https://musescore.com/sheetmusic?license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain']);
|
||||||
73
crawler/src/routes.ts
Normal file
73
crawler/src/routes.ts
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import { Dataset, createPlaywrightRouter } from 'crawlee';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
export const router = createPlaywrightRouter();
|
||||||
|
|
||||||
|
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
||||||
|
const songs = await enqueueLinks({
|
||||||
|
selector: 'article a.xrntp',
|
||||||
|
label: 'SONG',
|
||||||
|
});
|
||||||
|
// Find a link to the next page and enqueue it if it exists.
|
||||||
|
const lists = await enqueueLinks({
|
||||||
|
selector: '.VECGt',
|
||||||
|
label: 'LIST',
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
router.addHandler('SONG', async ({ request, page }) => {
|
||||||
|
await Dataset.pushData({ url: request.loadedUrl });
|
||||||
|
await page.waitForSelector('aside div div section button[name="download"]');
|
||||||
|
const title = await page.locator('h1').textContent()
|
||||||
|
// const artist = 'a';
|
||||||
|
const artist = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a').textContent()
|
||||||
|
//const genre = 'b';
|
||||||
|
const genre = await page.locator('body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a').textContent()
|
||||||
|
await page.locator('aside div div section button[name="download"]').click()
|
||||||
|
await page.waitForSelector('section.b_r17 button');
|
||||||
|
const [ downloadMxl ] = await Promise.all([
|
||||||
|
// Start waiting for the download
|
||||||
|
page.waitForEvent('download'),
|
||||||
|
// Perform the action that initiates download
|
||||||
|
page.locator('section.b_r17 section section div:nth-child(3) button').click(),
|
||||||
|
]);
|
||||||
|
// Save downloaded file somewhere
|
||||||
|
await downloadMxl.saveAs(`../musics/a/${title}/${title}.mxl`);
|
||||||
|
|
||||||
|
await page.locator('body > article > section > button').click();
|
||||||
|
|
||||||
|
await page.waitForTimeout(15000);
|
||||||
|
await page.locator('aside div div section button[name="download"]').click()
|
||||||
|
await page.waitForSelector('section.b_r17 button');
|
||||||
|
const [ downloadMidi ] = await Promise.all([
|
||||||
|
// Start waiting for the download
|
||||||
|
page.waitForEvent('download'),
|
||||||
|
// Perform the action that initiates download
|
||||||
|
page.locator('section.b_r17 section section div:nth-child(4) button').click(),
|
||||||
|
]);
|
||||||
|
// Save downloaded file somewhere
|
||||||
|
await downloadMidi.saveAs(`../musics/a/${title}/${title}.midi`);
|
||||||
|
|
||||||
|
fs.writeFile(`../musics/a/${title}/${title}.ini`, `
|
||||||
|
[Metadata]
|
||||||
|
Name=${title}
|
||||||
|
Artist=${artist}
|
||||||
|
Genre=${genre}
|
||||||
|
Album=
|
||||||
|
|
||||||
|
[Difficulties]
|
||||||
|
TwoHands=0
|
||||||
|
Rhythm=0
|
||||||
|
NoteCombo=0
|
||||||
|
Arpeggio=0
|
||||||
|
Distance=0
|
||||||
|
LeftHand=0
|
||||||
|
RightHand=0
|
||||||
|
LeadHandChange=0
|
||||||
|
ChordComplexity=0
|
||||||
|
ChordTiming=0
|
||||||
|
Length=0
|
||||||
|
PedalPoint=0
|
||||||
|
Precision=0
|
||||||
|
`, () => {})
|
||||||
|
await page.waitForTimeout(15000);
|
||||||
|
});
|
||||||
13
crawler/tsconfig.json
Normal file
13
crawler/tsconfig.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"extends": "@apify/tsconfig",
|
||||||
|
"compilerOptions": {
|
||||||
|
"module": "ES2022",
|
||||||
|
"target": "ES2022",
|
||||||
|
"outDir": "dist",
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"lib": ["DOM"]
|
||||||
|
},
|
||||||
|
"include": [
|
||||||
|
"./src/**/*"
|
||||||
|
]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user