2 Commits

Author SHA1 Message Date
GitBluub
cfd4a8acec sort by rating and slug and normal title 2024-01-15 01:29:05 +01:00
GitBluub
e63789cbc1 fix: skip if no artist or song name 2023-11-13 22:55:22 +01:00
4 changed files with 40 additions and 6 deletions

View File

@@ -11,10 +11,12 @@
"dependencies": {
"crawlee": "^3.0.0",
"fs": "^0.0.1-security",
"playwright": "^1.28.0"
"playwright": "^1.28.0",
"slug": "^8.2.3"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/slug": "^5.0.5",
"ts-node": "^10.8.0",
"typescript": "^4.7.4"
}
@@ -778,6 +780,12 @@
"@types/node": "*"
}
},
"node_modules/@types/slug": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
"dev": true
},
"node_modules/@types/tough-cookie": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
@@ -2760,6 +2768,14 @@
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
},
"node_modules/slug": {
"version": "8.2.3",
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ==",
"bin": {
"slug": "cli.js"
}
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
@@ -3848,6 +3864,12 @@
"@types/node": "*"
}
},
"@types/slug": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
"dev": true
},
"@types/tough-cookie": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
@@ -5233,6 +5255,11 @@
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
},
"slug": {
"version": "8.2.3",
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ=="
},
"source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",

View File

@@ -6,10 +6,12 @@
"dependencies": {
"crawlee": "^3.0.0",
"fs": "^0.0.1-security",
"playwright": "^1.28.0"
"playwright": "^1.28.0",
"slug": "^8.2.3"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/slug": "^5.0.5",
"ts-node": "^10.8.0",
"typescript": "^4.7.4"
},

View File

@@ -17,5 +17,5 @@ const crawler = new PlaywrightCrawler({
// Add first URL to the queue and start the crawl.
await crawler.run([
"https://musescore.com/sheetmusic?complexity=1&instrument=2&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain",
"https://musescore.com/sheetmusic?complexity=1&instrument=2&instrumentation=114&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain&sort=rating",
]);

View File

@@ -2,6 +2,7 @@ import { Dataset, createPlaywrightRouter } from "crawlee";
import * as fs from "fs";
import { sleep } from "crawlee";
export const router = createPlaywrightRouter();
import slug from "slug";
router.addDefaultHandler(async ({ enqueueLinks }) => {
const songs = await enqueueLinks({
@@ -18,13 +19,17 @@ router.addDefaultHandler(async ({ enqueueLinks }) => {
router.addHandler("SONG", async ({ request, page }) => {
await Dataset.pushData({ url: request.loadedUrl });
await page.waitForSelector('aside div div section button[name="download"]');
const title = await page.locator("h1").textContent();
const artist = await page
let og_title = await page.locator("h1").textContent();
if (og_title == null) return
let title = slug(og_title);
let artist = await page
.locator(
"body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a"
)
.first()
.textContent();
if (artist == null) return
artist = slug(artist);
const genres = await page
.locator(
"body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"
@@ -66,7 +71,7 @@ router.addHandler("SONG", async ({ request, page }) => {
`../musics/a/${title}/${title}.ini`,
`
[Metadata]
Name=${title}
Name=${og_title}
Artist=${artist}
Genre=${genres}
Album=