2 Commits

Author SHA1 Message Date
GitBluub
cfd4a8acec sort by rating and slug and normal title 2024-01-15 01:29:05 +01:00
GitBluub
e63789cbc1 fix: skip if no artist or song name 2023-11-13 22:55:22 +01:00
4 changed files with 40 additions and 6 deletions

View File

@@ -11,10 +11,12 @@
"dependencies": { "dependencies": {
"crawlee": "^3.0.0", "crawlee": "^3.0.0",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"playwright": "^1.28.0" "playwright": "^1.28.0",
"slug": "^8.2.3"
}, },
"devDependencies": { "devDependencies": {
"@apify/tsconfig": "^0.1.0", "@apify/tsconfig": "^0.1.0",
"@types/slug": "^5.0.5",
"ts-node": "^10.8.0", "ts-node": "^10.8.0",
"typescript": "^4.7.4" "typescript": "^4.7.4"
} }
@@ -778,6 +780,12 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/slug": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
"dev": true
},
"node_modules/@types/tough-cookie": { "node_modules/@types/tough-cookie": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz", "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
@@ -2760,6 +2768,14 @@
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
}, },
"node_modules/slug": {
"version": "8.2.3",
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ==",
"bin": {
"slug": "cli.js"
}
},
"node_modules/source-map": { "node_modules/source-map": {
"version": "0.6.1", "version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
@@ -3848,6 +3864,12 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"@types/slug": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
"dev": true
},
"@types/tough-cookie": { "@types/tough-cookie": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz", "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
@@ -5233,6 +5255,11 @@
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
}, },
"slug": {
"version": "8.2.3",
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ=="
},
"source-map": { "source-map": {
"version": "0.6.1", "version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",

View File

@@ -6,10 +6,12 @@
"dependencies": { "dependencies": {
"crawlee": "^3.0.0", "crawlee": "^3.0.0",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"playwright": "^1.28.0" "playwright": "^1.28.0",
"slug": "^8.2.3"
}, },
"devDependencies": { "devDependencies": {
"@apify/tsconfig": "^0.1.0", "@apify/tsconfig": "^0.1.0",
"@types/slug": "^5.0.5",
"ts-node": "^10.8.0", "ts-node": "^10.8.0",
"typescript": "^4.7.4" "typescript": "^4.7.4"
}, },

View File

@@ -17,5 +17,5 @@ const crawler = new PlaywrightCrawler({
// Add first URL to the queue and start the crawl. // Add first URL to the queue and start the crawl.
await crawler.run([ await crawler.run([
"https://musescore.com/sheetmusic?complexity=1&instrument=2&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain", "https://musescore.com/sheetmusic?complexity=1&instrument=2&instrumentation=114&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain&sort=rating",
]); ]);

View File

@@ -2,6 +2,7 @@ import { Dataset, createPlaywrightRouter } from "crawlee";
import * as fs from "fs"; import * as fs from "fs";
import { sleep } from "crawlee"; import { sleep } from "crawlee";
export const router = createPlaywrightRouter(); export const router = createPlaywrightRouter();
import slug from "slug";
router.addDefaultHandler(async ({ enqueueLinks }) => { router.addDefaultHandler(async ({ enqueueLinks }) => {
const songs = await enqueueLinks({ const songs = await enqueueLinks({
@@ -18,13 +19,17 @@ router.addDefaultHandler(async ({ enqueueLinks }) => {
router.addHandler("SONG", async ({ request, page }) => { router.addHandler("SONG", async ({ request, page }) => {
await Dataset.pushData({ url: request.loadedUrl }); await Dataset.pushData({ url: request.loadedUrl });
await page.waitForSelector('aside div div section button[name="download"]'); await page.waitForSelector('aside div div section button[name="download"]');
const title = await page.locator("h1").textContent(); let og_title = await page.locator("h1").textContent();
const artist = await page if (og_title == null) return
let title = slug(og_title);
let artist = await page
.locator( .locator(
"body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a" "body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a"
) )
.first() .first()
.textContent(); .textContent();
if (artist == null) return
artist = slug(artist);
const genres = await page const genres = await page
.locator( .locator(
"body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a" "body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"
@@ -66,7 +71,7 @@ router.addHandler("SONG", async ({ request, page }) => {
`../musics/a/${title}/${title}.ini`, `../musics/a/${title}/${title}.ini`,
` `
[Metadata] [Metadata]
Name=${title} Name=${og_title}
Artist=${artist} Artist=${artist}
Genre=${genres} Genre=${genres}
Album= Album=