Compare commits
2 Commits
main
...
feat/crawl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfd4a8acec | ||
|
|
e63789cbc1 |
29
crawler/package-lock.json
generated
29
crawler/package-lock.json
generated
@@ -11,10 +11,12 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"crawlee": "^3.0.0",
|
"crawlee": "^3.0.0",
|
||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
"playwright": "^1.28.0"
|
"playwright": "^1.28.0",
|
||||||
|
"slug": "^8.2.3"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@apify/tsconfig": "^0.1.0",
|
"@apify/tsconfig": "^0.1.0",
|
||||||
|
"@types/slug": "^5.0.5",
|
||||||
"ts-node": "^10.8.0",
|
"ts-node": "^10.8.0",
|
||||||
"typescript": "^4.7.4"
|
"typescript": "^4.7.4"
|
||||||
}
|
}
|
||||||
@@ -778,6 +780,12 @@
|
|||||||
"@types/node": "*"
|
"@types/node": "*"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@types/slug": {
|
||||||
|
"version": "5.0.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
|
||||||
|
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"node_modules/@types/tough-cookie": {
|
"node_modules/@types/tough-cookie": {
|
||||||
"version": "4.0.2",
|
"version": "4.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
|
||||||
@@ -2760,6 +2768,14 @@
|
|||||||
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
|
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
|
||||||
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
|
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
|
||||||
},
|
},
|
||||||
|
"node_modules/slug": {
|
||||||
|
"version": "8.2.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
|
||||||
|
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ==",
|
||||||
|
"bin": {
|
||||||
|
"slug": "cli.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/source-map": {
|
"node_modules/source-map": {
|
||||||
"version": "0.6.1",
|
"version": "0.6.1",
|
||||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||||
@@ -3848,6 +3864,12 @@
|
|||||||
"@types/node": "*"
|
"@types/node": "*"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"@types/slug": {
|
||||||
|
"version": "5.0.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz",
|
||||||
|
"integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==",
|
||||||
|
"dev": true
|
||||||
|
},
|
||||||
"@types/tough-cookie": {
|
"@types/tough-cookie": {
|
||||||
"version": "4.0.2",
|
"version": "4.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz",
|
||||||
@@ -5233,6 +5255,11 @@
|
|||||||
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
|
"resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
|
||||||
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
|
"integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ=="
|
||||||
},
|
},
|
||||||
|
"slug": {
|
||||||
|
"version": "8.2.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz",
|
||||||
|
"integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ=="
|
||||||
|
},
|
||||||
"source-map": {
|
"source-map": {
|
||||||
"version": "0.6.1",
|
"version": "0.6.1",
|
||||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||||
|
|||||||
@@ -6,10 +6,12 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"crawlee": "^3.0.0",
|
"crawlee": "^3.0.0",
|
||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
"playwright": "^1.28.0"
|
"playwright": "^1.28.0",
|
||||||
|
"slug": "^8.2.3"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@apify/tsconfig": "^0.1.0",
|
"@apify/tsconfig": "^0.1.0",
|
||||||
|
"@types/slug": "^5.0.5",
|
||||||
"ts-node": "^10.8.0",
|
"ts-node": "^10.8.0",
|
||||||
"typescript": "^4.7.4"
|
"typescript": "^4.7.4"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -17,5 +17,5 @@ const crawler = new PlaywrightCrawler({
|
|||||||
|
|
||||||
// Add first URL to the queue and start the crawl.
|
// Add first URL to the queue and start the crawl.
|
||||||
await crawler.run([
|
await crawler.run([
|
||||||
"https://musescore.com/sheetmusic?complexity=1&instrument=2&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain",
|
"https://musescore.com/sheetmusic?complexity=1&instrument=2&instrumentation=114&license=to_modify_commercially%2Cto_use_commercially&recording_type=public-domain&sort=rating",
|
||||||
]);
|
]);
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import { Dataset, createPlaywrightRouter } from "crawlee";
|
|||||||
import * as fs from "fs";
|
import * as fs from "fs";
|
||||||
import { sleep } from "crawlee";
|
import { sleep } from "crawlee";
|
||||||
export const router = createPlaywrightRouter();
|
export const router = createPlaywrightRouter();
|
||||||
|
import slug from "slug";
|
||||||
|
|
||||||
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
router.addDefaultHandler(async ({ enqueueLinks }) => {
|
||||||
const songs = await enqueueLinks({
|
const songs = await enqueueLinks({
|
||||||
@@ -18,13 +19,17 @@ router.addDefaultHandler(async ({ enqueueLinks }) => {
|
|||||||
router.addHandler("SONG", async ({ request, page }) => {
|
router.addHandler("SONG", async ({ request, page }) => {
|
||||||
await Dataset.pushData({ url: request.loadedUrl });
|
await Dataset.pushData({ url: request.loadedUrl });
|
||||||
await page.waitForSelector('aside div div section button[name="download"]');
|
await page.waitForSelector('aside div div section button[name="download"]');
|
||||||
const title = await page.locator("h1").textContent();
|
let og_title = await page.locator("h1").textContent();
|
||||||
const artist = await page
|
if (og_title == null) return
|
||||||
|
let title = slug(og_title);
|
||||||
|
let artist = await page
|
||||||
.locator(
|
.locator(
|
||||||
"body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a"
|
"body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a"
|
||||||
)
|
)
|
||||||
.first()
|
.first()
|
||||||
.textContent();
|
.textContent();
|
||||||
|
if (artist == null) return
|
||||||
|
artist = slug(artist);
|
||||||
const genres = await page
|
const genres = await page
|
||||||
.locator(
|
.locator(
|
||||||
"body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"
|
"body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"
|
||||||
@@ -66,7 +71,7 @@ router.addHandler("SONG", async ({ request, page }) => {
|
|||||||
`../musics/a/${title}/${title}.ini`,
|
`../musics/a/${title}/${title}.ini`,
|
||||||
`
|
`
|
||||||
[Metadata]
|
[Metadata]
|
||||||
Name=${title}
|
Name=${og_title}
|
||||||
Artist=${artist}
|
Artist=${artist}
|
||||||
Genre=${genres}
|
Genre=${genres}
|
||||||
Album=
|
Album=
|
||||||
|
|||||||
Reference in New Issue
Block a user