From e63789cbc1595f307da1c1beb2cd51c2ced6cccf Mon Sep 17 00:00:00 2001 From: GitBluub Date: Mon, 13 Nov 2023 22:55:22 +0100 Subject: [PATCH] fix: skip if no artist or song name --- crawler/package-lock.json | 29 ++++++++++++++++++++++++++++- crawler/package.json | 4 +++- crawler/src/routes.ts | 9 +++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/crawler/package-lock.json b/crawler/package-lock.json index 73e9797..d610f1b 100644 --- a/crawler/package-lock.json +++ b/crawler/package-lock.json @@ -11,10 +11,12 @@ "dependencies": { "crawlee": "^3.0.0", "fs": "^0.0.1-security", - "playwright": "^1.28.0" + "playwright": "^1.28.0", + "slug": "^8.2.3" }, "devDependencies": { "@apify/tsconfig": "^0.1.0", + "@types/slug": "^5.0.5", "ts-node": "^10.8.0", "typescript": "^4.7.4" } @@ -778,6 +780,12 @@ "@types/node": "*" } }, + "node_modules/@types/slug": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz", + "integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==", + "dev": true + }, "node_modules/@types/tough-cookie": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz", @@ -2760,6 +2768,14 @@ "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" }, + "node_modules/slug": { + "version": "8.2.3", + "resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz", + "integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ==", + "bin": { + "slug": "cli.js" + } + }, "node_modules/source-map": { "version": "0.6.1", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", @@ -3848,6 +3864,12 @@ "@types/node": "*" } }, + "@types/slug": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/@types/slug/-/slug-5.0.5.tgz", + "integrity": "sha512-vcHM79Xu5ALOC90kf5S1B4XGbRl8VW6f1+6jpBmK/FLHi4AyWKAVENgMOyHFyjHV5vDbNRPtjsNJuPRqrLBOxw==", + "dev": true + }, "@types/tough-cookie": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.2.tgz", @@ -5233,6 +5255,11 @@ "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" }, + "slug": { + "version": "8.2.3", + "resolved": "https://registry.npmjs.org/slug/-/slug-8.2.3.tgz", + "integrity": "sha512-fXjhAZszNecz855GUNIwW0+sFPi9WV4bMiEKDOCA4wcq1ts1UnUVNy/F78B0Aat7/W3rA+se//33ILKNMrbeYQ==" + }, "source-map": { "version": "0.6.1", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", diff --git a/crawler/package.json b/crawler/package.json index efc880f..234ab92 100644 --- a/crawler/package.json +++ b/crawler/package.json @@ -6,10 +6,12 @@ "dependencies": { "crawlee": "^3.0.0", "fs": "^0.0.1-security", - "playwright": "^1.28.0" + "playwright": "^1.28.0", + "slug": "^8.2.3" }, "devDependencies": { "@apify/tsconfig": "^0.1.0", + "@types/slug": "^5.0.5", "ts-node": "^10.8.0", "typescript": "^4.7.4" }, diff --git a/crawler/src/routes.ts b/crawler/src/routes.ts index 0623209..bb6946f 100644 --- a/crawler/src/routes.ts +++ b/crawler/src/routes.ts @@ -2,6 +2,7 @@ import { Dataset, createPlaywrightRouter } from "crawlee"; import * as fs from "fs"; import { sleep } from "crawlee"; export const router = createPlaywrightRouter(); +import slug from "slug"; router.addDefaultHandler(async ({ enqueueLinks }) => { const songs = await enqueueLinks({ @@ -18,13 +19,17 @@ router.addDefaultHandler(async ({ enqueueLinks }) => { router.addHandler("SONG", async ({ request, page }) => { await Dataset.pushData({ url: request.loadedUrl }); await page.waitForSelector('aside div div section button[name="download"]'); - const title = await page.locator("h1").textContent(); - const artist = await page + let title = await page.locator("h1").textContent(); + if (title == null) return + title = slug(title); + let artist = await page .locator( "body > div.js-page.react-container > div > section > aside > div:nth-child(5) > div > section > h3:nth-child(2) > a" ) .first() .textContent(); + if (artist == null) return + artist = slug(artist); const genres = await page .locator( "body > div.js-page.react-container > div > section > aside > div:nth-child(6) > div > table > tbody > tr:nth-child(5) > td > div > a"