Skip to content

Commit e281413

Browse files
committed
fix(learningsuite): filter out API proxy URLs, capture real CDN URLs
- Exclude api.learningsuite.io URLs from video capture - Only capture real Bunny CDN URLs (b-cdn.net, mediadelivery.net, vz-*) - Add response interception to extract CDN URLs from API responses - Filter DOM-extracted URLs to exclude API proxies - Fix invalid 4-second video downloads caused by API proxy responses
1 parent 45e8d9e commit e281413

File tree

1 file changed

+85
-26
lines changed

1 file changed

+85
-26
lines changed

src/scraper/learningsuite/extractor.ts

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,22 @@ export function detectVideoType(url: string): LearningSuiteVideoInfo["type"] {
6060
* Extracts video information from a lesson page.
6161
*/
6262
export async function extractVideoFromPage(page: Page): Promise<LearningSuiteVideoInfo | null> {
63+
// Helper to check if URL is a valid CDN URL (not an API proxy)
64+
const isValidCdnUrl = (url: string): boolean => {
65+
// Skip API proxy endpoints
66+
if (url.includes("api.learningsuite.io")) {
67+
return false;
68+
}
69+
// Only accept actual CDN URLs
70+
return (
71+
url.includes("b-cdn.net") ||
72+
url.includes("mediadelivery.net") ||
73+
url.includes("vz-") ||
74+
// Also accept URLs without API proxying
75+
(!url.includes("learningsuite.io") && url.includes(".m3u8"))
76+
);
77+
};
78+
6379
// Check for HLS video
6480
const hlsUrl = await page.evaluate(() => {
6581
// Look for video elements with HLS source
@@ -80,8 +96,19 @@ export async function extractVideoFromPage(page: Page): Promise<LearningSuiteVid
8096
if (src) return src;
8197
}
8298

83-
// Look for HLS URLs in script tags
99+
// Look for HLS URLs in script tags - prefer CDN URLs
84100
const scripts = Array.from(document.querySelectorAll("script"));
101+
for (const script of scripts) {
102+
const content = script.textContent ?? "";
103+
// Look for Bunny CDN URLs first
104+
const cdnMatch =
105+
/(https?:\/\/[^"'\s]*(?:b-cdn\.net|mediadelivery\.net|vz-)[^"'\s]*\.m3u8[^"'\s]*)/i.exec(
106+
content
107+
);
108+
if (cdnMatch?.[1]) return cdnMatch[1];
109+
}
110+
111+
// Fallback to any m3u8 URL in scripts (will be filtered later)
85112
for (const script of scripts) {
86113
const content = script.textContent ?? "";
87114
const hlsMatch = /"(https?:\/\/[^"]+\.m3u8[^"]*)"/i.exec(content);
@@ -91,7 +118,8 @@ export async function extractVideoFromPage(page: Page): Promise<LearningSuiteVid
91118
return null;
92119
});
93120

94-
if (hlsUrl) {
121+
// Filter out API proxy URLs
122+
if (hlsUrl && isValidCdnUrl(hlsUrl)) {
95123
return {
96124
type: "hls",
97125
url: hlsUrl,
@@ -355,23 +383,64 @@ export async function extractLearningSuitePostContent(
355383
): Promise<LearningSuitePostContent | null> {
356384
// Set up request interception to capture HLS video URLs
357385
const hlsUrls: string[] = [];
386+
387+
// Handler for requests - capture direct CDN URLs
358388
const requestHandler = (request: { url: () => string }) => {
359389
const url = request.url();
360-
// Capture actual HLS playlists from Bunny CDN or direct m3u8 files
361-
// Prioritize actual .m3u8 files over API endpoints
390+
391+
// Only capture real Bunny CDN URLs, not API proxies
392+
if (url.includes("api.learningsuite.io")) {
393+
return;
394+
}
395+
396+
// Capture actual HLS playlists from Bunny CDN
362397
if (
363-
url.includes(".m3u8") || // Direct HLS playlist
364-
url.includes("b-cdn.net") || // Bunny CDN
365-
url.includes("mediadelivery.net") // Bunny video delivery
398+
(url.includes(".m3u8") && url.includes("b-cdn.net")) ||
399+
(url.includes(".m3u8") && url.includes("mediadelivery.net")) ||
400+
(url.includes(".m3u8") && url.includes("vz-"))
366401
) {
367-
// Skip API responses, only capture actual playlist URLs
368-
if (!url.includes("/embed/") && !url.includes("/play/")) {
402+
if (!hlsUrls.includes(url)) {
369403
hlsUrls.push(url);
370404
}
371405
}
372406
};
373407

408+
// Handler for responses - capture Bunny CDN URLs from API responses
409+
const responseHandler = async (response: {
410+
url: () => string;
411+
status: () => number;
412+
text: () => Promise<string>;
413+
}) => {
414+
const url = response.url();
415+
416+
// Check if this is a Bunny API response that might contain the real playlist URL
417+
if (url.includes("api.learningsuite.io") && url.includes("/bunny/")) {
418+
try {
419+
const status = response.status();
420+
// Follow redirects - status 302/301 might have Location header
421+
if (status >= 300 && status < 400) {
422+
return; // Redirects are handled automatically
423+
}
424+
425+
// For 200 responses, try to parse as JSON to extract playlist URL
426+
if (status === 200) {
427+
const text = await response.text();
428+
// Look for Bunny CDN URLs in the response
429+
const cdnUrlRegex =
430+
/(https?:\/\/[^"'\s]*(?:b-cdn\.net|mediadelivery\.net)[^"'\s]*\.m3u8[^"'\s]*)/;
431+
const cdnUrlMatch = cdnUrlRegex.exec(text);
432+
if (cdnUrlMatch?.[1] && !hlsUrls.includes(cdnUrlMatch[1])) {
433+
hlsUrls.push(cdnUrlMatch[1]);
434+
}
435+
}
436+
} catch {
437+
// Response body might not be readable
438+
}
439+
}
440+
};
441+
374442
page.on("request", requestHandler);
443+
page.on("response", responseHandler);
375444

376445
// Navigate to lesson page
377446
await page.goto(lessonUrl, { timeout: 30000 });
@@ -406,30 +475,20 @@ export async function extractLearningSuitePostContent(
406475
await page.waitForTimeout(2000);
407476
}
408477

409-
// Remove handler
478+
// Remove handlers
410479
page.off("request", requestHandler);
480+
page.off("response", responseHandler);
411481

412482
// Try to get video from intercepted requests first
413483
let video: LearningSuiteVideoInfo | null = null;
414484

415-
// Prioritize actual .m3u8 files from CDN
416-
const actualPlaylist = hlsUrls.find(
417-
(url) =>
418-
url.includes(".m3u8") && (url.includes("b-cdn.net") || url.includes("mediadelivery.net"))
419-
);
420-
421-
if (actualPlaylist) {
422-
video = {
423-
type: "hls",
424-
url: actualPlaylist,
425-
hlsUrl: actualPlaylist,
426-
};
427-
} else if (hlsUrls.length > 0 && hlsUrls[0]) {
428-
// Fallback to any captured HLS URL
485+
// Only use CDN URLs we captured (API proxy URLs are filtered out above)
486+
const firstHlsUrl = hlsUrls[0];
487+
if (firstHlsUrl) {
429488
video = {
430489
type: "hls",
431-
url: hlsUrls[0],
432-
hlsUrl: hlsUrls[0],
490+
url: firstHlsUrl,
491+
hlsUrl: firstHlsUrl,
433492
};
434493
}
435494

0 commit comments

Comments
 (0)