Skip to content

Commit 822db9a

Browse files
committed
fix(learningsuite): capture all HLS segments by seeking through video
1 parent b80959c commit 822db9a

File tree

1 file changed

+100
-39
lines changed

1 file changed

+100
-39
lines changed

src/scraper/learningsuite/extractor.ts

Lines changed: 100 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ export async function extractLearningSuitePostContent(
387387

388388
// Handler for requests - capture segment URLs with tokens
389389
const segmentUrls: string[] = [];
390+
390391
const requestHandler = (request: { url: () => string }) => {
391392
const url = request.url();
392393

@@ -428,7 +429,6 @@ export async function extractLearningSuitePostContent(
428429
const headers = response.headers();
429430
const location = headers.location;
430431
if (location?.includes("b-cdn.net")) {
431-
console.log(`[DEBUG] Redirect to CDN: ${location.substring(0, 100)}...`);
432432
if (!hlsUrls.includes(location)) {
433433
hlsUrls.unshift(location); // Priority
434434
}
@@ -437,27 +437,57 @@ export async function extractLearningSuitePostContent(
437437
}
438438

439439
if (status === 200) {
440-
const contentType = response.headers()["content-type"] ?? "";
441-
442-
// If it's a direct playlist response
443-
if (contentType.includes("mpegurl") || contentType.includes("m3u8")) {
444-
// The API proxy is serving the playlist directly - construct CDN URL from .ts requests
445-
// We'll capture the CDN base URL from .ts segment requests instead
446-
return;
447-
}
448-
449440
const text = await response.text();
450441

451442
// Check if it's HLS playlist content
452443
if (text.startsWith("#EXTM3U")) {
453-
console.log(`[DEBUG] Got HLS playlist from API proxy`);
444+
// Extract ALL segment URLs from the playlist
445+
// Lines that end with .ts and include tokens are segments
446+
const lines = text.split("\n");
447+
let baseUrl = "";
448+
449+
// First, try to find the base URL from full URLs in the playlist
450+
for (const line of lines) {
451+
const trimmed = line.trim();
452+
if (trimmed.startsWith("http") && trimmed.includes("b-cdn.net")) {
453+
const match = /(https?:\/\/[^/]+\/[^/]+\/)/.exec(trimmed);
454+
if (match?.[1]) {
455+
baseUrl = match[1];
456+
break;
457+
}
458+
}
459+
}
460+
461+
for (const line of lines) {
462+
const trimmed = line.trim();
463+
// Skip comment lines and empty lines
464+
if (trimmed.startsWith("#") || trimmed === "") continue;
465+
466+
// Check if this line is a segment URL (contains .ts)
467+
if (trimmed.includes(".ts")) {
468+
let segmentUrl = trimmed;
469+
470+
// If it's a full URL, add it directly
471+
if (trimmed.startsWith("http")) {
472+
if (!segmentUrls.includes(segmentUrl)) {
473+
segmentUrls.push(segmentUrl);
474+
}
475+
} else if (baseUrl && trimmed.includes("token=")) {
476+
// Relative URL with token - construct full URL
477+
segmentUrl = baseUrl + trimmed;
478+
if (!segmentUrls.includes(segmentUrl)) {
479+
segmentUrls.push(segmentUrl);
480+
}
481+
}
482+
}
483+
}
484+
454485
// Extract CDN base URL from playlist content
455486
const cdnMatch = /(https?:\/\/vz-[^"'\s]+\.b-cdn\.net\/[^"'\s]+)/g.exec(text);
456487
if (cdnMatch?.[1]) {
457-
const baseUrl = cdnMatch[1].replace(/\/[^/]+\.ts.*$/, "/playlist.m3u8");
458-
console.log(`[DEBUG] Extracted CDN base: ${baseUrl}`);
459-
if (!hlsUrls.includes(baseUrl)) {
460-
hlsUrls.unshift(baseUrl);
488+
const extractedBase = cdnMatch[1].replace(/\/[^/]+\.ts.*$/, "/playlist.m3u8");
489+
if (!hlsUrls.includes(extractedBase)) {
490+
hlsUrls.unshift(extractedBase);
461491
}
462492
}
463493
}
@@ -468,7 +498,6 @@ export async function extractLearningSuitePostContent(
468498
while ((match = cdnUrlRegex.exec(text)) !== null) {
469499
const cdnUrl = match[1];
470500
if (cdnUrl && !hlsUrls.includes(cdnUrl)) {
471-
console.log(`[DEBUG] Found CDN URL in response: ${cdnUrl.substring(0, 80)}...`);
472501
hlsUrls.push(cdnUrl);
473502
}
474503
}
@@ -494,45 +523,77 @@ export async function extractLearningSuitePostContent(
494523
.then(() => true)
495524
.catch(() => false);
496525

497-
// If video player exists but no HLS URL captured yet, try to trigger video load
498-
if (hasVideoPlayer && hlsUrls.length === 0) {
499-
// Try multiple approaches to trigger video loading
500-
501-
// 1. Try clicking play button
526+
// If video player exists, trigger video load and seek to capture ALL segments
527+
if (hasVideoPlayer) {
528+
// Try clicking play button first
502529
const playButton = page.locator(
503530
'[aria-label*="play" i], [class*="play" i], button[class*="Play"], [data-testid*="play"]'
504531
);
505532
try {
506533
await playButton.first().click({ timeout: 2000 });
507-
await page.waitForTimeout(3000);
534+
await page.waitForTimeout(2000);
508535
} catch {
509-
// Play button not found
510-
}
511-
512-
// 2. Try clicking the video element directly
513-
if (hlsUrls.length === 0) {
536+
// Play button not found, try clicking video directly
514537
try {
515538
await page.locator("video").first().click({ timeout: 2000 });
516-
await page.waitForTimeout(3000);
539+
await page.waitForTimeout(2000);
517540
} catch {
518541
// Video not clickable
519542
}
520543
}
521544

522-
// 3. Try hovering over video to trigger autoplay
523-
if (hlsUrls.length === 0) {
524-
try {
525-
await page.locator("video, [class*='video']").first().hover({ timeout: 2000 });
526-
await page.waitForTimeout(3000);
527-
} catch {
528-
// Hover failed
545+
// Get video duration and seek to multiple positions to capture all segments
546+
// HLS players load segments on-demand, so we need to seek to trigger loading
547+
try {
548+
const videoDuration = await page.evaluate(() => {
549+
const video = document.querySelector("video");
550+
return video?.duration ?? 0;
551+
});
552+
553+
if (videoDuration > 0) {
554+
// Calculate expected segment count (assuming ~4s per segment for Bunny CDN)
555+
const segmentDuration = 4; // Bunny CDN uses ~4 second segments
556+
557+
// For longer videos, we need to seek to MORE positions
558+
// HLS players typically buffer ~3-4 segments ahead
559+
// So we need to seek every ~12-16 seconds to capture all segments
560+
const seekInterval = segmentDuration * 3; // Seek every ~12 seconds
561+
const seekPositions: number[] = [];
562+
563+
// Generate seek positions throughout the video
564+
for (let t = 0; t < videoDuration; t += seekInterval) {
565+
seekPositions.push(t);
566+
}
567+
// Always include near the end
568+
seekPositions.push(videoDuration - 2);
569+
seekPositions.push(videoDuration - 0.5);
570+
571+
for (const seekTime of seekPositions) {
572+
await page.evaluate((time) => {
573+
const video = document.querySelector("video");
574+
if (video) {
575+
video.currentTime = time;
576+
}
577+
}, seekTime);
578+
// Wait for segments to load - shorter wait since we have many positions
579+
await page.waitForTimeout(800);
580+
}
581+
582+
// Seek back to start
583+
await page.evaluate(() => {
584+
const video = document.querySelector("video");
585+
if (video) {
586+
video.currentTime = 0;
587+
}
588+
});
589+
await page.waitForTimeout(500);
529590
}
591+
} catch {
592+
// Seek failed
530593
}
531-
}
532594

533-
// Give more time for lazy-loaded videos and CDN URL extraction
534-
if (hlsUrls.length === 0 && hasVideoPlayer) {
535-
await page.waitForTimeout(5000);
595+
// Give more time for all segment requests to complete
596+
await page.waitForTimeout(1500);
536597
}
537598

538599
// Remove handlers

0 commit comments

Comments
 (0)