Skip to content

Commit 8fc1f24

Browse files
fix: youtube transcript collector not work well with non en or non asr caption (#4442)
* fix: youtube transcript collector not work well with non en or non asr caption * stub YT test in Github actions --------- Co-authored-by: Timothy Carambat <[email protected]>
1 parent c8f13d5 commit 8fc1f24

File tree

2 files changed

+115
-14
lines changed

2 files changed

+115
-14
lines changed
Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,32 @@
11
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
22

33
describe("YoutubeTranscript", () => {
4-
it("should fetch transcript from YouTube video", async () => {
5-
const videoId = "BJjsfNO5JTo";
6-
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
7-
lang: "en",
8-
});
4+
if (process.env.GITHUB_ACTIONS) {
5+
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
6+
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
7+
} else {
8+
it("should fetch transcript from YouTube video", async () => {
9+
const videoId = "BJjsfNO5JTo";
10+
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
11+
lang: "en",
12+
});
913

10-
expect(transcript).toBeDefined();
11-
expect(typeof transcript).toBe("string");
12-
expect(transcript.length).toBeGreaterThan(0);
13-
// console.log("Success! Transcript length:", transcript.length);
14-
// console.log("First 200 characters:", transcript.substring(0, 200) + "...");
15-
}, 30000);
14+
expect(transcript).toBeDefined();
15+
expect(typeof transcript).toBe("string");
16+
expect(transcript.length).toBeGreaterThan(0);
17+
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
18+
}, 30000);
19+
20+
it("should fetch non asr transcript from YouTube video", async () => {
21+
const videoId = "D111ao6wWH0";
22+
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
23+
lang: "zh-HK",
24+
});
25+
26+
expect(transcript).toBeDefined();
27+
expect(typeof transcript).toBe("string");
28+
expect(transcript.length).toBeGreaterThan(0);
29+
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
30+
}, 30000);
31+
}
1632
});

collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,85 @@ class YoutubeTranscript {
8585
.replace(/\s+/g, " ");
8686
}
8787

88+
/**
89+
* Calculates a preference score for a caption track to determine the best match
90+
* @param {Object} track - The caption track object from YouTube
91+
* @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es')
92+
* @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed)
93+
* @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en'])
94+
* @returns {number} Preference score (lower is better)
95+
*/
96+
static #calculatePreferenceScore(track, preferredLanguages) {
97+
// Language preference: index in preferredLanguages array (0 = most preferred)
98+
const languagePreference = preferredLanguages.indexOf(track.languageCode);
99+
const languageScore = languagePreference === -1 ? 9999 : languagePreference;
100+
101+
// Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr')
102+
const kindBonus = track.kind === "asr" ? 0.5 : 0;
103+
104+
return languageScore + kindBonus;
105+
}
106+
107+
/**
108+
* Finds the most suitable caption track based on preferred languages
109+
* @param {string} videoBody - The raw HTML response from YouTube
110+
* @param {string[]} preferredLanguages - Array of language codes in preference order
111+
* @returns {Object|null} The selected caption track or null if none found
112+
*/
113+
static #findPreferredCaptionTrack(videoBody, preferredLanguages) {
114+
const captionsConfigJson = videoBody.match(
115+
/"captions":(.*?),"videoDetails":/s
116+
);
117+
118+
const captionsConfig = captionsConfigJson?.[1]
119+
? JSON.parse(captionsConfigJson[1])
120+
: null;
121+
122+
const captionTracks = captionsConfig
123+
? captionsConfig.playerCaptionsTracklistRenderer.captionTracks
124+
: null;
125+
126+
if (!captionTracks || captionTracks.length === 0) {
127+
return null;
128+
}
129+
130+
const sortedTracks = [...captionTracks].sort((a, b) => {
131+
const scoreA = this.#calculatePreferenceScore(a, preferredLanguages);
132+
const scoreB = this.#calculatePreferenceScore(b, preferredLanguages);
133+
return scoreA - scoreB;
134+
});
135+
136+
return sortedTracks[0];
137+
}
138+
139+
/**
140+
* Fetches video page content and finds the preferred caption track
141+
* @param {string} videoId - YouTube video ID
142+
* @param {string[]} preferredLanguages - Array of preferred language codes
143+
* @returns {Promise<Object>} The preferred caption track
144+
* @throws {YoutubeTranscriptError} If no suitable caption track is found
145+
*/
146+
static async #getPreferredCaptionTrack(videoId, preferredLanguages) {
147+
const videoResponse = await fetch(
148+
`https://www.youtube.com/watch?v=${videoId}`,
149+
{ credentials: "omit" }
150+
);
151+
const videoBody = await videoResponse.text();
152+
153+
const preferredCaptionTrack = this.#findPreferredCaptionTrack(
154+
videoBody,
155+
preferredLanguages
156+
);
157+
158+
if (!preferredCaptionTrack) {
159+
throw new YoutubeTranscriptError(
160+
"No suitable caption track found for the video"
161+
);
162+
}
163+
164+
return preferredCaptionTrack;
165+
}
166+
88167
/**
89168
* Fetch transcript from YouTube video
90169
* @param {string} videoId - Video URL or video identifier
@@ -93,14 +172,20 @@ class YoutubeTranscript {
93172
* @returns {Promise<string>} Video transcript text
94173
*/
95174
static async fetchTranscript(videoId, config = {}) {
175+
const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"];
96176
const identifier = this.retrieveVideoId(videoId);
97-
const lang = config?.lang ?? "en";
98177

99178
try {
179+
const preferredCaptionTrack = await this.#getPreferredCaptionTrack(
180+
identifier,
181+
preferredLanguages
182+
);
183+
100184
const innerProto = this.#getBase64Protobuf({
101-
param1: "asr",
102-
param2: lang,
185+
param1: preferredCaptionTrack.kind || "",
186+
param2: preferredCaptionTrack.languageCode,
103187
});
188+
104189
const params = this.#getBase64Protobuf({
105190
param1: identifier,
106191
param2: innerProto,

0 commit comments

Comments
 (0)