Skip to content

Commit d361968

Browse files
Refactor loadYouTubeTranscript() to include YouTube Video Metadata in Content When parseOnly is true (#4552)
* Enhance YouTube transcript loading to include video metadata in parsed content when parseOnly is true * extract to function --------- Co-authored-by: timothycarambat <[email protected]>
1 parent 8c97240 commit d361968

File tree

1 file changed

+40
-3
lines changed
  • collector/utils/extensions/YoutubeTranscript

1 file changed

+40
-3
lines changed

collector/utils/extensions/YoutubeTranscript/index.js

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ const { validYoutubeVideoUrl } = require("../../url");
1414
/**
1515
* Fetch the transcript content for a YouTube video
1616
* @param {string} url - The URL of the YouTube video
17-
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: Object}>} - The transcript content for the YouTube video
17+
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: TranscriptMetadata}>} - The transcript content for the YouTube video
1818
*/
1919
async function fetchVideoTranscriptContent({ url }) {
2020
if (!validYoutubeVideoUrl(url)) {
@@ -64,11 +64,20 @@ async function fetchVideoTranscriptContent({ url }) {
6464
};
6565
}
6666

67+
/**
68+
* @typedef {Object} TranscriptMetadata
69+
* @property {string} title - The title of the video
70+
* @property {string} author - The author of the video
71+
* @property {string} description - The description of the video
72+
* @property {string} view_count - The view count of the video
73+
* @property {string} source - The source of the video (videoId)
74+
*/
75+
6776
/**
6877
* @typedef {Object} TranscriptAsDocument
6978
* @property {boolean} success - Whether the transcript was successful
7079
* @property {string|null} reason - The reason for the transcript
71-
* @property {{title: string, author: string, destination: string}} data - The data from the transcript
80+
* @property {TranscriptMetadata} metadata - The metadata from the transcript
7281
*/
7382

7483
/**
@@ -104,11 +113,12 @@ async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
104113
}
105114

106115
const { content, metadata } = transcriptResults;
116+
107117
if (options.parseOnly) {
108118
return {
109119
success: true,
110120
reason: null,
111-
content,
121+
content: buildTranscriptContentWithMetadata(content, metadata),
112122
documents: [],
113123
saveAsDocument: options.parseOnly,
114124
data: {},
@@ -154,6 +164,33 @@ async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
154164
};
155165
}
156166

167+
/**
168+
* Generate the transcript content and metadata into a single string
169+
*
170+
* Why? For ephemeral documents where we just want the content, we want to include the metadata as keys in the content
171+
* so that the LLM has context about the video, this gives it a better understanding of the video
172+
* and allows it to use the metadata in the conversation if relevant.
173+
* Examples:
174+
* - How many views does <LINK> have?
175+
* - Checkout <LINK> and tell me the key points and if it is performing well
176+
* - Summarize this video <LINK>? -> description could have links and references
177+
* @param {string} content - The content of the transcript
178+
* @param {TranscriptMetadata} metadata - The metadata from the transcript
179+
* @returns {string} - The concatenated transcript content and metadata
180+
*/
181+
function buildTranscriptContentWithMetadata(content = "", metadata = {}) {
182+
const VALID_METADATA_KEYS = ["title", "author", "description", "view_count"];
183+
if (!content || !metadata || Object.keys(metadata).length === 0)
184+
return content;
185+
186+
let contentWithMetadata = "";
187+
VALID_METADATA_KEYS.forEach((key) => {
188+
if (!metadata[key]) return;
189+
contentWithMetadata += `<${key}>${metadata[key]}</${key}>`;
190+
});
191+
return `${contentWithMetadata}\nTranscript:\n${content}`;
192+
}
193+
157194
module.exports = {
158195
loadYouTubeTranscript,
159196
fetchVideoTranscriptContent,

0 commit comments

Comments
 (0)