Skip to content

Commit 3ca7b42

Browse files
committed
Refactor YouTube playlist configuration to use only playlist ID. Update environment variables and documentation accordingly.
1 parent 5dbf45c commit 3ca7b42

File tree

6 files changed

+41
-36
lines changed

6 files changed

+41
-36
lines changed

.env.example

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,7 @@ R2_SECRET_ACCESS_KEY=MOCK_R2_SECRET_ACCESS_KEY
124124
SEMANTIC_SEARCH_IGNORE_LIST_KEY=manifests/ignore-list.json
125125

126126
# Feature: YouTube playlist semantic indexing + /youtube route
127-
# Optional: set either playlist URL or playlist ID.
128-
YOUTUBE_PLAYLIST_URL=https://www.youtube.com/watch?v=wSEUlS8WcQs&list=PLV5CVI1eNcJgNqzNwcs4UKrlJdhfDjshf
127+
# Optional
129128
YOUTUBE_PLAYLIST_ID=PLV5CVI1eNcJgNqzNwcs4UKrlJdhfDjshf
130129
# Optional: helps bypass YouTube anti-bot checks for captions in some envs.
131130
YOUTUBE_COOKIE=MOCK_YOUTUBE_COOKIE

.github/workflows/index-semantic-youtube.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,8 @@ jobs:
4747
${{ secrets.CLOUDFLARE_AI_EMBEDDING_MODEL }}
4848
R2_BUCKET: ${{ secrets.R2_BUCKET }}
4949

50-
# Prefer setting one of these as repository variables.
51-
# If neither is set, the script uses a built-in default playlist ID.
52-
YOUTUBE_PLAYLIST_URL: ${{ vars.YOUTUBE_PLAYLIST_URL }}
50+
# Prefer setting this as a repository variable.
51+
# If it is not set, the script uses a built-in default playlist ID.
5352
YOUTUBE_PLAYLIST_ID: ${{ vars.YOUTUBE_PLAYLIST_ID }}
5453

5554
# Optional: force-index specific videos. Can be set either as a

app/routes/youtube.tsx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ function parsePlaylistId(value: string | undefined) {
4848

4949
export async function loader() {
5050
const env = getEnv()
51-
const playlistInput = env.YOUTUBE_PLAYLIST_URL ?? env.YOUTUBE_PLAYLIST_ID
5251
const configuredPlaylistId =
53-
parsePlaylistId(playlistInput) ?? DEFAULT_PLAYLIST_ID
52+
parsePlaylistId(env.YOUTUBE_PLAYLIST_ID) ?? DEFAULT_PLAYLIST_ID
5453
return json({
5554
playlistId: configuredPlaylistId,
5655
})

app/utils/env.server.ts

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,7 @@ const schemaBase = z.object({
9797
GITHUB_REF: z.string().trim().optional().default('main'),
9898

9999
// Optional: /youtube route + indexing scripts.
100-
YOUTUBE_PLAYLIST_URL: z.string().trim().optional(),
101100
YOUTUBE_PLAYLIST_ID: z.string().trim().optional(),
102-
YOUTUBE_COOKIE: z.string().trim().optional(),
103-
YOUTUBE_USER_AGENT: z.string().trim().optional(),
104101
})
105102

106103
const schema = schemaBase.superRefine((values, ctx) => {
@@ -217,10 +214,21 @@ export function init() {
217214
'❌ Invalid environment variables:',
218215
z.flattenError(error).fieldErrors,
219216
)
220-
} else {
221-
console.error('❌ Unexpected error while validating environment:', error)
217+
throw new Error('Invalid environment variables')
222218
}
223-
throw new Error('Invalid environment variables')
219+
220+
// Preserve non-Zod failures from `getEnv()` so callers see the real cause.
221+
if (error instanceof Error) {
222+
console.error(
223+
'❌ Unexpected error while validating environment:',
224+
error.message,
225+
)
226+
if (error.stack) console.error(error.stack)
227+
throw error
228+
}
229+
230+
console.error('❌ Unexpected error while validating environment:', error)
231+
throw new Error(`Unexpected error while validating environment: ${String(error)}`)
224232
}
225233
// Keep unused warning quiet (and make debugging easier if needed).
226234
void parsedEnv

other/semantic-search/index-youtube-playlist.ts

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -830,7 +830,8 @@ async function fetchTranscriptFromTrack(track: CaptionTrack, label: string) {
830830
? event.tStartMs
831831
: 0
832832
const durationMs =
833-
typeof event.dDurationMs === 'number' && Number.isFinite(event.dDurationMs)
833+
typeof event.dDurationMs === 'number' &&
834+
Number.isFinite(event.dDurationMs)
834835
? event.dDurationMs
835836
: 0
836837
const line = (event.segs ?? [])
@@ -904,10 +905,7 @@ function chunkTranscriptEvents(
904905
}
905906

906907
if (startMs === null) startMs = Math.max(0, Math.floor(e.startMs))
907-
const eventEnd = Math.max(
908-
0,
909-
Math.floor(e.startMs + (e.durationMs || 0)),
910-
)
908+
const eventEnd = Math.max(0, Math.floor(e.startMs + (e.durationMs || 0)))
911909
endMs = Math.max(endMs, eventEnd)
912910
currentLines.push(line)
913911
currentLen = currentLen + (currentLines.length > 1 ? 1 : 0) + line.length
@@ -1198,10 +1196,7 @@ async function main() {
11981196
dryRun,
11991197
} = parseArgs()
12001198
const playlistInput =
1201-
playlistArg ??
1202-
process.env.YOUTUBE_PLAYLIST_URL ??
1203-
process.env.YOUTUBE_PLAYLIST_ID ??
1204-
DEFAULT_PLAYLIST_ID
1199+
playlistArg ?? process.env.YOUTUBE_PLAYLIST_ID ?? DEFAULT_PLAYLIST_ID
12051200
const playlistId = getPlaylistId(playlistInput)
12061201
if (!playlistId) {
12071202
throw new Error(
@@ -1275,8 +1270,9 @@ async function main() {
12751270
const appearancesSourceCount = videos.filter((video) =>
12761271
video.sources.includes('appearances'),
12771272
).length
1278-
const talksSourceCount = videos.filter((video) => video.sources.includes('talks'))
1279-
.length
1273+
const talksSourceCount = videos.filter((video) =>
1274+
video.sources.includes('talks'),
1275+
).length
12801276
console.log(
12811277
`Total unique videos to process: ${videos.length} (playlist: ${playlistSourceCount}, appearances: ${appearancesSourceCount}, talks: ${talksSourceCount})${maxVideos ? `, capped by --max-videos=${maxVideos}` : ''}`,
12821278
)
@@ -1515,12 +1511,14 @@ async function main() {
15151511
? chunkTranscriptEvents(details.transcriptEvents, {
15161512
targetChars: 3500,
15171513
maxChunkChars: 5500,
1518-
}).map((c): YoutubeChunkItem => ({
1519-
kind: 'transcript',
1520-
body: c.body,
1521-
startSeconds: Math.floor(c.startMs / 1000),
1522-
endSeconds: Math.ceil(c.endMs / 1000),
1523-
}))
1514+
}).map(
1515+
(c): YoutubeChunkItem => ({
1516+
kind: 'transcript',
1517+
body: c.body,
1518+
startSeconds: Math.floor(c.startMs / 1000),
1519+
endSeconds: Math.ceil(c.endMs / 1000),
1520+
}),
1521+
)
15241522
: transcript
15251523
? chunkText(transcript, {
15261524
targetChars: 3500,
@@ -1550,13 +1548,16 @@ async function main() {
15501548
chunkIndex: index,
15511549
chunkCount,
15521550
kind: item.kind,
1553-
startSeconds: item.kind === 'transcript' ? item.startSeconds : undefined,
1551+
startSeconds:
1552+
item.kind === 'transcript' ? item.startSeconds : undefined,
15541553
endSeconds: item.kind === 'transcript' ? item.endSeconds : undefined,
15551554
})
15561555
if (oldChunksById.get(vectorId)?.hash === hash) continue
15571556

1558-
const startSeconds = item.kind === 'transcript' ? item.startSeconds : undefined
1559-
const endSeconds = item.kind === 'transcript' ? item.endSeconds : undefined
1557+
const startSeconds =
1558+
item.kind === 'transcript' ? item.startSeconds : undefined
1559+
const endSeconds =
1560+
item.kind === 'transcript' ? item.endSeconds : undefined
15601561

15611562
toUpsert.push({
15621563
vectorId,

other/semantic-search/readme.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ For podcast indexing:
3535

3636
For YouTube playlist indexing (optional but recommended as repo variables):
3737

38-
- `YOUTUBE_PLAYLIST_URL` (full URL with `list=...`) **or**
3938
- `YOUTUBE_PLAYLIST_ID` (playlist ID only)
4039
- Optional (helps when YouTube returns anti-bot `LOGIN_REQUIRED`):
4140
- `YOUTUBE_COOKIE` (cookie header value from a logged-in browser session)
@@ -89,8 +88,8 @@ Script:
8988

9089
Optional flags:
9190

92-
- `--playlist "<url-or-id>"` (defaults to `YOUTUBE_PLAYLIST_URL`,
93-
`YOUTUBE_PLAYLIST_ID`, or a built-in default playlist ID)
91+
- `--playlist "<url-or-id>"` (defaults to `YOUTUBE_PLAYLIST_ID` or a built-in
92+
default playlist ID)
9493
- `--max-videos 50` (helpful for staged/backfill runs)
9594
- `--include-auto-captions false` (manual captions only)
9695
- `--manifest-key manifests/youtube-my-playlist.json`

0 commit comments

Comments
 (0)