wwdc-rag/extract-transcripts.py at main · JoeCotellese/wwdc-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import logging
import os
import re
import time
from urllib.parse import urljoin

import click
import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def extract_transcript(url):
    logger.info(f"Fetching transcript from URL: {url}")
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Extract year and number from the URL
    try:
        parts = url.split("/play/")
        year = parts[1].split("/")[0]
        number = parts[1].split("/")[1]
    except IndexError:
        logger.error(f"Failed to extract year or number from URL: {url}")
        return None

    # Extract talk title from OpenGraph metadata
    og_title_tag = soup.find("meta", property="og:title")
    talk_title = og_title_tag["content"] if og_title_tag else "Unknown Title"

    transcript_section = soup.find("section", id="transcript-content")
    if not transcript_section:
        logger.warning(f"No transcript section found for URL: {url}")
        return None

    # Group sentences by paragraph (<p>)
    paragraphs = []
    for p in transcript_section.find_all("p"):
        paragraphs.append(p.get_text(strip=True))

    logger.info(f"Transcript successfully extracted for URL: {url}")

    # Return the transcript object
    return {
        "year": year,
        "number": number,
        "talk_title": talk_title,
        "transcript": "\n".join(paragraphs),
        "url": url,  # Include the URL in the transcript object
    }


def extract_code_samples(soup):
    code_section = soup.find("li", class_="supplement sample-code")
    if not code_section:
        return []

    samples = []
    for container in code_section.find_all("li", class_="sample-code-main-container"):
        info_tag = container.find("p")
        timestamp = ""
        title = "Untitled"
        if info_tag:
            text = info_tag.get_text(" ", strip=True)
            match = re.match(r"^(\d{1,2}:\d{2}) - (.+)", text)
            if match:
                timestamp = match.group(1)
                title = match.group(2)
        code = container.find("pre", class_="code-source")
        code_text = code.get_text() if code else ""
        if code_text:
            samples.append({"timestamp": timestamp, "title": title, "code": code_text})
    return samples


def get_video_links(base_url):
    logger.info(f"Fetching video links from base URL: {base_url}")
    response = requests.get(base_url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    video_links = []

    # Find all anchor tags with the expected class
    for a in soup.find_all(
        "a",
        class_="vc-card tile tile-rounded grid-item large-span-4 medium-span-6 small-span-12",
    ):
        href = a.get("href")
        if href:
            full_url = urljoin(base_url, href)
            video_links.append(full_url)

    logger.info(f"Found {len(video_links)} video links at base URL: {base_url}")
    return video_links


def save_transcript(transcript_object):
    # Ensure the transcripts directory exists
    os.makedirs("./transcripts", exist_ok=True)

    # Sanitize the title for filename
    safe_title = re.sub(r"[^a-zA-Z0-9_\-]", "_", transcript_object["talk_title"]).strip(
        "_"
    )
    filename = (
        f"{transcript_object['year']}-{transcript_object['number']}-{safe_title}.md"
    )
    filepath = os.path.join("./transcripts", filename)

    content = (
        f"YEAR: {transcript_object['year']}\n"
        f"TITLE: {transcript_object['talk_title']}\n"
        f"URL: {transcript_object['url']}\n"  # Include the URL in the saved transcript header
        f"CONTENT:\n\n"
        f"{transcript_object['transcript']}\n"
    )

    if "code_samples" in transcript_object:
        content += "\n\nCODE SAMPLES:\n"
        for idx, sample in enumerate(transcript_object["code_samples"], 1):
            content += (
                f"\n--- Code Sample {idx} ---\n"
                f"**Time**: {sample['timestamp']}\n"
                f"**Title**: {sample['title']}\n\n"
                f"```swift\n{sample['code']}\n```\n"
            )
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        logger.info(f"Transcript saved to {filepath}")


@click.command()
@click.option("--base_url", required=False, help="Base URL to fetch video links.")
@click.option("--url", required=False, help="Single video page URL to extract.")
def main(base_url, url):
    """Main function to extract transcripts from a base URL or a single video URL."""
    if url and base_url:
        logger.error("Please specify only one of --url or --base_url.")
        return
    elif url:
        transcript_object = extract_transcript(url)
        if transcript_object:
            soup = BeautifulSoup(requests.get(url).text, "html.parser")
            transcript_object["code_samples"] = extract_code_samples(soup)
            save_transcript(transcript_object)
        return
    elif base_url:
        logger.info(f"Starting transcript extraction process for base URL: {base_url}")
        links = get_video_links(base_url)
        for link in links:
            transcript_object = extract_transcript(link)
            if transcript_object:
                soup = BeautifulSoup(requests.get(link).text, "html.parser")
                transcript_object["code_samples"] = extract_code_samples(soup)
                save_transcript(transcript_object)
            else:
                logger.warning("Transcript extraction failed for one or more links")
            time.sleep(1)
    else:
        logger.error("Please specify either --url or --base_url.")


if __name__ == "__main__":
    main()
    # base_url = "https://developer.apple.com/videos/wwdc2025/"
    # logger.info("Starting transcript extraction process")
    # links = get_video_links(base_url)
    # for link in links:
    #     transcript_object = extract_transcript(link)
    #     if transcript_object:
    #         logger.info(
    #             f"Transcript extraction completed successfully: {transcript_object['talk_title']}"
    #         )
    #         save_transcript(transcript_object)
    #     else:
    #         logger.warning("Transcript extraction failed for one or more links")
    #     time.sleep(1)  # Add a 1-second delay between requests