Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 23 additions & 155 deletions recipe-extractor.py
Original file line number Diff line number Diff line change
@@ -1,123 +1,34 @@
import yt_dlp
import openai
import os
import sys
import argparse
import json
from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.parse import urlparse, parse_qs
from urllib.parse import parse_qs, urlparse
from dotenv import load_dotenv

try:
from youtube_transcript_api import YouTubeTranscriptApi
except Exception: # pragma: no cover - optional dep may not be installed
YouTubeTranscriptApi = None
import openai
import yt_dlp

from video_transcripts import (
is_youtube_url,
download_audio_with_ytdlp,
fetch_video_info,
get_youtube_transcript,
get_post_text,
get_caption_languages,
transcribe_whisper,
extract_video_transcript,
AUDIO_FILE,
)

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
print("Error: OPENAI_API_KEY not set")
sys.exit(1)
AUDIO_FILE = "audio.mp3"


def is_youtube_url(url: str) -> bool:
"""Return True if the URL points to YouTube."""
host = urlparse(url).netloc.lower()
return "youtube.com" in host or "youtu.be" in host

def download_audio_with_ytdlp(url, out_file=AUDIO_FILE):
# Remove extension from out_file since FFmpegExtractAudio will add it
base_name = out_file.rsplit('.', 1)[0] if '.' in out_file else out_file
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': base_name,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'quiet': False,
'noplaylist': True
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])

def fetch_video_info(url):
"""Return video metadata without downloading the file."""
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
return ydl.extract_info(url, download=False)

def get_youtube_transcript(video_id, languages=None):
"""Fetch transcript text from YouTube if available."""
if not YouTubeTranscriptApi:
print("⚠️ youtube-transcript-api not installed; skipping transcript fetch")
return None

ytt_api = YouTubeTranscriptApi()
try:
transcript_list = ytt_api.list(video_id)
except Exception as e: # pragma: no cover - network dependent
print(f"⚠️ Could not list transcripts: {e}")
return None

languages = list(languages or [])

def fetch_text(transcript):
try:
segments = transcript.fetch()
except Exception as e:
print("⚠️ Issue while getting transcripts: ", e)
return None
return " ".join(seg.text for seg in segments)

# First try preferred languages
for lang in languages:
try:
t = transcript_list.find_transcript([lang])
except Exception:
t = None
if t:
text = fetch_text(t)
if text:
return text

# Fall back to the first available transcript
for t in transcript_list:
text = fetch_text(t)
if text:
return text

return None

def get_post_text(info):
"""Return video description or caption."""
for key in ("description", "caption", "summary"):
text = info.get(key)
if text:
return text
return ""

def get_caption_languages(info):
"""Return list of caption language codes from video metadata."""
languages = []
for key in ("subtitles", "automatic_captions"):
for lang in info.get(key, {}):
if lang not in languages:
languages.append(lang)
if info.get("language") and info["language"] not in languages:
languages.append(info["language"])
return languages

def transcribe_whisper(file_path):
openai.api_key = OPENAI_API_KEY
with open(file_path, "rb") as audio_file:
transcript = openai.audio.transcriptions.create(
# model="whisper-1",
model="gpt-4o-mini-transcribe",
file=audio_file
)
return transcript.text



def extract_recipe_with_gpt(transcript, language="english"):
openai.api_key = OPENAI_API_KEY
Expand Down Expand Up @@ -252,23 +163,10 @@ def extract_recipe(url, language="english", output_format="json", save_transcrip
"""High-level helper to extract a recipe from a URL and return it as a string."""
print(f"🎯 Extracting from URL: {url}")

print("⬇️ Downloading audio...")
download_audio_with_ytdlp(url)

print("🎙️ Transcribing audio...")
transcript = transcribe_whisper(AUDIO_FILE)

if save_transcript:
with open(save_transcript, "w", encoding="utf-8") as f:
f.write(transcript)

try:
os.remove(AUDIO_FILE)
except OSError:
pass
combined = extract_video_transcript(url, save_transcript=save_transcript)

print(f"🤖 Extracting recipe using AI (language: {language})...")
structured_recipe = extract_recipe_with_gpt(transcript, language)
structured_recipe = extract_recipe_with_gpt(combined, language)

if output_format == "markdown":
return convert_to_markdown(structured_recipe, language)
Expand Down Expand Up @@ -410,39 +308,9 @@ def main():
print(f"💾 Output: {args.output or 'structured_recipe'}")
print()

info = fetch_video_info(args.url)
post_text = get_post_text(info)

transcript = None
if is_youtube_url(args.url):
caption_langs = get_caption_languages(info)
transcript = get_youtube_transcript(info.get('id'), caption_langs)
if transcript:
print("📝 Using existing YouTube transcript")

if not transcript:
print("⬇️ Downloading audio...")
download_audio_with_ytdlp(args.url)

print("🎙️ Transcribing audio...")
transcript = transcribe_whisper(AUDIO_FILE)
print(f"📏 Transcription length: {len(transcript)} characters")

combined = (post_text + "\n\n" + transcript).strip()

# Save transcription if requested
if args.save_transcript:
with open(args.save_transcript, "w", encoding="utf-8") as f:
f.write(transcript)
print(f"📝 Transcription saved to {args.save_transcript} for review")
print()

# Clean up audio file after transcription
try:
os.remove(AUDIO_FILE)
print("🧹 Audio file cleaned up.")
except OSError:
print("⚠️ Warning: Could not delete audio file.")
combined = extract_video_transcript(
args.url, save_transcript=args.save_transcript
)

print(f"🤖 Extracting recipe using AI (language: {args.language})...")
structured_recipe = extract_recipe_with_gpt(combined, args.language)
Expand Down
14 changes: 7 additions & 7 deletions tests/test_transcript_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
recipe_extractor = importlib.util.module_from_spec(spec)
spec.loader.exec_module(recipe_extractor)
video_transcripts = sys.modules["video_transcripts"]

def test_is_youtube_url_detection():
assert recipe_extractor.is_youtube_url("https://www.youtube.com/watch?v=abc")
Expand All @@ -28,19 +29,18 @@ def test_main_only_uses_youtube_transcripts(tmp_path, monkeypatch):
def fake_fetch_info(url):
return {"id": "abc"}

monkeypatch.setattr(recipe_extractor, "fetch_video_info", fake_fetch_info)
monkeypatch.setattr(recipe_extractor, "get_post_text", lambda info: "")
monkeypatch.setattr(recipe_extractor, "get_caption_languages", lambda info: [])
monkeypatch.setattr(video_transcripts, "fetch_video_info", fake_fetch_info)
monkeypatch.setattr(video_transcripts, "get_post_text", lambda info: "")
monkeypatch.setattr(video_transcripts, "get_caption_languages", lambda info: [])

def fake_get_transcript(video_id, langs=None):
calls["yt"] += 1
return "transcript"

monkeypatch.setattr(recipe_extractor, "get_youtube_transcript", fake_get_transcript)
monkeypatch.setattr(recipe_extractor, "download_audio_with_ytdlp", lambda url: None)
monkeypatch.setattr(recipe_extractor, "transcribe_whisper", lambda path: "audio")
monkeypatch.setattr(video_transcripts, "get_youtube_transcript", fake_get_transcript)
monkeypatch.setattr(video_transcripts, "download_audio_with_ytdlp", lambda url: None)
monkeypatch.setattr(video_transcripts, "transcribe_whisper", lambda path: "audio")
monkeypatch.setattr(recipe_extractor, "extract_recipe_with_gpt", lambda t, l: "{}")
monkeypatch.setattr(recipe_extractor.os, "remove", lambda path: None)

# YouTube URL should trigger transcript fetch
monkeypatch.setattr(sys, "argv", [
Expand Down
149 changes: 149 additions & 0 deletions video_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import yt_dlp
import openai
import os
from urllib.parse import urlparse
from dotenv import load_dotenv

try:
from youtube_transcript_api import YouTubeTranscriptApi
except Exception: # pragma: no cover - optional dependency
YouTubeTranscriptApi = None

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
AUDIO_FILE = "audio.mp3"


def is_youtube_url(url: str) -> bool:
"""Return True if the URL points to YouTube."""
host = urlparse(url).netloc.lower()
return "youtube.com" in host or "youtu.be" in host


def download_audio_with_ytdlp(url: str, out_file: str = AUDIO_FILE) -> None:
"""Download the audio track from a video using yt-dlp."""
base_name = out_file.rsplit(".", 1)[0] if "." in out_file else out_file
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": base_name,
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"quiet": False,
"noplaylist": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])


def fetch_video_info(url: str) -> dict:
"""Return video metadata without downloading the file."""
with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
return ydl.extract_info(url, download=False)


def get_youtube_transcript(video_id: str, languages=None) -> str | None:
"""Fetch transcript text from YouTube if available."""
if not YouTubeTranscriptApi:
print("⚠️ youtube-transcript-api not installed; skipping transcript fetch")
return None

ytt_api = YouTubeTranscriptApi()
try:
transcript_list = ytt_api.list(video_id)
except Exception as e: # pragma: no cover - network dependent
print(f"⚠️ Could not list transcripts: {e}")
return None

languages = list(languages or [])

def fetch_text(transcript):
try:
segments = transcript.fetch()
except Exception as e: # pragma: no cover - network dependent
print("⚠️ Issue while getting transcripts: ", e)
return None
return " ".join(seg.text for seg in segments)

for lang in languages:
try:
t = transcript_list.find_transcript([lang])
except Exception:
t = None
if t:
text = fetch_text(t)
if text:
return text

for t in transcript_list:
text = fetch_text(t)
if text:
return text
return None


def get_post_text(info: dict) -> str:
"""Return video description or caption."""
for key in ("description", "caption", "summary"):
text = info.get(key)
if text:
return text
return ""


def get_caption_languages(info: dict) -> list:
"""Return list of caption language codes from video metadata."""
languages = []
for key in ("subtitles", "automatic_captions"):
for lang in info.get(key, {}):
if lang not in languages:
languages.append(lang)
if info.get("language") and info["language"] not in languages:
languages.append(info["language"])
return languages


def transcribe_whisper(file_path: str) -> str:
"""Transcribe an audio file using OpenAI Whisper."""
openai.api_key = OPENAI_API_KEY
with open(file_path, "rb") as audio_file:
transcript = openai.audio.transcriptions.create(
model="gpt-4o-mini-transcribe",
file=audio_file,
)
return transcript.text


def extract_video_transcript(url: str, *, save_transcript: str | None = None) -> str:
"""Return combined post text and transcript for a video URL."""
info = fetch_video_info(url)
post_text = get_post_text(info)

transcript = None
if is_youtube_url(url):
caption_langs = get_caption_languages(info)
transcript = get_youtube_transcript(info.get("id"), caption_langs)
if transcript:
print("📝 Using existing YouTube transcript")

if not transcript:
print("⬇️ Downloading audio...")
download_audio_with_ytdlp(url)
print("🎙️ Transcribing audio...")
transcript = transcribe_whisper(AUDIO_FILE)
if save_transcript:
with open(save_transcript, "w", encoding="utf-8") as f:
f.write(transcript)

try:
os.remove(AUDIO_FILE)
except OSError:
pass

combined = (post_text + "\n\n" + transcript).strip()
return combined