Skip to content

Commit f852dd1

Browse files
authored
Merge pull request #77 from ks6088ts-labs/feature/issue-76_youtube-scraper
add YouTube scraper
2 parents c3c1aca + 5074175 commit f852dd1

File tree

4 files changed

+70
-9
lines changed

4 files changed

+70
-9
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dependencies = [
2929
"qdrant-client>=1.15.1",
3030
"streamlit>=1.48.0",
3131
"typer>=0.16.0",
32+
"youtube-transcript-api>=1.2.2",
3233
]
3334

3435
[dependency-groups]

scripts/agent_operator.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,23 @@
99
from template_langgraph.agents.image_classifier_agent.models import Results
1010
from template_langgraph.agents.issue_formatter_agent.agent import graph as issue_formatter_agent_graph
1111
from template_langgraph.agents.kabuto_helpdesk_agent.agent import graph as kabuto_helpdesk_agent_graph
12+
from template_langgraph.agents.news_summarizer_agent.agent import MockNotifier, NewsSummarizerAgent
1213
from template_langgraph.agents.news_summarizer_agent.agent import (
1314
graph as news_summarizer_agent_graph,
1415
)
15-
from template_langgraph.agents.news_summarizer_agent.models import Article
16+
from template_langgraph.agents.news_summarizer_agent.models import (
17+
AgentInputState,
18+
AgentState,
19+
Article,
20+
)
21+
from template_langgraph.agents.news_summarizer_agent.scrapers import (
22+
BaseScraper,
23+
HttpxScraper,
24+
YouTubeTranscriptScraper,
25+
)
26+
from template_langgraph.agents.news_summarizer_agent.summarizers import (
27+
LlmSummarizer,
28+
)
1629
from template_langgraph.agents.task_decomposer_agent.agent import graph as task_decomposer_agent_graph
1730
from template_langgraph.loggers import get_logger
1831

@@ -43,6 +56,18 @@ def get_agent_graph(name: str):
4356
raise ValueError(f"Unknown agent name: {name}")
4457

4558

59+
def get_scraper(scraper_type: str) -> BaseScraper:
60+
scraper = None
61+
if scraper_type == "Httpx":
62+
scraper = HttpxScraper()
63+
elif scraper_type == "YouTubeTranscript":
64+
scraper = YouTubeTranscriptScraper()
65+
66+
if not scraper:
67+
raise ValueError(f"Unknown scraper type: {scraper_type}")
68+
return scraper
69+
70+
4671
@app.command()
4772
def png(
4873
name: str = typer.Option(
@@ -134,23 +159,28 @@ def news_summarizer_agent(
134159
"-u",
135160
help="Comma-separated list of URLs to summarize",
136161
),
162+
scraper: str = typer.Option(
163+
"Httpx", # YouTubeTranscript
164+
"--scraper",
165+
"-s",
166+
help="Scraper to use for fetching content",
167+
),
137168
verbose: bool = typer.Option(
138169
False,
139170
"--verbose",
140171
"-v",
141172
help="Enable verbose output",
142173
),
143174
):
144-
from template_langgraph.agents.news_summarizer_agent.models import (
145-
AgentInputState,
146-
AgentState,
147-
)
148-
149175
# Set up logging
150176
if verbose:
151177
logger.setLevel(logging.DEBUG)
152178

153-
graph = news_summarizer_agent_graph
179+
graph = NewsSummarizerAgent(
180+
notifier=MockNotifier(),
181+
scraper=get_scraper(scraper),
182+
summarizer=LlmSummarizer(),
183+
).create_graph()
154184
for event in graph.stream(
155185
input=AgentState(
156186
input=AgentInputState(

template_langgraph/agents/news_summarizer_agent/scrapers.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from abc import ABC, abstractmethod
1111

1212
import httpx
13+
from youtube_transcript_api import YouTubeTranscriptApi
1314

1415
from template_langgraph.loggers import get_logger
1516

@@ -39,24 +40,38 @@ def scrape(self, url: str) -> str: # pragma: no cover - interface
3940
class MockScraper(BaseScraper):
4041
"""Deterministic scraper for tests / offline development."""
4142

42-
def scrape(self, url: str) -> str: # noqa: D401
43+
def scrape(self, url: str) -> str:
4344
logger.info(f"Mock scrape for URL: {url}")
4445
return "<html><body><h1>Mocked web content</h1></body></html>"
4546

4647

4748
class HttpxScraper(BaseScraper):
4849
"""Simple httpx based scraper."""
4950

50-
def scrape(self, url: str) -> str: # noqa: D401
51+
def scrape(self, url: str) -> str:
5152
logger.info(f"Fetching URL via httpx: {url}")
5253
with httpx.Client() as client:
5354
response = client.get(url)
5455
response.raise_for_status()
5556
return response.text
5657

5758

59+
class YouTubeTranscriptScraper(BaseScraper):
60+
"""YouTube transcript scraper."""
61+
62+
def scrape(self, url: str) -> str:
63+
video_id = url.split("v=")[-1].split("&")[0]
64+
transcript = YouTubeTranscriptApi().fetch(
65+
video_id=video_id,
66+
languages=["ja", "en"],
67+
)
68+
text_list = [item.text for item in transcript]
69+
return " ".join(text_list)
70+
71+
5872
__all__ = [
5973
"BaseScraper",
6074
"MockScraper",
6175
"HttpxScraper",
76+
"YouTubeTranscriptScraper",
6277
]

uv.lock

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)