Merge pull request #77 from ks6088ts-labs/feature/issue-76_youtube-scraper

ks6088ts · web-flow · commit f852dd139e31 · 2025-08-13T08:08:24.000+09:00
add YouTube scraper
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
     "qdrant-client>=1.15.1",
     "streamlit>=1.48.0",
     "typer>=0.16.0",
+    "youtube-transcript-api>=1.2.2",
 ]
 
 [dependency-groups]
diff --git a/scripts/agent_operator.py b/scripts/agent_operator.py
@@ -9,10 +9,23 @@
 from template_langgraph.agents.image_classifier_agent.models import Results
 from template_langgraph.agents.issue_formatter_agent.agent import graph as issue_formatter_agent_graph
 from template_langgraph.agents.kabuto_helpdesk_agent.agent import graph as kabuto_helpdesk_agent_graph
+from template_langgraph.agents.news_summarizer_agent.agent import MockNotifier, NewsSummarizerAgent
 from template_langgraph.agents.news_summarizer_agent.agent import (
     graph as news_summarizer_agent_graph,
 )
-from template_langgraph.agents.news_summarizer_agent.models import Article
+from template_langgraph.agents.news_summarizer_agent.models import (
+    AgentInputState,
+    AgentState,
+    Article,
+)
+from template_langgraph.agents.news_summarizer_agent.scrapers import (
+    BaseScraper,
+    HttpxScraper,
+    YouTubeTranscriptScraper,
+)
+from template_langgraph.agents.news_summarizer_agent.summarizers import (
+    LlmSummarizer,
+)
 from template_langgraph.agents.task_decomposer_agent.agent import graph as task_decomposer_agent_graph
 from template_langgraph.loggers import get_logger
 
@@ -43,6 +56,18 @@ def get_agent_graph(name: str):
         raise ValueError(f"Unknown agent name: {name}")
 
 
+def get_scraper(scraper_type: str) -> BaseScraper:
+    scraper = None
+    if scraper_type == "Httpx":
+        scraper = HttpxScraper()
+    elif scraper_type == "YouTubeTranscript":
+        scraper = YouTubeTranscriptScraper()
+
+    if not scraper:
+        raise ValueError(f"Unknown scraper type: {scraper_type}")
+    return scraper
+
+
 @app.command()
 def png(
     name: str = typer.Option(
@@ -134,23 +159,28 @@ def news_summarizer_agent(
         "-u",
         help="Comma-separated list of URLs to summarize",
     ),
+    scraper: str = typer.Option(
+        "Httpx",  # YouTubeTranscript
+        "--scraper",
+        "-s",
+        help="Scraper to use for fetching content",
+    ),
     verbose: bool = typer.Option(
         False,
         "--verbose",
         "-v",
         help="Enable verbose output",
     ),
 ):
-    from template_langgraph.agents.news_summarizer_agent.models import (
-        AgentInputState,
-        AgentState,
-    )
-
     # Set up logging
     if verbose:
         logger.setLevel(logging.DEBUG)
 
-    graph = news_summarizer_agent_graph
+    graph = NewsSummarizerAgent(
+        notifier=MockNotifier(),
+        scraper=get_scraper(scraper),
+        summarizer=LlmSummarizer(),
+    ).create_graph()
     for event in graph.stream(
         input=AgentState(
             input=AgentInputState(
diff --git a/template_langgraph/agents/news_summarizer_agent/scrapers.py b/template_langgraph/agents/news_summarizer_agent/scrapers.py
@@ -10,6 +10,7 @@
 from abc import ABC, abstractmethod
 
 import httpx
+from youtube_transcript_api import YouTubeTranscriptApi
 
 from template_langgraph.loggers import get_logger
 
@@ -39,24 +40,38 @@ def scrape(self, url: str) -> str:  # pragma: no cover - interface
 class MockScraper(BaseScraper):
     """Deterministic scraper for tests / offline development."""
 
-    def scrape(self, url: str) -> str:  # noqa: D401
+    def scrape(self, url: str) -> str:
         logger.info(f"Mock scrape for URL: {url}")
         return "<html><body><h1>Mocked web content</h1></body></html>"
 
 
 class HttpxScraper(BaseScraper):
     """Simple httpx based scraper."""
 
-    def scrape(self, url: str) -> str:  # noqa: D401
+    def scrape(self, url: str) -> str:
         logger.info(f"Fetching URL via httpx: {url}")
         with httpx.Client() as client:
             response = client.get(url)
             response.raise_for_status()
             return response.text
 
 
+class YouTubeTranscriptScraper(BaseScraper):
+    """YouTube transcript scraper."""
+
+    def scrape(self, url: str) -> str:
+        video_id = url.split("v=")[-1].split("&")[0]
+        transcript = YouTubeTranscriptApi().fetch(
+            video_id=video_id,
+            languages=["ja", "en"],
+        )
+        text_list = [item.text for item in transcript]
+        return " ".join(text_list)
+
+
 __all__ = [
     "BaseScraper",
     "MockScraper",
     "HttpxScraper",
+    "YouTubeTranscriptScraper",
 ]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ dependencies = [`
`29`	`29`	`"qdrant-client>=1.15.1",`
`30`	`30`	`"streamlit>=1.48.0",`
`31`	`31`	`"typer>=0.16.0",`
	`32`	`+ "youtube-transcript-api>=1.2.2",`
`32`	`33`	`]`
`33`	`34`
`34`	`35`	`[dependency-groups]`