diff --git a/pyproject.toml b/pyproject.toml index 921fdc8..01ec327 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "qdrant-client>=1.15.1", "streamlit>=1.48.0", "typer>=0.16.0", + "youtube-transcript-api>=1.2.2", ] [dependency-groups] diff --git a/scripts/agent_operator.py b/scripts/agent_operator.py index d8065ee..0600cc7 100644 --- a/scripts/agent_operator.py +++ b/scripts/agent_operator.py @@ -9,10 +9,23 @@ from template_langgraph.agents.image_classifier_agent.models import Results from template_langgraph.agents.issue_formatter_agent.agent import graph as issue_formatter_agent_graph from template_langgraph.agents.kabuto_helpdesk_agent.agent import graph as kabuto_helpdesk_agent_graph +from template_langgraph.agents.news_summarizer_agent.agent import MockNotifier, NewsSummarizerAgent from template_langgraph.agents.news_summarizer_agent.agent import ( graph as news_summarizer_agent_graph, ) -from template_langgraph.agents.news_summarizer_agent.models import Article +from template_langgraph.agents.news_summarizer_agent.models import ( + AgentInputState, + AgentState, + Article, +) +from template_langgraph.agents.news_summarizer_agent.scrapers import ( + BaseScraper, + HttpxScraper, + YouTubeTranscriptScraper, +) +from template_langgraph.agents.news_summarizer_agent.summarizers import ( + LlmSummarizer, +) from template_langgraph.agents.task_decomposer_agent.agent import graph as task_decomposer_agent_graph from template_langgraph.loggers import get_logger @@ -43,6 +56,18 @@ def get_agent_graph(name: str): raise ValueError(f"Unknown agent name: {name}") +def get_scraper(scraper_type: str) -> BaseScraper: + scraper = None + if scraper_type == "Httpx": + scraper = HttpxScraper() + elif scraper_type == "YouTubeTranscript": + scraper = YouTubeTranscriptScraper() + + if not scraper: + raise ValueError(f"Unknown scraper type: {scraper_type}") + return scraper + + @app.command() def png( name: str = typer.Option( @@ -134,6 +159,12 @@ def news_summarizer_agent( "-u", help="Comma-separated list of URLs to summarize", ), + scraper: str = typer.Option( + "Httpx", # YouTubeTranscript + "--scraper", + "-s", + help="Scraper to use for fetching content", + ), verbose: bool = typer.Option( False, "--verbose", @@ -141,16 +172,15 @@ def news_summarizer_agent( help="Enable verbose output", ), ): - from template_langgraph.agents.news_summarizer_agent.models import ( - AgentInputState, - AgentState, - ) - # Set up logging if verbose: logger.setLevel(logging.DEBUG) - graph = news_summarizer_agent_graph + graph = NewsSummarizerAgent( + notifier=MockNotifier(), + scraper=get_scraper(scraper), + summarizer=LlmSummarizer(), + ).create_graph() for event in graph.stream( input=AgentState( input=AgentInputState( diff --git a/template_langgraph/agents/news_summarizer_agent/scrapers.py b/template_langgraph/agents/news_summarizer_agent/scrapers.py index 99fdcc2..0ece189 100644 --- a/template_langgraph/agents/news_summarizer_agent/scrapers.py +++ b/template_langgraph/agents/news_summarizer_agent/scrapers.py @@ -10,6 +10,7 @@ from abc import ABC, abstractmethod import httpx +from youtube_transcript_api import YouTubeTranscriptApi from template_langgraph.loggers import get_logger @@ -39,7 +40,7 @@ def scrape(self, url: str) -> str: # pragma: no cover - interface class MockScraper(BaseScraper): """Deterministic scraper for tests / offline development.""" - def scrape(self, url: str) -> str: # noqa: D401 + def scrape(self, url: str) -> str: logger.info(f"Mock scrape for URL: {url}") return "

Mocked web content

" @@ -47,7 +48,7 @@ def scrape(self, url: str) -> str: # noqa: D401 class HttpxScraper(BaseScraper): """Simple httpx based scraper.""" - def scrape(self, url: str) -> str: # noqa: D401 + def scrape(self, url: str) -> str: logger.info(f"Fetching URL via httpx: {url}") with httpx.Client() as client: response = client.get(url) @@ -55,8 +56,22 @@ def scrape(self, url: str) -> str: # noqa: D401 return response.text +class YouTubeTranscriptScraper(BaseScraper): + """YouTube transcript scraper.""" + + def scrape(self, url: str) -> str: + video_id = url.split("v=")[-1].split("&")[0] + transcript = YouTubeTranscriptApi().fetch( + video_id=video_id, + languages=["ja", "en"], + ) + text_list = [item.text for item in transcript] + return " ".join(text_list) + + __all__ = [ "BaseScraper", "MockScraper", "HttpxScraper", + "YouTubeTranscriptScraper", ] diff --git a/uv.lock b/uv.lock index e6e02d0..088a8fa 100644 --- a/uv.lock +++ b/uv.lock @@ -4679,6 +4679,7 @@ dependencies = [ { name = "qdrant-client" }, { name = "streamlit" }, { name = "typer" }, + { name = "youtube-transcript-api" }, ] [package.dev-dependencies] @@ -4723,6 +4724,7 @@ requires-dist = [ { name = "qdrant-client", specifier = ">=1.15.1" }, { name = "streamlit", specifier = ">=1.48.0" }, { name = "typer", specifier = ">=0.16.0" }, + { name = "youtube-transcript-api", specifier = ">=1.2.2" }, ] [package.metadata.requires-dev] @@ -5489,6 +5491,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] +[[package]] +name = "youtube-transcript-api" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8f/f8/5e12d3d0c7001c3b3078697b9918241022bdb1ae12715e9debb00a83e16e/youtube_transcript_api-1.2.2.tar.gz", hash = "sha256:5f67cfaff3621d969778817a3d7b2172c16784855f45fcaed4f0529632e2fef4", size = 469634, upload-time = "2025-08-04T12:22:52.158Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/92/3d1a580f0efcad926f45876cf6cb92b2c260e84ae75dae5463bbf38f92e7/youtube_transcript_api-1.2.2-py3-none-any.whl", hash = "sha256:feca8c7f7c9d65188ef6377fc0e01cf466e6b68f1b3e648019646ab342f994d2", size = 485047, upload-time = "2025-08-04T12:22:50.836Z" }, +] + [[package]] name = "zipp" version = "3.23.0"