Skip to content

Commit da0aebb

Browse files
committed
move scraper modules to internals
1 parent 8841a11 commit da0aebb

File tree

4 files changed

+48
-35
lines changed

4 files changed

+48
-35
lines changed

.env.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,6 @@ PDF_LOADER_DATA_DIR_PATH="./data"
6363
## OpenTelemetry Settings
6464
OTEL_SERVICE_NAME="template-langgraph"
6565
OTEL_COLLECTOR_ENDPOINT="http://localhost:4317"
66+
67+
## Scraper Settings
68+
SCRAPER_TYPE="mock" # Options: "mock", "httpx", "youtube_transcript"

scripts/agent_operator.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,11 @@
1818
AgentState,
1919
Article,
2020
)
21-
from template_langgraph.agents.news_summarizer_agent.scrapers import (
22-
BaseScraper,
23-
HttpxScraper,
24-
YouTubeTranscriptScraper,
25-
)
2621
from template_langgraph.agents.news_summarizer_agent.summarizers import (
2722
LlmSummarizer,
2823
)
2924
from template_langgraph.agents.task_decomposer_agent.agent import graph as task_decomposer_agent_graph
25+
from template_langgraph.internals.scrapers import get_scraper
3026
from template_langgraph.loggers import get_logger
3127

3228
# Initialize the Typer application
@@ -56,18 +52,6 @@ def get_agent_graph(name: str):
5652
raise ValueError(f"Unknown agent name: {name}")
5753

5854

59-
def get_scraper(scraper_type: str) -> BaseScraper:
60-
scraper = None
61-
if scraper_type == "Httpx":
62-
scraper = HttpxScraper()
63-
elif scraper_type == "YouTubeTranscript":
64-
scraper = YouTubeTranscriptScraper()
65-
66-
if not scraper:
67-
raise ValueError(f"Unknown scraper type: {scraper_type}")
68-
return scraper
69-
70-
7155
@app.command()
7256
def png(
7357
name: str = typer.Option(
@@ -159,12 +143,6 @@ def news_summarizer_agent(
159143
"-u",
160144
help="Comma-separated list of URLs to summarize",
161145
),
162-
scraper: str = typer.Option(
163-
"Httpx", # YouTubeTranscript
164-
"--scraper",
165-
"-s",
166-
help="Scraper to use for fetching content",
167-
),
168146
verbose: bool = typer.Option(
169147
False,
170148
"--verbose",
@@ -178,7 +156,7 @@ def news_summarizer_agent(
178156

179157
graph = NewsSummarizerAgent(
180158
notifier=MockNotifier(),
181-
scraper=get_scraper(scraper),
159+
scraper=get_scraper(),
182160
summarizer=LlmSummarizer(),
183161
).create_graph()
184162
for event in graph.stream(

template_langgraph/agents/news_summarizer_agent/agent.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@
88
StructuredArticle,
99
SummarizeWebContentState,
1010
)
11-
from template_langgraph.agents.news_summarizer_agent.scrapers import (
12-
BaseScraper,
13-
HttpxScraper,
14-
MockScraper,
15-
)
1611
from template_langgraph.agents.news_summarizer_agent.summarizers import (
1712
BaseSummarizer,
1813
LlmSummarizer,
1914
MockSummarizer,
2015
)
16+
from template_langgraph.internals.scrapers import (
17+
BaseScraper,
18+
HttpxScraper,
19+
MockScraper,
20+
)
2121
from template_langgraph.llms.azure_openais import AzureOpenAiWrapper
2222
from template_langgraph.loggers import get_logger
2323

template_langgraph/agents/news_summarizer_agent/scrapers.py renamed to template_langgraph/internals/scrapers.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,40 @@
88
from __future__ import annotations
99

1010
from abc import ABC, abstractmethod
11+
from enum import Enum
12+
from functools import lru_cache
1113

1214
import httpx
15+
from pydantic_settings import BaseSettings, SettingsConfigDict
1316
from youtube_transcript_api import YouTubeTranscriptApi
1417

1518
from template_langgraph.loggers import get_logger
1619

1720
logger = get_logger(__name__)
1821

1922

23+
class ScraperType(str, Enum):
24+
MOCK = "mock"
25+
HTTPX = "httpx"
26+
YOUTUBE_TRANSCRIPT = "youtube_transcript"
27+
28+
29+
class Settings(BaseSettings):
30+
scraper_type: ScraperType = ScraperType.MOCK
31+
32+
model_config = SettingsConfigDict(
33+
env_file=".env",
34+
env_ignore_empty=True,
35+
extra="ignore",
36+
)
37+
38+
39+
@lru_cache
40+
def get_scraper_settings() -> Settings:
41+
"""Get scraper settings."""
42+
return Settings()
43+
44+
2045
class BaseScraper(ABC):
2146
"""Abstract base scraper.
2247
@@ -60,6 +85,7 @@ class YouTubeTranscriptScraper(BaseScraper):
6085
"""YouTube transcript scraper."""
6186

6287
def scrape(self, url: str) -> str:
88+
logger.info(f"Fetching YouTube transcript for URL: {url}")
6389
video_id = url.split("v=")[-1].split("&")[0]
6490
transcript = YouTubeTranscriptApi().fetch(
6591
video_id=video_id,
@@ -69,9 +95,15 @@ def scrape(self, url: str) -> str:
6995
return " ".join(text_list)
7096

7197

72-
__all__ = [
73-
"BaseScraper",
74-
"MockScraper",
75-
"HttpxScraper",
76-
"YouTubeTranscriptScraper",
77-
]
98+
def get_scraper(settings: Settings = None) -> BaseScraper:
99+
if settings is None:
100+
settings = get_scraper_settings()
101+
102+
if settings.scraper_type == ScraperType.MOCK:
103+
return MockScraper()
104+
elif settings.scraper_type == ScraperType.HTTPX:
105+
return HttpxScraper()
106+
elif settings.scraper_type == ScraperType.YOUTUBE_TRANSCRIPT:
107+
return YouTubeTranscriptScraper()
108+
else:
109+
raise ValueError(f"Unknown scraper type: {settings.scraper_type}")

0 commit comments

Comments
 (0)