diff --git a/requirements-dev.lock b/requirements-dev.lock index 61bd3e2b..b2d32e41 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -30,6 +30,8 @@ anyio==4.4.0 astroid==3.2.4 # via pylint async-timeout==4.0.3 + # via aiohttp + # via langchain # via scrapegraphai attrs==24.2.0 # via aiohttp @@ -78,6 +80,9 @@ distro==1.9.0 # via openai docutils==0.19 # via sphinx +exceptiongroup==1.2.2 + # via anyio + # via pytest fastapi==0.112.0 # via burr fastapi-pagination==0.12.26 @@ -131,7 +136,6 @@ graphviz==0.20.3 # via burr greenlet==3.0.3 # via playwright - # via sqlalchemy grpcio==1.65.4 # via google-api-core # via grpcio-status @@ -500,6 +504,9 @@ tokenizers==0.19.1 # via transformers toml==0.10.2 # via streamlit +tomli==2.1.0 + # via pylint + # via pytest tomlkit==0.13.0 # via pylint tornado==6.4.1 @@ -517,6 +524,8 @@ transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 # via altair + # via anyio + # via astroid # via fastapi # via fastapi-pagination # via google-generativeai @@ -531,6 +540,7 @@ typing-extensions==4.12.2 # via sqlalchemy # via streamlit # via typing-inspect + # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton diff --git a/requirements.lock b/requirements.lock index c2c40996..38be6e68 100644 --- a/requirements.lock +++ b/requirements.lock @@ -19,6 +19,8 @@ anyio==4.4.0 # via httpx # via openai async-timeout==4.0.3 + # via aiohttp + # via langchain # via scrapegraphai attrs==23.2.0 # via aiohttp @@ -48,6 +50,8 @@ dill==0.3.8 # via multiprocess distro==1.9.0 # via openai +exceptiongroup==1.2.2 + # via anyio fastembed==0.3.6 # via scrapegraphai filelock==3.15.4 @@ -87,7 +91,6 @@ googlesearch-python==1.2.5 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy grpcio==1.65.1 # via google-api-core # via grpcio-status @@ -368,6 +371,7 @@ tqdm==4.66.4 transformers==4.44.2 # via scrapegraphai typing-extensions==4.12.2 + # via anyio # via google-generativeai # via huggingface-hub # via langchain-core diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 55f05ab6..f964eb8b 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -80,28 +80,30 @@ def __init__( None if node_config is None else node_config.get("scrape_do", None) ) + def is_valid_url(self, source: str) -> bool: + """ + Validates if the source string is a valid URL using regex. + + Parameters: + source (str): The URL string to validate + + Raises: + ValueError: If the URL is invalid + """ + import re + url_pattern = r'^https?://[^\s/$.?#].[^\s]*$' + if not bool(re.match(url_pattern, source)): + raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.") + return True + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and update the state with this content. - - Args: - state (dict): The current state of the graph. The input keys will be used - to fetch the correct data types from the state. - - Returns: - dict: The updated state with a new output key containing the fetched HTML content. - - Raises: - KeyError: If the input key is not found in the state, indicating that the - necessary information to perform the operation is missing. """ - self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] source = input_data[0] @@ -124,10 +126,16 @@ def execute(self, state): return handlers[input_type](state, input_type, source) elif self.input == "pdf_dir": return state - elif not source.startswith("http") and not source.startswith("www"): - return self.handle_local_source(state, source) - else: - return self.handle_web_source(state, source) + + # For web sources, validate URL before proceeding + try: + if self.is_valid_url(source): + return self.handle_web_source(state, source) + except ValueError as e: + # Re-raise the exception from is_valid_url + raise + + return self.handle_local_source(state, source) def handle_directory(self, state, input_type, source): """