diff --git a/.well-known/funding-manifest-urls/funding.json b/.well-known/funding-manifest-urls/funding.json new file mode 100644 index 00000000..8bcd746f --- /dev/null +++ b/.well-known/funding-manifest-urls/funding.json @@ -0,0 +1,87 @@ +{ + "version": "v1.0.0", + "entity": { + "type": "individual", + "role": "maintainer", + "name": "Marco Vinciguerra", + "email": "mvincig11@gmail.com", + "phone": "", + "description": "I'm dedicated to advancing web scraping and data extraction through AI-powered tools, focusing on making data access more accessible and ethical. My mission is to create solutions that uphold digital freedoms and support open internet principles.", + "webpageUrl": { + "url": "https://scrapegraphai.com", + } + }, + "projects": [ + { + "guid": "scrapegraph-core", + "name": "ScrapeGraphAI Core", + "description": "An AI-powered web scraping framework that intelligently extracts structured data from websites with automatic pattern recognition, adaptive scraping strategies, and built-in rate limiting. Recognized as a top 200 open-source AI project globally.", + "webpageUrl": { + "url": "https://scrapegraphai.com/projects/core", + }, + "repositoryUrl": { + "url": "https://github.com/ScrapeGraphAI/Scrapegraph-ai", + }, + "licenses": ["spdx:MIT"], + "tags": ["web-scraping", "ai", "data-extraction", "python", "machine-learning", "open-source", "llm"] + } + ], + "funding": { + "channels": [ + { + "guid": "mybank", + "type": "bank", + "address": "", + "description": "Will accept direct bank transfers. Please e-mail me for details." + }, + { + "guid": "mypay", + "type": "payment-provider", + "address": "https://example.com/payme/@myid", + "description": "Pay with your debit/credit card through this gateway and set up recurring subscriptions." + } + ], + "plans": [ + { + "guid": "infrastructure", + "status": "active", + "name": "Infrastructure Support", + "description": "Help cover monthly cloud infrastructure costs, including API servers, model hosting, and data storage.", + "amount": 750, + "currency": "USD", + "frequency": "monthly", + "channels": ["mybank"] + }, + { + "guid": "developer-compensation", + "status": "active", + "name": "Developer Compensation", + "description": "Provides financial support for developers working on maintenance, updates, and feature additions for the projects.", + "amount": 2500, + "currency": "USD", + "frequency": "monthly", + "channels": ["mybank"] + }, + { + "guid": "community-backer", + "status": "active", + "name": "Community Backer", + "description": "Support our open-source efforts with any contribution amount. Every donation helps!", + "amount": 5, + "currency": "USD", + "frequency": "monthly", + "channels": ["mypay"] + } + ], + "history": [ + { + "year": 2024, + "income": 15000, + "expenses": 15000, + "taxes": 0, + "currency": "USD", + "description": "Experienced a temporary dip in donations, with improvements expected." + } + ] + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 7be1274a..5292abd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,50 @@ -## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29) +## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26) + + +### Features + +* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3)) +* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635)) +* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76)) +* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254)) +* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4)) +* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703)) +* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4)) +* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887)) + ### Bug Fixes +* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422)) +* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa)) +* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918)) + + +### chore + +* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361)) + + +### Test + +* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3)) +* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612)) + + +### CI + +* **release:** 1.26.6-beta.1 [skip ci] ([e0fc457](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e0fc457d1a850f3306d473fbde55dd800133b404)) +* **release:** 1.27.0-beta.1 [skip ci] ([9266a36](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9266a36b2efdf7027470d59aa14b654d68f7cb51)) +* **release:** 1.27.0-beta.10 [skip ci] ([eee131e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eee131e959a36a4471f72610eefbc1764808b6be)) +* **release:** 1.27.0-beta.2 [skip ci] ([d84d295](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d84d29538985ef8d04badfed547c6fdc73d7774d)) +* **release:** 1.27.0-beta.3 [skip ci] ([f576afa](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f576afaf0c1dd6d1dbf79fd5e642f6dca9dbe862)) +* **release:** 1.27.0-beta.4 [skip ci] ([3d6bbcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3d6bbcdaa3828ff257adb22f2f7c1a46343de5b5)) +* **release:** 1.27.0-beta.5 [skip ci] ([5002c71](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5002c713d5a76b2c2e4313f888d9768e3f3142e1)) +* **release:** 1.27.0-beta.6 [skip ci] ([94b9836](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/94b9836ef6cd9c24bb8c04d7049d5477cc8ed807)) +* **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c)) +* **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee)) +* **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f)) * **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) ## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28) diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst index af746b26..1bed0a6e 100644 --- a/docs/source/getting_started/examples.rst +++ b/docs/source/getting_started/examples.rst @@ -22,7 +22,7 @@ OpenAI models graph_config = { "llm": { "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", + "model": "openai/gpt-4o", }, } @@ -67,11 +67,6 @@ After that, you can run the following code, using only your machine resources br "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 2000, # depending on the model set context length "base_url": "http://localhost:11434", # set ollama URL of the local host (YOU CAN CHANGE IT, if you have a different endpoint - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL } } diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index a37bbacc..4e2bd604 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -32,12 +32,16 @@ OpenAI Models - GPT-3.5 Turbo (16,385 tokens) - GPT-4 (8,192 tokens) - GPT-4 Turbo Preview (128,000 tokens) +- GPT-4o (128000 tokens) +- GTP-4o-mini (128000 tokens) Azure OpenAI Models ------------------- - GPT-3.5 Turbo (16,385 tokens) - GPT-4 (8,192 tokens) - GPT-4 Turbo Preview (128,000 tokens) +- GPT-4o (128000 tokens) +- GTP-4o-mini (128000 tokens) Google AI Models ---------------- diff --git a/pyproject.toml b/pyproject.toml index 8a6d894a..810f871b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [project] name = "scrapegraphai" + version = "1.27.0b13" diff --git a/scrapegraphai/__init__.py b/scrapegraphai/__init__.py index 448d6511..52b4d951 100644 --- a/scrapegraphai/__init__.py +++ b/scrapegraphai/__init__.py @@ -1,3 +1,3 @@ """ - __init__.py file for scrapegraphai folder +__init__.py file for scrapegraphai folder """ diff --git a/scrapegraphai/builders/__init__.py b/scrapegraphai/builders/__init__.py index 98520fcb..d01175db 100644 --- a/scrapegraphai/builders/__init__.py +++ b/scrapegraphai/builders/__init__.py @@ -1,5 +1,5 @@ """ -__init__.py file for builders folder +This module contains the builders for constructing various components in the ScrapeGraphAI application. """ from .graph_builder import GraphBuilder diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 1010a6be..75049b09 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,4 +1,6 @@ -"""__init__.py file for docloaders folder""" +""" +This module handles document loading functionalities for the ScrapeGraphAI application. +""" from .chromium import ChromiumLoader from .browser_base import browser_base_fetch diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 2c75f0f7..516ecbb9 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -1,5 +1,5 @@ -""" -__init__.py file for graphs folder +""" +This module defines the graph structures and related functionalities for the ScrapeGraphAI application. """ from .abstract_graph import AbstractGraph diff --git a/scrapegraphai/graphs/document_scraper_graph.py b/scrapegraphai/graphs/document_scraper_graph.py index 39e54f4a..48664f7f 100644 --- a/scrapegraphai/graphs/document_scraper_graph.py +++ b/scrapegraphai/graphs/document_scraper_graph.py @@ -1,5 +1,5 @@ """ -md_scraper module +This module implements the Document Scraper Graph for the ScrapeGraphAI application. """ from typing import Optional import logging diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index c15bc065..be909ba2 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -1,5 +1,5 @@ """ -OmniScraperGraph Module +This module implements the Omni Scraper Graph for the ScrapeGraphAI application. """ from typing import Optional from pydantic import BaseModel diff --git a/scrapegraphai/helpers/__init__.py b/scrapegraphai/helpers/__init__.py index 97f0e5d5..a09f13bf 100644 --- a/scrapegraphai/helpers/__init__.py +++ b/scrapegraphai/helpers/__init__.py @@ -1,5 +1,5 @@ -""" -__init__.py for the helpers folder +""" +This module provides helper functions and utilities for the ScrapeGraphAI application. """ from .nodes_metadata import nodes_metadata from .schemas import graph_schema diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index ce798ad8..abafd224 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -1,5 +1,5 @@ """ - __init__.py file for models folder +This module contains the model definitions used in the ScrapeGraphAI application. """ from .openai_itt import OpenAIImageToText from .openai_tts import OpenAITextToSpeech diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index 8b0f8064..b3df81b6 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -1,5 +1,5 @@ -""" -BaseNode Module +""" +This module defines the base node class for the ScrapeGraphAI application. """ import re from abc import ABC, abstractmethod diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index d90864e9..06842ca4 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,4 +1,4 @@ -"""" +""" FetchNode Module """ import json diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py index 944ed24e..802ba247 100644 --- a/scrapegraphai/prompts/description_node_prompts.py +++ b/scrapegraphai/prompts/description_node_prompts.py @@ -1,5 +1,5 @@ """ -description node prompts +This module contains prompts for description nodes in the ScrapeGraphAI application. """ DESCRIPTION_NODE_PROMPT = """ diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 2ec3b140..9b00f61c 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -60,13 +60,18 @@ def minify_html(html): """ minify_html function """ - html = re.sub(r'', '', html, flags=re.DOTALL) - - html = re.sub(r'>\s+<', '><', html) - html = re.sub(r'\s+>', '>', html) - html = re.sub(r'<\s+', '<', html) - html = re.sub(r'\s+', ' ', html) - html = re.sub(r'\s*=\s*', '=', html) + # Combine multiple regex operations into one for better performance + patterns = [ + (r'', '', re.DOTALL), + (r'>\s+<', '><', 0), + (r'\s+>', '>', 0), + (r'<\s+', '<', 0), + (r'\s+', ' ', 0), + (r'\s*=\s*', '=', 0) + ] + + for pattern, repl, flags in patterns: + html = re.sub(pattern, repl, html, flags=flags) return html.strip() diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index a35370ab..2ec7cee2 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -30,56 +30,38 @@ def is_boto3_client(obj): def safe_deepcopy(obj: Any) -> Any: """ - Attempts to create a deep copy of the object using `copy.deepcopy` - whenever possible. If that fails, it falls back to custom deep copy - logic. If that also fails, it raises a `DeepCopyError`. - + Safely create a deep copy of an object, handling special cases. + Args: - obj (Any): The object to be copied, which can be of any type. - + obj: Object to copy + Returns: - Any: A deep copy of the object if possible; otherwise, a shallow - copy if deep copying fails; if neither is possible, the original - object is returned. + Deep copy of the object + Raises: - DeepCopyError: If the object cannot be deep-copied or shallow-copied. + DeepCopyError: If object cannot be deep copied """ - try: - - return copy.deepcopy(obj) - except (TypeError, AttributeError) as e: - + # Handle special cases first + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + + if isinstance(obj, (list, set)): + return type(obj)(safe_deepcopy(v) for v in obj) + if isinstance(obj, dict): - new_obj = {} - - for k, v in obj.items(): - new_obj[k] = safe_deepcopy(v) - return new_obj - - elif isinstance(obj, list): - new_obj = [] - - for v in obj: - new_obj.append(safe_deepcopy(v)) - return new_obj - - elif isinstance(obj, tuple): - new_obj = tuple(safe_deepcopy(v) for v in obj) - - return new_obj - - elif isinstance(obj, frozenset): - new_obj = frozenset(safe_deepcopy(v) for v in obj) - return new_obj - - elif is_boto3_client(obj): + return {k: safe_deepcopy(v) for k, v in obj.items()} + + if isinstance(obj, tuple): + return tuple(safe_deepcopy(v) for v in obj) + + if isinstance(obj, frozenset): + return frozenset(safe_deepcopy(v) for v in obj) + + if is_boto3_client(obj): return obj - - else: - try: - return copy.copy(obj) - except (TypeError, AttributeError): - raise DeepCopyError( - f"Cannot deep copy the object of type {type(obj)}" - ) from e + + return copy.copy(obj) + + except Exception as e: + raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index af351ad4..86f9f5f3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -9,101 +9,97 @@ from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080, + max_results: int = 10, port: int = 8080, timeout: int = 10, proxy: str | dict = None) -> List[str]: + """Search web function with improved error handling and validation""" + + # Input validation + if not query or not isinstance(query, str): + raise ValueError("Query must be a non-empty string") + + search_engine = search_engine.lower() + valid_engines = {"google", "duckduckgo", "bing", "searxng"} + if search_engine not in valid_engines: + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") + + # Format proxy once + formatted_proxy = None + if proxy: + formatted_proxy = format_proxy(proxy) + + try: + results = [] + if search_engine == "google": + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) + + elif search_engine == "duckduckgo": + research = DuckDuckGoSearchResults(max_results=max_results) + res = research.run(query) + results = re.findall(r'https?://[^\s,\]]+', res) + + elif search_engine == "bing": + results = _search_bing(query, max_results, timeout, formatted_proxy) + + elif search_engine == "searxng": + results = _search_searxng(query, max_results, port, timeout) + + return filter_pdf_links(results) + + except requests.Timeout: + raise TimeoutError(f"Search request timed out after {timeout} seconds") + except requests.RequestException as e: + raise RuntimeError(f"Search request failed: {str(e)}") + +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: + """Helper function for Bing search""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + + proxies = {"http": proxy, "https": proxy} if proxy else None + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] + +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: + """Helper function for SearXNG search""" + url = f"http://localhost:{port}" + params = { + "q": query, + "format": "json", + "engines": "google,duckduckgo,brave,qwant,bing" + } + response = requests.get(url, params=params, timeout=timeout) + response.raise_for_status() + return [result['url'] for result in response.json().get("results", [])[:max_results]] + +def format_proxy(proxy): + if isinstance(proxy, dict): + server = proxy.get('server') + username = proxy.get('username') + password = proxy.get('password') + + if all([username, password, server]): + proxy_url = f"http://{username}:{password}@{server}" + return proxy_url + else: + raise ValueError("Proxy dictionary is missing required fields.") + elif isinstance(proxy, str): + return proxy # "https://username:password@ip:port" + else: + raise TypeError("Proxy should be a dictionary or a string.") + +def filter_pdf_links(links: List[str]) -> List[str]: """ - Searches the web for a given query using specified search - engine options and filters out PDF links. + Filters out any links that point to PDF files. Args: - query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. - max_results (int, optional): The maximum number of search results to return. - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. - timeout (int, optional): The number of seconds to wait - for a response from a request. Default is 10 seconds. - proxy (dict or string, optional): The proxy server to use for the request. Default is None. + links (List[str]): A list of URLs as strings. Returns: - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. - - Raises: - ValueError: If the search engine specified is not supported. - requests.exceptions.Timeout: If the request times out. - - Example: - >>> search_on_web("example query", search_engine="Google", max_results=5) - ['http://example.com', 'http://example.org', ...] + List[str]: A list of URLs excluding any that end with '.pdf'. """ - - def format_proxy(proxy): - if isinstance(proxy, dict): - server = proxy.get('server') - username = proxy.get('username') - password = proxy.get('password') - - if all([username, password, server]): - proxy_url = f"http://{username}:{password}@{server}" - return proxy_url - else: - raise ValueError("Proxy dictionary is missing required fields.") - elif isinstance(proxy, str): - return proxy # "https://username:password@ip:port" - else: - raise TypeError("Proxy should be a dictionary or a string.") - - def filter_pdf_links(links: List[str]) -> List[str]: - """ - Filters out any links that point to PDF files. - - Args: - links (List[str]): A list of URLs as strings. - - Returns: - List[str]: A list of URLs excluding any that end with '.pdf'. - """ - return [link for link in links if not link.lower().endswith('.pdf')] - - if proxy: - proxy = format_proxy(proxy) - - if search_engine.lower() == "google": - res = [] - for url in google_search(query, num_results=max_results, proxy=proxy): - res.append(url) - return filter_pdf_links(res) - - elif search_engine.lower() == "duckduckgo": - research = DuckDuckGoSearchResults(max_results=max_results) - res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return filter_pdf_links(links) - - elif search_engine.lower() == "bing": - headers = { - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" - } - search_url = f"https://www.bing.com/search?q={query}" - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - search_results = [] - for result in soup.find_all('li', class_='b_algo', limit=max_results): - link = result.find('a')['href'] - search_results.append(link) - return filter_pdf_links(search_results) - - elif search_engine.lower() == "searxng": - url = f"http://localhost:{port}" - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} - response = requests.get(url, params=params, timeout=timeout) - data = response.json() - limited_results = [result['url'] for result in data["results"][:max_results]] - return filter_pdf_links(limited_results) - - else: - raise ValueError("""The only search engines available are - DuckDuckGo, Google, Bing, or SearXNG""") + return [link for link in links if not link.lower().endswith('.pdf')]