diff --git a/.cursorrules b/.cursorrules index f34c6ba..dfe3424 100644 --- a/.cursorrules +++ b/.cursorrules @@ -60,6 +60,7 @@ If needed, you can further use the `web_scraper.py` file to scrape the web page - You have a python venv in ./venv. Use it. - Include info useful for debugging in the program output. - Read the file before you try to edit it. +- Due to Cursor's limit, when you use `git` and `gh` and need to submit a multiline commit message, first write the message in a file, and then use `git commit -F ` or similar command to commit. And then remove the file. Include "[Cursor] " in the commit message and PR title. ## Cursor learned diff --git a/requirements.txt b/requirements.txt index 78798a6..22936cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ playwright>=1.41.0 html5lib>=1.1 # Search engine -duckduckgo-search>=4.1.1 +duckduckgo-search>=7.2.1 # LLM integration openai>=1.59.8 # o1 support diff --git a/tests/test_search_engine.py b/tests/test_search_engine.py index 0e148c3..e7c6023 100644 --- a/tests/test_search_engine.py +++ b/tests/test_search_engine.py @@ -24,9 +24,9 @@ def test_successful_search(self, mock_ddgs): # Mock search results mock_results = [ { - 'link': 'http://example.com', + 'href': 'http://example.com', 'title': 'Example Title', - 'snippet': 'Example Snippet' + 'body': 'Example Body' }, { 'href': 'http://example2.com', @@ -44,7 +44,7 @@ def test_successful_search(self, mock_ddgs): search("test query", max_results=2) # Check debug output - expected_debug = "DEBUG: Attempt 1/3 - Searching for query: test query" + expected_debug = "DEBUG: Searching for query: test query (attempt 1/3)" self.assertIn(expected_debug, self.stderr.getvalue()) self.assertIn("DEBUG: Found 2 results", self.stderr.getvalue()) @@ -53,7 +53,7 @@ def test_successful_search(self, mock_ddgs): self.assertIn("=== Result 1 ===", output) self.assertIn("URL: http://example.com", output) self.assertIn("Title: Example Title", output) - self.assertIn("Snippet: Example Snippet", output) + self.assertIn("Snippet: Example Body", output) self.assertIn("=== Result 2 ===", output) self.assertIn("URL: http://example2.com", output) self.assertIn("Title: Example Title 2", output) @@ -62,8 +62,7 @@ def test_successful_search(self, mock_ddgs): # Verify mock was called correctly mock_ddgs_instance.__enter__.return_value.text.assert_called_once_with( "test query", - max_results=2, - backend='api' + max_results=2 ) @patch('tools.search_engine.DDGS') @@ -97,32 +96,23 @@ def test_search_error(self, mock_ddgs): self.assertIn("ERROR: Search failed: Test error", self.stderr.getvalue()) def test_result_field_fallbacks(self): - # Test that the fallback fields work correctly - result = { - 'link': 'http://example.com', - 'title': 'Example Title', - 'snippet': 'Example Snippet' - } - - # Test primary fields - self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com') - self.assertEqual(result.get('title', 'N/A'), 'Example Title') - self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Snippet') - - # Test fallback fields + # Test that the fields work correctly with N/A fallback result = { 'href': 'http://example.com', 'title': 'Example Title', 'body': 'Example Body' } - self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com') - self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Body') + + # Test fields present + self.assertEqual(result.get('href', 'N/A'), 'http://example.com') + self.assertEqual(result.get('title', 'N/A'), 'Example Title') + self.assertEqual(result.get('body', 'N/A'), 'Example Body') # Test missing fields result = {} - self.assertEqual(result.get('link', result.get('href', 'N/A')), 'N/A') + self.assertEqual(result.get('href', 'N/A'), 'N/A') self.assertEqual(result.get('title', 'N/A'), 'N/A') - self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'N/A') + self.assertEqual(result.get('body', 'N/A'), 'N/A') if __name__ == '__main__': unittest.main() diff --git a/tools/search_engine.py b/tools/search_engine.py index 0e7e9c9..120544e 100755 --- a/tools/search_engine.py +++ b/tools/search_engine.py @@ -2,110 +2,78 @@ import argparse import sys -import traceback import time -import random from duckduckgo_search import DDGS -from duckduckgo_search.exceptions import DuckDuckGoSearchException -def get_random_user_agent(): - """Return a random User-Agent string.""" - user_agents = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - ] - return random.choice(user_agents) - -def search_with_retry(query, max_results=10, max_retries=3, initial_delay=2): +def search_with_retry(query, max_results=10, max_retries=3): """ - Perform search with retry mechanism. + Search using DuckDuckGo and return results with URLs and text snippets. Args: query (str): Search query max_results (int): Maximum number of results to return max_retries (int): Maximum number of retry attempts - initial_delay (int): Initial delay between retries in seconds """ for attempt in range(max_retries): try: - headers = {'User-Agent': get_random_user_agent()} - - print(f"DEBUG: Attempt {attempt + 1}/{max_retries} - Searching for query: {query}", + print(f"DEBUG: Searching for query: {query} (attempt {attempt + 1}/{max_retries})", file=sys.stderr) - with DDGS(headers=headers) as ddgs: - # Try API backend first, fallback to HTML if needed - try: - results = list(ddgs.text( - query, - max_results=max_results, - backend='api' - )) - except DuckDuckGoSearchException as api_error: - print(f"DEBUG: API backend failed, trying HTML backend: {str(api_error)}", - file=sys.stderr) - # Add delay before trying HTML backend - time.sleep(1) - results = list(ddgs.text( - query, - max_results=max_results, - backend='html' - )) - - if not results: - print("DEBUG: No results found", file=sys.stderr) - return [] + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=max_results)) - print(f"DEBUG: Found {len(results)} results", file=sys.stderr) - return results + if not results: + print("DEBUG: No results found", file=sys.stderr) + return [] + + print(f"DEBUG: Found {len(results)} results", file=sys.stderr) + return results except Exception as e: - print(f"ERROR: Attempt {attempt + 1} failed: {str(e)}", file=sys.stderr) - if attempt < max_retries - 1: - delay = initial_delay * (attempt + 1) + random.random() * 2 - print(f"DEBUG: Waiting {delay:.2f} seconds before retry...", file=sys.stderr) - time.sleep(delay) + print(f"ERROR: Attempt {attempt + 1}/{max_retries} failed: {str(e)}", file=sys.stderr) + if attempt < max_retries - 1: # If not the last attempt + print(f"DEBUG: Waiting 1 second before retry...", file=sys.stderr) + time.sleep(1) # Wait 1 second before retry else: - print("ERROR: All retry attempts failed", file=sys.stderr) + print(f"ERROR: All {max_retries} attempts failed", file=sys.stderr) raise def format_results(results): """Format and print search results.""" for i, r in enumerate(results, 1): print(f"\n=== Result {i} ===") - print(f"URL: {r.get('link', r.get('href', 'N/A'))}") + print(f"URL: {r.get('href', 'N/A')}") print(f"Title: {r.get('title', 'N/A')}") - print(f"Snippet: {r.get('snippet', r.get('body', 'N/A'))}") + print(f"Snippet: {r.get('body', 'N/A')}") -def search(query, max_results=10): +def search(query, max_results=10, max_retries=3): """ - Main search function that handles both API and HTML backends with retry mechanism. + Main search function that handles search with retry mechanism. Args: query (str): Search query max_results (int): Maximum number of results to return + max_retries (int): Maximum number of retry attempts """ try: - results = search_with_retry(query, max_results) + results = search_with_retry(query, max_results, max_retries) if results: format_results(results) except Exception as e: print(f"ERROR: Search failed: {str(e)}", file=sys.stderr) - print(f"ERROR type: {type(e)}", file=sys.stderr) - traceback.print_exc(file=sys.stderr) sys.exit(1) def main(): - parser = argparse.ArgumentParser(description="Search using DuckDuckGo with fallback mechanisms") + parser = argparse.ArgumentParser(description="Search using DuckDuckGo API") parser.add_argument("query", help="Search query") parser.add_argument("--max-results", type=int, default=10, help="Maximum number of results (default: 10)") + parser.add_argument("--max-retries", type=int, default=3, + help="Maximum number of retry attempts (default: 3)") args = parser.parse_args() - search(args.query, args.max_results) + search(args.query, args.max_results, args.max_retries) if __name__ == "__main__": main()