Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cursorrules
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ If needed, you can further use the `web_scraper.py` file to scrape the web page
- You have a python venv in ./venv. Use it.
- Include info useful for debugging in the program output.
- Read the file before you try to edit it.
- Due to Cursor's limit, when you use `git` and `gh` and need to submit a multiline commit message, first write the message in a file, and then use `git commit -F <filename>` or similar command to commit. And then remove the file. Include "[Cursor] " in the commit message and PR title.

## Cursor learned

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ playwright>=1.41.0
html5lib>=1.1

# Search engine
duckduckgo-search>=4.1.1
duckduckgo-search>=7.2.1

# LLM integration
openai>=1.59.8 # o1 support
Expand Down
36 changes: 13 additions & 23 deletions tests/test_search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def test_successful_search(self, mock_ddgs):
# Mock search results
mock_results = [
{
'link': 'http://example.com',
'href': 'http://example.com',
'title': 'Example Title',
'snippet': 'Example Snippet'
'body': 'Example Body'
},
{
'href': 'http://example2.com',
Expand All @@ -44,7 +44,7 @@ def test_successful_search(self, mock_ddgs):
search("test query", max_results=2)

# Check debug output
expected_debug = "DEBUG: Attempt 1/3 - Searching for query: test query"
expected_debug = "DEBUG: Searching for query: test query (attempt 1/3)"
self.assertIn(expected_debug, self.stderr.getvalue())
self.assertIn("DEBUG: Found 2 results", self.stderr.getvalue())

Expand All @@ -53,7 +53,7 @@ def test_successful_search(self, mock_ddgs):
self.assertIn("=== Result 1 ===", output)
self.assertIn("URL: http://example.com", output)
self.assertIn("Title: Example Title", output)
self.assertIn("Snippet: Example Snippet", output)
self.assertIn("Snippet: Example Body", output)
self.assertIn("=== Result 2 ===", output)
self.assertIn("URL: http://example2.com", output)
self.assertIn("Title: Example Title 2", output)
Expand All @@ -62,8 +62,7 @@ def test_successful_search(self, mock_ddgs):
# Verify mock was called correctly
mock_ddgs_instance.__enter__.return_value.text.assert_called_once_with(
"test query",
max_results=2,
backend='api'
max_results=2
)

@patch('tools.search_engine.DDGS')
Expand Down Expand Up @@ -97,32 +96,23 @@ def test_search_error(self, mock_ddgs):
self.assertIn("ERROR: Search failed: Test error", self.stderr.getvalue())

def test_result_field_fallbacks(self):
# Test that the fallback fields work correctly
result = {
'link': 'http://example.com',
'title': 'Example Title',
'snippet': 'Example Snippet'
}

# Test primary fields
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com')
self.assertEqual(result.get('title', 'N/A'), 'Example Title')
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Snippet')

# Test fallback fields
# Test that the fields work correctly with N/A fallback
result = {
'href': 'http://example.com',
'title': 'Example Title',
'body': 'Example Body'
}
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com')
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Body')

# Test fields present
self.assertEqual(result.get('href', 'N/A'), 'http://example.com')
self.assertEqual(result.get('title', 'N/A'), 'Example Title')
self.assertEqual(result.get('body', 'N/A'), 'Example Body')

# Test missing fields
result = {}
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'N/A')
self.assertEqual(result.get('href', 'N/A'), 'N/A')
self.assertEqual(result.get('title', 'N/A'), 'N/A')
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'N/A')
self.assertEqual(result.get('body', 'N/A'), 'N/A')

if __name__ == '__main__':
unittest.main()
84 changes: 26 additions & 58 deletions tools/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,110 +2,78 @@

import argparse
import sys
import traceback
import time
import random
from duckduckgo_search import DDGS
from duckduckgo_search.exceptions import DuckDuckGoSearchException

def get_random_user_agent():
"""Return a random User-Agent string."""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]
return random.choice(user_agents)

def search_with_retry(query, max_results=10, max_retries=3, initial_delay=2):
def search_with_retry(query, max_results=10, max_retries=3):
"""
Perform search with retry mechanism.
Search using DuckDuckGo and return results with URLs and text snippets.

Args:
query (str): Search query
max_results (int): Maximum number of results to return
max_retries (int): Maximum number of retry attempts
initial_delay (int): Initial delay between retries in seconds
"""
for attempt in range(max_retries):
try:
headers = {'User-Agent': get_random_user_agent()}

print(f"DEBUG: Attempt {attempt + 1}/{max_retries} - Searching for query: {query}",
print(f"DEBUG: Searching for query: {query} (attempt {attempt + 1}/{max_retries})",
file=sys.stderr)

with DDGS(headers=headers) as ddgs:
# Try API backend first, fallback to HTML if needed
try:
results = list(ddgs.text(
query,
max_results=max_results,
backend='api'
))
except DuckDuckGoSearchException as api_error:
print(f"DEBUG: API backend failed, trying HTML backend: {str(api_error)}",
file=sys.stderr)
# Add delay before trying HTML backend
time.sleep(1)
results = list(ddgs.text(
query,
max_results=max_results,
backend='html'
))

if not results:
print("DEBUG: No results found", file=sys.stderr)
return []
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=max_results))

print(f"DEBUG: Found {len(results)} results", file=sys.stderr)
return results
if not results:
print("DEBUG: No results found", file=sys.stderr)
return []

print(f"DEBUG: Found {len(results)} results", file=sys.stderr)
return results

except Exception as e:
print(f"ERROR: Attempt {attempt + 1} failed: {str(e)}", file=sys.stderr)
if attempt < max_retries - 1:
delay = initial_delay * (attempt + 1) + random.random() * 2
print(f"DEBUG: Waiting {delay:.2f} seconds before retry...", file=sys.stderr)
time.sleep(delay)
print(f"ERROR: Attempt {attempt + 1}/{max_retries} failed: {str(e)}", file=sys.stderr)
if attempt < max_retries - 1: # If not the last attempt
print(f"DEBUG: Waiting 1 second before retry...", file=sys.stderr)
time.sleep(1) # Wait 1 second before retry
else:
print("ERROR: All retry attempts failed", file=sys.stderr)
print(f"ERROR: All {max_retries} attempts failed", file=sys.stderr)
raise

def format_results(results):
"""Format and print search results."""
for i, r in enumerate(results, 1):
print(f"\n=== Result {i} ===")
print(f"URL: {r.get('link', r.get('href', 'N/A'))}")
print(f"URL: {r.get('href', 'N/A')}")
print(f"Title: {r.get('title', 'N/A')}")
print(f"Snippet: {r.get('snippet', r.get('body', 'N/A'))}")
print(f"Snippet: {r.get('body', 'N/A')}")

def search(query, max_results=10):
def search(query, max_results=10, max_retries=3):
"""
Main search function that handles both API and HTML backends with retry mechanism.
Main search function that handles search with retry mechanism.

Args:
query (str): Search query
max_results (int): Maximum number of results to return
max_retries (int): Maximum number of retry attempts
"""
try:
results = search_with_retry(query, max_results)
results = search_with_retry(query, max_results, max_retries)
if results:
format_results(results)

except Exception as e:
print(f"ERROR: Search failed: {str(e)}", file=sys.stderr)
print(f"ERROR type: {type(e)}", file=sys.stderr)
traceback.print_exc(file=sys.stderr)
sys.exit(1)

def main():
parser = argparse.ArgumentParser(description="Search using DuckDuckGo with fallback mechanisms")
parser = argparse.ArgumentParser(description="Search using DuckDuckGo API")
parser.add_argument("query", help="Search query")
parser.add_argument("--max-results", type=int, default=10,
help="Maximum number of results (default: 10)")
parser.add_argument("--max-retries", type=int, default=3,
help="Maximum number of retry attempts (default: 3)")

args = parser.parse_args()
search(args.query, args.max_results)
search(args.query, args.max_results, args.max_retries)

if __name__ == "__main__":
main()