Skip to content

Commit 3d47100

Browse files
authored
Upgrade and improve DuckDuckGo search implementation (#28)
* Upgrade and improve DuckDuckGo search implementation - Upgrade duckduckgo-search to v7.2.1 - Simplify implementation to use default DDGS configuration - Remove deprecated backend='api' parameter - Standardize result field names (href, body) - Update unit tests to match new implementation - Add comprehensive testing with diverse queries: - Different languages (Chinese, Japanese) - Technical queries - Special characters - Short and long queries - Emojis and Unicode All tests passing, improved reliability and search quality. * Update the .cursorfiles again.
1 parent 86186fd commit 3d47100

File tree

4 files changed

+41
-82
lines changed

4 files changed

+41
-82
lines changed

.cursorrules

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ If needed, you can further use the `web_scraper.py` file to scrape the web page
6060
- You have a python venv in ./venv. Use it.
6161
- Include info useful for debugging in the program output.
6262
- Read the file before you try to edit it.
63+
- Due to Cursor's limit, when you use `git` and `gh` and need to submit a multiline commit message, first write the message in a file, and then use `git commit -F <filename>` or similar command to commit. And then remove the file. Include "[Cursor] " in the commit message and PR title.
6364

6465
## Cursor learned
6566

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ playwright>=1.41.0
33
html5lib>=1.1
44

55
# Search engine
6-
duckduckgo-search>=4.1.1
6+
duckduckgo-search>=7.2.1
77

88
# LLM integration
99
openai>=1.59.8 # o1 support

tests/test_search_engine.py

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ def test_successful_search(self, mock_ddgs):
2424
# Mock search results
2525
mock_results = [
2626
{
27-
'link': 'http://example.com',
27+
'href': 'http://example.com',
2828
'title': 'Example Title',
29-
'snippet': 'Example Snippet'
29+
'body': 'Example Body'
3030
},
3131
{
3232
'href': 'http://example2.com',
@@ -44,7 +44,7 @@ def test_successful_search(self, mock_ddgs):
4444
search("test query", max_results=2)
4545

4646
# Check debug output
47-
expected_debug = "DEBUG: Attempt 1/3 - Searching for query: test query"
47+
expected_debug = "DEBUG: Searching for query: test query (attempt 1/3)"
4848
self.assertIn(expected_debug, self.stderr.getvalue())
4949
self.assertIn("DEBUG: Found 2 results", self.stderr.getvalue())
5050

@@ -53,7 +53,7 @@ def test_successful_search(self, mock_ddgs):
5353
self.assertIn("=== Result 1 ===", output)
5454
self.assertIn("URL: http://example.com", output)
5555
self.assertIn("Title: Example Title", output)
56-
self.assertIn("Snippet: Example Snippet", output)
56+
self.assertIn("Snippet: Example Body", output)
5757
self.assertIn("=== Result 2 ===", output)
5858
self.assertIn("URL: http://example2.com", output)
5959
self.assertIn("Title: Example Title 2", output)
@@ -62,8 +62,7 @@ def test_successful_search(self, mock_ddgs):
6262
# Verify mock was called correctly
6363
mock_ddgs_instance.__enter__.return_value.text.assert_called_once_with(
6464
"test query",
65-
max_results=2,
66-
backend='api'
65+
max_results=2
6766
)
6867

6968
@patch('tools.search_engine.DDGS')
@@ -97,32 +96,23 @@ def test_search_error(self, mock_ddgs):
9796
self.assertIn("ERROR: Search failed: Test error", self.stderr.getvalue())
9897

9998
def test_result_field_fallbacks(self):
100-
# Test that the fallback fields work correctly
101-
result = {
102-
'link': 'http://example.com',
103-
'title': 'Example Title',
104-
'snippet': 'Example Snippet'
105-
}
106-
107-
# Test primary fields
108-
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com')
109-
self.assertEqual(result.get('title', 'N/A'), 'Example Title')
110-
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Snippet')
111-
112-
# Test fallback fields
99+
# Test that the fields work correctly with N/A fallback
113100
result = {
114101
'href': 'http://example.com',
115102
'title': 'Example Title',
116103
'body': 'Example Body'
117104
}
118-
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'http://example.com')
119-
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'Example Body')
105+
106+
# Test fields present
107+
self.assertEqual(result.get('href', 'N/A'), 'http://example.com')
108+
self.assertEqual(result.get('title', 'N/A'), 'Example Title')
109+
self.assertEqual(result.get('body', 'N/A'), 'Example Body')
120110

121111
# Test missing fields
122112
result = {}
123-
self.assertEqual(result.get('link', result.get('href', 'N/A')), 'N/A')
113+
self.assertEqual(result.get('href', 'N/A'), 'N/A')
124114
self.assertEqual(result.get('title', 'N/A'), 'N/A')
125-
self.assertEqual(result.get('snippet', result.get('body', 'N/A')), 'N/A')
115+
self.assertEqual(result.get('body', 'N/A'), 'N/A')
126116

127117
if __name__ == '__main__':
128118
unittest.main()

tools/search_engine.py

Lines changed: 26 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,110 +2,78 @@
22

33
import argparse
44
import sys
5-
import traceback
65
import time
7-
import random
86
from duckduckgo_search import DDGS
9-
from duckduckgo_search.exceptions import DuckDuckGoSearchException
107

11-
def get_random_user_agent():
12-
"""Return a random User-Agent string."""
13-
user_agents = [
14-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
15-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
16-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
17-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
18-
]
19-
return random.choice(user_agents)
20-
21-
def search_with_retry(query, max_results=10, max_retries=3, initial_delay=2):
8+
def search_with_retry(query, max_results=10, max_retries=3):
229
"""
23-
Perform search with retry mechanism.
10+
Search using DuckDuckGo and return results with URLs and text snippets.
2411
2512
Args:
2613
query (str): Search query
2714
max_results (int): Maximum number of results to return
2815
max_retries (int): Maximum number of retry attempts
29-
initial_delay (int): Initial delay between retries in seconds
3016
"""
3117
for attempt in range(max_retries):
3218
try:
33-
headers = {'User-Agent': get_random_user_agent()}
34-
35-
print(f"DEBUG: Attempt {attempt + 1}/{max_retries} - Searching for query: {query}",
19+
print(f"DEBUG: Searching for query: {query} (attempt {attempt + 1}/{max_retries})",
3620
file=sys.stderr)
3721

38-
with DDGS(headers=headers) as ddgs:
39-
# Try API backend first, fallback to HTML if needed
40-
try:
41-
results = list(ddgs.text(
42-
query,
43-
max_results=max_results,
44-
backend='api'
45-
))
46-
except DuckDuckGoSearchException as api_error:
47-
print(f"DEBUG: API backend failed, trying HTML backend: {str(api_error)}",
48-
file=sys.stderr)
49-
# Add delay before trying HTML backend
50-
time.sleep(1)
51-
results = list(ddgs.text(
52-
query,
53-
max_results=max_results,
54-
backend='html'
55-
))
56-
57-
if not results:
58-
print("DEBUG: No results found", file=sys.stderr)
59-
return []
22+
with DDGS() as ddgs:
23+
results = list(ddgs.text(query, max_results=max_results))
6024

61-
print(f"DEBUG: Found {len(results)} results", file=sys.stderr)
62-
return results
25+
if not results:
26+
print("DEBUG: No results found", file=sys.stderr)
27+
return []
28+
29+
print(f"DEBUG: Found {len(results)} results", file=sys.stderr)
30+
return results
6331

6432
except Exception as e:
65-
print(f"ERROR: Attempt {attempt + 1} failed: {str(e)}", file=sys.stderr)
66-
if attempt < max_retries - 1:
67-
delay = initial_delay * (attempt + 1) + random.random() * 2
68-
print(f"DEBUG: Waiting {delay:.2f} seconds before retry...", file=sys.stderr)
69-
time.sleep(delay)
33+
print(f"ERROR: Attempt {attempt + 1}/{max_retries} failed: {str(e)}", file=sys.stderr)
34+
if attempt < max_retries - 1: # If not the last attempt
35+
print(f"DEBUG: Waiting 1 second before retry...", file=sys.stderr)
36+
time.sleep(1) # Wait 1 second before retry
7037
else:
71-
print("ERROR: All retry attempts failed", file=sys.stderr)
38+
print(f"ERROR: All {max_retries} attempts failed", file=sys.stderr)
7239
raise
7340

7441
def format_results(results):
7542
"""Format and print search results."""
7643
for i, r in enumerate(results, 1):
7744
print(f"\n=== Result {i} ===")
78-
print(f"URL: {r.get('link', r.get('href', 'N/A'))}")
45+
print(f"URL: {r.get('href', 'N/A')}")
7946
print(f"Title: {r.get('title', 'N/A')}")
80-
print(f"Snippet: {r.get('snippet', r.get('body', 'N/A'))}")
47+
print(f"Snippet: {r.get('body', 'N/A')}")
8148

82-
def search(query, max_results=10):
49+
def search(query, max_results=10, max_retries=3):
8350
"""
84-
Main search function that handles both API and HTML backends with retry mechanism.
51+
Main search function that handles search with retry mechanism.
8552
8653
Args:
8754
query (str): Search query
8855
max_results (int): Maximum number of results to return
56+
max_retries (int): Maximum number of retry attempts
8957
"""
9058
try:
91-
results = search_with_retry(query, max_results)
59+
results = search_with_retry(query, max_results, max_retries)
9260
if results:
9361
format_results(results)
9462

9563
except Exception as e:
9664
print(f"ERROR: Search failed: {str(e)}", file=sys.stderr)
97-
print(f"ERROR type: {type(e)}", file=sys.stderr)
98-
traceback.print_exc(file=sys.stderr)
9965
sys.exit(1)
10066

10167
def main():
102-
parser = argparse.ArgumentParser(description="Search using DuckDuckGo with fallback mechanisms")
68+
parser = argparse.ArgumentParser(description="Search using DuckDuckGo API")
10369
parser.add_argument("query", help="Search query")
10470
parser.add_argument("--max-results", type=int, default=10,
10571
help="Maximum number of results (default: 10)")
72+
parser.add_argument("--max-retries", type=int, default=3,
73+
help="Maximum number of retry attempts (default: 3)")
10674

10775
args = parser.parse_args()
108-
search(args.query, args.max_results)
76+
search(args.query, args.max_results, args.max_retries)
10977

11078
if __name__ == "__main__":
11179
main()

0 commit comments

Comments
 (0)