2525from starlette .types import Receive , Scope , Send
2626import httpx
2727from bs4 import BeautifulSoup
28+ from ddgs import DDGS
2829
2930logger = logging .getLogger (__name__ )
3031
3132
3233def _get_proxy_url () -> Optional [str ]:
33- """Build proxy URL from PROXY_USERNAME and PROXY_PASSWORD env vars."""
34+ """Build proxy URL from environment variables.
35+
36+ Env vars:
37+ PROXY_USERNAME, PROXY_PASSWORD – required
38+ PROXY_HOST – default: p.webshare.io
39+ PROXY_PORT – default: 1080
40+ PROXY_SCHEME – default: socks5
41+ """
3442 username = os .environ .get ("PROXY_USERNAME" )
3543 password = os .environ .get ("PROXY_PASSWORD" )
36- if username and password :
37- return f"http://{ username } :{ password } @p.webshare.io:80/"
38- return None
44+ if not (username and password ):
45+ return None
46+ host = os .environ .get ("PROXY_HOST" , "p.webshare.io" )
47+ scheme = os .environ .get ("PROXY_SCHEME" , "http" )
48+ port = os .environ .get ("PROXY_PORT" , "1080" if "socks" in scheme else "80" )
49+ return f"{ scheme } ://{ username } :{ password } @{ host } :{ port } "
3950
4051
4152# DuckDuckGo does not require authentication, but we follow the auth extraction
@@ -87,10 +98,8 @@ class SearchResult:
8798
8899
89100class DuckDuckGoSearcher :
90- BASE_URL = "https://html.duckduckgo.com/html"
91- HEADERS = {
92- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
93- }
101+ """Uses the duckduckgo-search library (primp browser impersonation)
102+ to avoid CAPTCHAs that raw httpx requests trigger."""
94103
95104 def format_results_for_llm (self , results : List [SearchResult ]) -> str :
96105 if not results :
@@ -111,71 +120,27 @@ async def search(
111120 self , query : str , ctx : Context , max_results : int = 10
112121 ) -> List [SearchResult ]:
113122 try :
114- data = {
115- "q" : query ,
116- "b" : "" ,
117- "kl" : "" ,
118- }
119-
120123 await ctx .info (f"Searching DuckDuckGo for: { query } " )
121124
122125 proxy = _get_proxy_url ()
123- async with httpx .AsyncClient (proxy = proxy ) as client :
124- response = await client .post (
125- self .BASE_URL , data = data , headers = self .HEADERS , timeout = 30.0
126- )
127- response .raise_for_status ()
128-
129- soup = BeautifulSoup (response .text , "html.parser" )
130- if not soup :
131- await ctx .error ("Failed to parse HTML response" )
132- return []
133-
134- results = []
135- for result in soup .select (".result" ):
136- title_elem = result .select_one (".result__title" )
137- if not title_elem :
138- continue
139-
140- link_elem = title_elem .find ("a" )
141- if not link_elem :
142- continue
143-
144- title = link_elem .get_text (strip = True )
145- link = link_elem .get ("href" , "" )
146-
147- if "y.js" in link :
148- continue
149-
150- if link .startswith ("//duckduckgo.com/l/?uddg=" ):
151- link = urllib .parse .unquote (link .split ("uddg=" )[1 ].split ("&" )[0 ])
152-
153- snippet_elem = result .select_one (".result__snippet" )
154- snippet = snippet_elem .get_text (strip = True ) if snippet_elem else ""
155-
156- results .append (
157- SearchResult (
158- title = title ,
159- link = link ,
160- snippet = snippet ,
161- position = len (results ) + 1 ,
162- )
126+ ddgs = DDGS (proxy = proxy )
127+ raw_results = ddgs .text (query , max_results = max_results , backend = "duckduckgo" )
128+
129+ results = [
130+ SearchResult (
131+ title = r .get ("title" , "" ),
132+ link = r .get ("href" , "" ),
133+ snippet = r .get ("body" , "" ),
134+ position = i + 1 ,
163135 )
164-
165- if len (results ) >= max_results :
166- break
136+ for i , r in enumerate (raw_results )
137+ ]
167138
168139 await ctx .info (f"Successfully found { len (results )} results" )
169140 return results
170141
171- except httpx .TimeoutError :
172- await ctx .error ("Search request timed out" )
173- return []
174- except httpx .HTTPError as e :
175- await ctx .error (f"HTTP error occurred: { str (e )} " )
176- return []
177142 except Exception as e :
178- await ctx .error (f"Unexpected error during search : { str ( e ) } " )
143+ await ctx .error (f"Search error: { e } " )
179144 traceback .print_exc (file = sys .stderr )
180145 return []
181146
0 commit comments