11import json
22import azure .functions as func
33import logging
4- from readability . readability import Document
4+ from readability import Document
55import requests
66from bs4 import BeautifulSoup
77import random
88import time
99from urllib .parse import urlparse
10+ import os
11+ from dotenv import load_dotenv
12+
13+ # Load environment variables from .env file
14+ load_dotenv ()
15+
16+ # In-memory API key (fallback if environment variable is not set)
17+ SERPER_API_KEY = os .environ .get ("SERPER_API_KEY" , "" )
1018
1119app = func .FunctionApp ()
1220
1927
2028def fetch_content (url , headers ):
2129 response = requests .get (url , headers = headers )
22- doc = Document (response .content )
30+ html_content = response .content .decode (response .encoding or 'utf-8' )
31+ doc = Document (html_content )
2332 summary_html = doc .summary (html_partial = True )
2433 soup = BeautifulSoup (summary_html , 'html.parser' )
2534 return soup
2635
36+ def clean_text (text ):
37+ """Clean scraped text by normalizing whitespace and newlines"""
38+ if not text :
39+ return ""
40+ # Replace multiple newlines with a single newline
41+ text = '\n ' .join (line .strip () for line in text .splitlines () if line .strip ())
42+ # Replace multiple spaces with a single space
43+ text = ' ' .join (text .split ())
44+ return text
45+
2746def try_fetch_with_backoff (url , headers , attempts = 3 , backoff_factor = 2 ):
2847 for attempt in range (attempts ):
2948 try :
@@ -52,16 +71,11 @@ def scrape(req: func.HttpRequest) -> func.HttpResponse:
5271 try :
5372 soup = try_fetch_with_backoff (url , headers )
5473 except Exception as e :
55- logging .error (f"Initial requests failed: { str (e )} " )
56- proxy_url = f"https://12ft.io/{ url } "
57- logging .info (f"Retrying with proxy: { proxy_url } " )
58- try :
59- soup = try_fetch_with_backoff (proxy_url , headers )
60- except Exception as e :
61- logging .error (f"Proxy requests failed: { str (e )} " )
62- return func .HttpResponse (f"Error: Failed to scrape the URL - { str (e )} " , status_code = 500 )
74+ logging .error (f"Requests failed: { str (e )} " )
75+ return func .HttpResponse (f"Error: Failed to scrape the URL - { str (e )} " , status_code = 500 )
6376
64- content = soup .get_text (separator = '\n ' ).strip ()
77+ raw_content = soup .get_text (separator = ' ' ).strip ()
78+ content = clean_text (raw_content )
6579
6680 return func .HttpResponse (content , mimetype = "text/plain" )
6781 except Exception as e :
@@ -83,31 +97,141 @@ def scrape_with_images(req: func.HttpRequest) -> func.HttpResponse:
8397 try :
8498 soup = try_fetch_with_backoff (url , headers )
8599 except Exception as e :
86- logging .error (f"Initial requests failed: { str (e )} " )
87- proxy_url = f"https://12ft.io/{ url } "
88- logging .info (f"Retrying with proxy: { proxy_url } " )
89- try :
90- soup = try_fetch_with_backoff (proxy_url , headers )
91- except Exception as e :
92- logging .error (f"Proxy requests failed: { str (e )} " )
93- return func .HttpResponse (f"Error: Failed to scrape the URL - { str (e )} " , status_code = 500 )
100+ logging .error (f"Requests failed: { str (e )} " )
101+ return func .HttpResponse (f"Error: Failed to scrape the URL - { str (e )} " , status_code = 500 )
94102
95- content = ''
103+ text_parts = []
104+ images = []
105+
106+ # Extract text and images separately
96107 for element in soup .descendants :
97- if isinstance (element , str ):
98- content += element .strip () + ' \n '
108+ if isinstance (element , str ) and element . strip () :
109+ text_parts . append ( element .strip ())
99110 elif element .name == 'img' :
100111 img_url = element .get ('src' )
101112 if img_url and img_url .startswith (('http://' , 'https://' )):
102- content += f'\n { img_url } \n '
103-
104- content = content .strip ()
113+ images .append (img_url )
114+
115+ # Clean and join text
116+ text_content = clean_text (' ' .join (text_parts ))
117+
118+ # Add images after the text
119+ content = text_content
120+ for img_url in images :
121+ content += f'\n \n { img_url } '
105122
106123 response_data = {
107124 "content" : content
108125 }
109126
110- return func .HttpResponse (json .dumps (response_data ), mimetype = "application/json" )
127+ return func .HttpResponse (
128+ json .dumps (response_data , ensure_ascii = False ),
129+ mimetype = "application/json"
130+ )
111131 except Exception as e :
112132 logging .error (f"Error: { str (e )} " )
113133 return func .HttpResponse (f"Error: Failed to scrape the URL - { str (e )} " , status_code = 500 )
134+
135+ @app .route (route = "set_api_key" , methods = ["POST" ], auth_level = func .AuthLevel .ANONYMOUS )
136+ def set_api_key (req : func .HttpRequest ) -> func .HttpResponse :
137+ try :
138+ global SERPER_API_KEY
139+ data = req .get_json ()
140+ api_key = data .get ('api_key' )
141+
142+ if not api_key :
143+ return func .HttpResponse ("Error: Missing API key" , status_code = 400 )
144+
145+ # Set the API key in memory
146+ SERPER_API_KEY = api_key
147+
148+ return func .HttpResponse ("API key set successfully" , status_code = 200 )
149+ except Exception as e :
150+ logging .error (f"Error setting API key: { str (e )} " )
151+ return func .HttpResponse (f"Error: { str (e )} " , status_code = 500 )
152+
153+ @app .route (route = "search" , methods = ["POST" ], auth_level = func .AuthLevel .ANONYMOUS )
154+ def search (req : func .HttpRequest ) -> func .HttpResponse :
155+ try :
156+ global SERPER_API_KEY
157+ data = req .get_json ()
158+ query = data .get ('query' )
159+
160+ # Use the in-memory API key, or allow overriding with a request parameter
161+ api_key = data .get ('api_key' ) or SERPER_API_KEY
162+
163+ if not query :
164+ return func .HttpResponse ("Error: Missing search query" , status_code = 400 )
165+
166+ if not api_key :
167+ return func .HttpResponse ("Error: Serper API key not configured. Please use the /api/set_api_key endpoint first." , status_code = 400 )
168+
169+ logging .info (f'search [{ query } ]' )
170+
171+ # Call Serper.dev API to get search results
172+ serper_url = "https://google.serper.dev/search"
173+ headers = {
174+ 'X-API-KEY' : api_key ,
175+ 'Content-Type' : 'application/json'
176+ }
177+ payload = {
178+ 'q' : query ,
179+ 'gl' : 'us' ,
180+ 'hl' : 'en'
181+ }
182+
183+ response = requests .post (serper_url , headers = headers , json = payload )
184+ search_results = response .json ()
185+
186+ # Extract top 3 organic results
187+ if 'organic' not in search_results or not search_results ['organic' ]:
188+ return func .HttpResponse ("No search results found" , status_code = 404 )
189+
190+ top_results = search_results ['organic' ][:3 ]
191+ results_with_content = []
192+
193+ user_agent = random .choice (USER_AGENTS )
194+ headers = {'User-Agent' : user_agent }
195+
196+ # Scrape content for each result
197+ for result in top_results :
198+ url = result .get ('link' )
199+ if not url :
200+ continue
201+
202+ try :
203+ # Use the existing scrape functionality
204+ soup = try_fetch_with_backoff (url , headers )
205+ # Get text with space separator to avoid literal \n characters
206+ raw_content = soup .get_text (separator = ' ' ).strip ()
207+ # Clean and format the content
208+ content = clean_text (raw_content )
209+
210+ results_with_content .append ({
211+ 'title' : result .get ('title' ),
212+ 'url' : url ,
213+ 'snippet' : result .get ('snippet' ),
214+ 'content' : content
215+ })
216+ except Exception as e :
217+ logging .error (f"Failed to scrape { url } : { str (e )} " )
218+ results_with_content .append ({
219+ 'title' : result .get ('title' ),
220+ 'url' : url ,
221+ 'snippet' : result .get ('snippet' ),
222+ 'content' : f"Error: Failed to scrape content - { str (e )} "
223+ })
224+
225+ # Include the query and result count in the response
226+ return func .HttpResponse (
227+ json .dumps ({
228+ "query" : query ,
229+ "result_count" : len (results_with_content ),
230+ "results" : results_with_content
231+ }, ensure_ascii = False ),
232+ mimetype = "application/json"
233+ )
234+
235+ except Exception as e :
236+ logging .error (f"Search error: { str (e )} " )
237+ return func .HttpResponse (f"Error: { str (e )} " , status_code = 500 )
0 commit comments