1010from sklearn .metrics .pairwise import cosine_similarity
1111import numpy as np
1212import time
13+ from typing import Union
1314
1415# Load environment variables
1516load_dotenv ()
@@ -104,14 +105,36 @@ def find_relevant_sections(soup: BeautifulSoup, request_id: str) -> Dict[str, st
104105
105106 best_text = []
106107 for idx in matching_indices :
107- start_idx = max (0 , idx - 3 )
108- end_idx = min (len (sections ), idx + 4 )
108+ start_idx = max (0 , idx - 1 )
109+ end_idx = min (len (sections ), idx + 2 )
109110 context = sections [start_idx :end_idx ]
110111 best_text .extend ([s .strip () for s in context ])
111-
112112 if max_similarity > 0.01 and best_text :
113- relevant_sections [group_key ] = "\n " .join (best_text )
114-
113+ # For price, validate it contains actual price information
114+ if group_key == 'price' :
115+ # Check if text contains numeric values and currency symbols
116+ api_key : str = key_manager .get_next_key ()
117+ gemini_model : genai .GenerativeModel = initialize_gemini (api_key )
118+ prompt : str = f"""
119+ Check the following text has price correct or not
120+ return true or false only (true for correct and false for incorrect)
121+ { best_text }
122+ """
123+ try :
124+ response = gemini_model .generate_content (contents = prompt )
125+ if response .text != "false" :
126+ print (f"Price is correct" )
127+ relevant_sections [group_key ] = "\n " .join (best_text )
128+ else :
129+ relevant_sections [group_key ] = "null"
130+ print (f"Price is not correct" )
131+ except Exception as e :
132+ print (f"Error validating price: { e } " )
133+ relevant_sections [group_key ] = "null"
134+ else :
135+ relevant_sections [group_key ] = "\n " .join (best_text )
136+ else :
137+ relevant_sections [group_key ] = "null"
115138 return relevant_sections
116139
117140# Function to extract all visible text from a webpage
@@ -141,7 +164,122 @@ def extract_page_content(url: str, country_code: str, latitude: float = None, lo
141164
142165 except Exception as e :
143166 return f"Error extracting content from { url } : { e } "
167+
168+ # Function to extract all visible text from a webpage
169+ def extract_amazon_page_content (url : str , country_code : str , latitude : float = None , longitude : float = None ) -> Union [str , BeautifulSoup ]:
170+ """Extracts and parses the content of a webpage."""
171+ try :
172+ headers : dict = {
173+ "User-Agent" : os .getenv ("USER_AGENT" ),
174+ "Accept-Language" : "en-US,en;q=0.9" ,
175+ "geo-location" : country_code ,
176+ "Accept-Location" : country_code
144177
178+ }
179+
180+ # Add geolocation headers for Canada
181+ if country_code == "CA" :
182+ headers .update ({
183+ "CF-IPCountry" : "CA" ,
184+ "X-Forwarded-For" : "24.48.0.1" , # Canadian IP address
185+ "geo-position" : f"{ latitude } ;{ longitude } " , # Toronto coordinates
186+ "geo-coordinates" : f"{ latitude } , { longitude } " ,
187+ "X-Geo-Position" : f"{ latitude } , { longitude } " ,
188+ "X-Geo-Location" : "CA"
189+ })
190+
191+ response : requests .Response = requests .get (url , headers = headers , timeout = 30 )
192+ response .raise_for_status () # Raise HTTPError for bad responses
193+
194+ # Parse the HTML using BeautifulSoup
195+ soup : BeautifulSoup = BeautifulSoup (response .content , "html.parser" )
196+
197+ text = ""
198+
199+ # Extract product title
200+ product_title = soup .find ('span' , {'id' : 'productTitle' })
201+ if product_title :
202+ product_title = product_title .text .strip ()
203+ text += "product_title: " + product_title
204+ text += "\n \n "
205+
206+ # Extract product description
207+ description_list = soup .find ('ul' , {'class' : 'a-unordered-list a-vertical a-spacing-mini' })
208+ if description_list :
209+ description_items = description_list .find_all ('span' , {'class' : 'a-list-item' })
210+ description_text = ' ' .join ([item .text .strip () for item in description_items ])
211+ text += "description: " + description_text
212+ text += "\n \n "
213+
214+ # Extract rating
215+ rating_element = soup .find ('span' , {'data-hook' : 'rating-out-of-text' , 'class' : 'a-size-medium a-color-base' })
216+ if rating_element :
217+ rating = rating_element .text .strip ()
218+ text += "rating: " + rating
219+ text += "\n \n "
220+
221+ # Extract availability
222+ availability_element = soup .find ('div' , {'id' : 'availability' })
223+ if availability_element :
224+ availability = availability_element .find ('span' , {'class' : 'a-size-medium a-color-success' })
225+ if availability :
226+ availability_text = availability .text .strip ()
227+ text += "availability: " + availability_text
228+ text += "\n \n "
229+
230+ # Extract price and currency
231+ price_whole = soup .find ('span' , {'class' : 'a-price-whole' })
232+ price_decimal = soup .find ('span' , {'class' : 'a-price-decimal' })
233+ price_fraction = soup .find ('span' , {'class' : 'a-price-fraction' })
234+ currency_symbol = soup .find ('span' , {'class' : 'a-price-symbol' })
235+
236+ if price_whole and currency_symbol :
237+ price = price_whole .text .strip ()
238+ if price_decimal and price [- 1 ] != "." :
239+ price += price_decimal .text .strip ()
240+ price += price_fraction .text .strip ()
241+ currency = currency_symbol .text .strip ()
242+ text += "price: " + price
243+ text += "currency: " + currency
244+ text += "\n \n "
245+
246+ # Extract product image
247+ image_element = soup .find ('img' , {'id' : 'landingImage' })
248+ if image_element :
249+ image_url = image_element .get ('src' )
250+ if image_url :
251+ text += "image: " + image_url
252+ text += "\n \n "
253+
254+ # Extract reviews
255+ reviews = []
256+ review_elements = soup .find_all ('div' , {'data-hook' : 'review-collapsed' })
257+ for review in review_elements :
258+ review_text = review .find ('span' )
259+ if review_text :
260+ reviews .append (review_text .text .strip ())
261+
262+ if reviews :
263+ text += "reviews: " + str (reviews )
264+ text += "\n \n "
265+ # Extract all content from elements containing a-box-group in their class
266+ box_groups = soup .find_all (lambda tag : tag .get ('class' ) and 'a-box-group' in tag .get ('class' ))
267+ if box_groups :
268+ box_contents = []
269+ for box in box_groups :
270+ # Get all text content from the box, preserving line breaks
271+ lines = [line .strip () for line in box .stripped_strings ]
272+ box_contents .extend (lines )
273+
274+ if box_contents :
275+ text += "box_contents: " + ", " .join (box_contents )
276+ text += "\n \n "
277+
278+ return text
279+
280+ except Exception as e :
281+ return f"Error extracting content from { url } : { e } "
282+
145283# Process each link and save responses in separate files
146284def process_links (country_code : str , custom_domains : List [str ], location : List [float ], request_id : str ) -> None :
147285 """Processes each link and saves structured responses in separate files."""
@@ -165,19 +303,23 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
165303 for link in links :
166304 print (f"Processing: { link } " )
167305 try :
168- # Extract page content
169- page_content : Union [str , BeautifulSoup ] = extract_page_content (link ,country_code ,location [0 ],location [1 ])
306+ if custom_domains == ["https://www.amazon.com" ]:
307+ page_content : Union [str , BeautifulSoup ] = extract_amazon_page_content (link ,country_code ,location [0 ],location [1 ])
308+ relevant_content = page_content
309+ else :
310+ page_content : Union [str , BeautifulSoup ] = extract_page_content (link ,country_code ,location [0 ],location [1 ])
311+ # Find relevant sections using vector similarity search
312+ relevant_sections = find_relevant_sections (page_content , request_id )
313+ if relevant_sections .get ('price' ) == "null" :
314+ print (f"Price is not available for { link } " )
315+ continue
316+ # Combine relevant sections into a single string
317+ relevant_content = "\n " .join ([f"{ term } : { content } " for term , content in relevant_sections .items ()])
170318
171319 # Get the next API key and initialize Gemini model
172320 api_key : str = key_manager .get_next_key ()
173321 gemini_model : genai .GenerativeModel = initialize_gemini (api_key )
174322
175- # Find relevant sections using vector similarity search
176- relevant_sections = find_relevant_sections (page_content , request_id )
177-
178- # Combine relevant sections into a single string
179- relevant_content = "\n " .join ([f"{ term } : { content } " for term , content in relevant_sections .items ()])
180-
181323 # Send the content to Gemini for structuring
182324 prompt : str = f"""
183325 You are a product data extraction specialist. Analyze the provided HTML content and structure the product details into a clean JSON format.
@@ -200,8 +342,8 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
200342 product_rating (Mostly this is available which is usually 0 to 5 find the rating and add it)
201343 availability(make false if mentioned as out of stock otherwise always true)
202344 shipping (if mentioned as not shipping to { country_code } , mention false otherwise always true)
203- delivery_date (add the delivery date or how long it takes to deliver)
204- delivery_cost(or shipping cost)
345+ delivery_date (add the delivery date or how long it takes to deliver only for { country_code } )
346+ delivery_cost(or shipping cost only for { country_code } )
205347 warranty(true or false on availability)
206348 image(add the image url)
207349 latest_reviews(The reviews are available in the bottom parts analyze and add them)
@@ -220,11 +362,11 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
220362
221363 product_counter += 1
222364
223- if product_counter == 100 :
365+ if product_counter == 32 :
224366 break
225367
226368 except Exception as e :
227- print (f"Error processing { link } : { e } " )
369+ print (f"Error processing { link } : { str ( e ) } " )
228370
229371 print ("Data extraction agent completed" )
230372 print ("------------------------------------------------------------------------------------------------" )
@@ -235,8 +377,8 @@ def sanitize_filename(url: str) -> str:
235377 return "" .join (c if c .isalnum () or c in ('-' , '_' ) else '_' for c in url )
236378
237379# Example usage
238- start_time = time .time ()
380+ # start_time = time.time()
239381# process_links("CA",None,[56.4383657,-114.8492314],"1234567898")
240382# process_links("CA",["https://www.amazon.com"],[56.4383657,-114.8492314],"1234567899")
241- end_time = time .time ()
242- print (f"Time taken: { end_time - start_time :.2f} seconds" )
383+ # end_time = time.time()
384+ # print(f"Time taken: {end_time - start_time:.2f} seconds")
0 commit comments