Skip to content

Commit d93e014

Browse files
committed
Amzon only agent finished
1 parent af35928 commit d93e014

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+225
-134
lines changed

AI_Agents/Data_Extract_Agent.py

Lines changed: 162 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from sklearn.metrics.pairwise import cosine_similarity
1111
import numpy as np
1212
import time
13+
from typing import Union
1314

1415
# Load environment variables
1516
load_dotenv()
@@ -104,14 +105,36 @@ def find_relevant_sections(soup: BeautifulSoup, request_id: str) -> Dict[str, st
104105

105106
best_text = []
106107
for idx in matching_indices:
107-
start_idx = max(0, idx - 3)
108-
end_idx = min(len(sections), idx + 4)
108+
start_idx = max(0, idx - 1)
109+
end_idx = min(len(sections), idx + 2)
109110
context = sections[start_idx:end_idx]
110111
best_text.extend([s.strip() for s in context])
111-
112112
if max_similarity > 0.01 and best_text:
113-
relevant_sections[group_key] = "\n".join(best_text)
114-
113+
# For price, validate it contains actual price information
114+
if group_key == 'price':
115+
# Check if text contains numeric values and currency symbols
116+
api_key: str = key_manager.get_next_key()
117+
gemini_model: genai.GenerativeModel = initialize_gemini(api_key)
118+
prompt: str = f"""
119+
Check the following text has price correct or not
120+
return true or false only (true for correct and false for incorrect)
121+
{best_text}
122+
"""
123+
try:
124+
response = gemini_model.generate_content(contents=prompt)
125+
if response.text != "false":
126+
print(f"Price is correct")
127+
relevant_sections[group_key] = "\n".join(best_text)
128+
else:
129+
relevant_sections[group_key] = "null"
130+
print(f"Price is not correct")
131+
except Exception as e:
132+
print(f"Error validating price: {e}")
133+
relevant_sections[group_key] = "null"
134+
else:
135+
relevant_sections[group_key] = "\n".join(best_text)
136+
else:
137+
relevant_sections[group_key] = "null"
115138
return relevant_sections
116139

117140
# Function to extract all visible text from a webpage
@@ -141,7 +164,122 @@ def extract_page_content(url: str, country_code: str, latitude: float = None, lo
141164

142165
except Exception as e:
143166
return f"Error extracting content from {url}: {e}"
167+
168+
# Function to extract all visible text from a webpage
169+
def extract_amazon_page_content(url: str, country_code: str, latitude: float = None, longitude: float = None) -> Union[str, BeautifulSoup]:
170+
"""Extracts and parses the content of a webpage."""
171+
try:
172+
headers: dict = {
173+
"User-Agent": os.getenv("USER_AGENT"),
174+
"Accept-Language": "en-US,en;q=0.9",
175+
"geo-location": country_code,
176+
"Accept-Location": country_code
144177

178+
}
179+
180+
# Add geolocation headers for Canada
181+
if country_code == "CA":
182+
headers.update({
183+
"CF-IPCountry": "CA",
184+
"X-Forwarded-For": "24.48.0.1", # Canadian IP address
185+
"geo-position": f"{latitude};{longitude}", # Toronto coordinates
186+
"geo-coordinates": f"{latitude}, {longitude}",
187+
"X-Geo-Position": f"{latitude}, {longitude}",
188+
"X-Geo-Location": "CA"
189+
})
190+
191+
response: requests.Response = requests.get(url, headers=headers, timeout=30)
192+
response.raise_for_status() # Raise HTTPError for bad responses
193+
194+
# Parse the HTML using BeautifulSoup
195+
soup: BeautifulSoup = BeautifulSoup(response.content, "html.parser")
196+
197+
text = ""
198+
199+
# Extract product title
200+
product_title = soup.find('span', {'id': 'productTitle'})
201+
if product_title:
202+
product_title = product_title.text.strip()
203+
text += "product_title: " + product_title
204+
text += "\n\n"
205+
206+
# Extract product description
207+
description_list = soup.find('ul', {'class': 'a-unordered-list a-vertical a-spacing-mini'})
208+
if description_list:
209+
description_items = description_list.find_all('span', {'class': 'a-list-item'})
210+
description_text = ' '.join([item.text.strip() for item in description_items])
211+
text += "description: " + description_text
212+
text += "\n\n"
213+
214+
# Extract rating
215+
rating_element = soup.find('span', {'data-hook': 'rating-out-of-text', 'class': 'a-size-medium a-color-base'})
216+
if rating_element:
217+
rating = rating_element.text.strip()
218+
text += "rating: " + rating
219+
text += "\n\n"
220+
221+
# Extract availability
222+
availability_element = soup.find('div', {'id': 'availability'})
223+
if availability_element:
224+
availability = availability_element.find('span', {'class': 'a-size-medium a-color-success'})
225+
if availability:
226+
availability_text = availability.text.strip()
227+
text += "availability: " + availability_text
228+
text += "\n\n"
229+
230+
# Extract price and currency
231+
price_whole = soup.find('span', {'class': 'a-price-whole'})
232+
price_decimal = soup.find('span', {'class': 'a-price-decimal'})
233+
price_fraction = soup.find('span', {'class': 'a-price-fraction'})
234+
currency_symbol = soup.find('span', {'class': 'a-price-symbol'})
235+
236+
if price_whole and currency_symbol:
237+
price = price_whole.text.strip()
238+
if price_decimal and price[-1] != ".":
239+
price += price_decimal.text.strip()
240+
price += price_fraction.text.strip()
241+
currency = currency_symbol.text.strip()
242+
text += "price: " + price
243+
text += "currency: " + currency
244+
text += "\n\n"
245+
246+
# Extract product image
247+
image_element = soup.find('img', {'id': 'landingImage'})
248+
if image_element:
249+
image_url = image_element.get('src')
250+
if image_url:
251+
text += "image: " + image_url
252+
text += "\n\n"
253+
254+
# Extract reviews
255+
reviews = []
256+
review_elements = soup.find_all('div', {'data-hook': 'review-collapsed'})
257+
for review in review_elements:
258+
review_text = review.find('span')
259+
if review_text:
260+
reviews.append(review_text.text.strip())
261+
262+
if reviews:
263+
text += "reviews: " + str(reviews)
264+
text += "\n\n"
265+
# Extract all content from elements containing a-box-group in their class
266+
box_groups = soup.find_all(lambda tag: tag.get('class') and 'a-box-group' in tag.get('class'))
267+
if box_groups:
268+
box_contents = []
269+
for box in box_groups:
270+
# Get all text content from the box, preserving line breaks
271+
lines = [line.strip() for line in box.stripped_strings]
272+
box_contents.extend(lines)
273+
274+
if box_contents:
275+
text += "box_contents: " + ", ".join(box_contents)
276+
text += "\n\n"
277+
278+
return text
279+
280+
except Exception as e:
281+
return f"Error extracting content from {url}: {e}"
282+
145283
# Process each link and save responses in separate files
146284
def process_links(country_code: str, custom_domains: List[str], location: List[float], request_id: str) -> None:
147285
"""Processes each link and saves structured responses in separate files."""
@@ -165,19 +303,23 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
165303
for link in links:
166304
print(f"Processing: {link}")
167305
try:
168-
# Extract page content
169-
page_content: Union[str, BeautifulSoup] = extract_page_content(link,country_code,location[0],location[1])
306+
if custom_domains == ["https://www.amazon.com"]:
307+
page_content: Union[str, BeautifulSoup] = extract_amazon_page_content(link,country_code,location[0],location[1])
308+
relevant_content = page_content
309+
else:
310+
page_content: Union[str, BeautifulSoup] = extract_page_content(link,country_code,location[0],location[1])
311+
# Find relevant sections using vector similarity search
312+
relevant_sections = find_relevant_sections(page_content, request_id)
313+
if relevant_sections.get('price') == "null":
314+
print(f"Price is not available for {link}")
315+
continue
316+
# Combine relevant sections into a single string
317+
relevant_content = "\n".join([f"{term}: {content}" for term, content in relevant_sections.items()])
170318

171319
# Get the next API key and initialize Gemini model
172320
api_key: str = key_manager.get_next_key()
173321
gemini_model: genai.GenerativeModel = initialize_gemini(api_key)
174322

175-
# Find relevant sections using vector similarity search
176-
relevant_sections = find_relevant_sections(page_content, request_id)
177-
178-
# Combine relevant sections into a single string
179-
relevant_content = "\n".join([f"{term}: {content}" for term, content in relevant_sections.items()])
180-
181323
# Send the content to Gemini for structuring
182324
prompt: str = f"""
183325
You are a product data extraction specialist. Analyze the provided HTML content and structure the product details into a clean JSON format.
@@ -200,8 +342,8 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
200342
product_rating (Mostly this is available which is usually 0 to 5 find the rating and add it)
201343
availability(make false if mentioned as out of stock otherwise always true)
202344
shipping (if mentioned as not shipping to {country_code}, mention false otherwise always true)
203-
delivery_date (add the delivery date or how long it takes to deliver)
204-
delivery_cost(or shipping cost)
345+
delivery_date (add the delivery date or how long it takes to deliver only for {country_code})
346+
delivery_cost(or shipping cost only for {country_code})
205347
warranty(true or false on availability)
206348
image(add the image url)
207349
latest_reviews(The reviews are available in the bottom parts analyze and add them)
@@ -220,11 +362,11 @@ def process_links(country_code: str, custom_domains: List[str], location: List[f
220362

221363
product_counter += 1
222364

223-
if product_counter == 100:
365+
if product_counter == 32:
224366
break
225367

226368
except Exception as e:
227-
print(f"Error processing {link}: {e}")
369+
print(f"Error processing {link}: {str(e)}")
228370

229371
print("Data extraction agent completed")
230372
print("------------------------------------------------------------------------------------------------")
@@ -235,8 +377,8 @@ def sanitize_filename(url: str) -> str:
235377
return "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in url)
236378

237379
# Example usage
238-
start_time = time.time()
380+
# start_time = time.time()
239381
# process_links("CA",None,[56.4383657,-114.8492314],"1234567898")
240382
# process_links("CA",["https://www.amazon.com"],[56.4383657,-114.8492314],"1234567899")
241-
end_time = time.time()
242-
print(f"Time taken: {end_time - start_time:.2f} seconds")
383+
# end_time = time.time()
384+
# print(f"Time taken: {end_time - start_time:.2f} seconds")

0 commit comments

Comments
 (0)