-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_data.py
More file actions
131 lines (110 loc) · 4.68 KB
/
fetch_data.py
File metadata and controls
131 lines (110 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import requests
import pandas as pd
import time
from tqdm import tqdm
from random import uniform
# Function to fetch JSON data from a URL with retries
def get_json(url, retries=3, delay=2):
for attempt in range(retries):
try:
r = requests.get(url, timeout=10)
if r.status_code == 200:
return r.json()
else:
print(f"Status {r.status_code} for {url}")
except Exception as e:
print(f"Error fetching {url}: {e}")
time.sleep(delay)
return None
# Function to fetch Samsung mobile products across multiple pages
def get_samsung_products(pages=45):
products = []
for page in range(1, pages + 1):
url = f"https://api.digikala.com/v1/categories/mobile-phone/brands/samsung/search/?page={page}&_rch=db340a7f7c4f"
data = get_json(url)
if not data or "data" not in data or "products" not in data["data"]:
print(f"No data on page {page}")
continue
# Extract products from the response
for p in data["data"]["products"]:
variant = p.get("default_variant", {})
if isinstance(variant, list) and len(variant) > 0:
variant = variant[0]
elif not isinstance(variant, dict):
variant = {}
price_info = variant.get("price", {})
rating_info = p.get("rating", {})
brand_info = p.get("brand", {})
category_info = p.get("category", {})
review_info = p.get("review", {}).get("recommendation", {})
colors = p.get("colors", [])
first_color = colors[0]["title"] if colors else None
# Append product details as a dictionary
products.append({
"id": p.get("id"),
"title_fa": p.get("title_fa"),
"title_en": p.get("title_en"),
"brand": brand_info.get("title_fa"),
"category": category_info.get("title_fa"),
"status": p.get("status"),
"selling_price": price_info.get("selling_price"),
"rrp_price": price_info.get("rrp_price"),
"rating": rating_info.get("rate"),
"rating_count": rating_info.get("count"),
"recommendation_percentage": review_info.get("recommended_percentage"),
"color": first_color,
"url": p.get("url", {}).get("uri"),
})
# Random delay after fetching all pages
time.sleep(uniform(0.5, 1.2))
# Convert list to DataFrame and save to CSV
df_products = pd.DataFrame(products)
df_products.to_csv("samsung_products.csv", index=False, encoding="utf-8-sig")
print(f"Saved {len(df_products)} products → samsung_products.csv")
return df_products
# Function to fetch comments for a single product
def get_comments_for_product(product_id, max_pages=45):
comments = []
for page in range(1, max_pages + 1):
url = f"https://api.digikala.com/v1/product/{product_id}/comments/?page={page}"
data = get_json(url)
if not data or "data" not in data or "comments" not in data["data"]:
break
cmts = data["data"]["comments"]
if not cmts:
break
for c in cmts:
comments.append({
"product_id": product_id,
"comment_id": c.get("id"),
"title": c.get("title"),
"body": c.get("body"),
"rate": c.get("rate"),
"date": c.get("created_at"),
"user_name": c.get("user_name")
})
total_pages = data.get("metadata", {}).get("paging", {}).get("total_pages", 1)
if page >= total_pages:
break
# Random delay between pages
time.sleep(uniform(0.5, 1.2))
return comments
# Function to fetch comments for all products
def get_all_comments(products_df):
all_comments = []
# Loop through product IDs with progress bar
for pid in tqdm(products_df["id"], desc="Fetching comments", ncols=100):
cmts = get_comments_for_product(pid, max_pages=45)
all_comments.extend(cmts)
time.sleep(uniform(1.0, 2.0))
# Convert to DataFrame and save to CSV
df_comments = pd.DataFrame(all_comments)
df_comments.to_csv("samsung_comments.csv", index=False, encoding="utf-8-sig")
print(f"Saved {len(df_comments)} comments → samsung_comments.csv")
return df_comments
# Main execution block
if __name__ == "__main__":
print("Fetching Samsung mobile products and comments from Digikala...")
df_products = get_samsung_products(pages=45)
df_comments = get_all_comments(df_products)
print("Done! Data saved successfully.")