Skip to content

Commit 1d91189

Browse files
Refactor Fashionphile scraper
1 parent d781ee1 commit 1d91189

File tree

1 file changed

+132
-40
lines changed

1 file changed

+132
-40
lines changed

fashionphile-scraper/fashionphile.py

Lines changed: 132 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from typing import Dict, List
1111
from pathlib import Path
1212
from loguru import logger as log
13-
from urllib.parse import parse_qs, urlencode, urlparse
14-
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
13+
import re
14+
from scrapfly import ScrapeConfig, ScrapflyClient
1515

1616
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
1717

@@ -20,61 +20,153 @@
2020
"asp": True,
2121
# set the proxy country to US
2222
"country": "US",
23+
"render_js": True,
2324
}
2425

2526
output = Path(__file__).parent / "results"
2627
output.mkdir(exist_ok=True)
2728

2829

29-
def find_hidden_data(result: ScrapeApiResponse) -> dict:
30-
"""extract hidden NEXT_DATA from page html"""
31-
data = result.selector.css("script#__NEXT_DATA__::text").get()
32-
data = json.loads(data)
33-
return data
30+
def convert_to_json_urls(urls):
31+
converted = []
32+
for url in urls:
33+
# Replace '/p/' with '/products/' and add '.json' at the end
34+
new_url = url.replace("/p/", "/products/") + ".json"
35+
converted.append(new_url)
36+
return converted
3437

3538

36-
async def scrape_products(urls: List[str]) -> dict:
37-
"""scrape fashionphile product pages for product data"""
39+
async def scrape_products(urls: List[str]) -> List[Dict]:
40+
"""
41+
Scrape product data from Fashionphile product pages using the product API.
42+
"""
43+
urls = convert_to_json_urls(urls)
3844
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
3945
products = []
4046
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
41-
data = find_hidden_data(response)
42-
product = data["props"]["pageProps"]["initialState"]["productPageReducer"]["productData"]
43-
products.append(product)
44-
log.success(f"scraped {len(products)} product listings from product pages")
47+
# Extract just the product data from the JSON content
48+
content = response.result['result']['content']
49+
product_data = json.loads(content)['product']
50+
products.append(product_data)
51+
log.success(f"scraped {len(products)} product listings from product pages")
4552
return products
4653

4754

48-
def update_url_parameter(url, **params):
49-
"""update url query parameter of an url with new values"""
50-
current_params = parse_qs(urlparse(url).query)
51-
updated_query_params = urlencode({**current_params, **params}, doseq=True)
52-
return f"{url.split('?')[0]}?{updated_query_params}"
55+
def parse_price(price_text: str) -> int:
56+
if not price_text:
57+
return 0
58+
# Remove $ and commas, convert to int
59+
return int(re.sub(r'[$,]', '', price_text.strip()))
60+
61+
62+
def extract_product_from_card(card_selector) -> Dict:
63+
"""Extract product data from a product card HTML element"""
64+
65+
# Get product ID from data attribute
66+
product_id = card_selector.css('::attr(data-product-id)').get('')
67+
68+
# Get brand name
69+
brand_name = card_selector.css('.fp-card__vendor::text').get('').strip()
70+
71+
# Get product name
72+
product_name = card_selector.css('.fp-card__link__product-name::text').get('').strip()
73+
74+
# Get condition
75+
condition = card_selector.css('.fp-condition::text').get('').strip()
76+
77+
# Get prices
78+
regular_price_text = card_selector.css('.price-item--regular::text').get('').strip()
79+
sale_price_text = card_selector.css('.price-item--sale.price-item--last::text').get('').strip()
80+
81+
# If no sale price, use regular price as final price
82+
if sale_price_text:
83+
price_text = sale_price_text
84+
elif regular_price_text:
85+
price_text = regular_price_text
86+
else:
87+
# Fallback: try to find any price
88+
price_text = card_selector.css('.price-item::text').get('$0').strip()
89+
90+
price = parse_price(price_text)
91+
92+
# Calculate discounted price
93+
if regular_price_text and sale_price_text:
94+
regular = parse_price(regular_price_text)
95+
discounted_price = regular - price
96+
else:
97+
discounted_price = 0
98+
99+
100+
# Build result matching search_schema
101+
result = {
102+
"brand_name": brand_name,
103+
"product_name" : product_name,
104+
"condition": condition,
105+
"discounted_price": discounted_price,
106+
"price": price,
107+
"id": int(product_id) if product_id else 0
108+
}
109+
110+
return result
53111

54112

55113
async def scrape_search(url: str, max_pages: int = 10) -> List[Dict]:
56-
log.info(f"scraping search page {url}")
57-
# scrape first page
114+
# Scrape first page
58115
result_first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
59-
data_first_page = find_hidden_data(result_first_page)
60-
data_first_page = data_first_page["props"]["pageProps"]["serverState"]["initialResults"][
61-
"prod_ecom_products_date_desc"
62-
]["results"][0]
63-
results = data_first_page["hits"]
64-
65-
# find total page count
66-
total_pages = data_first_page["nbPages"]
116+
selector = result_first_page.selector
117+
118+
# Find all product cards
119+
product_cards = selector.css('.fp-algolia-product-card')
120+
log.info(f"found {len(product_cards)} products on first page")
121+
122+
# Extract data from each card
123+
results = []
124+
for card in product_cards:
125+
try:
126+
product_data = extract_product_from_card(card)
127+
results.append(product_data)
128+
except Exception as e:
129+
log.warning(f"failed to extract product: {e}")
130+
continue
131+
132+
# Find total pages from pagination
133+
pagination_href = selector.css('.ais-Pagination-item--lastPage a::attr(href)').get('')
134+
if pagination_href:
135+
match = re.search(r'page=(\d+)', pagination_href)
136+
if match:
137+
total_pages = int(match.group(1))
138+
else:
139+
total_pages = 1
140+
else:
141+
total_pages = 1
142+
67143
if max_pages and max_pages < total_pages:
68144
total_pages = max_pages
69-
70-
# scrape remaining pages
71-
log.info(f"scraping search pagination ({total_pages-1} more pages)")
72-
to_scrape = [
73-
ScrapeConfig(update_url_parameter(url, page=page), **BASE_CONFIG) for page in range(2, total_pages + 1)
74-
]
75-
async for result in SCRAPFLY.concurrent_scrape(to_scrape):
76-
data = find_hidden_data(result)
77-
data = data["props"]["pageProps"]["serverState"]["initialResults"]["prod_ecom_products_date_desc"]["results"][0]
78-
results.extend(data["hits"])
79-
log.success(f"scraped {len(results)} product listings from search pages")
80-
return results
145+
146+
log.info(f"total pages: {total_pages}")
147+
148+
# Scrape remaining pages
149+
if total_pages > 1:
150+
log.info(f"scraping pagination ({total_pages-1} more pages)")
151+
152+
# Build URLs for remaining pages
153+
base_url = url.split('?')[0]
154+
to_scrape = []
155+
for page in range(2, total_pages + 1):
156+
page_url = f"{base_url}?page={page}"
157+
to_scrape.append(ScrapeConfig(page_url, **BASE_CONFIG))
158+
159+
# Scrape concurrently
160+
async for result in SCRAPFLY.concurrent_scrape(to_scrape):
161+
product_cards = result.selector.css('.fp-algolia-product-card')
162+
163+
for card in product_cards:
164+
try:
165+
product_data = extract_product_from_card(card)
166+
results.append(product_data)
167+
except Exception as e:
168+
log.warning(f"failed to extract product: {e}")
169+
continue
170+
171+
log.info(f"scraped {len(results)} product listings from search pages")
172+
return results

0 commit comments

Comments
 (0)