1010from typing import Dict , List
1111from pathlib import Path
1212from loguru import logger as log
13- from urllib . parse import parse_qs , urlencode , urlparse
14- from scrapfly import ScrapeConfig , ScrapflyClient , ScrapeApiResponse
13+ import re
14+ from scrapfly import ScrapeConfig , ScrapflyClient
1515
1616SCRAPFLY = ScrapflyClient (key = os .environ ["SCRAPFLY_KEY" ])
1717
2020 "asp" : True ,
2121 # set the proxy country to US
2222 "country" : "US" ,
23+ "render_js" : True ,
2324}
2425
2526output = Path (__file__ ).parent / "results"
2627output .mkdir (exist_ok = True )
2728
2829
29- def find_hidden_data (result : ScrapeApiResponse ) -> dict :
30- """extract hidden NEXT_DATA from page html"""
31- data = result .selector .css ("script#__NEXT_DATA__::text" ).get ()
32- data = json .loads (data )
33- return data
30+ def convert_to_json_urls (urls ):
31+ converted = []
32+ for url in urls :
33+ # Replace '/p/' with '/products/' and add '.json' at the end
34+ new_url = url .replace ("/p/" , "/products/" ) + ".json"
35+ converted .append (new_url )
36+ return converted
3437
3538
36- async def scrape_products (urls : List [str ]) -> dict :
37- """scrape fashionphile product pages for product data"""
39+ async def scrape_products (urls : List [str ]) -> List [Dict ]:
40+ """
41+ Scrape product data from Fashionphile product pages using the product API.
42+ """
43+ urls = convert_to_json_urls (urls )
3844 to_scrape = [ScrapeConfig (url , ** BASE_CONFIG ) for url in urls ]
3945 products = []
4046 async for response in SCRAPFLY .concurrent_scrape (to_scrape ):
41- data = find_hidden_data (response )
42- product = data ["props" ]["pageProps" ]["initialState" ]["productPageReducer" ]["productData" ]
43- products .append (product )
44- log .success (f"scraped { len (products )} product listings from product pages" )
47+ # Extract just the product data from the JSON content
48+ content = response .result ['result' ]['content' ]
49+ product_data = json .loads (content )['product' ]
50+ products .append (product_data )
51+ log .success (f"scraped { len (products )} product listings from product pages" )
4552 return products
4653
4754
48- def update_url_parameter (url , ** params ):
49- """update url query parameter of an url with new values"""
50- current_params = parse_qs (urlparse (url ).query )
51- updated_query_params = urlencode ({** current_params , ** params }, doseq = True )
52- return f"{ url .split ('?' )[0 ]} ?{ updated_query_params } "
55+ def parse_price (price_text : str ) -> int :
56+ if not price_text :
57+ return 0
58+ # Remove $ and commas, convert to int
59+ return int (re .sub (r'[$,]' , '' , price_text .strip ()))
60+
61+
62+ def extract_product_from_card (card_selector ) -> Dict :
63+ """Extract product data from a product card HTML element"""
64+
65+ # Get product ID from data attribute
66+ product_id = card_selector .css ('::attr(data-product-id)' ).get ('' )
67+
68+ # Get brand name
69+ brand_name = card_selector .css ('.fp-card__vendor::text' ).get ('' ).strip ()
70+
71+ # Get product name
72+ product_name = card_selector .css ('.fp-card__link__product-name::text' ).get ('' ).strip ()
73+
74+ # Get condition
75+ condition = card_selector .css ('.fp-condition::text' ).get ('' ).strip ()
76+
77+ # Get prices
78+ regular_price_text = card_selector .css ('.price-item--regular::text' ).get ('' ).strip ()
79+ sale_price_text = card_selector .css ('.price-item--sale.price-item--last::text' ).get ('' ).strip ()
80+
81+ # If no sale price, use regular price as final price
82+ if sale_price_text :
83+ price_text = sale_price_text
84+ elif regular_price_text :
85+ price_text = regular_price_text
86+ else :
87+ # Fallback: try to find any price
88+ price_text = card_selector .css ('.price-item::text' ).get ('$0' ).strip ()
89+
90+ price = parse_price (price_text )
91+
92+ # Calculate discounted price
93+ if regular_price_text and sale_price_text :
94+ regular = parse_price (regular_price_text )
95+ discounted_price = regular - price
96+ else :
97+ discounted_price = 0
98+
99+
100+ # Build result matching search_schema
101+ result = {
102+ "brand_name" : brand_name ,
103+ "product_name" : product_name ,
104+ "condition" : condition ,
105+ "discounted_price" : discounted_price ,
106+ "price" : price ,
107+ "id" : int (product_id ) if product_id else 0
108+ }
109+
110+ return result
53111
54112
55113async def scrape_search (url : str , max_pages : int = 10 ) -> List [Dict ]:
56- log .info (f"scraping search page { url } " )
57- # scrape first page
114+ # Scrape first page
58115 result_first_page = await SCRAPFLY .async_scrape (ScrapeConfig (url , ** BASE_CONFIG ))
59- data_first_page = find_hidden_data (result_first_page )
60- data_first_page = data_first_page ["props" ]["pageProps" ]["serverState" ]["initialResults" ][
61- "prod_ecom_products_date_desc"
62- ]["results" ][0 ]
63- results = data_first_page ["hits" ]
64-
65- # find total page count
66- total_pages = data_first_page ["nbPages" ]
116+ selector = result_first_page .selector
117+
118+ # Find all product cards
119+ product_cards = selector .css ('.fp-algolia-product-card' )
120+ log .info (f"found { len (product_cards )} products on first page" )
121+
122+ # Extract data from each card
123+ results = []
124+ for card in product_cards :
125+ try :
126+ product_data = extract_product_from_card (card )
127+ results .append (product_data )
128+ except Exception as e :
129+ log .warning (f"failed to extract product: { e } " )
130+ continue
131+
132+ # Find total pages from pagination
133+ pagination_href = selector .css ('.ais-Pagination-item--lastPage a::attr(href)' ).get ('' )
134+ if pagination_href :
135+ match = re .search (r'page=(\d+)' , pagination_href )
136+ if match :
137+ total_pages = int (match .group (1 ))
138+ else :
139+ total_pages = 1
140+ else :
141+ total_pages = 1
142+
67143 if max_pages and max_pages < total_pages :
68144 total_pages = max_pages
69-
70- # scrape remaining pages
71- log .info (f"scraping search pagination ({ total_pages - 1 } more pages)" )
72- to_scrape = [
73- ScrapeConfig (update_url_parameter (url , page = page ), ** BASE_CONFIG ) for page in range (2 , total_pages + 1 )
74- ]
75- async for result in SCRAPFLY .concurrent_scrape (to_scrape ):
76- data = find_hidden_data (result )
77- data = data ["props" ]["pageProps" ]["serverState" ]["initialResults" ]["prod_ecom_products_date_desc" ]["results" ][0 ]
78- results .extend (data ["hits" ])
79- log .success (f"scraped { len (results )} product listings from search pages" )
80- return results
145+
146+ log .info (f"total pages: { total_pages } " )
147+
148+ # Scrape remaining pages
149+ if total_pages > 1 :
150+ log .info (f"scraping pagination ({ total_pages - 1 } more pages)" )
151+
152+ # Build URLs for remaining pages
153+ base_url = url .split ('?' )[0 ]
154+ to_scrape = []
155+ for page in range (2 , total_pages + 1 ):
156+ page_url = f"{ base_url } ?page={ page } "
157+ to_scrape .append (ScrapeConfig (page_url , ** BASE_CONFIG ))
158+
159+ # Scrape concurrently
160+ async for result in SCRAPFLY .concurrent_scrape (to_scrape ):
161+ product_cards = result .selector .css ('.fp-algolia-product-card' )
162+
163+ for card in product_cards :
164+ try :
165+ product_data = extract_product_from_card (card )
166+ results .append (product_data )
167+ except Exception as e :
168+ log .warning (f"failed to extract product: { e } " )
169+ continue
170+
171+ log .info (f"scraped { len (results )} product listings from search pages" )
172+ return results
0 commit comments