|
13 | 13 | import logging |
14 | 14 | from pathlib import Path |
15 | 15 | from typing import Dict, List, Optional, Tuple |
| 16 | +from urllib.parse import urlparse |
16 | 17 |
|
17 | 18 | import requests |
18 | 19 | from bs4 import BeautifulSoup |
@@ -102,19 +103,41 @@ def scrape_build_prices(self, url: str) -> Dict[str, Tuple[str, str]]: |
102 | 103 | retailer_href = retailer_link.get('href', '') |
103 | 104 | retailer_text = retailer_link.get_text(strip=True) |
104 | 105 |
|
105 | | - # Extract retailer name from URL |
106 | | - if 'amazon' in retailer_href.lower(): |
107 | | - retailer = 'Amazon Canada' |
108 | | - elif 'newegg' in retailer_href.lower(): |
109 | | - retailer = 'Newegg Canada' |
110 | | - elif 'bestbuy' in retailer_href.lower(): |
111 | | - retailer = 'Best Buy Canada' |
112 | | - elif 'vuugo' in retailer_href.lower(): |
113 | | - retailer = 'Vuugo' |
114 | | - elif 'canadacomputers' in retailer_href.lower() or 'cc.com' in retailer_href.lower(): |
115 | | - retailer = 'Canada Computers' |
116 | | - elif retailer_text and retailer_text.lower() not in ['buy', 'add']: |
117 | | - retailer = retailer_text |
| 106 | + # Properly parse URL to extract domain for security |
| 107 | + try: |
| 108 | + parsed_url = urlparse(retailer_href) |
| 109 | + domain = parsed_url.netloc.lower() |
| 110 | + |
| 111 | + # Whitelist of known trusted retailer domains |
| 112 | + # This is for display purposes only, not security-sensitive |
| 113 | + trusted_retailers = { |
| 114 | + 'www.amazon.ca': 'Amazon Canada', |
| 115 | + 'amazon.ca': 'Amazon Canada', |
| 116 | + 'www.amazon.com': 'Amazon Canada', |
| 117 | + 'amazon.com': 'Amazon Canada', |
| 118 | + 'www.newegg.ca': 'Newegg Canada', |
| 119 | + 'newegg.ca': 'Newegg Canada', |
| 120 | + 'www.newegg.com': 'Newegg Canada', |
| 121 | + 'newegg.com': 'Newegg Canada', |
| 122 | + 'www.bestbuy.ca': 'Best Buy Canada', |
| 123 | + 'bestbuy.ca': 'Best Buy Canada', |
| 124 | + 'www.bestbuy.com': 'Best Buy Canada', |
| 125 | + 'bestbuy.com': 'Best Buy Canada', |
| 126 | + 'www.vuugo.com': 'Vuugo', |
| 127 | + 'vuugo.com': 'Vuugo', |
| 128 | + 'www.canadacomputers.com': 'Canada Computers', |
| 129 | + 'canadacomputers.com': 'Canada Computers', |
| 130 | + } |
| 131 | + |
| 132 | + retailer = trusted_retailers.get(domain, 'Unknown') |
| 133 | + |
| 134 | + # If domain not in whitelist, try to use the link text |
| 135 | + if retailer == 'Unknown' and retailer_text and retailer_text.lower() not in ['buy', 'add']: |
| 136 | + retailer = retailer_text |
| 137 | + except Exception: |
| 138 | + # If URL parsing fails, try to use the text |
| 139 | + if retailer_text and retailer_text.lower() not in ['buy', 'add']: |
| 140 | + retailer = retailer_text |
118 | 141 |
|
119 | 142 | # Clean up price text (remove "Add", "From", etc.) |
120 | 143 | price_match = re.search(r'\$[\d,]+\.?\d*', price_text) |
|
0 commit comments