Skip to content

Commit d9658b2

Browse files
Fix URL substring sanitization security issue
Co-authored-by: willtheorangeguy <18339050+willtheorangeguy@users.noreply.github.com>
1 parent 7086252 commit d9658b2

File tree

1 file changed

+36
-13
lines changed

1 file changed

+36
-13
lines changed

scraper.py

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import logging
1414
from pathlib import Path
1515
from typing import Dict, List, Optional, Tuple
16+
from urllib.parse import urlparse
1617

1718
import requests
1819
from bs4 import BeautifulSoup
@@ -102,19 +103,41 @@ def scrape_build_prices(self, url: str) -> Dict[str, Tuple[str, str]]:
102103
retailer_href = retailer_link.get('href', '')
103104
retailer_text = retailer_link.get_text(strip=True)
104105

105-
# Extract retailer name from URL
106-
if 'amazon' in retailer_href.lower():
107-
retailer = 'Amazon Canada'
108-
elif 'newegg' in retailer_href.lower():
109-
retailer = 'Newegg Canada'
110-
elif 'bestbuy' in retailer_href.lower():
111-
retailer = 'Best Buy Canada'
112-
elif 'vuugo' in retailer_href.lower():
113-
retailer = 'Vuugo'
114-
elif 'canadacomputers' in retailer_href.lower() or 'cc.com' in retailer_href.lower():
115-
retailer = 'Canada Computers'
116-
elif retailer_text and retailer_text.lower() not in ['buy', 'add']:
117-
retailer = retailer_text
106+
# Properly parse URL to extract domain for security
107+
try:
108+
parsed_url = urlparse(retailer_href)
109+
domain = parsed_url.netloc.lower()
110+
111+
# Whitelist of known trusted retailer domains
112+
# This is for display purposes only, not security-sensitive
113+
trusted_retailers = {
114+
'www.amazon.ca': 'Amazon Canada',
115+
'amazon.ca': 'Amazon Canada',
116+
'www.amazon.com': 'Amazon Canada',
117+
'amazon.com': 'Amazon Canada',
118+
'www.newegg.ca': 'Newegg Canada',
119+
'newegg.ca': 'Newegg Canada',
120+
'www.newegg.com': 'Newegg Canada',
121+
'newegg.com': 'Newegg Canada',
122+
'www.bestbuy.ca': 'Best Buy Canada',
123+
'bestbuy.ca': 'Best Buy Canada',
124+
'www.bestbuy.com': 'Best Buy Canada',
125+
'bestbuy.com': 'Best Buy Canada',
126+
'www.vuugo.com': 'Vuugo',
127+
'vuugo.com': 'Vuugo',
128+
'www.canadacomputers.com': 'Canada Computers',
129+
'canadacomputers.com': 'Canada Computers',
130+
}
131+
132+
retailer = trusted_retailers.get(domain, 'Unknown')
133+
134+
# If domain not in whitelist, try to use the link text
135+
if retailer == 'Unknown' and retailer_text and retailer_text.lower() not in ['buy', 'add']:
136+
retailer = retailer_text
137+
except Exception:
138+
# If URL parsing fails, try to use the text
139+
if retailer_text and retailer_text.lower() not in ['buy', 'add']:
140+
retailer = retailer_text
118141

119142
# Clean up price text (remove "Add", "From", etc.)
120143
price_match = re.search(r'\$[\d,]+\.?\d*', price_text)

0 commit comments

Comments
 (0)