Skip to content

Commit 87335bb

Browse files
committed
url supports relative paths
1 parent 38f6545 commit 87335bb

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

scraper/src/config/config_loader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ class ConfigLoader:
6363

6464
nb_hits_max = 6000000
6565

66+
relative_url = False
67+
6668
def __init__(self, config):
6769
data = self._load_config(config)
6870

scraper/src/strategies/default_strategy.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ def get_records_from_dom(self, current_page_url=None):
6666
if self.dom is None:
6767
sys.exit('DefaultStrategy.dom is not defined')
6868

69+
# Convert absolute URL to relative path
70+
original_url = current_page_url
71+
if self.config.relative_url and original_url:
72+
from urllib.parse import urlparse
73+
parsed = urlparse(original_url)
74+
# Construct a relative path (including path, parameters, and query parameters)
75+
relative_url = parsed.path
76+
if parsed.params: # Handle URL parameters (less common)
77+
relative_url += ';' + parsed.params
78+
if parsed.query: # Handle query parameters
79+
relative_url += '?' + parsed.query
80+
current_page_url = relative_url
81+
6982
# Reset it to be able to have a clean instance when testing
7083
self.global_content = {}
7184

0 commit comments

Comments
 (0)