Skip to content

Commit 2e9a00a

Browse files
authored
fix(data_collector): fix us_index collector.py Http Error 403 Forbidden; Remove FutureWarning (#2047)
* Fix 403 Forbidden error; Remove FutureWarning: * use fake_useragent * Fix lint format error * Add timeout to fix pylint error
1 parent d631b44 commit 2e9a00a

File tree

2 files changed

+12
-3
lines changed

2 files changed

+12
-3
lines changed

scripts/data_collector/us_index/collector.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
from pathlib import Path
88
from concurrent.futures import ThreadPoolExecutor
99
from typing import List
10+
from io import StringIO
1011

1112
import fire
1213
import requests
1314
import pandas as pd
1415
from tqdm import tqdm
1516
from loguru import logger
17+
from fake_useragent import UserAgent
1618

1719

1820
CUR_DIR = Path(__file__).resolve().parent
@@ -51,6 +53,7 @@ def __init__(
5153
)
5254

5355
self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
56+
self._ua = UserAgent()
5457

5558
@property
5659
@abc.abstractmethod
@@ -112,7 +115,8 @@ def calendar_list(self) -> List[pd.Timestamp]:
112115
return _calendar_list
113116

114117
def _request_new_companies(self) -> requests.Response:
115-
resp = requests.get(self._target_url, timeout=None)
118+
headers = {"User-Agent": self._ua.random}
119+
resp = requests.get(self._target_url, timeout=None, headers=headers)
116120
if resp.status_code != 200:
117121
raise ValueError(f"request error: {self._target_url}")
118122

@@ -128,7 +132,7 @@ def set_default_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
128132
def get_new_companies(self):
129133
logger.info(f"get new companies {self.index_name} ......")
130134
_data = deco_retry(retry=self._request_retry, retry_sleep=self._retry_sleep)(self._request_new_companies)()
131-
df_list = pd.read_html(_data.text)
135+
df_list = pd.read_html(StringIO(_data.text))
132136
for _df in df_list:
133137
_df = self.filter_df(_df)
134138
if (_df is not None) and (not _df.empty):
@@ -226,7 +230,11 @@ def bench_start_date(self) -> pd.Timestamp:
226230
def get_changes(self) -> pd.DataFrame:
227231
logger.info(f"get sp500 history changes......")
228232
# NOTE: may update the index of the table
229-
changes_df = pd.read_html(self.WIKISP500_CHANGES_URL)[-1]
233+
# Add headers to avoid 403 Forbidden error from Wikipedia
234+
headers = {"User-Agent": self._ua.random}
235+
response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers, timeout=None)
236+
response.raise_for_status()
237+
changes_df = pd.read_html(StringIO(response.text))[-1]
230238
changes_df = changes_df.iloc[:, [0, 1, 3]]
231239
changes_df.columns = [self.DATE_FIELD_NAME, self.ADD, self.REMOVE]
232240
changes_df[self.DATE_FIELD_NAME] = pd.to_datetime(changes_df[self.DATE_FIELD_NAME])

scripts/data_collector/us_index/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ requests
33
pandas
44
lxml
55
loguru
6+
fake-useragent

0 commit comments

Comments
 (0)