77from pathlib import Path
88from concurrent .futures import ThreadPoolExecutor
99from typing import List
10+ from io import StringIO
1011
1112import fire
1213import requests
1314import pandas as pd
1415from tqdm import tqdm
1516from loguru import logger
17+ from fake_useragent import UserAgent
1618
1719
1820CUR_DIR = Path (__file__ ).resolve ().parent
@@ -51,6 +53,7 @@ def __init__(
5153 )
5254
5355 self ._target_url = f"{ WIKI_URL } /{ WIKI_INDEX_NAME_MAP [self .index_name .upper ()]} "
56+ self ._ua = UserAgent ()
5457
5558 @property
5659 @abc .abstractmethod
@@ -112,7 +115,8 @@ def calendar_list(self) -> List[pd.Timestamp]:
112115 return _calendar_list
113116
114117 def _request_new_companies (self ) -> requests .Response :
115- resp = requests .get (self ._target_url , timeout = None )
118+ headers = {"User-Agent" : self ._ua .random }
119+ resp = requests .get (self ._target_url , timeout = None , headers = headers )
116120 if resp .status_code != 200 :
117121 raise ValueError (f"request error: { self ._target_url } " )
118122
@@ -128,7 +132,7 @@ def set_default_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
128132 def get_new_companies (self ):
129133 logger .info (f"get new companies { self .index_name } ......" )
130134 _data = deco_retry (retry = self ._request_retry , retry_sleep = self ._retry_sleep )(self ._request_new_companies )()
131- df_list = pd .read_html (_data .text )
135+ df_list = pd .read_html (StringIO ( _data .text ) )
132136 for _df in df_list :
133137 _df = self .filter_df (_df )
134138 if (_df is not None ) and (not _df .empty ):
@@ -226,7 +230,11 @@ def bench_start_date(self) -> pd.Timestamp:
226230 def get_changes (self ) -> pd .DataFrame :
227231 logger .info (f"get sp500 history changes......" )
228232 # NOTE: may update the index of the table
229- changes_df = pd .read_html (self .WIKISP500_CHANGES_URL )[- 1 ]
233+ # Add headers to avoid 403 Forbidden error from Wikipedia
234+ headers = {"User-Agent" : self ._ua .random }
235+ response = requests .get (self .WIKISP500_CHANGES_URL , headers = headers , timeout = None )
236+ response .raise_for_status ()
237+ changes_df = pd .read_html (StringIO (response .text ))[- 1 ]
230238 changes_df = changes_df .iloc [:, [0 , 1 , 3 ]]
231239 changes_df .columns = [self .DATE_FIELD_NAME , self .ADD , self .REMOVE ]
232240 changes_df [self .DATE_FIELD_NAME ] = pd .to_datetime (changes_df [self .DATE_FIELD_NAME ])
0 commit comments