Skip to content

Commit 6116fe9

Browse files
committed
use fake_useragent
1 parent 9c5f004 commit 6116fe9

File tree

2 files changed

+5
-2
lines changed

2 files changed

+5
-2
lines changed

scripts/data_collector/us_index/collector.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import pandas as pd
1515
from tqdm import tqdm
1616
from loguru import logger
17+
from fake_useragent import UserAgent
1718

1819

1920
CUR_DIR = Path(__file__).resolve().parent
@@ -52,6 +53,7 @@ def __init__(
5253
)
5354

5455
self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
56+
self._ua = UserAgent()
5557

5658
@property
5759
@abc.abstractmethod
@@ -114,7 +116,7 @@ def calendar_list(self) -> List[pd.Timestamp]:
114116

115117
def _request_new_companies(self) -> requests.Response:
116118
headers = {
117-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
119+
'User-Agent': self._ua.random
118120
}
119121
resp = requests.get(self._target_url, timeout=None, headers=headers)
120122
if resp.status_code != 200:
@@ -232,7 +234,7 @@ def get_changes(self) -> pd.DataFrame:
232234
# NOTE: may update the index of the table
233235
# Add headers to avoid 403 Forbidden error from Wikipedia
234236
headers = {
235-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
237+
'User-Agent': self._ua.random
236238
}
237239
response = requests.get(self.WIKISP500_CHANGES_URL, headers=headers)
238240
response.raise_for_status()

scripts/data_collector/us_index/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ requests
33
pandas
44
lxml
55
loguru
6+
fake-useragent

0 commit comments

Comments
 (0)