Skip to content

Commit 9f4ebe7

Browse files
committed
refactor: centralize get_requests_session() in shared.get_session()
1 parent 0d44547 commit 9f4ebe7

File tree

6 files changed

+44
-76
lines changed

6 files changed

+44
-76
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
from pygments import highlight
2323
from pygments.formatters import TerminalFormatter
2424
from pygments.lexers import PythonTracebackLexer
25-
from requests.adapters import HTTPAdapter
26-
from urllib3.util.retry import Retry
2725

2826
# Add parent directory so shared can be imported
2927
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -335,17 +333,7 @@ def initialize_all_data_files(args):
335333
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
336334

337335

338-
def get_requests_session():
339-
"""Create request session with retry logic"""
340-
retry_strategy = Retry(
341-
total=5,
342-
backoff_factor=10,
343-
status_forcelist=shared.STATUS_FORCELIST,
344-
)
345-
session = requests.Session()
346-
session.headers.update({"User-Agent": shared.USER_AGENT})
347-
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
348-
return session
336+
session = shared.get_requests_session()
349337

350338

351339
def normalize_license_text(raw_text):
@@ -533,7 +521,7 @@ def query_arxiv(args):
533521
"""
534522

535523
LOGGER.info("Beginning to fetch results from ArXiv API")
536-
session = get_requests_session()
524+
session = shared.get_requests_session()
537525

538526
results_per_iteration = 50
539527

scripts/1-fetch/europeana_fetch.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from pygments import highlight
2424
from pygments.formatters import TerminalFormatter
2525
from pygments.lexers import PythonTracebackLexer
26-
from requests.adapters import HTTPAdapter, Retry
2726

2827
# Add parent directory for shared imports
2928
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -103,17 +102,7 @@ def parse_arguments():
103102
return args
104103

105104

106-
def get_requests_session():
107-
"""Create a requests session with retry."""
108-
max_retries = Retry(
109-
total=5, backoff_factor=10, status_forcelist=shared.STATUS_FORCELIST
110-
)
111-
session = requests.Session()
112-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
113-
session.headers.update(
114-
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
115-
)
116-
return session
105+
session = shared.get_requests_session(accept_header="application/json")
117106

118107

119108
def simplify_legal_tool(legal_tool):
@@ -433,7 +422,7 @@ def main():
433422
"EUROPEANA_API_KEY not found in environment variables", 1
434423
)
435424

436-
session = get_requests_session()
425+
session = shared.get_requests_session()
437426

438427
# Fetch facet lists once, including counts
439428
providers_full = get_facet_list(session, "DATA_PROVIDER")

scripts/1-fetch/github_fetch.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from pygments import highlight
1818
from pygments.formatters import TerminalFormatter
1919
from pygments.lexers import PythonTracebackLexer
20-
from requests.adapters import HTTPAdapter
21-
from urllib3.util.retry import Retry
2220

2321
# Add parent directory so shared can be imported
2422
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -80,23 +78,11 @@ def check_for_completion():
8078
pass # File may not be found without --enable-save, etc.
8179

8280

83-
def get_requests_session():
84-
max_retries = Retry(
85-
total=5,
86-
backoff_factor=10,
87-
status_forcelist=shared.STATUS_FORCELIST,
88-
)
89-
session = requests.Session()
90-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
91-
headers = {
92-
"accept": "application/vnd.github+json",
93-
"User-Agent": shared.USER_AGENT,
94-
}
95-
if GH_TOKEN:
96-
headers["authorization"] = f"Bearer {GH_TOKEN}"
97-
session.headers.update(headers)
98-
99-
return session
81+
session = shared.get_requests_session(
82+
headers={"accept": "application/vnd.github+json"},
83+
auth_token=GH_TOKEN,
84+
auth_prefix="Bearer",
85+
)
10086

10187

10288
def write_data(args, tool_data):
@@ -162,7 +148,7 @@ def main():
162148
args = parse_arguments()
163149
shared.paths_log(LOGGER, PATHS)
164150
check_for_completion()
165-
session = get_requests_session()
151+
session = shared.get_requests_session()
166152
tool_data = query_github(args, session)
167153
args = write_data(args, tool_data)
168154
args = shared.git_add_and_commit(

scripts/1-fetch/openverse_fetch.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
from pygments import highlight
2626
from pygments.formatters import TerminalFormatter
2727
from pygments.lexers import PythonTracebackLexer
28-
from requests.adapters import HTTPAdapter
29-
from urllib3.util.retry import Retry
3028

3129
# Add parent directory so shared can be imported
3230
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -83,18 +81,7 @@ def parse_arguments():
8381
return args
8482

8583

86-
def get_requests_session():
87-
max_retries = Retry(
88-
total=5,
89-
backoff_factor=10,
90-
status_forcelist=shared.STATUS_FORCELIST,
91-
)
92-
session = requests.Session()
93-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
94-
session.headers.update(
95-
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
96-
)
97-
return session
84+
session = shared.get_requests_session(accept_header="application/json")
9885

9986

10087
def get_all_sources_and_licenses(session, media_type):
@@ -225,7 +212,7 @@ def write_data(args, data):
225212

226213
def main():
227214
args = parse_arguments()
228-
session = get_requests_session()
215+
session = shared.get_requests_session()
229216
LOGGER.info("Starting Openverse Fetch Script...")
230217
records = query_openverse(session)
231218
write_data(args, records)

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,9 @@
1313
from operator import itemgetter
1414

1515
# Third-party
16-
import requests
1716
from pygments import highlight
1817
from pygments.formatters import TerminalFormatter
1918
from pygments.lexers import PythonTracebackLexer
20-
from requests.adapters import HTTPAdapter
21-
from urllib3.util.retry import Retry
2219

2320
# Add parent directory so shared can be imported
2421
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -66,16 +63,7 @@ def parse_arguments():
6663
return args
6764

6865

69-
def get_requests_session():
70-
max_retries = Retry(
71-
total=5,
72-
backoff_factor=10,
73-
status_forcelist=shared.STATUS_FORCELIST,
74-
)
75-
session = requests.Session()
76-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
77-
session.headers.update({"User-Agent": shared.USER_AGENT})
78-
return session
66+
session = shared.get_requests_session()
7967

8068

8169
def write_data(args, tool_data):
@@ -173,7 +161,7 @@ def main():
173161
args = parse_arguments()
174162
shared.paths_log(LOGGER, PATHS)
175163
shared.git_fetch_and_merge(args, PATHS["repo"])
176-
tool_data = query_wikipedia_languages(get_requests_session())
164+
tool_data = query_wikipedia_languages(shared.get_requests_session())
177165
args = write_data(args, tool_data)
178166
args = shared.git_add_and_commit(
179167
args,

scripts/shared.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
from datetime import datetime, timezone
66

77
# Third-party
8+
import requests
89
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
910
from pandas import PeriodIndex
11+
from requests.adapters import HTTPAdapter, Retry
1012

1113
# Constants
1214
STATUS_FORCELIST = [
@@ -31,6 +33,34 @@ def __init__(self, message, exit_code=None):
3133
super().__init__(self.message)
3234

3335

36+
def get_requests_session(
37+
accept_header: str | None = None,
38+
auth_token: str | None = None,
39+
mount_https: bool = True,
40+
) -> requests.Session:
41+
"""Create a reusable requests session with retry logic."""
42+
retry_strategy = Retry(
43+
total=5,
44+
backoff_factor=10,
45+
status_forcelist=STATUS_FORCELIST,
46+
)
47+
48+
session = requests.Session()
49+
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
50+
51+
headers = {"User-Agent": USER_AGENT}
52+
if accept_header:
53+
headers["accept"] = accept_header
54+
if auth_token:
55+
headers["authorization"] = f"Bearer {auth_token}"
56+
# Mount retry adapter for HTTPS
57+
if mount_https:
58+
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
59+
60+
session.headers.update(headers)
61+
return session
62+
63+
3464
def git_fetch_and_merge(args, repo_path, branch=None):
3565
if not args.enable_git:
3666
return

0 commit comments

Comments
 (0)