Skip to content

Commit c57ac3d

Browse files
committed
Merge branch 'main' into feat/warc-by-cdx
2 parents 155db05 + b1fea6c commit c57ac3d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1324
-164
lines changed

.github/workflows/ci-slow.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
- python-version: '3.12'
1717
os: macos-latest
1818
EXTRA: true
19-
- python-version: '3.7'
19+
- python-version: '3.8'
2020
os: windows-latest
2121
EXTRA: true
2222
- python-version: '3.12'

.github/workflows/ci.yaml

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,38 @@
11
name: CI
22

3-
on: workflow_dispatch
4-
# Disabled for this feature
5-
# on:
6-
# # runtime is erratic and up to an hour
7-
# push:
8-
# branches:
9-
# - main
10-
# pull_request:
11-
# branches:
12-
# - main
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
branches:
9+
- main
1310

1411
jobs:
1512
unit-tests:
1613
runs-on: ${{ matrix.os }}
1714
strategy:
18-
fail-fast: false
19-
max-parallel: 1 # avoids ever triggering a rate limit
15+
fail-fast: true
2016
matrix:
21-
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
17+
python-version: [
18+
'3.8', '3.9', '3.10', '3.11', '3.12', '3.13'
19+
]
2220
os: [ubuntu-latest]
2321
EXTRA: [false] # used to force includes to get included
2422
include:
25-
- python-version: '3.12'
26-
os: ubuntu-latest
23+
- python-version: '3.8'
24+
os: ubuntu-22.04 # oldest version on github actions
2725
EXTRA: true
26+
- python-version: '3.13'
27+
os: ubuntu-latest
2828
env:
2929
LOGLEVEL=DEBUG
30-
- python-version: '3.7'
31-
os: ubuntu-20.04 # oldest version on github actions
30+
EXTRA: true
31+
- python-version: '3.13'
32+
os: macos-latest
33+
EXTRA: true
34+
- python-version: '3.13'
35+
os: windows-latest
3236
EXTRA: true
3337

3438
steps:
@@ -40,7 +44,12 @@ jobs:
4044
with:
4145
python-version: ${{ matrix.python-version }}
4246

47+
- name: Get Runner IP
48+
run: |
49+
echo "Runner IP: $(curl -s https://ipinfo.io/ip)"
50+
4351
- name: Install setuptools on python 3.12+
52+
# apparently this is a quirk of Github action runners?
4453
if: ${{ matrix.python-version >= '3.12' }}
4554
run: |
4655
pip install setuptools

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ __pycache__
44
cdx_toolkit.egg-info
55
.coverage
66
.eggs/
7+
tmp/

cdx_toolkit/myrequests.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,16 @@ def update_next_fetch(hostname, next_fetch):
5454
retry_info[hostname]['next_fetch'] = next_fetch
5555

5656

57-
def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
57+
def myrequests_get(
58+
url,
59+
params=None,
60+
headers=None,
61+
cdx=False,
62+
allow404=False,
63+
raise_error_after_n_errors: int = 100,
64+
raise_warning_after_n_errors: int = 10,
65+
retry_max_sec: int = 60,
66+
):
5867
t = time.time()
5968

6069
hostname = urlparse(url).hostname
@@ -84,7 +93,6 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
8493

8594
retry = True
8695
retry_sec = 2 * minimum_interval
87-
retry_max_sec = 60
8896
retries = 0
8997
connect_errors = 0
9098
while retry:
@@ -125,14 +133,17 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
125133
connect_errors += 1
126134
string = '{} failures for url {} {!r}: {}'.format(connect_errors, url, params, str(e))
127135

128-
if 'Name or service not known' in string:
136+
# Check for DNS errors with different operating systems
137+
if (('Name or service not known' in string) # linux
138+
or ('nodename nor servname provided, or not known' in string) # macos
139+
or ('getaddrinfo failed' in string)): # windows
129140
if dns_fatal(url):
130141
raise ValueError('invalid hostname in url '+url) from None
131142

132-
if connect_errors > 100:
143+
if connect_errors > raise_error_after_n_errors:
133144
LOGGER.error(string)
134145
raise ValueError(string)
135-
if connect_errors > 10:
146+
if connect_errors > raise_warning_after_n_errors:
136147
LOGGER.warning(string)
137148
LOGGER.info('retrying after {:.2f}s for '.format(retry_max_sec)+str(e))
138149
time.sleep(retry_max_sec) # notice the extra-long sleep

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pytest-cov==2.12.1
1313
pytest-sugar==0.9.4
1414
coveralls==3.1.0
1515
botocore>=1.39.11
16+
responses==0.25.8
1617

1718
# packaging
1819
twine==3.4.1

setup.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# remember: keep requires synchronized with requirements.txt
1111
requires = ['requests', 'warcio', 'fsspec[s3]', 'aioboto3', 'surt', 'tqdm']
1212

13-
test_requirements = ['pytest', 'pytest-cov', 'boto3']
13+
test_requirements = ['pytest', 'pytest-cov', 'responses']
1414

1515
package_requirements = ['twine', 'setuptools', 'setuptools-scm']
1616

@@ -35,7 +35,7 @@
3535
author_email='lindahl@pbm.com',
3636
url='https://github.com/cocrawler/cdx_toolkit',
3737
packages=packages,
38-
python_requires=">=3.7",
38+
python_requires=">=3.8",
3939
extras_require=extras_require,
4040
setup_requires=['setuptools-scm'],
4141
install_requires=requires,
@@ -58,12 +58,13 @@
5858
'Programming Language :: Python',
5959
#'Programming Language :: Python :: 3.5', # setuptools-scm problem
6060
#'Programming Language :: Python :: 3.6', # not offered in github actions
61-
'Programming Language :: Python :: 3.7',
61+
# 'Programming Language :: Python :: 3.7',
6262
'Programming Language :: Python :: 3.8',
6363
'Programming Language :: Python :: 3.9',
6464
'Programming Language :: Python :: 3.10',
6565
'Programming Language :: Python :: 3.11',
6666
'Programming Language :: Python :: 3.12',
67+
'Programming Language :: Python :: 3.13',
6768
'Programming Language :: Python :: 3 :: Only',
6869
],
6970
)

0 commit comments

Comments
 (0)