Skip to content

Commit 8b1bd4e

Browse files
authored
Python 3.14 (#7836)
* add 3.14 * update ci * go home tf * torchcodec * numba * fix ci * no lz4 in python 3.14 * fix tests * again * again * again
1 parent 9e5b0e6 commit 8b1bd4e

File tree

10 files changed

+83
-44
lines changed

10 files changed

+83
-44
lines changed

.github/conda/meta.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ requirements:
2525
- dataclasses
2626
- multiprocess
2727
- fsspec
28-
- huggingface_hub >=0.24.0,<1.0.0
28+
- huggingface_hub >=0.25.0,<2.0.0
2929
- packaging
3030
run:
3131
- python
@@ -41,7 +41,7 @@ requirements:
4141
- dataclasses
4242
- multiprocess
4343
- fsspec
44-
- huggingface_hub >=0.24.0,<1.0.0
44+
- huggingface_hub >=0.25.0,<2.0.0
4545
- packaging
4646

4747
test:

.github/workflows/ci.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ jobs:
8282
run: |
8383
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
8484
85-
test_py312:
85+
test_py314:
8686
needs: check_code_quality
8787
strategy:
8888
matrix:
@@ -100,18 +100,18 @@ jobs:
100100
run: |
101101
sudo apt update
102102
sudo apt install -y ffmpeg
103-
- name: Set up Python 3.12
103+
- name: Set up Python 3.14
104104
uses: actions/setup-python@v5
105105
with:
106-
python-version: "3.12"
106+
python-version: "3.14"
107107
- name: Setup conda env (windows)
108108
if: ${{ matrix.os == 'windows-latest' }}
109109
uses: conda-incubator/setup-miniconda@v2
110110
with:
111111
auto-update-conda: true
112112
miniconda-version: "latest"
113113
activate-environment: test
114-
python-version: "3.12"
114+
python-version: "3.14"
115115
- name: Setup FFmpeg (windows)
116116
if: ${{ matrix.os == 'windows-latest' }}
117117
run: conda install "ffmpeg=7.0.1" -c conda-forge
@@ -127,7 +127,7 @@ jobs:
127127
run: |
128128
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
129129
130-
test_py312_future:
130+
test_py314_future:
131131
needs: check_code_quality
132132
strategy:
133133
matrix:
@@ -145,18 +145,18 @@ jobs:
145145
run: |
146146
sudo apt update
147147
sudo apt install -y ffmpeg
148-
- name: Set up Python 3.12
148+
- name: Set up Python 3.14
149149
uses: actions/setup-python@v5
150150
with:
151-
python-version: "3.12"
151+
python-version: "3.14"
152152
- name: Setup conda env (windows)
153153
if: ${{ matrix.os == 'windows-latest' }}
154154
uses: conda-incubator/setup-miniconda@v2
155155
with:
156156
auto-update-conda: true
157157
miniconda-version: "latest"
158158
activate-environment: test
159-
python-version: "3.12"
159+
python-version: "3.14"
160160
- name: Setup FFmpeg (windows)
161161
if: ${{ matrix.os == 'windows-latest' }}
162162
run: conda install "ffmpeg=7.0.1" -c conda-forge

setup.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
# for fast hashing
125125
"xxhash",
126126
# for better multiprocessing
127-
"multiprocess<0.70.17", # to align with dill<0.3.9 (see above)
127+
"multiprocess<0.70.19", # to align with dill<0.3.9 (see above)
128128
# to save datasets locally or on any filesystem
129129
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
130130
"fsspec[http]>=2023.1.0,<=2025.9.0",
@@ -153,12 +153,12 @@
153153

154154
TESTS_REQUIRE = [
155155
# fix pip install issues for windows
156-
"numba>=0.56.4", # to get recent versions of llvmlite for windows ci
156+
"numba>=0.56.4; python_version < '3.14'", # to get recent versions of llvmlite for windows ci, not available on 3.14
157157
# test dependencies
158158
"absl-py",
159159
"decorator",
160160
"joblib<1.3.0", # joblibspark doesn't support recent joblib versions
161-
"joblibspark",
161+
"joblibspark; python_version < '3.14'", # python 3.14 gives AttributeError: module 'ast' has no attribute 'Num'
162162
"pytest",
163163
"pytest-datadir",
164164
"pytest-xdist",
@@ -169,23 +169,23 @@
169169
"h5py",
170170
"jax>=0.3.14; sys_platform != 'win32'",
171171
"jaxlib>=0.3.14; sys_platform != 'win32'",
172-
"lz4",
172+
"lz4; python_version < '3.14'", # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame
173173
"moto[server]",
174174
"pyspark>=3.4", # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0
175175
"py7zr",
176176
"rarfile>=4.0",
177177
"sqlalchemy",
178178
"protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12
179179
"tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'", # numpy-2 is not supported for Python < 3.10
180-
"tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'", # Pins numpy < 2
180+
"tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32' and python_version < '3.14'", # Pins numpy < 2
181181
"tiktoken",
182182
"torch>=2.8.0",
183183
"torchdata",
184184
"transformers>=4.42.0", # Pins numpy < 2
185185
"zstandard",
186186
"polars[timezone]>=0.20.0",
187187
"Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced
188-
"torchcodec>=0.7.0", # minium version to get windows support
188+
"torchcodec>=0.7.0; python_version < '3.14'", # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet
189189
"nibabel>=5.3.1",
190190
]
191191

@@ -262,6 +262,9 @@
262262
"Programming Language :: Python :: 3.9",
263263
"Programming Language :: Python :: 3.10",
264264
"Programming Language :: Python :: 3.11",
265+
"Programming Language :: Python :: 3.12",
266+
"Programming Language :: Python :: 3.13",
267+
"Programming Language :: Python :: 3.14",
265268
"Topic :: Scientific/Engineering :: Artificial Intelligence",
266269
],
267270
keywords="datasets machine learning datasets",

tests/features/test_audio.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,7 @@ def test_dataset_with_audio_feature_loaded_from_cache():
713713
assert isinstance(ds, Dataset)
714714

715715

716+
@require_torchcodec
716717
def test_dataset_with_audio_feature_undecoded(shared_datadir):
717718
audio_path = str(shared_datadir / "test_audio_44100.wav")
718719
data = {"audio": [audio_path]}
@@ -730,6 +731,7 @@ def test_dataset_with_audio_feature_undecoded(shared_datadir):
730731
assert column[0] == {"path": audio_path, "bytes": None}
731732

732733

734+
@require_torchcodec
733735
def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
734736
audio_path = str(shared_datadir / "test_audio_44100.wav")
735737
data = {"audio": [audio_path]}
@@ -761,6 +763,7 @@ def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
761763
assert column[0] == {"path": audio_path, "bytes": None}
762764

763765

766+
@require_torchcodec
764767
def test_dataset_with_audio_feature_map_undecoded(shared_datadir):
765768
audio_path = str(shared_datadir / "test_audio_44100.wav")
766769
data = {"audio": [audio_path]}

tests/test_extract.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import zipfile
32

43
import pytest
54

@@ -199,5 +198,5 @@ def test_is_zipfile_false_positive(tmpdir):
199198
)
200199
with not_a_zip_file.open("wb") as f:
201200
f.write(data)
202-
assert zipfile.is_zipfile(str(not_a_zip_file)) # is a false positive for `zipfile`
201+
# zipfile.is_zipfile(str(not_a_zip_file)) could be a false positive for `zipfile`
203202
assert not ZipExtractor.is_extractable(not_a_zip_file) # but we're right

tests/test_fingerprint.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
require_spacy,
2727
require_tiktoken,
2828
require_torch,
29+
require_torch_compile,
2930
require_transformers,
3031
)
3132

@@ -347,7 +348,7 @@ def test_hash_spacy_model(self):
347348
self.assertNotEqual(hash1, hash2)
348349

349350
@require_not_windows
350-
@require_torch
351+
@require_torch_compile
351352
def test_hash_torch_compiled_function(self):
352353
import torch
353354

@@ -360,7 +361,7 @@ def f(x):
360361
self.assertEqual(hash1, hash2)
361362

362363
@require_not_windows
363-
@require_torch
364+
@require_torch_compile
364365
def test_hash_torch_compiled_module(self):
365366
m = TorchModule()
366367
next(iter(m.parameters())).data.fill_(1.0)

tests/test_iterable_dataset.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,18 +1553,21 @@ def test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_pa
15531553
assert len(result) == 10
15541554

15551555

1556+
def gen_with_worker_info(shard):
1557+
from torch.utils.data import get_worker_info
1558+
1559+
worker_info = get_worker_info()
1560+
for i in range(100):
1561+
yield {"value": i, "worker_id": worker_info.id}
1562+
1563+
15561564
@require_torch
15571565
def test_iterable_dataset_shuffle_with_multiple_workers_different_rng():
15581566
# GH 7567
1559-
from torch.utils.data import DataLoader, get_worker_info
1560-
1561-
def gen(shard):
1562-
worker_info = get_worker_info()
1563-
for i in range(100):
1564-
yield {"value": i, "worker_id": worker_info.id}
1567+
from torch.utils.data import DataLoader
15651568

15661569
num_workers = 20
1567-
ds = IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers))})
1570+
ds = IterableDataset.from_generator(gen_with_worker_info, gen_kwargs={"shard": list(range(num_workers))})
15681571
ds = ds.shuffle(buffer_size=100, seed=1234)
15691572
dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers)
15701573

@@ -1575,18 +1578,19 @@ def gen(shard):
15751578
assert len(set(values)) != 1, "Make sure not all values are identical"
15761579

15771580

1581+
def gen_with_value(shard, value):
1582+
for i in range(100):
1583+
yield {"value": value}
1584+
1585+
15781586
@require_torch
15791587
def test_iterable_dataset_interleave_dataset_with_multiple_workers():
15801588
# GH 7567
15811589
from torch.utils.data import DataLoader
15821590

1583-
def gen(shard, value):
1584-
for i in range(100):
1585-
yield {"value": value}
1586-
15871591
num_workers = 20
15881592
ds = [
1589-
IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i})
1593+
IterableDataset.from_generator(gen_with_value, gen_kwargs={"shard": list(range(num_workers)), "value": i})
15901594
for i in range(10)
15911595
]
15921596
ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)
@@ -1598,18 +1602,19 @@ def gen(shard, value):
15981602
assert len(set(values)) != 1, "Make sure not all values are identical"
15991603

16001604

1605+
def gen_with_id(shard, value):
1606+
for i in range(50):
1607+
yield {"value": value, "id": i}
1608+
1609+
16011610
@require_torch
16021611
def test_iterable_dataset_interleave_dataset_deterministic_across_iterations():
16031612
# GH 7567
16041613
from torch.utils.data import DataLoader
16051614

1606-
def gen(shard, value):
1607-
for i in range(50):
1608-
yield {"value": value, "id": i}
1609-
16101615
num_workers = 10
16111616
ds = [
1612-
IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i})
1617+
IterableDataset.from_generator(gen_with_id, gen_kwargs={"shard": list(range(num_workers)), "value": i})
16131618
for i in range(5)
16141619
]
16151620
ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)

tests/test_py_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import pickle
23
import time
34
from dataclasses import dataclass
45
from multiprocessing import Pool
@@ -81,7 +82,7 @@ def test_map_nested(self):
8182
{k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True, num_proc=num_proc).items()},
8283
{k: v.tolist() for k, v in expected_map_nested_sn1_int.items()},
8384
)
84-
with self.assertRaises(AttributeError): # can't pickle a local lambda
85+
with self.assertRaises((AttributeError, pickle.PicklingError)): # can't pickle a local lambda
8586
map_nested(lambda x: x + 1, sn1, num_proc=num_proc)
8687

8788
def test_zip_dict(self):

tests/test_streaming_download_manager.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import os
3+
from pathlib import Path
34

45
import pytest
56

@@ -26,10 +27,16 @@
2627
Bulbasaur, grass"""
2728

2829

29-
@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"])
30-
def test_streaming_dl_manager_download_dummy_path(urlpath):
30+
def test_streaming_dl_manager_download_dummy_path():
31+
path = str(Path(__file__).resolve())
3132
dl_manager = StreamingDownloadManager()
32-
assert dl_manager.download(urlpath) == urlpath
33+
assert dl_manager.download(path) == path
34+
35+
36+
def test_streaming_dl_manager_download_dummy_url():
37+
url = "https://f.oo/bar.txt"
38+
dl_manager = StreamingDownloadManager()
39+
assert dl_manager.download(url) == url
3340

3441

3542
@pytest.mark.parametrize(
@@ -54,10 +61,16 @@ def test_streaming_dl_manager_download(text_path):
5461
assert f.read() == expected_file.read()
5562

5663

57-
@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"])
58-
def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
64+
def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path():
65+
path = str(Path(__file__).resolve())
66+
dl_manager = StreamingDownloadManager()
67+
assert dl_manager.download_and_extract(path) == path
68+
69+
70+
def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_url():
71+
url = "https://f.oo/bar.txt"
5972
dl_manager = StreamingDownloadManager()
60-
assert dl_manager.download_and_extract(urlpath) == urlpath
73+
assert dl_manager.download_and_extract(url) == url
6174

6275

6376
def test_streaming_dl_manager_extract(text_gz_path, text_path):

tests/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,20 @@ def require_torch(test_case):
125125
return test_case
126126

127127

128+
def require_torch_compile(test_case):
129+
"""
130+
Decorator marking a test that requires PyTorch.
131+
132+
These tests are skipped when PyTorch isn't installed.
133+
134+
"""
135+
if not config.TORCH_AVAILABLE:
136+
test_case = unittest.skip("test requires PyTorch")(test_case)
137+
if config.PY_VERSION >= version.parse("3.14"):
138+
test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case)
139+
return test_case
140+
141+
128142
def require_polars(test_case):
129143
"""
130144
Decorator marking a test that requires Polars.

0 commit comments

Comments
 (0)