Skip to content

Commit c5a4b62

Browse files
ada-ggf25lhoestq
andauthored
fix(fingerprint): treat TMPDIR as strict API and fail (Issue #7877) (#7891)
* fix(fingerprint): improve TMPDIR environment variable handling in _TempCacheDir Enhanced the _TempCacheDir.__init__ method to properly respect and handle the TMPDIR environment variable when creating temporary cache directories. Changes: - Add TMPDIR environment variable detection and validation - Normalise paths to handle path resolution issues - Auto-create TMPDIR directory if it doesn't exist to prevent silent fallback to default temporary directory - Validate that TMPDIR is actually a directory before use - Explicitly pass directory to mkdtemp to ensure TMPDIR is respected even if tempfile.gettempdir() was already called and cached - Add appropriate logging for directory creation and fallback scenarios This ensures that when TMPDIR is set, the temporary cache files are created in the specified directory rather than silently falling back to the system default temporary directory. * test(fingerprint): add comprehensive tests for TMPDIR handling in _TempCacheDir Add test coverage for the improved TMPDIR environment variable handling in the _TempCacheDir class. These tests verify the various scenarios for TMPDIR usage and error handling. Changes: - Refactor test_fingerprint_in_multiprocessing to use Pool.map for cleaner test implementation - Add test_temp_cache_dir_with_tmpdir_nonexistent to verify TMPDIR auto-creation when directory doesn't exist - Add test_temp_cache_dir_with_tmpdir_existing to verify correct behaviour when TMPDIR exists and is valid - Add test_temp_cache_dir_without_tmpdir to verify fallback to default temporary directory when TMPDIR is not set - Add test_temp_cache_dir_tmpdir_creation_failure to verify graceful error handling and fallback when TMPDIR creation fails These tests ensure that the TMPDIR improvements work correctly across all scenarios and edge cases, including proper logging and fallback behaviour. * test(fingerprint): tighten TMPDIR error-path tests for _TempCacheDir Refine TMPDIR-related failure tests for _TempCacheDir to assert explicit error conditions instead of fallback behaviour. Changes: - Update test_temp_cache_dir_tmpdir_creation_failure to use _TempCacheDir directly and assert that an OSError is raised with a clear TMPDIR context when directory creation fails - Introduce test_temp_cache_dir_tmpdir_not_directory to verify that pointing TMPDIR at a non-directory raises an OSError with an informative error message These tests better match the intended contract of _TempCacheDir by ensuring invalid TMPDIR configurations fail loudly with descriptive messages rather than silently falling back. * fix(fingerprint): make TMPDIR misconfiguration in _TempCacheDir fail loudly Tighten TMPDIR handling in _TempCacheDir so that invalid configurations raise clear errors instead of silently falling back to the default temporary directory. Changes: - When TMPDIR points to a non-existent directory, raise an OSError with explicit guidance to create it manually or unset TMPDIR - When TMPDIR points to a non-directory path, raise an OSError with guidance to point TMPDIR to a writable directory or unset it - Remove previous warning-and-fallback behaviour to avoid masking configuration issues This ensures that TMPDIR misconfigurations are surfaced early and clearly, aligning runtime behaviour with the stricter expectations codified in the new tests. * Update src/datasets/fingerprint.py * Update fingerprint.py * Fix formatting of TMPDIR retrieval line --------- Co-authored-by: Quentin Lhoest <[email protected]>
1 parent 0ec4d87 commit c5a4b62

File tree

2 files changed

+159
-6
lines changed

2 files changed

+159
-6
lines changed

src/datasets/fingerprint.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,33 @@ class _TempCacheDir:
4949
"""
5050

5151
def __init__(self):
52-
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
52+
# Check if TMPDIR is set and handle the case where it doesn't exist
53+
tmpdir = os.environ.get("TMPDIR") or os.environ.get("TEMP") or os.environ.get("TMP")
54+
# Normalize the path to handle any path resolution issues
55+
if tmpdir:
56+
tmpdir = os.path.normpath(tmpdir)
57+
if not os.path.exists(tmpdir):
58+
# Auto-create the directory if it doesn't exist
59+
# This prevents tempfile from silently falling back to /tmp
60+
try:
61+
os.makedirs(tmpdir, exist_ok=True)
62+
logger.info(f"Created TMPDIR directory: {tmpdir}")
63+
except OSError as e:
64+
raise OSError(
65+
f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. "
66+
"Please create it manually or unset TMPDIR to fall back to the default temporary directory."
67+
) from e
68+
# If tmpdir exists, verify it's actually a directory and writable
69+
elif not os.path.isdir(tmpdir):
70+
raise OSError(
71+
f"TMPDIR is set to '{tmpdir}' but it is not a directory. "
72+
"Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory."
73+
)
74+
75+
# Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected
76+
# This works even if tempfile.gettempdir() was already called and cached
77+
# Pass dir=None if tmpdir is None to use default temp directory
78+
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)
5379
self._finalizer = weakref.finalize(self, self._cleanup)
5480

5581
def _cleanup(self):

tests/test_fingerprint.py

Lines changed: 132 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -407,12 +407,139 @@ def test_fingerprint_in_multiprocessing():
407407
data = {"a": [0, 1, 2]}
408408
dataset = DatasetChild(InMemoryTable.from_pydict(data))
409409
expected_fingerprint = dataset.func1()._fingerprint
410-
assert expected_fingerprint == dataset.func1()._fingerprint
411-
assert expected_fingerprint != dataset.func2()._fingerprint
410+
with Pool(2) as pool:
411+
fingerprints = pool.map(
412+
lambda _: DatasetChild(InMemoryTable.from_pydict(data)).func1()._fingerprint, range(10)
413+
)
414+
assert all(f == expected_fingerprint for f in fingerprints)
412415

413-
with Pool(2) as p:
414-
assert expected_fingerprint == p.apply_async(dataset.func1).get()._fingerprint
415-
assert expected_fingerprint != p.apply_async(dataset.func2).get()._fingerprint
416+
417+
def test_temp_cache_dir_with_tmpdir_nonexistent(tmp_path, caplog):
418+
"""Test that _TempCacheDir creates TMPDIR if it doesn't exist."""
419+
import os
420+
421+
# Set TMPDIR to a non-existent directory
422+
tmpdir_path = tmp_path / "custom_tmpdir"
423+
assert not tmpdir_path.exists(), "TMPDIR should not exist initially"
424+
425+
# Save original TMPDIR and set new one
426+
original_tmpdir = os.environ.get("TMPDIR")
427+
try:
428+
os.environ["TMPDIR"] = str(tmpdir_path)
429+
430+
# Clear any existing temp cache dir to force recreation
431+
import datasets.fingerprint
432+
433+
datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None
434+
435+
# Import and test _TempCacheDir directly
436+
from datasets.fingerprint import _TempCacheDir
437+
438+
with caplog.at_level("INFO", logger="datasets.fingerprint"):
439+
temp_cache = _TempCacheDir()
440+
cache_dir = temp_cache.name
441+
442+
# The key test: verify the cache directory is within the TMPDIR we set
443+
# This proves that TMPDIR was respected and the directory was created
444+
tmpdir_path_str = str(tmpdir_path)
445+
assert cache_dir.startswith(tmpdir_path_str), (
446+
f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path_str}. TMPDIR env var: {os.environ.get('TMPDIR')}"
447+
)
448+
# Verify the directory was created
449+
assert tmpdir_path.exists(), (
450+
f"TMPDIR directory {tmpdir_path} should have been created. Cache dir is: {cache_dir}"
451+
)
452+
# Verify logging
453+
assert f"Created TMPDIR directory: {tmpdir_path_str}" in caplog.text
454+
455+
# Cleanup
456+
temp_cache.cleanup()
457+
finally:
458+
# Restore original TMPDIR
459+
if original_tmpdir is not None:
460+
os.environ["TMPDIR"] = original_tmpdir
461+
elif "TMPDIR" in os.environ:
462+
del os.environ["TMPDIR"]
463+
464+
465+
def test_temp_cache_dir_with_tmpdir_existing(tmp_path, monkeypatch):
466+
"""Test that _TempCacheDir works correctly when TMPDIR exists."""
467+
from datasets.fingerprint import get_temporary_cache_files_directory
468+
469+
# Set TMPDIR to an existing directory
470+
tmpdir_path = tmp_path / "existing_tmpdir"
471+
tmpdir_path.mkdir()
472+
monkeypatch.setenv("TMPDIR", str(tmpdir_path))
473+
474+
# Clear any existing temp cache dir
475+
import datasets.fingerprint
476+
477+
datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None
478+
479+
cache_dir = get_temporary_cache_files_directory()
480+
481+
# Verify the cache directory is within the TMPDIR
482+
assert cache_dir.startswith(str(tmpdir_path)), f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path}"
483+
484+
485+
def test_temp_cache_dir_without_tmpdir(monkeypatch):
486+
"""Test that _TempCacheDir works correctly when TMPDIR is not set."""
487+
from datasets.fingerprint import get_temporary_cache_files_directory
488+
489+
# Remove TMPDIR if it exists
490+
monkeypatch.delenv("TMPDIR", raising=False)
491+
492+
# Clear any existing temp cache dir
493+
import datasets.fingerprint
494+
495+
datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None
496+
497+
cache_dir = get_temporary_cache_files_directory()
498+
499+
# Verify it uses the default temp directory
500+
from tempfile import gettempdir
501+
502+
default_temp = gettempdir()
503+
assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should be in default temp {default_temp}"
504+
505+
506+
def test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog):
507+
"""Test that _TempCacheDir raises if TMPDIR cannot be created."""
508+
from unittest.mock import patch
509+
510+
from datasets.fingerprint import _TempCacheDir
511+
512+
# Set TMPDIR to a path that will fail to create (e.g., invalid permissions)
513+
# Use a path that's likely to fail on creation
514+
tmpdir_path = tmp_path / "nonexistent" / "nested" / "path"
515+
monkeypatch.setenv("TMPDIR", str(tmpdir_path))
516+
517+
# Mock os.makedirs to raise an error
518+
with patch("datasets.fingerprint.os.makedirs", side_effect=OSError("Permission denied")):
519+
with pytest.raises(OSError) as excinfo:
520+
_TempCacheDir()
521+
522+
# Verify the error message gives clear context about TMPDIR
523+
msg = str(excinfo.value)
524+
assert "TMPDIR is set to" in msg
525+
assert "could not be created" in msg
526+
527+
528+
def test_temp_cache_dir_tmpdir_not_directory(tmp_path, monkeypatch):
529+
"""Test that _TempCacheDir raises if TMPDIR points to a non-directory."""
530+
from datasets.fingerprint import _TempCacheDir
531+
532+
# Create a regular file and point TMPDIR to it
533+
file_path = tmp_path / "not_a_dir"
534+
file_path.write_text("not a directory")
535+
monkeypatch.setenv("TMPDIR", str(file_path))
536+
537+
with pytest.raises(OSError) as excinfo:
538+
_TempCacheDir()
539+
540+
msg = str(excinfo.value)
541+
assert "TMPDIR is set to" in msg
542+
assert "is not a directory" in msg
416543

417544

418545
def test_fingerprint_when_transform_version_changes():

0 commit comments

Comments
 (0)