diff --git a/docs/integrity-verification.md b/docs/integrity-verification.md new file mode 100644 index 00000000..ecfc3695 --- /dev/null +++ b/docs/integrity-verification.md @@ -0,0 +1,125 @@ +# File Integrity Verification + +KToolBox now includes robust file integrity verification features to prevent corrupted downloads and provide better reliability. + +## Overview + +The integrity verification system addresses common issues with incomplete downloads, including: +- Files that appear to download successfully but are actually corrupted (e.g., half-black images) +- `PermissionError(13)` issues during file operations +- Downloads that complete without proper size verification + +## Features + +### 1. File Size Verification + +**Default: Enabled** + +KToolBox automatically verifies that downloaded files match the expected size from HTTP headers: + +- Checks `Content-Range` header first (for resumable downloads) +- Falls back to `Content-Length` header if `Content-Range` is not available +- Fails the download and cleans up temporary files if sizes don't match + +```bash +# Enable/disable via environment variable +KTOOLBOX_DOWNLOADER__VERIFY_FILE_INTEGRITY=true # Default: true +``` + +### 2. SHA-256 Checksum Verification + +**Default: Disabled** + +Optional checksum calculation provides additional integrity verification: + +- Calculates SHA-256 hash during download +- Logs the checksum for each completed file +- Enables detection of data corruption beyond size mismatches + +```bash +# Enable checksum verification +KTOOLBOX_DOWNLOADER__CHECKSUM_VERIFICATION=true # Default: false +``` + +### 3. Improved Error Handling + +The downloader now includes: + +- **PermissionError Retry**: Automatic retry up to 3 times with exponential backoff when file rename operations fail +- **Atomic Operations**: Better cleanup of temporary files when operations fail +- **Enhanced Logging**: More detailed error messages with context + +## Configuration + +### Via Environment Variables + +```bash +# File integrity verification (recommended: keep enabled) +KTOOLBOX_DOWNLOADER__VERIFY_FILE_INTEGRITY=true + +# Checksum verification (optional, adds logging overhead) +KTOOLBOX_DOWNLOADER__CHECKSUM_VERIFICATION=false +``` + +### Via Configuration File + +```python +# In your configuration +downloader: + verify_file_integrity: true # Enable size verification + checksum_verification: false # Enable SHA-256 checksums +``` + +## When to Use + +### File Integrity Verification +- **Always recommended** - should remain enabled (default) +- Prevents corrupted files from being saved +- Minimal performance impact +- Essential for reliability + +### Checksum Verification +- **Optional** - enable when you need extra verification +- Useful for archival purposes or when data integrity is critical +- Adds computational overhead during downloads +- Provides SHA-256 hash logging for each file + +## Troubleshooting + +### Common Issues + +**Issue**: Downloads fail with "File integrity check failed: size mismatch" +- **Cause**: Server provided incorrect `Content-Length` or download was interrupted +- **Solution**: This is working correctly - the system prevented a corrupted file from being saved + +**Issue**: PermissionError during downloads +- **Cause**: File is being accessed by another process (antivirus, file explorer, etc.) +- **Solution**: The system now automatically retries with exponential backoff + +**Issue**: Performance impact with checksum verification +- **Cause**: SHA-256 calculation adds CPU overhead +- **Solution**: Disable checksum verification if not needed: `KTOOLBOX_DOWNLOADER__CHECKSUM_VERIFICATION=false` + +### Logs + +With integrity verification enabled, you'll see logs like: + +``` +INFO - Download completed with checksum - file: image.jpg, sha256: a7b86d4635b0a3e10df870a604a1a811c01a7943c34e5bfb3759d9827473507b +``` + +With size mismatch detection: + +``` +ERROR - File integrity check failed: size mismatch - expected_size: 1024, actual_size: 512, filename: corrupted.jpg +``` + +## Migration + +These features are designed to be backward compatible: + +- **Existing downloads**: No configuration changes required +- **Default behavior**: File integrity verification is automatically enabled +- **Optional features**: Checksum verification remains disabled by default + +The integrity verification system helps ensure reliable downloads while maintaining compatibility with existing configurations and workflows. \ No newline at end of file diff --git a/ktoolbox/configuration.py b/ktoolbox/configuration.py index 2b06f97e..c0ee9643 100644 --- a/ktoolbox/configuration.py +++ b/ktoolbox/configuration.py @@ -68,6 +68,8 @@ class DownloaderConfiguration(BaseModel): For example: ``https://example.com/{}`` will be ``https://example.com/https://n1.kemono.su/data/66/83/xxxxx.jpg``; \ ``https://example.com/?url={}`` will be ``https://example.com/?url=https://n1.kemono.su/data/66/83/xxxxx.jpg`` :ivar keep_metadata: Keep the file metadata when downloading files (e.g. last modified time, etc.) + :ivar verify_file_integrity: Verify downloaded file size matches expected Content-Length + :ivar checksum_verification: Enable SHA-256 checksum verification for downloaded files """ scheme: Literal["http", "https"] = "https" timeout: float = 30.0 @@ -83,6 +85,8 @@ class DownloaderConfiguration(BaseModel): bucket_path: Path = Path("./.ktoolbox/bucket_storage") reverse_proxy: str = "{}" keep_metadata: bool = True + verify_file_integrity: bool = True + checksum_verification: bool = False @model_validator(mode="after") def check_bucket_path(self) -> "DownloaderConfiguration": diff --git a/ktoolbox/downloader/downloader.py b/ktoolbox/downloader/downloader.py index a801a036..7c19899c 100644 --- a/ktoolbox/downloader/downloader.py +++ b/ktoolbox/downloader/downloader.py @@ -1,4 +1,5 @@ import asyncio +import hashlib import os from asyncio import CancelledError, Lock from functools import cached_property @@ -245,7 +246,7 @@ async def run( filename=save_filepath ) ) - elif res.status_code != httpx.codes.PARTIAL_CONTENT: + elif res.status_code not in (httpx.codes.PARTIAL_CONTENT, httpx.codes.OK): self._url = self._initial_url return DownloaderRet( code=RetCodeEnum.GeneralFailure, @@ -278,6 +279,13 @@ async def run( # Download total_size = int(range_str.split("/")[-1]) if (range_str := res.headers.get("Content-Range")) else None + # Also check Content-Length if Content-Range is not available + if total_size is None: + total_size = int(res.headers.get("Content-Length", 0)) or None + + downloaded_size = temp_size + sha256_hash = hashlib.sha256() if config.downloader.checksum_verification else None + async with aiofiles.open(str(temp_filepath), "ab", self._buffer_size) as f: chunk_iterator = res.aiter_bytes(self._chunk_size) t = tqdm_class( @@ -292,14 +300,96 @@ async def run( if self._stop: raise CancelledError await f.write(chunk) + downloaded_size += len(chunk) + if sha256_hash: + sha256_hash.update(chunk) t.update(len(chunk)) # Update progress bar - # Download finished + # Verify file integrity if enabled + if config.downloader.verify_file_integrity and total_size is not None: + actual_size = temp_filepath.stat().st_size + if actual_size != total_size: + # Remove incomplete file + if temp_filepath.exists(): + temp_filepath.unlink() + return DownloaderRet( + code=RetCodeEnum.GeneralFailure, + message=generate_msg( + "File integrity check failed: size mismatch", + expected_size=total_size, + actual_size=actual_size, + filename=save_filepath + ) + ) + + # Download finished - ensure atomic file operations + final_filepath = self._path / self._save_filename + + # Handle bucket storage if enabled if config.downloader.use_bucket: bucket_file_path.parent.mkdir(parents=True, exist_ok=True) - os.link(temp_filepath, bucket_file_path) - final_filepath = self._path / self._save_filename - temp_filepath.rename(final_filepath) + try: + os.link(temp_filepath, bucket_file_path) + except OSError as e: + logger.warning( + generate_msg( + "Failed to create bucket link", + file=self._save_filename, + exception=e + ) + ) + + # Atomic rename with retry for PermissionError + max_rename_attempts = 3 + for attempt in range(max_rename_attempts): + try: + temp_filepath.rename(final_filepath) + break + except PermissionError as e: + if attempt < max_rename_attempts - 1: + logger.warning( + generate_msg( + f"PermissionError during rename, retrying ({attempt + 1}/{max_rename_attempts})", + file=self._save_filename, + exception=e + ) + ) + await asyncio.sleep(0.1 * (attempt + 1)) # Exponential backoff + else: + # Final attempt failed - remove temp file and return error + if temp_filepath.exists(): + temp_filepath.unlink() + return DownloaderRet( + code=RetCodeEnum.GeneralFailure, + message=generate_msg( + "Failed to rename file after multiple attempts", + file=self._save_filename, + exception=e + ) + ) + except OSError as e: + # Other OS errors - remove temp file and return error + if temp_filepath.exists(): + temp_filepath.unlink() + return DownloaderRet( + code=RetCodeEnum.GeneralFailure, + message=generate_msg( + "Failed to rename file", + file=self._save_filename, + exception=e + ) + ) + + # Log checksum information if verification was enabled + if config.downloader.checksum_verification and sha256_hash: + checksum = sha256_hash.hexdigest() + logger.info( + generate_msg( + "Download completed with checksum", + file=self._save_filename, + sha256=checksum + ) + ) # Set file time from headers if config.downloader.keep_metadata: diff --git a/tests/ktoolbox/test_downloader_integrity.py b/tests/ktoolbox/test_downloader_integrity.py new file mode 100644 index 00000000..ef3e166a --- /dev/null +++ b/tests/ktoolbox/test_downloader_integrity.py @@ -0,0 +1,79 @@ +import pytest +from ktoolbox.configuration import config + + +class TestDownloaderConfiguration: + """Test new downloader configuration options""" + + def test_integrity_check_config_default(self): + """Test that verify_file_integrity is enabled by default""" + assert config.downloader.verify_file_integrity is True + + def test_checksum_verification_config_default(self): + """Test that checksum_verification is disabled by default""" + assert config.downloader.checksum_verification is False + + def test_config_can_be_modified(self): + """Test that configuration can be modified""" + # Save original values + original_verify = config.downloader.verify_file_integrity + original_checksum = config.downloader.checksum_verification + + try: + # Modify values + config.downloader.verify_file_integrity = False + config.downloader.checksum_verification = True + + # Check they were modified + assert config.downloader.verify_file_integrity is False + assert config.downloader.checksum_verification is True + + finally: + # Restore original values + config.downloader.verify_file_integrity = original_verify + config.downloader.checksum_verification = original_checksum + + +class TestDownloaderLogic: + """Test downloader logic improvements""" + + def test_content_length_fallback_logic(self): + """Test that Content-Length is used when Content-Range is not available""" + # This tests the logic we added to check Content-Length as fallback + + # Simulate the logic from downloader.py + range_str = None # No Content-Range header + headers = {"Content-Length": "12345"} + + # Our updated logic + total_size = int(range_str.split("/")[-1]) if range_str else None + if total_size is None: + total_size = int(headers.get("Content-Length", 0)) or None + + assert total_size == 12345 + + def test_content_range_takes_priority(self): + """Test that Content-Range takes priority over Content-Length""" + # This tests that existing logic still works correctly + + range_str = "bytes 1000-12345/12346" + headers = {"Content-Length": "11346", "Content-Range": range_str} + + # Our updated logic + total_size = int(range_str.split("/")[-1]) if range_str else None + if total_size is None: + total_size = int(headers.get("Content-Length", 0)) or None + + assert total_size == 12346 # Should use Content-Range, not Content-Length + + def test_no_size_headers(self): + """Test behavior when neither Content-Range nor Content-Length are available""" + range_str = None + headers = {} + + # Our updated logic + total_size = int(range_str.split("/")[-1]) if range_str else None + if total_size is None: + total_size = int(headers.get("Content-Length", 0)) or None + + assert total_size is None \ No newline at end of file