more tests

malteos · malteos · commit 155db0518df3 · 2025-09-19T14:21:16.000Z
diff --git a/cdx_toolkit/filter_cdx/__init__.py b/cdx_toolkit/filter_cdx/__init__.py
@@ -4,11 +4,12 @@
 import sys
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from functools import partial
+from typing import List, Tuple
 
 import fsspec
 from surt import surt
 
-from cdx_toolkit.filter_cdx.matcher import TupleMatcher, TrieMatcher
+from cdx_toolkit.filter_cdx.matcher import Matcher, TupleMatcher, TrieMatcher
 
 
 logger = logging.getLogger(__name__)
@@ -62,71 +63,77 @@ def run_filter_cdx(args, cmdline: str):
         'trie': TrieMatcher,
         'tuple': TupleMatcher,
     }
+    limit = 0 if args.limit is None else args.limit
+    logger.info(f'Loaded {len(include_surt_prefixes):,} filter entries using {args.matching_approach} approach')
 
-    matcher = matcher_classes[args.matching_approach](include_surt_prefixes)
+    # Process files in parallel
+    total_lines_n, total_included_n, total_errors_n = filter_cdx(
+        matcher=matcher_classes[args.matching_approach](include_surt_prefixes),
+        input_paths=input_paths,
+        output_paths=output_paths,
+        limit=limit,
+        n_parallel=max(1, args.parallel),
+    )
 
-    logger.info(f'Loaded {len(include_surt_prefixes):,} filter entries using {args.matching_approach} approach')
+    logger.info(
+        f'Filter statistics: {total_included_n} / {total_lines_n} lines ({total_included_n / total_lines_n:.4f})'
+    )
+    logger.info(
+        f'Errors: {total_errors_n}'
+    )
 
-    # Process files in parallel or sequentially
-    n_parallel = args.parallel
-    limit = 0 if args.limit is None else args.limit
-    total_lines_n = 0
-    total_included_n = 0
-    total_errors_n = 0
-
-    if n_parallel > 1:
-        # Parallel processing
-        logger.info('Parallel processes: %i', n_parallel)
-        with ProcessPoolExecutor(max_workers=n_parallel) as executor:
-            # Create partial function with common arguments
-            process_file_partial = partial(_process_single_file, matcher=matcher, limit=limit)
-
-            # Submit all jobs
-            future_to_paths = {
-                executor.submit(process_file_partial, input_path, output_path): (input_path, output_path)
-                for input_path, output_path in zip(input_paths, output_paths)
-            }
-
-            # Collect results
-            for future in as_completed(future_to_paths):
-                input_path, output_path = future_to_paths[future]
-                try:
-                    lines_n, included_n = future.result()
-                    logger.info(
-                        f'File statistics for {input_path}: included_n={included_n}; lines_n={lines_n}; ratio={included_n / lines_n:.4f}'
-                    )
-                    total_lines_n += lines_n
-                    total_included_n += included_n
-
-                except Exception as exc:
-                    logger.error(f'File {input_path} generated an exception: {exc}')
-                    total_errors_n += 1
-    else:
-        # Sequential processing
-        logger.info('Sequential processing')
-        for input_path, output_path in zip(input_paths, output_paths):
+    if limit > 0 and total_included_n >= 0:
+        logger.info(f"Limit reached at {limit}")
+
+    # End timing and log execution time
+    end_time = time.time()
+    execution_time = end_time - start_time
+
+    logger.info(f'Script execution time: {execution_time:.3f} seconds')
+
+
+def filter_cdx(
+    matcher: Matcher,
+    input_paths: List[str],
+    output_paths: List[str],
+    n_parallel: int = 1,
+    limit: int = 0,
+    total_lines_n: int = 0,
+    total_included_n: int = 0,
+    total_errors_n: int = 0,
+) -> Tuple[int, int, int]:
+    """Filter CDX files from input paths using a matcher to output paths."""
+
+    # Parallel processing
+    logger.info('Filtering with %i processes in parallel (limit: %i)', n_parallel, limit)
+
+    with ProcessPoolExecutor(max_workers=n_parallel) as executor:
+        # Create partial function with common arguments
+        process_file_partial = partial(_process_single_file, matcher=matcher, limit=limit)
+
+        # Submit all jobs
+        future_to_paths = {
+            executor.submit(process_file_partial, input_path, output_path): (input_path, output_path)
+            for input_path, output_path in zip(input_paths, output_paths)
+        }
+
+        # Collect results
+        for future in as_completed(future_to_paths):
+            input_path, output_path = future_to_paths[future]
             try:
-                lines_n, included_n = _process_single_file(input_path, output_path, matcher, limit)
+                lines_n, included_n = future.result()
                 logger.info(
-                    f'File statistics for {input_path}: included_n={included_n}; lines_n={lines_n}; ratio={included_n / lines_n:.4f}'
+                    f'File statistics: included {total_included_n} / {total_lines_n} lines: {input_path}'
                 )
+
                 total_lines_n += lines_n
                 total_included_n += included_n
 
             except Exception as exc:
                 logger.error(f'File {input_path} generated an exception: {exc}')
                 total_errors_n += 1
-    logger.info(
-        f'Total statistics: included_n={total_included_n}; lines_n={total_lines_n}; ratio={total_included_n / total_lines_n:.4f}'
-    )
-    if total_errors_n > 0:
-        logger.error('Processing errors: %i', total_errors_n)
-
-    # End timing and log execution time
-    end_time = time.time()
-    execution_time = end_time - start_time
 
-    logger.info(f'Script execution time: {execution_time:.3f} seconds')
+    return total_lines_n, total_included_n, total_errors_n
 
 
 def resolve_paths(input_base_path: str, input_glob: str, output_base_path: str):
diff --git a/cdx_toolkit/warcer_by_cdx/aioboto3_warcer.py b/cdx_toolkit/warcer_by_cdx/aioboto3_warcer.py
@@ -177,33 +177,29 @@ async def get_range_jobs_from_index_paths(
     logger.info('Range index limit: %i', limit)
     count = 0
 
-    if not index_paths:
-        logger.error('No index paths provided!')
-
-    else:
-        # Iterate over index files
-        for index_path in index_paths:
-            # Fetch range queries from index
-            try:
-                for warc_url, offset, length in iter_cdx_index_from_path(
-                    index_path, warc_download_prefix=warc_download_prefix
-                ):
-                    # Convert the CDX record back to a RangeJob
-                    bucket, key = parse_s3_uri(warc_url)
-                    job = RangeJob(bucket=bucket, key=key, offset=offset, length=length)
-                    await key_queue.put(job)
-                    count += 1
-
-                    if limit > 0 and count >= limit:
-                        logger.warning('Index limit reached at %i', count)
-                        break
-
-            except Exception as e:
-                logger.error('Failed to read CDX index from %s: %s', index_path, e)
-
-            if limit > 0 and count >= limit:
-                logger.warning('Limit reached at %i', count)
-                break
+    # Iterate over index files
+    for index_path in index_paths:
+        # Fetch range queries from index
+        try:
+            for warc_url, offset, length in iter_cdx_index_from_path(
+                index_path, warc_download_prefix=warc_download_prefix
+            ):
+                # Convert the CDX record back to a RangeJob
+                bucket, key = parse_s3_uri(warc_url)
+                job = RangeJob(bucket=bucket, key=key, offset=offset, length=length)
+                await key_queue.put(job)
+                count += 1
+
+                if limit > 0 and count >= limit:
+                    logger.warning('Index limit reached at %i', count)
+                    break
+
+        except Exception as e:
+            logger.error('Failed to read CDX index from %s: %s', index_path, e)
+
+        if limit > 0 and count >= limit:
+            logger.warning('Limit reached at %i', count)
+            break
 
     # signal fetchers to stop
     for _ in range(num_fetchers):
diff --git a/tests/warc_by_cdx/test_aioboto3_warcer.py b/tests/warc_by_cdx/test_aioboto3_warcer.py
@@ -0,0 +1,73 @@
+import asyncio
+from unittest.mock import patch, AsyncMock
+
+from cdx_toolkit.warcer_by_cdx.aioboto3_warcer import filter_warc_by_cdx_via_aioboto3, get_range_jobs_from_index_paths
+
+
+def test_filter_warc_by_cdx_via_aioboto3_keyboard_interrupt(caplog):
+    """Test filter_warc_by_cdx_via_aioboto3 KeyboardInterrupt exception handling."""
+
+    # Mock the async function to raise KeyboardInterrupt
+    async def mock_async_function(*args, **kwargs):
+        raise KeyboardInterrupt('User interrupted')
+
+    with patch(
+        'cdx_toolkit.warcer_by_cdx.aioboto3_warcer.filter_warc_by_cdx_via_aioboto3_async',
+        side_effect=mock_async_function,
+    ):
+        # Call the function with minimal required parameters
+        result = filter_warc_by_cdx_via_aioboto3(
+            index_paths=['test_index.cdx'], prefix_path='s3://test-bucket/test-prefix', writer_info={'software': 'test'}
+        )
+
+        # Verify that KeyboardInterrupt was handled correctly
+        assert result == -1, 'Should return -1 when KeyboardInterrupt is caught'
+
+        # Check that the warning message was logged
+        assert 'Interrupted by user.' in caplog.text
+
+        # Verify the log level is warning
+        warning_records = [record for record in caplog.records if record.levelname == 'WARNING']
+        assert len(warning_records) == 1
+        assert warning_records[0].message == 'Interrupted by user.'
+
+
+
+def test_get_range_jobs_from_index_paths_exception_handling_with_logging(caplog):
+    """Test get_range_jobs_from_index_paths logs errors when iter_cdx_index_from_path raises."""
+
+    async def run_test():
+        # Create a mock queue
+        key_queue = AsyncMock(spec=asyncio.Queue)
+
+        # Test parameters
+        index_paths = ['failing_index.cdx']
+        warc_download_prefix = 'http://test-prefix'
+        num_fetchers = 1
+
+        # Mock iter_cdx_index_from_path to always raise exception
+        def mock_iter_cdx_index_from_path(index_path, warc_download_prefix):
+            raise ValueError('Simulated CDX parsing error')
+
+        with patch(
+            'cdx_toolkit.warcer_by_cdx.aioboto3_warcer.iter_cdx_index_from_path',
+            side_effect=mock_iter_cdx_index_from_path,
+        ):
+            # Run the function
+            await get_range_jobs_from_index_paths(
+                key_queue=key_queue,
+                index_paths=index_paths,
+                warc_download_prefix=warc_download_prefix,
+                num_fetchers=num_fetchers,
+                limit=0,
+            )
+
+            # Verify error was logged
+            assert 'Failed to read CDX index from failing_index.cdx' in caplog.text
+            assert 'Simulated CDX parsing error' in caplog.text
+
+            # Verify that only STOP signal was sent (no jobs due to exception)
+            assert key_queue.put.call_count == 1  # Only 1 STOP signal
+
+    # Run the test
+    asyncio.run(run_test())
diff --git a/tests/warc_by_cdx/test_filter_cdx.py b/tests/warc_by_cdx/test_filter_cdx.py
@@ -1,7 +1,10 @@
 import pytest
 
+from unittest.mock import patch
+
 from cdx_toolkit.cli import main
-from cdx_toolkit.filter_cdx import resolve_paths, validate_resolved_paths
+from cdx_toolkit.filter_cdx import _process_single_file, resolve_paths, validate_resolved_paths, filter_cdx
+from cdx_toolkit.filter_cdx.matcher import TupleMatcher
 from tests.conftest import requires_aws_s3, TEST_DATA_PATH
 
 fixture_path = TEST_DATA_PATH / 'filter_cdx'
@@ -23,7 +26,7 @@ def test_cli_filter_cdx_with_surts(tmpdir, caplog):
             f'{str(whitelist_path)}',
             f'{tmpdir}',
             '--filter-type=surt',
-            f'--input-glob={index_glob}'
+            f'--input-glob={index_glob}',
         ]
     )
 
@@ -46,7 +49,7 @@ def test_cli_filter_cdx_with_urls(tmpdir, caplog):
             f'{str(whitelist_path)}',
             f'{tmpdir}',
             '--filter-type=url',
-            f'--input-glob={index_glob}'
+            f'--input-glob={index_glob}',
         ]
     )
 
@@ -99,7 +102,7 @@ def test_filter_cdx_nonexistent_surt_file_exits(tmpdir, caplog):
                 f'{index_path}',
                 f'{nonexistent_surt_file}',
                 f'{tmpdir}',
-                f'--input-glob={index_glob}'
+                f'--input-glob={index_glob}',
             ]
         )
 
@@ -150,15 +153,73 @@ def test_cli_filter_cdx_with_parallel_processing(tmpdir, caplog):
             f'{tmpdir}',
             '--filter-type=surt',
             f'--input-glob={index_glob}',
-            '--parallel=2'
+            '--parallel=2',
         ]
     )
 
     # Check that multiple files were processed in parallel
     assert 'Found' in caplog.text and 'files matching pattern' in caplog.text
-    assert 'File statistics for' in caplog.text
-    assert 'Total statistics:' in caplog.text
+    assert 'File statistics' in caplog.text
+    assert 'Filter statistics' in caplog.text
 
     # Should have processed multiple files (pattern matches 2 files: cdx-00187.gz and cdx-00188.gz)
-    file_stats_count = caplog.text.count('File statistics for')
+    file_stats_count = caplog.text.count('File statistics')
     assert file_stats_count == 2, 'Should process exactly 2 files with the glob pattern'
+
+
+def test_process_single_file(tmpdir):
+    input_path = TEST_DATA_PATH / 'warc_by_cdx/filtered_CC-MAIN-2024-30_cdx-00187.gz'
+    matcher = TupleMatcher(prefixes=['fr,'])
+
+    lines_n, included_n = _process_single_file(
+        input_path=input_path,
+        output_path=tmpdir + '/filter_cdx',
+        matcher=matcher,
+        log_every_n=10,
+        limit=100,
+    )
+
+    assert included_n == 100
+    assert lines_n == 100
+
+
+def test_process_single_file_empty(tmpdir):
+    input_path = tmpdir + '/input'
+    with open(input_path, 'w') as f:
+        f.write('')
+
+    lines_n, included_n = _process_single_file(
+        input_path=input_path,
+        output_path=tmpdir + '/output',
+        matcher=None,
+    )
+    assert lines_n == 0
+    assert included_n == 0
+
+
+def test_filter_cdx_error_handling(tmpdir, caplog):
+    """Test filter_cdx function error handling when exceptions occur during processing."""
+
+    def mock_process_single_file(*args, **kwargs):
+        raise ValueError()
+
+    # Create test input and output paths
+    input_paths = [str(tmpdir / 'input1.cdx'), str(tmpdir / 'input2.cdx')]
+    output_paths = [str(tmpdir / 'output1.cdx'), str(tmpdir / 'output2.cdx')]
+
+    # Replace the _process_single_file function with our mock
+    with patch('cdx_toolkit.filter_cdx._process_single_file', side_effect=mock_process_single_file):
+        # Test the error handling
+        total_lines, total_included, total_errors = filter_cdx(
+            matcher=None,
+            input_paths=input_paths,
+            output_paths=output_paths,
+        )
+
+        # Verify error handling results
+        assert total_errors == 2, f'Should have 1 error from the failed file, got {total_errors}'
+        assert total_lines == 0, 'Should have lines from the successful file'
+        assert total_included == 0, 'Should have included lines from the successful file'
+
+        # Check that error was logged correctly
+        assert 'generated an exception' in caplog.text