|
4 | 4 | import sys |
5 | 5 | from concurrent.futures import ProcessPoolExecutor, as_completed |
6 | 6 | from functools import partial |
| 7 | +from typing import List, Tuple |
7 | 8 |
|
8 | 9 | import fsspec |
9 | 10 | from surt import surt |
10 | 11 |
|
11 | | -from cdx_toolkit.filter_cdx.matcher import TupleMatcher, TrieMatcher |
| 12 | +from cdx_toolkit.filter_cdx.matcher import Matcher, TupleMatcher, TrieMatcher |
12 | 13 |
|
13 | 14 |
|
14 | 15 | logger = logging.getLogger(__name__) |
@@ -62,71 +63,77 @@ def run_filter_cdx(args, cmdline: str): |
62 | 63 | 'trie': TrieMatcher, |
63 | 64 | 'tuple': TupleMatcher, |
64 | 65 | } |
| 66 | + limit = 0 if args.limit is None else args.limit |
| 67 | + logger.info(f'Loaded {len(include_surt_prefixes):,} filter entries using {args.matching_approach} approach') |
65 | 68 |
|
66 | | - matcher = matcher_classes[args.matching_approach](include_surt_prefixes) |
| 69 | + # Process files in parallel |
| 70 | + total_lines_n, total_included_n, total_errors_n = filter_cdx( |
| 71 | + matcher=matcher_classes[args.matching_approach](include_surt_prefixes), |
| 72 | + input_paths=input_paths, |
| 73 | + output_paths=output_paths, |
| 74 | + limit=limit, |
| 75 | + n_parallel=max(1, args.parallel), |
| 76 | + ) |
67 | 77 |
|
68 | | - logger.info(f'Loaded {len(include_surt_prefixes):,} filter entries using {args.matching_approach} approach') |
| 78 | + logger.info( |
| 79 | + f'Filter statistics: {total_included_n} / {total_lines_n} lines ({total_included_n / total_lines_n:.4f})' |
| 80 | + ) |
| 81 | + logger.info( |
| 82 | + f'Errors: {total_errors_n}' |
| 83 | + ) |
69 | 84 |
|
70 | | - # Process files in parallel or sequentially |
71 | | - n_parallel = args.parallel |
72 | | - limit = 0 if args.limit is None else args.limit |
73 | | - total_lines_n = 0 |
74 | | - total_included_n = 0 |
75 | | - total_errors_n = 0 |
76 | | - |
77 | | - if n_parallel > 1: |
78 | | - # Parallel processing |
79 | | - logger.info('Parallel processes: %i', n_parallel) |
80 | | - with ProcessPoolExecutor(max_workers=n_parallel) as executor: |
81 | | - # Create partial function with common arguments |
82 | | - process_file_partial = partial(_process_single_file, matcher=matcher, limit=limit) |
83 | | - |
84 | | - # Submit all jobs |
85 | | - future_to_paths = { |
86 | | - executor.submit(process_file_partial, input_path, output_path): (input_path, output_path) |
87 | | - for input_path, output_path in zip(input_paths, output_paths) |
88 | | - } |
89 | | - |
90 | | - # Collect results |
91 | | - for future in as_completed(future_to_paths): |
92 | | - input_path, output_path = future_to_paths[future] |
93 | | - try: |
94 | | - lines_n, included_n = future.result() |
95 | | - logger.info( |
96 | | - f'File statistics for {input_path}: included_n={included_n}; lines_n={lines_n}; ratio={included_n / lines_n:.4f}' |
97 | | - ) |
98 | | - total_lines_n += lines_n |
99 | | - total_included_n += included_n |
100 | | - |
101 | | - except Exception as exc: |
102 | | - logger.error(f'File {input_path} generated an exception: {exc}') |
103 | | - total_errors_n += 1 |
104 | | - else: |
105 | | - # Sequential processing |
106 | | - logger.info('Sequential processing') |
107 | | - for input_path, output_path in zip(input_paths, output_paths): |
| 85 | + if limit > 0 and total_included_n >= 0: |
| 86 | + logger.info(f"Limit reached at {limit}") |
| 87 | + |
| 88 | + # End timing and log execution time |
| 89 | + end_time = time.time() |
| 90 | + execution_time = end_time - start_time |
| 91 | + |
| 92 | + logger.info(f'Script execution time: {execution_time:.3f} seconds') |
| 93 | + |
| 94 | + |
| 95 | +def filter_cdx( |
| 96 | + matcher: Matcher, |
| 97 | + input_paths: List[str], |
| 98 | + output_paths: List[str], |
| 99 | + n_parallel: int = 1, |
| 100 | + limit: int = 0, |
| 101 | + total_lines_n: int = 0, |
| 102 | + total_included_n: int = 0, |
| 103 | + total_errors_n: int = 0, |
| 104 | +) -> Tuple[int, int, int]: |
| 105 | + """Filter CDX files from input paths using a matcher to output paths.""" |
| 106 | + |
| 107 | + # Parallel processing |
| 108 | + logger.info('Filtering with %i processes in parallel (limit: %i)', n_parallel, limit) |
| 109 | + |
| 110 | + with ProcessPoolExecutor(max_workers=n_parallel) as executor: |
| 111 | + # Create partial function with common arguments |
| 112 | + process_file_partial = partial(_process_single_file, matcher=matcher, limit=limit) |
| 113 | + |
| 114 | + # Submit all jobs |
| 115 | + future_to_paths = { |
| 116 | + executor.submit(process_file_partial, input_path, output_path): (input_path, output_path) |
| 117 | + for input_path, output_path in zip(input_paths, output_paths) |
| 118 | + } |
| 119 | + |
| 120 | + # Collect results |
| 121 | + for future in as_completed(future_to_paths): |
| 122 | + input_path, output_path = future_to_paths[future] |
108 | 123 | try: |
109 | | - lines_n, included_n = _process_single_file(input_path, output_path, matcher, limit) |
| 124 | + lines_n, included_n = future.result() |
110 | 125 | logger.info( |
111 | | - f'File statistics for {input_path}: included_n={included_n}; lines_n={lines_n}; ratio={included_n / lines_n:.4f}' |
| 126 | + f'File statistics: included {total_included_n} / {total_lines_n} lines: {input_path}' |
112 | 127 | ) |
| 128 | + |
113 | 129 | total_lines_n += lines_n |
114 | 130 | total_included_n += included_n |
115 | 131 |
|
116 | 132 | except Exception as exc: |
117 | 133 | logger.error(f'File {input_path} generated an exception: {exc}') |
118 | 134 | total_errors_n += 1 |
119 | | - logger.info( |
120 | | - f'Total statistics: included_n={total_included_n}; lines_n={total_lines_n}; ratio={total_included_n / total_lines_n:.4f}' |
121 | | - ) |
122 | | - if total_errors_n > 0: |
123 | | - logger.error('Processing errors: %i', total_errors_n) |
124 | | - |
125 | | - # End timing and log execution time |
126 | | - end_time = time.time() |
127 | | - execution_time = end_time - start_time |
128 | 135 |
|
129 | | - logger.info(f'Script execution time: {execution_time:.3f} seconds') |
| 136 | + return total_lines_n, total_included_n, total_errors_n |
130 | 137 |
|
131 | 138 |
|
132 | 139 | def resolve_paths(input_base_path: str, input_glob: str, output_base_path: str): |
|
0 commit comments