Skip to content

Commit 3184ef6

Browse files
feat(dedupe): add parallel workers, colored group rows in HTML
1 parent 9c53cd7 commit 3184ef6

File tree

4 files changed

+79
-33
lines changed

4 files changed

+79
-33
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.2.1] - 2026-01-14
9+
10+
### Added
11+
12+
- **Dedupe mode**: `-w`/`--workers` option for parallel processing
13+
14+
### Improved
15+
16+
- **Dedupe HTML report**: Visual grouping with distinct colors per duplicate group
17+
- **Dedupe HTML report**: Added "Group" column to easily identify duplicate sets
18+
819
## [0.2.0] - 2026-01-13
920

1021
### Added

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ python main.py [mode] [options] [TARGET_PATHS]...
7070
* `--output`, `-o`: Used with `analyze` and `dedupe` modes. Specify the output path for the HTML report.
7171
* `--force`: Used with `repair` mode. Forces re-encoding of all files, even if they are valid.
7272
* `--assume-album`: Used with `replaygain` mode. Treats all processed files as a single album for ReplayGain calculation.
73-
* `-w`, `--workers`: Number of parallel workers for faster processing.
73+
* `-w`, `--workers`: Number of parallel workers for faster processing (available in `analyze`, `repair`, and `dedupe` modes).
7474
* `-v`, `--verbose`: Enables detailed debug output.
7575
* `-q`, `--quiet`: Suppresses all informational output, showing only errors.
7676

flac_toolkit/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,9 @@ def dedupe(args):
159159
logging.info("DEDUPE Mode - Scanning for duplicates\n" + "=" * 50)
160160
target_paths = [Path(p) for p in args.target_paths]
161161
output_html = args.output
162+
workers = args.workers
162163

163-
results = find_duplicates(target_paths)
164+
results = find_duplicates(target_paths, workers=workers)
164165
print_duplicate_report(results)
165166

166167
# Generate HTML Report
@@ -202,6 +203,7 @@ def main():
202203
dedupe_parser = subparsers.add_parser('dedupe', help='Find duplicate FLAC files (strict and audio-only).')
203204
dedupe_parser.add_argument('target_paths', nargs='+', help='One or more files or directories to process.')
204205
dedupe_parser.add_argument('-o', '--output', type=str, default='flac_duplicate_report.html', help='Path to the output HTML report.')
206+
dedupe_parser.add_argument('-w', '--workers', type=int, default=None, help='Number of parallel workers.')
205207

206208
args = parser.parse_args()
207209

flac_toolkit/dedupe.py

Lines changed: 64 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import hashlib
22
import logging
3+
import os
4+
import sys
35
from pathlib import Path
46
from collections import defaultdict
5-
from typing import List, Dict, NamedTuple, Optional
7+
from typing import List, Dict, NamedTuple, Optional, Tuple
8+
from concurrent.futures import ProcessPoolExecutor, as_completed
69
from tqdm import tqdm
710
from mutagen.flac import FLAC
811
from flac_toolkit.core import find_flac_files
@@ -21,7 +24,30 @@ def get_file_content_hash(path: Path) -> str:
2124
sha256_hash.update(byte_block)
2225
return sha256_hash.hexdigest()
2326

24-
def find_duplicates(target_paths: List[Path]) -> List[DuplicateGroup]:
27+
28+
def _get_audio_signature(file_path: Path) -> Tuple[Path, Optional[str], Optional[str]]:
29+
"""
30+
Worker function to get audio MD5 signature for a single file.
31+
Returns (file_path, signature_hex, error_message).
32+
"""
33+
try:
34+
audio = FLAC(file_path)
35+
sig = audio.info.md5_signature
36+
37+
if sig == 0:
38+
# If signature is missing in header, calculate it manually
39+
calc_sig, err = _calculate_audio_md5(file_path, audio.info.bits_per_sample)
40+
if calc_sig:
41+
return (file_path, calc_sig, None)
42+
else:
43+
return (file_path, None, err)
44+
else:
45+
return (file_path, format(sig, '032x'), None)
46+
except Exception as e:
47+
return (file_path, None, str(e))
48+
49+
50+
def find_duplicates(target_paths: List[Path], workers: Optional[int] = None) -> List[DuplicateGroup]:
2551
"""
2652
Finds groups of files with identical audio content.
2753
Within those groups, identifies files that are strictly identical (byte-for-byte).
@@ -33,28 +59,31 @@ def find_duplicates(target_paths: List[Path]) -> List[DuplicateGroup]:
3359
# Map: Audio MD5 -> List of files
3460
audio_map = defaultdict(list)
3561

36-
# Using tqdm for progress feedback
37-
pbar = tqdm(files, desc="Scanning audio signatures", unit="file")
38-
for f in pbar:
39-
try:
40-
audio = FLAC(f)
41-
sig = audio.info.md5_signature
42-
43-
sig_hex = ""
44-
if sig == 0:
45-
# If signature is missing in header, calculate it manually
46-
calc_sig, err = _calculate_audio_md5(f, audio.info.bits_per_sample)
47-
if calc_sig:
48-
sig_hex = calc_sig
49-
else:
50-
logging.warning(f"\nSkipping: {f}\n Reason: {err}")
51-
continue
52-
else:
53-
sig_hex = format(sig, '032x')
54-
55-
audio_map[sig_hex].append(f)
56-
except Exception as e:
57-
logging.error(f"\nError reading {f}: {e}")
62+
if workers is not None and workers == 1:
63+
# Sequential execution
64+
tqdm.write("Running in sequential mode (1 worker).")
65+
for f in tqdm(files, desc="Scanning audio signatures", unit="file", miniters=1, mininterval=0.0, file=sys.stdout):
66+
f, sig_hex, err = _get_audio_signature(f)
67+
if err:
68+
logging.warning(f"\nSkipping: {f}\n Reason: {err}")
69+
elif sig_hex:
70+
audio_map[sig_hex].append(f)
71+
else:
72+
# Parallel execution
73+
effective_workers = workers if workers else os.cpu_count()
74+
# Limit workers on Windows
75+
if effective_workers and effective_workers > 61:
76+
effective_workers = 61
77+
tqdm.write(f"Running in parallel mode ({effective_workers} workers).")
78+
79+
with ProcessPoolExecutor(max_workers=effective_workers) as executor:
80+
futures = [executor.submit(_get_audio_signature, f) for f in files]
81+
for future in tqdm(as_completed(futures), total=len(files), desc="Scanning audio signatures", unit="file", miniters=1, mininterval=0.0, file=sys.stdout):
82+
file_path, sig_hex, err = future.result()
83+
if err:
84+
logging.warning(f"\nSkipping: {file_path}\n Reason: {err}")
85+
elif sig_hex:
86+
audio_map[sig_hex].append(file_path)
5887

5988
results = []
6089

@@ -81,7 +110,7 @@ def find_duplicates(target_paths: List[Path]) -> List[DuplicateGroup]:
81110

82111
def print_duplicate_report(results: List[DuplicateGroup]):
83112
if not results:
84-
print("\nNo duplicates found! Great job.")
113+
print("\nNo duplicates found!")
85114
return
86115

87116
print(f"\nFound {len(results)} groups of duplicate audio content.\n")
@@ -131,10 +160,16 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
131160
total_duplicate_files = sum(len(g.files) for g in results)
132161
total_strict_sets = sum(len(g.strict_groups) for g in results)
133162

134-
# Build table rows
163+
# Build table rows - generate distinct colors for groups
164+
# Using HSL with fixed saturation/lightness, varying hue
165+
def get_group_color(idx: int, total: int) -> str:
166+
"""Generate a distinct pastel color for each group."""
167+
hue = (idx * 137.508) % 360 # Golden angle for good distribution
168+
return f"hsl({hue:.0f}, 50%, 95%)"
169+
135170
table_rows = []
136171
for group_idx, group in enumerate(results, 1):
137-
row_class = "row-even" if group_idx % 2 == 0 else "row-odd"
172+
group_color = get_group_color(group_idx, total_groups)
138173
for f in group.files:
139174
# Determine duplicate type
140175
is_strict = any(f in sg for sg in group.strict_groups)
@@ -173,7 +208,7 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
173208
)
174209

175210
table_rows.append(f"""
176-
<tr class="{row_class}" data-group="{group_idx}">
211+
<tr style="background-color: {group_color};" data-group="{group_idx}">
177212
<td class="col-group">{group_idx}</td>
178213
<td>{file_cell}</td>
179214
<td>{folder_cell}</td>
@@ -335,9 +370,7 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
335370
}}
336371
.folder-link:hover {{ opacity: 0.7; }}
337372
338-
/* Group row coloring */
339-
.row-odd {{ background-color: #ffffff; }}
340-
.row-even {{ background-color: #f8f9fa; }}
373+
/* Group row coloring - distinct colors per group */
341374
.col-group {{
342375
font-weight: 600;
343376
text-align: center;

0 commit comments

Comments
 (0)