11import hashlib
22import logging
3+ import os
4+ import sys
35from pathlib import Path
46from collections import defaultdict
5- from typing import List , Dict , NamedTuple , Optional
7+ from typing import List , Dict , NamedTuple , Optional , Tuple
8+ from concurrent .futures import ProcessPoolExecutor , as_completed
69from tqdm import tqdm
710from mutagen .flac import FLAC
811from flac_toolkit .core import find_flac_files
@@ -21,7 +24,30 @@ def get_file_content_hash(path: Path) -> str:
2124 sha256_hash .update (byte_block )
2225 return sha256_hash .hexdigest ()
2326
24- def find_duplicates (target_paths : List [Path ]) -> List [DuplicateGroup ]:
27+
28+ def _get_audio_signature (file_path : Path ) -> Tuple [Path , Optional [str ], Optional [str ]]:
29+ """
30+ Worker function to get audio MD5 signature for a single file.
31+ Returns (file_path, signature_hex, error_message).
32+ """
33+ try :
34+ audio = FLAC (file_path )
35+ sig = audio .info .md5_signature
36+
37+ if sig == 0 :
38+ # If signature is missing in header, calculate it manually
39+ calc_sig , err = _calculate_audio_md5 (file_path , audio .info .bits_per_sample )
40+ if calc_sig :
41+ return (file_path , calc_sig , None )
42+ else :
43+ return (file_path , None , err )
44+ else :
45+ return (file_path , format (sig , '032x' ), None )
46+ except Exception as e :
47+ return (file_path , None , str (e ))
48+
49+
50+ def find_duplicates (target_paths : List [Path ], workers : Optional [int ] = None ) -> List [DuplicateGroup ]:
2551 """
2652 Finds groups of files with identical audio content.
2753 Within those groups, identifies files that are strictly identical (byte-for-byte).
@@ -33,28 +59,31 @@ def find_duplicates(target_paths: List[Path]) -> List[DuplicateGroup]:
3359 # Map: Audio MD5 -> List of files
3460 audio_map = defaultdict (list )
3561
36- # Using tqdm for progress feedback
37- pbar = tqdm (files , desc = "Scanning audio signatures" , unit = "file" )
38- for f in pbar :
39- try :
40- audio = FLAC (f )
41- sig = audio .info .md5_signature
42-
43- sig_hex = ""
44- if sig == 0 :
45- # If signature is missing in header, calculate it manually
46- calc_sig , err = _calculate_audio_md5 (f , audio .info .bits_per_sample )
47- if calc_sig :
48- sig_hex = calc_sig
49- else :
50- logging .warning (f"\n Skipping: { f } \n Reason: { err } " )
51- continue
52- else :
53- sig_hex = format (sig , '032x' )
54-
55- audio_map [sig_hex ].append (f )
56- except Exception as e :
57- logging .error (f"\n Error reading { f } : { e } " )
62+ if workers is not None and workers == 1 :
63+ # Sequential execution
64+ tqdm .write ("Running in sequential mode (1 worker)." )
65+ for f in tqdm (files , desc = "Scanning audio signatures" , unit = "file" , miniters = 1 , mininterval = 0.0 , file = sys .stdout ):
66+ f , sig_hex , err = _get_audio_signature (f )
67+ if err :
68+ logging .warning (f"\n Skipping: { f } \n Reason: { err } " )
69+ elif sig_hex :
70+ audio_map [sig_hex ].append (f )
71+ else :
72+ # Parallel execution
73+ effective_workers = workers if workers else os .cpu_count ()
74+ # Limit workers on Windows
75+ if effective_workers and effective_workers > 61 :
76+ effective_workers = 61
77+ tqdm .write (f"Running in parallel mode ({ effective_workers } workers)." )
78+
79+ with ProcessPoolExecutor (max_workers = effective_workers ) as executor :
80+ futures = [executor .submit (_get_audio_signature , f ) for f in files ]
81+ for future in tqdm (as_completed (futures ), total = len (files ), desc = "Scanning audio signatures" , unit = "file" , miniters = 1 , mininterval = 0.0 , file = sys .stdout ):
82+ file_path , sig_hex , err = future .result ()
83+ if err :
84+ logging .warning (f"\n Skipping: { file_path } \n Reason: { err } " )
85+ elif sig_hex :
86+ audio_map [sig_hex ].append (file_path )
5887
5988 results = []
6089
@@ -81,7 +110,7 @@ def find_duplicates(target_paths: List[Path]) -> List[DuplicateGroup]:
81110
82111def print_duplicate_report (results : List [DuplicateGroup ]):
83112 if not results :
84- print ("\n No duplicates found! Great job. " )
113+ print ("\n No duplicates found!" )
85114 return
86115
87116 print (f"\n Found { len (results )} groups of duplicate audio content.\n " )
@@ -131,10 +160,16 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
131160 total_duplicate_files = sum (len (g .files ) for g in results )
132161 total_strict_sets = sum (len (g .strict_groups ) for g in results )
133162
134- # Build table rows
163+ # Build table rows - generate distinct colors for groups
164+ # Using HSL with fixed saturation/lightness, varying hue
165+ def get_group_color (idx : int , total : int ) -> str :
166+ """Generate a distinct pastel color for each group."""
167+ hue = (idx * 137.508 ) % 360 # Golden angle for good distribution
168+ return f"hsl({ hue :.0f} , 50%, 95%)"
169+
135170 table_rows = []
136171 for group_idx , group in enumerate (results , 1 ):
137- row_class = "row-even" if group_idx % 2 == 0 else "row-odd"
172+ group_color = get_group_color ( group_idx , total_groups )
138173 for f in group .files :
139174 # Determine duplicate type
140175 is_strict = any (f in sg for sg in group .strict_groups )
@@ -173,7 +208,7 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
173208 )
174209
175210 table_rows .append (f"""
176- <tr class=" { row_class } " data-group="{ group_idx } ">
211+ <tr style="background-color: { group_color } ; " data-group="{ group_idx } ">
177212 <td class="col-group">{ group_idx } </td>
178213 <td>{ file_cell } </td>
179214 <td>{ folder_cell } </td>
@@ -335,9 +370,7 @@ def generate_dedupe_html_report(results: List[DuplicateGroup], output_path: Path
335370 }}
336371 .folder-link:hover {{ opacity: 0.7; }}
337372
338- /* Group row coloring */
339- .row-odd {{ background-color: #ffffff; }}
340- .row-even {{ background-color: #f8f9fa; }}
373+ /* Group row coloring - distinct colors per group */
341374 .col-group {{
342375 font-weight: 600;
343376 text-align: center;
0 commit comments