Skip to content

Commit a0ebe2f

Browse files
author
Артём Земляк
committed
feat: Add image deduplication to prevent saving duplicate images
- Add SHA256 hash-based duplicate detection in FileProcessor - Reuse existing images instead of creating duplicates - Add cleanup script for removing existing duplicates - Add comprehensive tests for deduplication logic - Bump version to 0.2.0 Fixes issue where identical images were saved multiple times with different timestamps, wasting disk space (~260KB on existing data). Changes: - src/processor/file_processor.py: Add _compute_file_hash() and _find_existing_image_by_hash() - tests/test_file_processor.py: Add deduplication tests - scripts/cleanup_duplicate_images.py: New utility script - pyproject.toml, src/__init__.py: Version bump to 0.2.0
1 parent b57b51b commit a0ebe2f

File tree

5 files changed

+384
-18
lines changed

5 files changed

+384
-18
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[project]
66
name = "tg-note"
7-
version = "0.1.1"
7+
version = "0.2.0"
88
description = "Intelligent Knowledge Base Builder - Telegram bot that automatically transforms your messages, reposts, and articles into a structured knowledge base using AI agents"
99
readme = "README.md"
1010
license = "MIT"
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Script to clean up duplicate images in knowledge base.
5+
6+
This script:
7+
1. Scans the images directory for duplicate images based on file hash
8+
2. Keeps the oldest file (by timestamp in filename)
9+
3. Removes newer duplicates
10+
4. Reports what was removed
11+
12+
Usage:
13+
python scripts/cleanup_duplicate_images.py [--dry-run] <images_dir>
14+
15+
Example:
16+
python scripts/cleanup_duplicate_images.py --dry-run knowledge_base/tg-note-kb/images
17+
python scripts/cleanup_duplicate_images.py knowledge_base/tg-note-kb/images
18+
"""
19+
20+
import argparse
21+
import hashlib
22+
import sys
23+
from collections import defaultdict
24+
from pathlib import Path
25+
26+
27+
def compute_file_hash(file_path):
28+
"""Compute SHA256 hash of file."""
29+
with open(file_path, "rb") as f:
30+
return hashlib.sha256(f.read()).hexdigest()
31+
32+
33+
def extract_timestamp_from_filename(filename):
34+
"""Extract timestamp from image filename like img_1762860964_AgACAgIA.jpg"""
35+
try:
36+
# Format: img_<timestamp>_<file_id>.ext
37+
parts = filename.split("_")
38+
if len(parts) >= 2 and parts[0] == "img":
39+
return int(parts[1])
40+
except (ValueError, IndexError):
41+
pass
42+
return 0
43+
44+
45+
def find_duplicates(images_dir):
46+
"""
47+
Find duplicate images grouped by hash.
48+
49+
Returns:
50+
dict: Hash -> list of file paths with that hash
51+
"""
52+
hash_to_files = defaultdict(list)
53+
54+
print(f"Scanning directory: {images_dir}")
55+
56+
# Scan all image files
57+
for ext in [".jpg", ".jpeg", ".png", ".gif", ".tiff", ".bmp", ".webp"]:
58+
for image_file in images_dir.glob(f"img_*{ext}"):
59+
if image_file.is_file():
60+
try:
61+
file_hash = compute_file_hash(image_file)
62+
hash_to_files[file_hash].append(image_file)
63+
except Exception as e:
64+
print(f"Warning: Error processing {image_file}: {e}", file=sys.stderr)
65+
66+
# Filter to only keep hashes with duplicates
67+
duplicates = {h: files for h, files in hash_to_files.items() if len(files) > 1}
68+
69+
return duplicates
70+
71+
72+
def cleanup_duplicates(duplicates, dry_run=False):
73+
"""
74+
Remove duplicate files, keeping the oldest one.
75+
76+
Args:
77+
duplicates: Hash -> list of duplicate files
78+
dry_run: If True, don't actually delete files
79+
80+
Returns:
81+
tuple: (total_files_removed, total_bytes_saved)
82+
"""
83+
files_removed = 0
84+
bytes_saved = 0
85+
86+
for file_hash, files in duplicates.items():
87+
# Sort by timestamp (oldest first)
88+
sorted_files = sorted(files, key=lambda f: extract_timestamp_from_filename(f.name))
89+
90+
# Keep the first (oldest) file
91+
keep_file = sorted_files[0]
92+
remove_files = sorted_files[1:]
93+
94+
print(f"\n🔍 Hash: {file_hash[:8]}...")
95+
print(f" ✅ KEEP: {keep_file.name}")
96+
97+
for remove_file in remove_files:
98+
file_size = remove_file.stat().st_size
99+
bytes_saved += file_size
100+
101+
if dry_run:
102+
print(f" 🗑️ WOULD REMOVE: {remove_file.name} ({file_size} bytes)")
103+
else:
104+
try:
105+
remove_file.unlink()
106+
print(f" ❌ REMOVED: {remove_file.name} ({file_size} bytes)")
107+
files_removed += 1
108+
except Exception as e:
109+
print(f" ⚠️ ERROR removing {remove_file.name}: {e}", file=sys.stderr)
110+
111+
return files_removed, bytes_saved
112+
113+
114+
def format_bytes(bytes_count):
115+
"""Format bytes as human-readable string."""
116+
for unit in ["B", "KB", "MB", "GB"]:
117+
if bytes_count < 1024.0:
118+
return f"{bytes_count:.2f} {unit}"
119+
bytes_count /= 1024.0
120+
return f"{bytes_count:.2f} TB"
121+
122+
123+
def main():
124+
parser = argparse.ArgumentParser(
125+
description="Clean up duplicate images in knowledge base",
126+
formatter_class=argparse.RawDescriptionHelpFormatter,
127+
)
128+
parser.add_argument("images_dir", type=Path, help="Path to images directory")
129+
parser.add_argument(
130+
"--dry-run",
131+
action="store_true",
132+
help="Show what would be removed without actually removing files",
133+
)
134+
135+
args = parser.parse_args()
136+
137+
# Validate directory
138+
if not args.images_dir.exists():
139+
print(f"❌ Error: Directory does not exist: {args.images_dir}", file=sys.stderr)
140+
sys.exit(1)
141+
142+
if not args.images_dir.is_dir():
143+
print(f"❌ Error: Not a directory: {args.images_dir}", file=sys.stderr)
144+
sys.exit(1)
145+
146+
# Find duplicates
147+
print("=" * 60)
148+
print("🔍 Scanning for duplicate images...")
149+
print("=" * 60)
150+
151+
duplicates = find_duplicates(args.images_dir)
152+
153+
if not duplicates:
154+
print("\n✅ No duplicates found!")
155+
return
156+
157+
# Report findings
158+
total_duplicate_files = sum(len(files) - 1 for files in duplicates.values())
159+
print(f"\n📊 Found {len(duplicates)} sets of duplicates")
160+
print(f"📊 Total duplicate files to remove: {total_duplicate_files}")
161+
162+
# Cleanup
163+
print("\n" + "=" * 60)
164+
if args.dry_run:
165+
print("🔍 DRY RUN MODE - No files will be deleted")
166+
else:
167+
print("🗑️ CLEANUP MODE - Removing duplicates...")
168+
print("=" * 60)
169+
170+
files_removed, bytes_saved = cleanup_duplicates(duplicates, dry_run=args.dry_run)
171+
172+
# Final summary
173+
print("\n" + "=" * 60)
174+
print("📊 SUMMARY")
175+
print("=" * 60)
176+
if args.dry_run:
177+
print(f"Would remove: {total_duplicate_files} files")
178+
print(f"Would save: {format_bytes(bytes_saved)}")
179+
print("\nRun without --dry-run to actually remove duplicates")
180+
else:
181+
print(f"✅ Removed: {files_removed} files")
182+
print(f"💾 Saved: {format_bytes(bytes_saved)}")
183+
184+
185+
if __name__ == "__main__":
186+
main()

src/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
# -*- coding: utf-8 -*-
12
"""
23
tg-note: Telegram bot для автоматического создания заметок в базе знаний
34
"""
45

5-
__version__ = "0.1.1"
6+
__version__ = "0.2.0"

src/processor/file_processor.py

Lines changed: 79 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55

66
import base64
7+
import hashlib
78
import tempfile
89
from pathlib import Path
910
from typing import Any, Dict, List, Optional, Set, Tuple
@@ -817,6 +818,49 @@ async def process_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
817818
self.logger.warning("No Docling backend configured for processing.")
818819
return None
819820

821+
def _compute_file_hash(self, file_content: bytes) -> str:
822+
"""
823+
Compute SHA256 hash of file content.
824+
825+
Args:
826+
file_content: File content as bytes
827+
828+
Returns:
829+
SHA256 hash as hex string
830+
"""
831+
return hashlib.sha256(file_content).hexdigest()
832+
833+
def _find_existing_image_by_hash(
834+
self, file_hash: str, images_dir: Path, extension: str
835+
) -> Optional[Path]:
836+
"""
837+
Find existing image with same hash in images directory.
838+
839+
Args:
840+
file_hash: SHA256 hash of the file
841+
images_dir: Directory containing images
842+
extension: File extension (e.g., '.jpg')
843+
844+
Returns:
845+
Path to existing file if found, None otherwise
846+
"""
847+
# AICODE-NOTE: Check all images with the same extension
848+
pattern = f"img_*{extension}"
849+
for existing_file in images_dir.glob(pattern):
850+
if existing_file.is_file():
851+
try:
852+
with open(existing_file, "rb") as f:
853+
existing_hash = self._compute_file_hash(f.read())
854+
if existing_hash == file_hash:
855+
self.logger.info(
856+
f"Found duplicate image: {existing_file.name} (hash: {file_hash[:8]}...)"
857+
)
858+
return existing_file
859+
except Exception as e:
860+
self.logger.warning(f"Error checking file {existing_file}: {e}")
861+
continue
862+
return None
863+
820864
async def download_and_process_telegram_file(
821865
self,
822866
bot,
@@ -876,37 +920,56 @@ async def download_and_process_telegram_file(
876920
is_image = extension in ["jpg", "jpeg", "png", "gif", "tiff", "bmp", "webp"]
877921
save_to_kb = kb_images_dir is not None and is_image
878922

923+
# Download file first to check for duplicates
924+
downloaded_file = await bot.download_file(file_info.file_path)
925+
879926
if save_to_kb:
880927
# AICODE-NOTE: Save images to KB for later reference in markdown files
881-
# Generate unique filename using timestamp and file_id
882-
# Convert to absolute path to avoid file URI errors
928+
# Check for duplicates before saving
883929
kb_images_dir_abs = kb_images_dir.resolve()
884930
kb_images_dir_abs.mkdir(parents=True, exist_ok=True)
885931

886-
# Generate unique filename
887-
import time
932+
# Compute hash of downloaded file
933+
file_hash = self._compute_file_hash(downloaded_file)
888934

889-
timestamp = message_date or int(time.time())
890-
# Use first 8 chars of file_id as identifier (if available)
891-
file_suffix = f"_{file_id[:8]}" if file_id else ""
892-
unique_filename = f"img_{timestamp}{file_suffix}{file_extension}"
935+
# Check if this image already exists
936+
existing_file = self._find_existing_image_by_hash(
937+
file_hash, kb_images_dir_abs, file_extension
938+
)
893939

894-
save_path = kb_images_dir_abs / unique_filename
895-
self.logger.info(f"Downloading Telegram image to KB: {save_path}")
940+
if existing_file:
941+
# Use existing file instead of saving duplicate
942+
save_path = existing_file
943+
unique_filename = existing_file.name
944+
self.logger.info(f"Reusing existing image (duplicate detected): {save_path}")
945+
else:
946+
# Generate unique filename using timestamp and file_id
947+
import time
948+
949+
timestamp = message_date or int(time.time())
950+
# Use first 8 chars of file_id as identifier (if available)
951+
file_suffix = f"_{file_id[:8]}" if file_id else ""
952+
unique_filename = f"img_{timestamp}{file_suffix}{file_extension}"
953+
954+
save_path = kb_images_dir_abs / unique_filename
955+
self.logger.info(f"Saving new image to KB: {save_path}")
956+
957+
# Write file only if it's not a duplicate
958+
with open(save_path, "wb") as f:
959+
f.write(downloaded_file)
960+
961+
self.logger.info(f"File downloaded to: {save_path}")
896962
else:
897963
# Use temporary directory for non-images or when KB path not provided
898964
temp_dir = tempfile.mkdtemp(prefix="tg_note_file_")
899965
temp_filename = f"telegram_file{file_extension}"
900966
save_path = Path(temp_dir) / temp_filename
901967
self.logger.info(f"Downloading Telegram file to temp: {save_path}")
902968

903-
# Download file
904-
downloaded_file = await bot.download_file(file_info.file_path)
905-
906-
with open(save_path, "wb") as f:
907-
f.write(downloaded_file)
969+
with open(save_path, "wb") as f:
970+
f.write(downloaded_file)
908971

909-
self.logger.info(f"File downloaded to: {save_path}")
972+
self.logger.info(f"File downloaded to: {save_path}")
910973

911974
# AICODE-NOTE: Validate saved image path (for images saved to KB)
912975
if save_to_kb and is_image:

0 commit comments

Comments
 (0)