feat: Add image deduplication to prevent saving duplicate images

Артём Земляк · Артём Земляк · commit a0ebe2fc5586 · 2025-11-12T12:24:23.000+03:00
- Add SHA256 hash-based duplicate detection in FileProcessor
- Reuse existing images instead of creating duplicates
- Add cleanup script for removing existing duplicates
- Add comprehensive tests for deduplication logic
- Bump version to 0.2.0

Fixes issue where identical images were saved multiple times with
different timestamps, wasting disk space (~260KB on existing data).

Changes:
- src/processor/file_processor.py: Add _compute_file_hash() and _find_existing_image_by_hash()
- tests/test_file_processor.py: Add deduplication tests
- scripts/cleanup_duplicate_images.py: New utility script
- pyproject.toml, src/__init__.py: Version bump to 0.2.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [project]
 name = "tg-note"
-version = "0.1.1"
+version = "0.2.0"
 description = "Intelligent Knowledge Base Builder - Telegram bot that automatically transforms your messages, reposts, and articles into a structured knowledge base using AI agents"
 readme = "README.md"
 license = "MIT"
diff --git a/scripts/cleanup_duplicate_images.py b/scripts/cleanup_duplicate_images.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Script to clean up duplicate images in knowledge base.
+
+This script:
+1. Scans the images directory for duplicate images based on file hash
+2. Keeps the oldest file (by timestamp in filename)
+3. Removes newer duplicates
+4. Reports what was removed
+
+Usage:
+    python scripts/cleanup_duplicate_images.py [--dry-run] <images_dir>
+
+Example:
+    python scripts/cleanup_duplicate_images.py --dry-run knowledge_base/tg-note-kb/images
+    python scripts/cleanup_duplicate_images.py knowledge_base/tg-note-kb/images
+"""
+
+import argparse
+import hashlib
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+
+def compute_file_hash(file_path):
+    """Compute SHA256 hash of file."""
+    with open(file_path, "rb") as f:
+        return hashlib.sha256(f.read()).hexdigest()
+
+
+def extract_timestamp_from_filename(filename):
+    """Extract timestamp from image filename like img_1762860964_AgACAgIA.jpg"""
+    try:
+        # Format: img_<timestamp>_<file_id>.ext
+        parts = filename.split("_")
+        if len(parts) >= 2 and parts[0] == "img":
+            return int(parts[1])
+    except (ValueError, IndexError):
+        pass
+    return 0
+
+
+def find_duplicates(images_dir):
+    """
+    Find duplicate images grouped by hash.
+
+    Returns:
+        dict: Hash -> list of file paths with that hash
+    """
+    hash_to_files = defaultdict(list)
+
+    print(f"Scanning directory: {images_dir}")
+
+    # Scan all image files
+    for ext in [".jpg", ".jpeg", ".png", ".gif", ".tiff", ".bmp", ".webp"]:
+        for image_file in images_dir.glob(f"img_*{ext}"):
+            if image_file.is_file():
+                try:
+                    file_hash = compute_file_hash(image_file)
+                    hash_to_files[file_hash].append(image_file)
+                except Exception as e:
+                    print(f"Warning: Error processing {image_file}: {e}", file=sys.stderr)
+
+    # Filter to only keep hashes with duplicates
+    duplicates = {h: files for h, files in hash_to_files.items() if len(files) > 1}
+
+    return duplicates
+
+
+def cleanup_duplicates(duplicates, dry_run=False):
+    """
+    Remove duplicate files, keeping the oldest one.
+
+    Args:
+        duplicates: Hash -> list of duplicate files
+        dry_run: If True, don't actually delete files
+
+    Returns:
+        tuple: (total_files_removed, total_bytes_saved)
+    """
+    files_removed = 0
+    bytes_saved = 0
+
+    for file_hash, files in duplicates.items():
+        # Sort by timestamp (oldest first)
+        sorted_files = sorted(files, key=lambda f: extract_timestamp_from_filename(f.name))
+
+        # Keep the first (oldest) file
+        keep_file = sorted_files[0]
+        remove_files = sorted_files[1:]
+
+        print(f"\n🔍 Hash: {file_hash[:8]}...")
+        print(f"   ✅ KEEP: {keep_file.name}")
+
+        for remove_file in remove_files:
+            file_size = remove_file.stat().st_size
+            bytes_saved += file_size
+
+            if dry_run:
+                print(f"   🗑️  WOULD REMOVE: {remove_file.name} ({file_size} bytes)")
+            else:
+                try:
+                    remove_file.unlink()
+                    print(f"   ❌ REMOVED: {remove_file.name} ({file_size} bytes)")
+                    files_removed += 1
+                except Exception as e:
+                    print(f"   ⚠️  ERROR removing {remove_file.name}: {e}", file=sys.stderr)
+
+    return files_removed, bytes_saved
+
+
+def format_bytes(bytes_count):
+    """Format bytes as human-readable string."""
+    for unit in ["B", "KB", "MB", "GB"]:
+        if bytes_count < 1024.0:
+            return f"{bytes_count:.2f} {unit}"
+        bytes_count /= 1024.0
+    return f"{bytes_count:.2f} TB"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Clean up duplicate images in knowledge base",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("images_dir", type=Path, help="Path to images directory")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be removed without actually removing files",
+    )
+
+    args = parser.parse_args()
+
+    # Validate directory
+    if not args.images_dir.exists():
+        print(f"❌ Error: Directory does not exist: {args.images_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    if not args.images_dir.is_dir():
+        print(f"❌ Error: Not a directory: {args.images_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    # Find duplicates
+    print("=" * 60)
+    print("🔍 Scanning for duplicate images...")
+    print("=" * 60)
+
+    duplicates = find_duplicates(args.images_dir)
+
+    if not duplicates:
+        print("\n✅ No duplicates found!")
+        return
+
+    # Report findings
+    total_duplicate_files = sum(len(files) - 1 for files in duplicates.values())
+    print(f"\n📊 Found {len(duplicates)} sets of duplicates")
+    print(f"📊 Total duplicate files to remove: {total_duplicate_files}")
+
+    # Cleanup
+    print("\n" + "=" * 60)
+    if args.dry_run:
+        print("🔍 DRY RUN MODE - No files will be deleted")
+    else:
+        print("🗑️  CLEANUP MODE - Removing duplicates...")
+    print("=" * 60)
+
+    files_removed, bytes_saved = cleanup_duplicates(duplicates, dry_run=args.dry_run)
+
+    # Final summary
+    print("\n" + "=" * 60)
+    print("📊 SUMMARY")
+    print("=" * 60)
+    if args.dry_run:
+        print(f"Would remove: {total_duplicate_files} files")
+        print(f"Would save: {format_bytes(bytes_saved)}")
+        print("\nRun without --dry-run to actually remove duplicates")
+    else:
+        print(f"✅ Removed: {files_removed} files")
+        print(f"💾 Saved: {format_bytes(bytes_saved)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/__init__.py b/src/__init__.py
@@ -1,5 +1,6 @@
+# -*- coding: utf-8 -*-
 """
 tg-note: Telegram bot для автоматического создания заметок в базе знаний
 """
 
-__version__ = "0.1.1"
+__version__ = "0.2.0"
diff --git a/src/processor/file_processor.py b/src/processor/file_processor.py
@@ -4,6 +4,7 @@
 """
 
 import base64
+import hashlib
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple
@@ -817,6 +818,49 @@ async def process_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
         self.logger.warning("No Docling backend configured for processing.")
         return None
 
+    def _compute_file_hash(self, file_content: bytes) -> str:
+        """
+        Compute SHA256 hash of file content.
+
+        Args:
+            file_content: File content as bytes
+
+        Returns:
+            SHA256 hash as hex string
+        """
+        return hashlib.sha256(file_content).hexdigest()
+
+    def _find_existing_image_by_hash(
+        self, file_hash: str, images_dir: Path, extension: str
+    ) -> Optional[Path]:
+        """
+        Find existing image with same hash in images directory.
+
+        Args:
+            file_hash: SHA256 hash of the file
+            images_dir: Directory containing images
+            extension: File extension (e.g., '.jpg')
+
+        Returns:
+            Path to existing file if found, None otherwise
+        """
+        # AICODE-NOTE: Check all images with the same extension
+        pattern = f"img_*{extension}"
+        for existing_file in images_dir.glob(pattern):
+            if existing_file.is_file():
+                try:
+                    with open(existing_file, "rb") as f:
+                        existing_hash = self._compute_file_hash(f.read())
+                    if existing_hash == file_hash:
+                        self.logger.info(
+                            f"Found duplicate image: {existing_file.name} (hash: {file_hash[:8]}...)"
+                        )
+                        return existing_file
+                except Exception as e:
+                    self.logger.warning(f"Error checking file {existing_file}: {e}")
+                    continue
+        return None
+
     async def download_and_process_telegram_file(
         self,
         bot,
@@ -876,37 +920,56 @@ async def download_and_process_telegram_file(
             is_image = extension in ["jpg", "jpeg", "png", "gif", "tiff", "bmp", "webp"]
             save_to_kb = kb_images_dir is not None and is_image
 
+            # Download file first to check for duplicates
+            downloaded_file = await bot.download_file(file_info.file_path)
+
             if save_to_kb:
                 # AICODE-NOTE: Save images to KB for later reference in markdown files
-                # Generate unique filename using timestamp and file_id
-                # Convert to absolute path to avoid file URI errors
+                # Check for duplicates before saving
                 kb_images_dir_abs = kb_images_dir.resolve()
                 kb_images_dir_abs.mkdir(parents=True, exist_ok=True)
 
-                # Generate unique filename
-                import time
+                # Compute hash of downloaded file
+                file_hash = self._compute_file_hash(downloaded_file)
 
-                timestamp = message_date or int(time.time())
-                # Use first 8 chars of file_id as identifier (if available)
-                file_suffix = f"_{file_id[:8]}" if file_id else ""
-                unique_filename = f"img_{timestamp}{file_suffix}{file_extension}"
+                # Check if this image already exists
+                existing_file = self._find_existing_image_by_hash(
+                    file_hash, kb_images_dir_abs, file_extension
+                )
 
-                save_path = kb_images_dir_abs / unique_filename
-                self.logger.info(f"Downloading Telegram image to KB: {save_path}")
+                if existing_file:
+                    # Use existing file instead of saving duplicate
+                    save_path = existing_file
+                    unique_filename = existing_file.name
+                    self.logger.info(f"Reusing existing image (duplicate detected): {save_path}")
+                else:
+                    # Generate unique filename using timestamp and file_id
+                    import time
+
+                    timestamp = message_date or int(time.time())
+                    # Use first 8 chars of file_id as identifier (if available)
+                    file_suffix = f"_{file_id[:8]}" if file_id else ""
+                    unique_filename = f"img_{timestamp}{file_suffix}{file_extension}"
+
+                    save_path = kb_images_dir_abs / unique_filename
+                    self.logger.info(f"Saving new image to KB: {save_path}")
+
+                    # Write file only if it's not a duplicate
+                    with open(save_path, "wb") as f:
+                        f.write(downloaded_file)
+
+                    self.logger.info(f"File downloaded to: {save_path}")
             else:
                 # Use temporary directory for non-images or when KB path not provided
                 temp_dir = tempfile.mkdtemp(prefix="tg_note_file_")
                 temp_filename = f"telegram_file{file_extension}"
                 save_path = Path(temp_dir) / temp_filename
                 self.logger.info(f"Downloading Telegram file to temp: {save_path}")
 
-            # Download file
-            downloaded_file = await bot.download_file(file_info.file_path)
-
-            with open(save_path, "wb") as f:
-                f.write(downloaded_file)
+                with open(save_path, "wb") as f:
+                    f.write(downloaded_file)
 
-            self.logger.info(f"File downloaded to: {save_path}")
+                self.logger.info(f"File downloaded to: {save_path}")
 
             # AICODE-NOTE: Validate saved image path (for images saved to KB)
             if save_to_kb and is_image:
diff --git a/tests/test_file_processor.py b/tests/test_file_processor.py