AOSSIE-Org · DeveloperAmrit · Jan 6, 2026 · coderabbitai · Jan 6, 2026 · coderabbitai
diff --git a/backend/app/database/images.py b/backend/app/database/images.py
@@ -27,6 +27,7 @@ class ImageRecord(TypedDict):
     thumbnailPath: str
     metadata: Union[Mapping[str, Any], str]
     isTagged: bool
+    phash: Union[str, None]
 
 
 class UntaggedImageRecord(TypedDict):
@@ -64,6 +65,7 @@ def db_create_images_table() -> None:
             metadata TEXT,
             isTagged BOOLEAN DEFAULT 0,
             isFavourite BOOLEAN DEFAULT 0,
+            phash TEXT,
             FOREIGN KEY (folder_id) REFERENCES folders(folder_id) ON DELETE CASCADE
         )
     """
@@ -97,12 +99,13 @@ def db_bulk_insert_images(image_records: List[ImageRecord]) -> bool:
     try:
         cursor.executemany(
             """
-            INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged)
-            VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged)
+            INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged, phash)
+            VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged, :phash)
             ON CONFLICT(path) DO UPDATE SET
                 folder_id=excluded.folder_id,
                 thumbnailPath=excluded.thumbnailPath,
                 metadata=excluded.metadata,
+                phash=excluded.phash,
                 isTagged=CASE
                     WHEN excluded.isTagged THEN 1
                     ELSE images.isTagged
@@ -417,5 +420,53 @@ def db_toggle_image_favourite_status(image_id: str) -> bool:
         logger.error(f"Database error: {e}")
         conn.rollback()
         return False
+
+def db_get_all_images_with_phash() -> List[dict]:
+    """
+    Get all images that have a phash.
+    """
+    conn = _connect()
+    cursor = conn.cursor()
+
+    try:
+        cursor.execute("""
+            SELECT id, path, folder_id, thumbnailPath, metadata, isTagged, isFavourite, phash
+            FROM images
+            WHERE phash IS NOT NULL
+        """)
+
+        images = []
+        for row in cursor.fetchall():
+            images.append({
+                "id": row[0],
+                "path": row[1],
+                "folder_id": row[2],
+                "thumbnailPath": row[3],
+                "metadata": row[4],
+                "isTagged": bool(row[5]),
+                "isFavourite": bool(row[6]),
+                "phash": row[7]
+            })
+
+        return images
+
+    except Exception as e:
+        logger.error(f"Error getting images with phash: {e}")
+        return []
+    finally:
+        conn.close()
+
+def db_get_images_by_ids(image_ids: List[str]) -> List[dict]:
+    if not image_ids:
+        return []
+    conn = _connect()
+    cursor = conn.cursor()
+    try:
+        placeholders = ','.join(['?'] * len(image_ids))
+        cursor.execute(f"SELECT id, path, thumbnailPath FROM images WHERE id IN ({placeholders})", image_ids)
+        return [{"id": row[0], "path": row[1], "thumbnailPath": row[2]} for row in cursor.fetchall()]
+    except Exception as e:
+        logger.error(f"Error getting images by ids: {e}")
+        return []
     finally:
         conn.close()
diff --git a/backend/app/routes/duplicates.py b/backend/app/routes/duplicates.py
@@ -0,0 +1,72 @@
+from fastapi import APIRouter, HTTPException, Body
+from typing import List, Dict, Any
+from app.database.images import db_get_all_images_with_phash, db_delete_images_by_ids, db_get_images_by_ids
+from app.utils.duplicate_detector import identify_best_shot, group_similar_images
+from app.logging.setup_logging import get_logger
+import os
+
+router = APIRouter()
+logger = get_logger(__name__)
+
+@router.get("/", response_model=List[Dict[str, Any]])
+async def get_duplicates():
+    """
+    Get groups of duplicate images.
+    Returns a list of groups, where each group contains:
+    - images: List of image objects
+    - best_shot_id: ID of the best shot
+    """
+    try:
+        # Get all images with pHash
+        all_images = db_get_all_images_with_phash()
+
+        # Group similar images using Python logic (Hamming distance)
+        # Threshold 5 allows for some edits/compression differences
+        groups = group_similar_images(all_images, threshold=5)
+
+        result = []
+
+        for group in groups:
+            best_shot = identify_best_shot(group)
+            result.append({
+                "images": group,
+                "best_shot_id": best_shot['id'] if best_shot else None
+            })
+
+        return result
+    except Exception as e:
+        logger.error(f"Error getting duplicates: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+@router.post("/delete", response_model=Dict[str, int])
+async def delete_duplicates(image_ids: List[str] = Body(...)):
+    """
+    Delete specified duplicate images from DB and filesystem.
+    """
+    try:
+        # Get image paths before deleting from DB
+        images = db_get_images_by_ids(image_ids)
+
+        # Delete from DB
+        if not db_delete_images_by_ids(image_ids):
+            raise HTTPException(status_code=500, detail="Failed to delete images from database")
+
+        # Delete from filesystem
+        deleted_files_count = 0
+        for img in images:
+            try:
+                if os.path.exists(img['path']):
+                    os.remove(img['path'])
+                    deleted_files_count += 1
+
+                # Also delete thumbnail
+                if img.get('thumbnailPath') and os.path.exists(img['thumbnailPath']):
+                    os.remove(img['thumbnailPath'])
+            except Exception as e:
+                logger.error(f"Error deleting file {img['path']}: {e}")
+
+        return {"deleted_count": len(image_ids), "deleted_files_count": deleted_files_count}
+
+    except Exception as e:
+        logger.error(f"Error in delete_duplicates: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
-    except Exception as e:
-        logger.error(f"Error in delete_duplicates: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Error in delete_duplicates: {e}")
+        raise HTTPException(status_code=500, detail="Failed to delete images")
-    except Exception as e:
-        logger.error(f"Error in delete_duplicates: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Error in delete_duplicates: {e}")
+        raise HTTPException(status_code=500, detail="Failed to delete images")
diff --git a/backend/app/utils/duplicate_detector.py b/backend/app/utils/duplicate_detector.py
@@ -0,0 +1,186 @@
+import imagehash
+from PIL import Image
+import os
+import json
+import cv2
+import numpy as np
+from typing import List, Dict, Any, Optional
+from app.logging.setup_logging import get_logger
+
+logger = get_logger(__name__)
+
+# Constants
+# dHash is generally faster and better at gradients than pHash for burst shots
+HASH_SIZE = 8 
+HASH_THRESHOLD = 8 # Strict threshold for hashing
+MIN_MATCH_COUNT = 15 # Minimum shared keypoints to consider them the same scene
+
+def get_image_sharpness(image_path: str) -> float:
+    """
+    Returns a score representing the 'sharpness' of an image.
+    Higher is better.
+    Technique: Laplacian Variance (Detects edges).
+    """
+    try:
+        img = cv2.imread(image_path)
+        if img is None:
+            return 0.0
+
+        # Convert to grayscale
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+        # Calculate Laplacian variance
+        # Blurry images have low variance (few edges), Sharp images have high variance
+        variance = cv2.Laplacian(gray, cv2.CV_64F).var()
+        return variance
+    except Exception as e:
+        logger.error(f"Error calculating sharpness for {image_path}: {e}")
+        return 0.0
+
+def are_images_geometrically_similar(path1: str, path2: str) -> bool:
+    """
+    Uses ORB (Oriented FAST and Rotated BRIEF) to detect if two images 
+    are of the same scene, even if camera moved slightly or angle changed.
+    """
+    try:
+        img1 = cv2.imread(path1, 0) # Read as grayscale
+        img2 = cv2.imread(path2, 0)
+
+        if img1 is None or img2 is None:
+            return False
+
+        # Initialize ORB detector
+        orb = cv2.ORB_create(nfeatures=500)
+
+        # Find keypoints and descriptors
+        kp1, des1 = orb.detectAndCompute(img1, None)
+        kp2, des2 = orb.detectAndCompute(img2, None)
+
+        if des1 is None or des2 is None:
+            return False
+
+        # Match descriptors using BFMatcher (Brute Force) with Hamming distance
+        bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+        matches = bf.match(des1, des2)
+
+        # Sort matches by distance (best matches first)
+        matches = sorted(matches, key=lambda x: x.distance)
+
+        # Take top 15% of matches or top 50 matches
+        good_matches = [m for m in matches if m.distance < 50]
+
+        # If we have enough strong geometrical matches, it's the same object/scene
+        return len(good_matches) > MIN_MATCH_COUNT
+
+    except Exception as e:
+        logger.error(f"Error matching features between {path1} and {path2}: {e}")
+        return False
+
+
+def calculate_phash(image_path: str) -> Optional[str]:
+    """
+    Calculate perceptual hash for an image.
+    """
+    try:
+        img = Image.open(image_path)
+        # phash is generally good for finding duplicates including resized/compressed ones
+        hash_obj = imagehash.phash(img)
+        return str(hash_obj)
+    except Exception as e:
+        logger.error(f"Error calculating pHash for {image_path}: {e}")
+        return None
+
+def identify_best_shot(images: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """
+    Identify the best shot from a list of duplicate images.
+    Heuristic: Sharpness (Laplacian Variance), then File Size.
+    """
+    if not images:
+        return None
+
+    # Calculate sharpness for all if not already calculated
+    for img in images:
+        if 'sharpness_score' not in img:
+            img['sharpness_score'] = get_image_sharpness(img['path'])
+
+    # Pick best image
+    # Heuristic: Sharpness is king.
+    # Tie-breaker: File size (higher usually means more color data if sharpness is equal)
+    try:
+        best_image = max(images, key=lambda x: (x.get('sharpness_score', 0), os.path.getsize(x['path']) if os.path.exists(x['path']) else 0))
+        return best_image
+    except Exception as e:
+        logger.error(f"Error identifying best shot: {e}")
+        return images[0] if images else None
+
+def group_similar_images(images: List[Dict[str, Any]], threshold: int = HASH_THRESHOLD) -> List[List[Dict[str, Any]]]:
+    """
+    Groups images by Visual Hash (dHash) and verifies with ORB.
+    """
+    clusters = []
+
+    # Pre-compute dHash (Difference Hash) instead of pHash
+    processed_images = []
+    for img in images:
+        if img.get('phash'): # We are technically using the pHash from DB if available, or calculating on fly
+             # If we want to switch to dHash strictly we might need to re-compute.
+             # For now let's reuse the stored hash as a first pass filter if possible,
+             # OR strictly compute dHash now for better burst mode detection.
+             # Given the context, let's calculate dHash on fly for high accuracy as requested.
+             pass
+
+        path = img.get('path')
+        if not path or not os.path.exists(path):
+            continue
+
+        try:
+             # Calculate dHash on the fly for heavy logic mode
+             pil_img = Image.open(path)
+             dhash = imagehash.dhash(pil_img, hash_size=HASH_SIZE)
+             img['hash_obj'] = dhash
+             # Compute sharpness now to save time later
+             img['sharpness_score'] = get_image_sharpness(path)
+             processed_images.append(img)
+        except Exception as e:
+             logger.warning(f"Error processing image {path}: {e}")
+
+    # Sort by sharpness initially so the "best" image often becomes the cluster rep
+    processed_images.sort(key=lambda x: x.get('sharpness_score', 0), reverse=True)
+
+    # Greedy clustering
+    for img in processed_images:
+        found_cluster = False
+        img_hash = img['hash_obj']
+
+        for cluster in clusters:
+            if not cluster:
+                continue
+
+            rep_img = cluster[0]
+            rep_hash = rep_img['hash_obj']
+
+            dist = img_hash - rep_hash
+
+            # Fast Check: Hamming Distance
+            if dist <= threshold:
+                # Secondary Check: ORB Verification
+                # We check geometric similarity if the hash is "close but not perfect" or just always.
+                # To be robust as requested:
+                if are_images_geometrically_similar(img['path'], rep_img['path']):
+                    cluster.append(img)
+                    found_cluster = True
+                    break
+
+        if not found_cluster:
+            clusters.append([img])
+
+    # Filter out single-image clusters (no duplicates)
+    duplicate_groups = [cluster for cluster in clusters if len(cluster) > 1]
+
+    # Remove temporary objects
+    for group in duplicate_groups:
+        for img in group:
+            img.pop('hash_obj', None)
+
+    return duplicate_groups
+
diff --git a/backend/app/utils/images.py b/backend/app/utils/images.py
@@ -19,6 +19,7 @@
 from app.models.FaceDetector import FaceDetector
 from app.models.ObjectClassifier import ObjectClassifier
 from app.logging.setup_logging import get_logger
+from app.utils.duplicate_detector import calculate_phash
 
 logger = get_logger(__name__)
 
@@ -165,6 +166,7 @@ def image_util_prepare_image_records(
         # Generate thumbnail
         if image_util_generate_thumbnail(image_path, thumbnail_path):
             metadata = image_util_extract_metadata(image_path)
+            phash = calculate_phash(image_path)
             logger.debug(f"Extracted metadata for {image_path}: {metadata}")
             image_records.append(
                 {
@@ -174,6 +176,7 @@ def image_util_prepare_image_records(
                     "thumbnailPath": thumbnail_path,
                     "metadata": json.dumps(metadata),
                     "isTagged": False,
+                    "phash": phash,
                 }
             )
 

diff --git a/backend/main.py b/backend/main.py
@@ -25,6 +25,7 @@
 from app.routes.images import router as images_router
 from app.routes.face_clusters import router as face_clusters_router
 from app.routes.user_preferences import router as user_preferences_router
+from app.routes.duplicates import router as duplicates_router
 from fastapi.openapi.utils import get_openapi
 from app.logging.setup_logging import (
     configure_uvicorn_logging,
@@ -130,7 +131,7 @@ async def root():
 app.include_router(
     user_preferences_router, prefix="/user-preferences", tags=["User Preferences"]
 )
-
+app.include_router(duplicates_router, prefix="/duplicates", tags=["Duplicates"])
 
 # Entry point for running with: python3 main.py
 if __name__ == "__main__":

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -71,3 +71,4 @@ ruff>=0.0.241
 psutil>=5.9.5
 pytest-asyncio>=1.0.0
 setuptools==66.1.1
+ImageHash==4.3.1