diff --git a/backend/app/database/images.py b/backend/app/database/images.py index ec9541a56..7722a6416 100644 --- a/backend/app/database/images.py +++ b/backend/app/database/images.py @@ -27,6 +27,7 @@ class ImageRecord(TypedDict): thumbnailPath: str metadata: Union[Mapping[str, Any], str] isTagged: bool + phash: Union[str, None] class UntaggedImageRecord(TypedDict): @@ -64,6 +65,7 @@ def db_create_images_table() -> None: metadata TEXT, isTagged BOOLEAN DEFAULT 0, isFavourite BOOLEAN DEFAULT 0, + phash TEXT, FOREIGN KEY (folder_id) REFERENCES folders(folder_id) ON DELETE CASCADE ) """ @@ -97,12 +99,13 @@ def db_bulk_insert_images(image_records: List[ImageRecord]) -> bool: try: cursor.executemany( """ - INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged) - VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged) + INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged, phash) + VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged, :phash) ON CONFLICT(path) DO UPDATE SET folder_id=excluded.folder_id, thumbnailPath=excluded.thumbnailPath, metadata=excluded.metadata, + phash=excluded.phash, isTagged=CASE WHEN excluded.isTagged THEN 1 ELSE images.isTagged @@ -417,5 +420,53 @@ def db_toggle_image_favourite_status(image_id: str) -> bool: logger.error(f"Database error: {e}") conn.rollback() return False + +def db_get_all_images_with_phash() -> List[dict]: + """ + Get all images that have a phash. + """ + conn = _connect() + cursor = conn.cursor() + + try: + cursor.execute(""" + SELECT id, path, folder_id, thumbnailPath, metadata, isTagged, isFavourite, phash + FROM images + WHERE phash IS NOT NULL + """) + + images = [] + for row in cursor.fetchall(): + images.append({ + "id": row[0], + "path": row[1], + "folder_id": row[2], + "thumbnailPath": row[3], + "metadata": row[4], + "isTagged": bool(row[5]), + "isFavourite": bool(row[6]), + "phash": row[7] + }) + + return images + + except Exception as e: + logger.error(f"Error getting images with phash: {e}") + return [] + finally: + conn.close() + +def db_get_images_by_ids(image_ids: List[str]) -> List[dict]: + if not image_ids: + return [] + conn = _connect() + cursor = conn.cursor() + try: + placeholders = ','.join(['?'] * len(image_ids)) + cursor.execute(f"SELECT id, path, thumbnailPath FROM images WHERE id IN ({placeholders})", image_ids) + return [{"id": row[0], "path": row[1], "thumbnailPath": row[2]} for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error getting images by ids: {e}") + return [] finally: conn.close() diff --git a/backend/app/routes/duplicates.py b/backend/app/routes/duplicates.py new file mode 100644 index 000000000..eafe2390f --- /dev/null +++ b/backend/app/routes/duplicates.py @@ -0,0 +1,72 @@ +from fastapi import APIRouter, HTTPException, Body +from typing import List, Dict, Any +from app.database.images import db_get_all_images_with_phash, db_delete_images_by_ids, db_get_images_by_ids +from app.utils.duplicate_detector import identify_best_shot, group_similar_images +from app.logging.setup_logging import get_logger +import os + +router = APIRouter() +logger = get_logger(__name__) + +@router.get("/", response_model=List[Dict[str, Any]]) +async def get_duplicates(): + """ + Get groups of duplicate images. + Returns a list of groups, where each group contains: + - images: List of image objects + - best_shot_id: ID of the best shot + """ + try: + # Get all images with pHash + all_images = db_get_all_images_with_phash() + + # Group similar images using Python logic (Hamming distance) + # Threshold 5 allows for some edits/compression differences + groups = group_similar_images(all_images, threshold=5) + + result = [] + + for group in groups: + best_shot = identify_best_shot(group) + result.append({ + "images": group, + "best_shot_id": best_shot['id'] if best_shot else None + }) + + return result + except Exception as e: + logger.error(f"Error getting duplicates: {e}") + raise HTTPException(status_code=500, detail="Internal server error") + +@router.post("/delete", response_model=Dict[str, int]) +async def delete_duplicates(image_ids: List[str] = Body(...)): + """ + Delete specified duplicate images from DB and filesystem. + """ + try: + # Get image paths before deleting from DB + images = db_get_images_by_ids(image_ids) + + # Delete from DB + if not db_delete_images_by_ids(image_ids): + raise HTTPException(status_code=500, detail="Failed to delete images from database") + + # Delete from filesystem + deleted_files_count = 0 + for img in images: + try: + if os.path.exists(img['path']): + os.remove(img['path']) + deleted_files_count += 1 + + # Also delete thumbnail + if img.get('thumbnailPath') and os.path.exists(img['thumbnailPath']): + os.remove(img['thumbnailPath']) + except Exception as e: + logger.error(f"Error deleting file {img['path']}: {e}") + + return {"deleted_count": len(image_ids), "deleted_files_count": deleted_files_count} + + except Exception as e: + logger.error(f"Error in delete_duplicates: {e}") + raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/app/utils/duplicate_detector.py b/backend/app/utils/duplicate_detector.py new file mode 100644 index 000000000..48435d6d2 --- /dev/null +++ b/backend/app/utils/duplicate_detector.py @@ -0,0 +1,186 @@ +import imagehash +from PIL import Image +import os +import json +import cv2 +import numpy as np +from typing import List, Dict, Any, Optional +from app.logging.setup_logging import get_logger + +logger = get_logger(__name__) + +# Constants +# dHash is generally faster and better at gradients than pHash for burst shots +HASH_SIZE = 8 +HASH_THRESHOLD = 8 # Strict threshold for hashing +MIN_MATCH_COUNT = 15 # Minimum shared keypoints to consider them the same scene + +def get_image_sharpness(image_path: str) -> float: + """ + Returns a score representing the 'sharpness' of an image. + Higher is better. + Technique: Laplacian Variance (Detects edges). + """ + try: + img = cv2.imread(image_path) + if img is None: + return 0.0 + + # Convert to grayscale + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Calculate Laplacian variance + # Blurry images have low variance (few edges), Sharp images have high variance + variance = cv2.Laplacian(gray, cv2.CV_64F).var() + return variance + except Exception as e: + logger.error(f"Error calculating sharpness for {image_path}: {e}") + return 0.0 + +def are_images_geometrically_similar(path1: str, path2: str) -> bool: + """ + Uses ORB (Oriented FAST and Rotated BRIEF) to detect if two images + are of the same scene, even if camera moved slightly or angle changed. + """ + try: + img1 = cv2.imread(path1, 0) # Read as grayscale + img2 = cv2.imread(path2, 0) + + if img1 is None or img2 is None: + return False + + # Initialize ORB detector + orb = cv2.ORB_create(nfeatures=500) + + # Find keypoints and descriptors + kp1, des1 = orb.detectAndCompute(img1, None) + kp2, des2 = orb.detectAndCompute(img2, None) + + if des1 is None or des2 is None: + return False + + # Match descriptors using BFMatcher (Brute Force) with Hamming distance + bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) + matches = bf.match(des1, des2) + + # Sort matches by distance (best matches first) + matches = sorted(matches, key=lambda x: x.distance) + + # Take top 15% of matches or top 50 matches + good_matches = [m for m in matches if m.distance < 50] + + # If we have enough strong geometrical matches, it's the same object/scene + return len(good_matches) > MIN_MATCH_COUNT + + except Exception as e: + logger.error(f"Error matching features between {path1} and {path2}: {e}") + return False + + +def calculate_phash(image_path: str) -> Optional[str]: + """ + Calculate perceptual hash for an image. + """ + try: + img = Image.open(image_path) + # phash is generally good for finding duplicates including resized/compressed ones + hash_obj = imagehash.phash(img) + return str(hash_obj) + except Exception as e: + logger.error(f"Error calculating pHash for {image_path}: {e}") + return None + +def identify_best_shot(images: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + """ + Identify the best shot from a list of duplicate images. + Heuristic: Sharpness (Laplacian Variance), then File Size. + """ + if not images: + return None + + # Calculate sharpness for all if not already calculated + for img in images: + if 'sharpness_score' not in img: + img['sharpness_score'] = get_image_sharpness(img['path']) + + # Pick best image + # Heuristic: Sharpness is king. + # Tie-breaker: File size (higher usually means more color data if sharpness is equal) + try: + best_image = max(images, key=lambda x: (x.get('sharpness_score', 0), os.path.getsize(x['path']) if os.path.exists(x['path']) else 0)) + return best_image + except Exception as e: + logger.error(f"Error identifying best shot: {e}") + return images[0] if images else None + +def group_similar_images(images: List[Dict[str, Any]], threshold: int = HASH_THRESHOLD) -> List[List[Dict[str, Any]]]: + """ + Groups images by Visual Hash (dHash) and verifies with ORB. + """ + clusters = [] + + # Pre-compute dHash (Difference Hash) instead of pHash + processed_images = [] + for img in images: + if img.get('phash'): # We are technically using the pHash from DB if available, or calculating on fly + # If we want to switch to dHash strictly we might need to re-compute. + # For now let's reuse the stored hash as a first pass filter if possible, + # OR strictly compute dHash now for better burst mode detection. + # Given the context, let's calculate dHash on fly for high accuracy as requested. + pass + + path = img.get('path') + if not path or not os.path.exists(path): + continue + + try: + # Calculate dHash on the fly for heavy logic mode + pil_img = Image.open(path) + dhash = imagehash.dhash(pil_img, hash_size=HASH_SIZE) + img['hash_obj'] = dhash + # Compute sharpness now to save time later + img['sharpness_score'] = get_image_sharpness(path) + processed_images.append(img) + except Exception as e: + logger.warning(f"Error processing image {path}: {e}") + + # Sort by sharpness initially so the "best" image often becomes the cluster rep + processed_images.sort(key=lambda x: x.get('sharpness_score', 0), reverse=True) + + # Greedy clustering + for img in processed_images: + found_cluster = False + img_hash = img['hash_obj'] + + for cluster in clusters: + if not cluster: + continue + + rep_img = cluster[0] + rep_hash = rep_img['hash_obj'] + + dist = img_hash - rep_hash + + # Fast Check: Hamming Distance + if dist <= threshold: + # Secondary Check: ORB Verification + # We check geometric similarity if the hash is "close but not perfect" or just always. + # To be robust as requested: + if are_images_geometrically_similar(img['path'], rep_img['path']): + cluster.append(img) + found_cluster = True + break + + if not found_cluster: + clusters.append([img]) + + # Filter out single-image clusters (no duplicates) + duplicate_groups = [cluster for cluster in clusters if len(cluster) > 1] + + # Remove temporary objects + for group in duplicate_groups: + for img in group: + img.pop('hash_obj', None) + + return duplicate_groups + diff --git a/backend/app/utils/images.py b/backend/app/utils/images.py index c3b202205..fbf0358da 100644 --- a/backend/app/utils/images.py +++ b/backend/app/utils/images.py @@ -19,6 +19,7 @@ from app.models.FaceDetector import FaceDetector from app.models.ObjectClassifier import ObjectClassifier from app.logging.setup_logging import get_logger +from app.utils.duplicate_detector import calculate_phash logger = get_logger(__name__) @@ -165,6 +166,7 @@ def image_util_prepare_image_records( # Generate thumbnail if image_util_generate_thumbnail(image_path, thumbnail_path): metadata = image_util_extract_metadata(image_path) + phash = calculate_phash(image_path) logger.debug(f"Extracted metadata for {image_path}: {metadata}") image_records.append( { @@ -174,6 +176,7 @@ def image_util_prepare_image_records( "thumbnailPath": thumbnail_path, "metadata": json.dumps(metadata), "isTagged": False, + "phash": phash, } ) diff --git a/backend/main.py b/backend/main.py index 4e388d454..3ba795039 100644 --- a/backend/main.py +++ b/backend/main.py @@ -25,6 +25,7 @@ from app.routes.images import router as images_router from app.routes.face_clusters import router as face_clusters_router from app.routes.user_preferences import router as user_preferences_router +from app.routes.duplicates import router as duplicates_router from fastapi.openapi.utils import get_openapi from app.logging.setup_logging import ( configure_uvicorn_logging, @@ -130,7 +131,7 @@ async def root(): app.include_router( user_preferences_router, prefix="/user-preferences", tags=["User Preferences"] ) - +app.include_router(duplicates_router, prefix="/duplicates", tags=["Duplicates"]) # Entry point for running with: python3 main.py if __name__ == "__main__": diff --git a/backend/requirements.txt b/backend/requirements.txt index b848d7ad6..7d391b769 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -71,3 +71,4 @@ ruff>=0.0.241 psutil>=5.9.5 pytest-asyncio>=1.0.0 setuptools==66.1.1 +ImageHash==4.3.1 diff --git a/docs/backend/backend_python/openapi.json b/docs/backend/backend_python/openapi.json index fbf40091b..287b17d9a 100644 --- a/docs/backend/backend_python/openapi.json +++ b/docs/backend/backend_python/openapi.json @@ -1304,6 +1304,82 @@ } } } + }, + "/duplicates/": { + "get": { + "tags": [ + "Duplicates" + ], + "summary": "Get Duplicates", + "description": "Get groups of duplicate images.\nReturns a list of groups, where each group contains:\n- images: List of image objects\n- best_shot_id: ID of the best shot", + "operationId": "get_duplicates_duplicates__get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "type": "object" + }, + "type": "array", + "title": "Response Get Duplicates Duplicates Get" + } + } + } + } + } + } + }, + "/duplicates/delete": { + "post": { + "tags": [ + "Duplicates" + ], + "summary": "Delete Duplicates", + "description": "Delete specified duplicate images from DB and filesystem.", + "operationId": "delete_duplicates_duplicates_delete_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Image Ids" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "additionalProperties": { + "type": "integer" + }, + "type": "object", + "title": "Response Delete Duplicates Duplicates Delete Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } } }, "components": { diff --git a/frontend/src/api/apiEndpoints.ts b/frontend/src/api/apiEndpoints.ts index 69a7e570d..5ec58be13 100644 --- a/frontend/src/api/apiEndpoints.ts +++ b/frontend/src/api/apiEndpoints.ts @@ -30,3 +30,8 @@ export const userPreferencesEndpoints = { export const healthEndpoints = { healthCheck: '/health', }; + +export const duplicatesEndpoints = { + getDuplicates: '/duplicates/', + deleteDuplicates: '/duplicates/delete', +}; diff --git a/frontend/src/components/Navigation/Sidebar/AppSidebar.tsx b/frontend/src/components/Navigation/Sidebar/AppSidebar.tsx index ec018ec12..83691f6e6 100644 --- a/frontend/src/components/Navigation/Sidebar/AppSidebar.tsx +++ b/frontend/src/components/Navigation/Sidebar/AppSidebar.tsx @@ -16,6 +16,7 @@ import { Video, BookImage, ClockFading, + Copy, } from 'lucide-react'; import { useLocation, Link } from 'react-router'; import { ROUTES } from '@/constants/routes'; @@ -52,6 +53,7 @@ export function AppSidebar() { { name: 'Videos', path: `/${ROUTES.VIDEOS}`, icon: Video }, { name: 'Albums', path: `/${ROUTES.ALBUMS}`, icon: BookImage }, { name: 'Memories', path: `/${ROUTES.MEMORIES}`, icon: ClockFading }, + { name: 'Duplicates', path: `/${ROUTES.DUPLICATES}`, icon: Copy }, { name: 'Settings', path: `/${ROUTES.SETTINGS}`, icon: Bolt }, ]; diff --git a/frontend/src/constants/routes.ts b/frontend/src/constants/routes.ts index 7a8da5bb5..987649f1e 100644 --- a/frontend/src/constants/routes.ts +++ b/frontend/src/constants/routes.ts @@ -9,4 +9,5 @@ export const ROUTES = { ALBUMS: 'albums', MEMORIES: 'memories', PERSON: 'person/:clusterId', + DUPLICATES: 'duplicates', }; diff --git a/frontend/src/pages/DuplicatePage/DuplicatePage.tsx b/frontend/src/pages/DuplicatePage/DuplicatePage.tsx new file mode 100644 index 000000000..ce079d4fa --- /dev/null +++ b/frontend/src/pages/DuplicatePage/DuplicatePage.tsx @@ -0,0 +1,266 @@ +import React, { useEffect, useState } from 'react'; +import { duplicatesEndpoints } from '@/api/apiEndpoints'; +import { apiClient as axiosInstance } from '@/api/axiosConfig'; +import { convertFileSrc } from '@tauri-apps/api/core'; +import { ask } from '@tauri-apps/plugin-dialog'; +import { Trash2, CheckCircle, X, ZoomIn, ChevronLeft, ChevronRight } from 'lucide-react'; + +interface Image { + id: string; + path: string; + thumbnailPath: string; + phash: string; +} + +interface DuplicateGroup { + images: Image[]; + best_shot_id: string | null; +} + +export const DuplicatePage: React.FC = () => { + const [groups, setGroups] = useState([]); + const [loading, setLoading] = useState(true); + const [selectedIds, setSelectedIds] = useState>(new Set()); + const [viewingImage, setViewingImage] = useState(null); + + useEffect(() => { + fetchDuplicates(); + }, []); + + const fetchDuplicates = async () => { + try { + const response = await axiosInstance.get(duplicatesEndpoints.getDuplicates); + setGroups(response.data); + + // Pre-select duplicates (all except best shot) + const initialSelection = new Set(); + response.data.forEach((group: DuplicateGroup) => { + group.images.forEach((img) => { + if (img.id !== group.best_shot_id) { + initialSelection.add(img.id); + } + }); + }); + setSelectedIds(initialSelection); + } catch (error) { + console.error('Error fetching duplicates:', error); + } finally { + setLoading(false); + } + }; + + const toggleSelection = (id: string) => { + const newSelection = new Set(selectedIds); + if (newSelection.has(id)) { + newSelection.delete(id); + } else { + newSelection.add(id); + } + setSelectedIds(newSelection); + }; + + const handleDelete = async () => { + if (selectedIds.size === 0) return; + const confirmed = await ask(`Are you sure you want to delete ${selectedIds.size} images?`, { + title: 'Confirm Deletion', + kind: 'warning', + }); + + if (!confirmed) return; + + try { + await axiosInstance.post(duplicatesEndpoints.deleteDuplicates, Array.from(selectedIds)); + // Refresh + fetchDuplicates(); + } catch (error) { + console.error('Error deleting duplicates:', error); + } + }; + + const navigateImage = (direction: 'prev' | 'next') => { + if (!viewingImage) return; + for (const group of groups) { + const idx = group.images.findIndex(img => img.id === viewingImage.id); + if (idx !== -1) { + let newIdx; + if (direction === 'prev') { + newIdx = idx === 0 ? group.images.length - 1 : idx - 1; + } else { + newIdx = (idx + 1) % group.images.length; + } + setViewingImage(group.images[newIdx]); + return; + } + } + }; + + useEffect(() => { + const handleKeyDown = (e: KeyboardEvent) => { + if (!viewingImage) return; + + switch (e.key) { + case 'ArrowLeft': + navigateImage('prev'); + break; + case 'ArrowRight': + navigateImage('next'); + break; + case 'Escape': + setViewingImage(null); + break; + } + }; + + window.addEventListener('keydown', handleKeyDown); + return () => window.removeEventListener('keydown', handleKeyDown); + }, [viewingImage, groups]); + + const handlePrev = (e: React.MouseEvent) => { + e.stopPropagation(); + navigateImage('prev'); + }; + + const handleNext = (e: React.MouseEvent) => { + e.stopPropagation(); + navigateImage('next'); + }; + + if (loading) return
Loading duplicates...
; + + if (groups.length === 0) { + return ( +
+

No Duplicates Found

+

Great! Your gallery seems to be free of duplicate photos.

+
+ ); + } + + return ( +
+
+

Duplicate Photos

+ +
+ +
+ {groups.map((group, index) => ( +
+
+ Group {index + 1} • {group.images.length} images +
+
+ {group.images.map((img) => { + const isBest = img.id === group.best_shot_id; + const isSelected = selectedIds.has(img.id); + + return ( +
+
setViewingImage(img)} + > + Duplicate +
+ +
+
+ + {isBest && ( +
+ Best Shot +
+ )} + +
+ toggleSelection(img.id)} + className="w-5 h-5 rounded border-gray-300 text-red-600 focus:ring-red-500" + /> +
+ +
+ {img.path.split('/').pop()} +
+
+ ); + })} +
+ + {/* Image Preview Modal */} + {viewingImage && ( +
setViewingImage(null)} + > +
e.stopPropagation()}> +
+ {(() => { + const group = groups.find(g => g.images.some(i => i.id === viewingImage.id)); + if (!group) return ''; + const index = group.images.findIndex(i => i.id === viewingImage.id) + 1; + return `${index} / ${group.images.length}`; + })()} +
+ + {/* Navigation Buttons */} + + + Preview + +
+

{viewingImage.path.split('/').pop()}

+

{viewingImage.path}

+
+ + + + +
+
+ )} +
+ ))} +
+
+ ); +}; diff --git a/frontend/src/routes/AppRoutes.tsx b/frontend/src/routes/AppRoutes.tsx index 22153edbb..54d3f4ffa 100644 --- a/frontend/src/routes/AppRoutes.tsx +++ b/frontend/src/routes/AppRoutes.tsx @@ -9,6 +9,7 @@ import { MyFav } from '@/pages/Home/MyFav'; import { AITagging } from '@/pages/AITagging/AITagging'; import { PersonImages } from '@/pages/PersonImages/PersonImages'; import { ComingSoon } from '@/pages/ComingSoon/ComingSoon'; +import { DuplicatePage } from '@/pages/DuplicatePage/DuplicatePage'; export const AppRoutes: React.FC = () => { return ( @@ -23,6 +24,7 @@ export const AppRoutes: React.FC = () => { } /> } /> } /> + } /> );