-
Notifications
You must be signed in to change notification settings - Fork 522
Fix 905 Added duplicates and best shot feature #967
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| from fastapi import APIRouter, HTTPException, Body | ||
| from typing import List, Dict, Any | ||
| from app.database.images import db_get_all_images_with_phash, db_delete_images_by_ids, db_get_images_by_ids | ||
| from app.utils.duplicate_detector import identify_best_shot, group_similar_images | ||
| from app.logging.setup_logging import get_logger | ||
| import os | ||
|
|
||
| router = APIRouter() | ||
| logger = get_logger(__name__) | ||
|
|
||
| @router.get("/", response_model=List[Dict[str, Any]]) | ||
| async def get_duplicates(): | ||
| """ | ||
| Get groups of duplicate images. | ||
| Returns a list of groups, where each group contains: | ||
| - images: List of image objects | ||
| - best_shot_id: ID of the best shot | ||
| """ | ||
| try: | ||
| # Get all images with pHash | ||
| all_images = db_get_all_images_with_phash() | ||
|
|
||
| # Group similar images using Python logic (Hamming distance) | ||
| # Threshold 5 allows for some edits/compression differences | ||
| groups = group_similar_images(all_images, threshold=5) | ||
|
|
||
| result = [] | ||
|
|
||
| for group in groups: | ||
| best_shot = identify_best_shot(group) | ||
| result.append({ | ||
| "images": group, | ||
| "best_shot_id": best_shot['id'] if best_shot else None | ||
| }) | ||
|
|
||
| return result | ||
| except Exception as e: | ||
| logger.error(f"Error getting duplicates: {e}") | ||
| raise HTTPException(status_code=500, detail="Internal server error") | ||
|
|
||
| @router.post("/delete", response_model=Dict[str, int]) | ||
| async def delete_duplicates(image_ids: List[str] = Body(...)): | ||
| """ | ||
| Delete specified duplicate images from DB and filesystem. | ||
| """ | ||
| try: | ||
| # Get image paths before deleting from DB | ||
| images = db_get_images_by_ids(image_ids) | ||
|
|
||
| # Delete from DB | ||
| if not db_delete_images_by_ids(image_ids): | ||
| raise HTTPException(status_code=500, detail="Failed to delete images from database") | ||
|
|
||
| # Delete from filesystem | ||
| deleted_files_count = 0 | ||
| for img in images: | ||
| try: | ||
| if os.path.exists(img['path']): | ||
| os.remove(img['path']) | ||
| deleted_files_count += 1 | ||
|
|
||
| # Also delete thumbnail | ||
| if img.get('thumbnailPath') and os.path.exists(img['thumbnailPath']): | ||
| os.remove(img['thumbnailPath']) | ||
| except Exception as e: | ||
| logger.error(f"Error deleting file {img['path']}: {e}") | ||
|
|
||
| return {"deleted_count": len(image_ids), "deleted_files_count": deleted_files_count} | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Error in delete_duplicates: {e}") | ||
| raise HTTPException(status_code=500, detail=str(e)) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,186 @@ | ||
| import imagehash | ||
| from PIL import Image | ||
| import os | ||
| import json | ||
| import cv2 | ||
| import numpy as np | ||
| from typing import List, Dict, Any, Optional | ||
| from app.logging.setup_logging import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
| # Constants | ||
| # dHash is generally faster and better at gradients than pHash for burst shots | ||
| HASH_SIZE = 8 | ||
| HASH_THRESHOLD = 8 # Strict threshold for hashing | ||
| MIN_MATCH_COUNT = 15 # Minimum shared keypoints to consider them the same scene | ||
|
|
||
| def get_image_sharpness(image_path: str) -> float: | ||
| """ | ||
| Returns a score representing the 'sharpness' of an image. | ||
| Higher is better. | ||
| Technique: Laplacian Variance (Detects edges). | ||
| """ | ||
| try: | ||
| img = cv2.imread(image_path) | ||
| if img is None: | ||
| return 0.0 | ||
|
|
||
| # Convert to grayscale | ||
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | ||
|
|
||
| # Calculate Laplacian variance | ||
| # Blurry images have low variance (few edges), Sharp images have high variance | ||
| variance = cv2.Laplacian(gray, cv2.CV_64F).var() | ||
| return variance | ||
| except Exception as e: | ||
| logger.error(f"Error calculating sharpness for {image_path}: {e}") | ||
| return 0.0 | ||
|
|
||
| def are_images_geometrically_similar(path1: str, path2: str) -> bool: | ||
| """ | ||
| Uses ORB (Oriented FAST and Rotated BRIEF) to detect if two images | ||
| are of the same scene, even if camera moved slightly or angle changed. | ||
| """ | ||
| try: | ||
| img1 = cv2.imread(path1, 0) # Read as grayscale | ||
| img2 = cv2.imread(path2, 0) | ||
|
|
||
| if img1 is None or img2 is None: | ||
| return False | ||
|
|
||
| # Initialize ORB detector | ||
| orb = cv2.ORB_create(nfeatures=500) | ||
|
|
||
| # Find keypoints and descriptors | ||
| kp1, des1 = orb.detectAndCompute(img1, None) | ||
| kp2, des2 = orb.detectAndCompute(img2, None) | ||
|
|
||
| if des1 is None or des2 is None: | ||
| return False | ||
|
|
||
| # Match descriptors using BFMatcher (Brute Force) with Hamming distance | ||
| bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) | ||
| matches = bf.match(des1, des2) | ||
|
|
||
| # Sort matches by distance (best matches first) | ||
| matches = sorted(matches, key=lambda x: x.distance) | ||
|
|
||
| # Take top 15% of matches or top 50 matches | ||
| good_matches = [m for m in matches if m.distance < 50] | ||
|
|
||
| # If we have enough strong geometrical matches, it's the same object/scene | ||
| return len(good_matches) > MIN_MATCH_COUNT | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Error matching features between {path1} and {path2}: {e}") | ||
| return False | ||
|
|
||
|
|
||
| def calculate_phash(image_path: str) -> Optional[str]: | ||
| """ | ||
| Calculate perceptual hash for an image. | ||
| """ | ||
| try: | ||
| img = Image.open(image_path) | ||
| # phash is generally good for finding duplicates including resized/compressed ones | ||
| hash_obj = imagehash.phash(img) | ||
| return str(hash_obj) | ||
| except Exception as e: | ||
| logger.error(f"Error calculating pHash for {image_path}: {e}") | ||
| return None | ||
|
|
||
| def identify_best_shot(images: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: | ||
| """ | ||
| Identify the best shot from a list of duplicate images. | ||
| Heuristic: Sharpness (Laplacian Variance), then File Size. | ||
| """ | ||
| if not images: | ||
| return None | ||
|
|
||
| # Calculate sharpness for all if not already calculated | ||
| for img in images: | ||
| if 'sharpness_score' not in img: | ||
| img['sharpness_score'] = get_image_sharpness(img['path']) | ||
|
|
||
| # Pick best image | ||
| # Heuristic: Sharpness is king. | ||
| # Tie-breaker: File size (higher usually means more color data if sharpness is equal) | ||
| try: | ||
| best_image = max(images, key=lambda x: (x.get('sharpness_score', 0), os.path.getsize(x['path']) if os.path.exists(x['path']) else 0)) | ||
| return best_image | ||
| except Exception as e: | ||
| logger.error(f"Error identifying best shot: {e}") | ||
| return images[0] if images else None | ||
|
|
||
| def group_similar_images(images: List[Dict[str, Any]], threshold: int = HASH_THRESHOLD) -> List[List[Dict[str, Any]]]: | ||
| """ | ||
| Groups images by Visual Hash (dHash) and verifies with ORB. | ||
| """ | ||
| clusters = [] | ||
|
|
||
| # Pre-compute dHash (Difference Hash) instead of pHash | ||
| processed_images = [] | ||
| for img in images: | ||
| if img.get('phash'): # We are technically using the pHash from DB if available, or calculating on fly | ||
| # If we want to switch to dHash strictly we might need to re-compute. | ||
| # For now let's reuse the stored hash as a first pass filter if possible, | ||
| # OR strictly compute dHash now for better burst mode detection. | ||
| # Given the context, let's calculate dHash on fly for high accuracy as requested. | ||
| pass | ||
|
|
||
| path = img.get('path') | ||
| if not path or not os.path.exists(path): | ||
| continue | ||
|
|
||
| try: | ||
| # Calculate dHash on the fly for heavy logic mode | ||
| pil_img = Image.open(path) | ||
| dhash = imagehash.dhash(pil_img, hash_size=HASH_SIZE) | ||
| img['hash_obj'] = dhash | ||
| # Compute sharpness now to save time later | ||
| img['sharpness_score'] = get_image_sharpness(path) | ||
| processed_images.append(img) | ||
| except Exception as e: | ||
| logger.warning(f"Error processing image {path}: {e}") | ||
|
|
||
| # Sort by sharpness initially so the "best" image often becomes the cluster rep | ||
| processed_images.sort(key=lambda x: x.get('sharpness_score', 0), reverse=True) | ||
|
|
||
| # Greedy clustering | ||
| for img in processed_images: | ||
| found_cluster = False | ||
| img_hash = img['hash_obj'] | ||
|
|
||
| for cluster in clusters: | ||
| if not cluster: | ||
| continue | ||
|
|
||
| rep_img = cluster[0] | ||
| rep_hash = rep_img['hash_obj'] | ||
|
|
||
| dist = img_hash - rep_hash | ||
|
|
||
| # Fast Check: Hamming Distance | ||
| if dist <= threshold: | ||
| # Secondary Check: ORB Verification | ||
| # We check geometric similarity if the hash is "close but not perfect" or just always. | ||
| # To be robust as requested: | ||
| if are_images_geometrically_similar(img['path'], rep_img['path']): | ||
| cluster.append(img) | ||
| found_cluster = True | ||
| break | ||
|
|
||
| if not found_cluster: | ||
| clusters.append([img]) | ||
|
|
||
| # Filter out single-image clusters (no duplicates) | ||
| duplicate_groups = [cluster for cluster in clusters if len(cluster) > 1] | ||
|
|
||
| # Remove temporary objects | ||
| for group in duplicate_groups: | ||
| for img in group: | ||
| img.pop('hash_obj', None) | ||
|
|
||
| return duplicate_groups | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| from app.models.FaceDetector import FaceDetector | ||
| from app.models.ObjectClassifier import ObjectClassifier | ||
| from app.logging.setup_logging import get_logger | ||
| from app.utils.duplicate_detector import calculate_phash | ||
|
|
||
| logger = get_logger(__name__) | ||
|
Comment on lines
+22
to
24
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicate logger definition detected. Line 24 defines 🔎 Proposed fixRemove the duplicate definition at line 30: logger = get_logger(__name__)
# GPS EXIF tag constant
GPS_INFO_TAG = 34853
-logger = logging.getLogger(__name__)
-
🤖 Prompt for AI Agents |
||
|
|
||
|
|
@@ -165,6 +166,7 @@ def image_util_prepare_image_records( | |
| # Generate thumbnail | ||
| if image_util_generate_thumbnail(image_path, thumbnail_path): | ||
| metadata = image_util_extract_metadata(image_path) | ||
| phash = calculate_phash(image_path) | ||
| logger.debug(f"Extracted metadata for {image_path}: {metadata}") | ||
| image_records.append( | ||
| { | ||
|
|
@@ -174,6 +176,7 @@ def image_util_prepare_image_records( | |
| "thumbnailPath": thumbnail_path, | ||
| "metadata": json.dumps(metadata), | ||
| "isTagged": False, | ||
| "phash": phash, | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,3 +71,4 @@ ruff>=0.0.241 | |
| psutil>=5.9.5 | ||
| pytest-asyncio>=1.0.0 | ||
| setuptools==66.1.1 | ||
| ImageHash==4.3.1 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Check ImageHash package version information and security advisories
echo "=== Checking available ImageHash versions (last 10) ==="
curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.releases | keys[]' | sort -V | tail -10
echo -e "\n=== Latest version ==="
curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.info.version'
echo -e "\n=== Release date of version 4.3.1 ==="
curl -s https://pypi.org/pypi/ImageHash/4.3.1/json | jq -r '.releases' 2>/dev/null || curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.releases."4.3.1"[0].upload_time' 2>/dev/null
echo -e "\n=== Check if version 4.3.1 exists ==="
curl -s https://pypi.org/pypi/ImageHash/4.3.1/json | jq -r '.info.version' 2>/dev/nullRepository: AOSSIE-Org/PictoPy Length of output: 288 🌐 Web query:
💡 Result: Short answer: There are no public CVEs that specifically name the Python ImageHash package. Snyk’s vulnerability DB lists no direct vulnerabilities for ImageHash, and PyPI shows the project but no CVE entries. [1][2] Context / related notes:
If you want, I can:
🌐 Web query:
💡 Result: Short answer: The ImageHash project itself has no published security advisories; its GitHub security page shows no advisories and no SECURITY.md. However, ImageHash depends on image libraries (notably Pillow), which have had several CVEs (e.g., CVE-2022-22817, CVE-2021-28675). You should track Pillow (and other imaging deps) for security fixes and keep ImageHash and its dependencies up to date. [1][2][3][4] Sources:
Update ImageHash to version 4.3.2 (latest available). Version 4.3.1 is valid and free of known vulnerabilities, but version 4.3.2 is now available. ImageHash itself has no published CVEs; however, its dependency on Pillow means security updates to image libraries should be monitored and kept current. 🤖 Prompt for AI Agents |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid exposing raw exception details in API response.
Using
str(e)in the error detail could expose internal paths, database errors, or other sensitive information to API consumers.🔎 Proposed fix
except Exception as e: logger.error(f"Error in delete_duplicates: {e}") - raise HTTPException(status_code=500, detail=str(e)) + raise HTTPException(status_code=500, detail="Failed to delete images")📝 Committable suggestion
🤖 Prompt for AI Agents