Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions backend/app/database/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ImageRecord(TypedDict):
thumbnailPath: str
metadata: Union[Mapping[str, Any], str]
isTagged: bool
phash: Union[str, None]


class UntaggedImageRecord(TypedDict):
Expand Down Expand Up @@ -64,6 +65,7 @@ def db_create_images_table() -> None:
metadata TEXT,
isTagged BOOLEAN DEFAULT 0,
isFavourite BOOLEAN DEFAULT 0,
phash TEXT,
FOREIGN KEY (folder_id) REFERENCES folders(folder_id) ON DELETE CASCADE
)
"""
Expand Down Expand Up @@ -97,12 +99,13 @@ def db_bulk_insert_images(image_records: List[ImageRecord]) -> bool:
try:
cursor.executemany(
"""
INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged)
VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged)
INSERT INTO images (id, path, folder_id, thumbnailPath, metadata, isTagged, phash)
VALUES (:id, :path, :folder_id, :thumbnailPath, :metadata, :isTagged, :phash)
ON CONFLICT(path) DO UPDATE SET
folder_id=excluded.folder_id,
thumbnailPath=excluded.thumbnailPath,
metadata=excluded.metadata,
phash=excluded.phash,
isTagged=CASE
WHEN excluded.isTagged THEN 1
ELSE images.isTagged
Expand Down Expand Up @@ -417,5 +420,53 @@ def db_toggle_image_favourite_status(image_id: str) -> bool:
logger.error(f"Database error: {e}")
conn.rollback()
return False

def db_get_all_images_with_phash() -> List[dict]:
"""
Get all images that have a phash.
"""
conn = _connect()
cursor = conn.cursor()

try:
cursor.execute("""
SELECT id, path, folder_id, thumbnailPath, metadata, isTagged, isFavourite, phash
FROM images
WHERE phash IS NOT NULL
""")

images = []
for row in cursor.fetchall():
images.append({
"id": row[0],
"path": row[1],
"folder_id": row[2],
"thumbnailPath": row[3],
"metadata": row[4],
"isTagged": bool(row[5]),
"isFavourite": bool(row[6]),
"phash": row[7]
})

return images

except Exception as e:
logger.error(f"Error getting images with phash: {e}")
return []
finally:
conn.close()

def db_get_images_by_ids(image_ids: List[str]) -> List[dict]:
if not image_ids:
return []
conn = _connect()
cursor = conn.cursor()
try:
placeholders = ','.join(['?'] * len(image_ids))
cursor.execute(f"SELECT id, path, thumbnailPath FROM images WHERE id IN ({placeholders})", image_ids)
return [{"id": row[0], "path": row[1], "thumbnailPath": row[2]} for row in cursor.fetchall()]
except Exception as e:
logger.error(f"Error getting images by ids: {e}")
return []
finally:
conn.close()
72 changes: 72 additions & 0 deletions backend/app/routes/duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from fastapi import APIRouter, HTTPException, Body
from typing import List, Dict, Any
from app.database.images import db_get_all_images_with_phash, db_delete_images_by_ids, db_get_images_by_ids
from app.utils.duplicate_detector import identify_best_shot, group_similar_images
from app.logging.setup_logging import get_logger
import os

router = APIRouter()
logger = get_logger(__name__)

@router.get("/", response_model=List[Dict[str, Any]])
async def get_duplicates():
"""
Get groups of duplicate images.
Returns a list of groups, where each group contains:
- images: List of image objects
- best_shot_id: ID of the best shot
"""
try:
# Get all images with pHash
all_images = db_get_all_images_with_phash()

# Group similar images using Python logic (Hamming distance)
# Threshold 5 allows for some edits/compression differences
groups = group_similar_images(all_images, threshold=5)

result = []

for group in groups:
best_shot = identify_best_shot(group)
result.append({
"images": group,
"best_shot_id": best_shot['id'] if best_shot else None
})

return result
except Exception as e:
logger.error(f"Error getting duplicates: {e}")
raise HTTPException(status_code=500, detail="Internal server error")

@router.post("/delete", response_model=Dict[str, int])
async def delete_duplicates(image_ids: List[str] = Body(...)):
"""
Delete specified duplicate images from DB and filesystem.
"""
try:
# Get image paths before deleting from DB
images = db_get_images_by_ids(image_ids)

# Delete from DB
if not db_delete_images_by_ids(image_ids):
raise HTTPException(status_code=500, detail="Failed to delete images from database")

# Delete from filesystem
deleted_files_count = 0
for img in images:
try:
if os.path.exists(img['path']):
os.remove(img['path'])
deleted_files_count += 1

# Also delete thumbnail
if img.get('thumbnailPath') and os.path.exists(img['thumbnailPath']):
os.remove(img['thumbnailPath'])
except Exception as e:
logger.error(f"Error deleting file {img['path']}: {e}")

return {"deleted_count": len(image_ids), "deleted_files_count": deleted_files_count}

except Exception as e:
logger.error(f"Error in delete_duplicates: {e}")
raise HTTPException(status_code=500, detail=str(e))
Comment on lines +70 to +72
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Avoid exposing raw exception details in API response.

Using str(e) in the error detail could expose internal paths, database errors, or other sensitive information to API consumers.

🔎 Proposed fix
     except Exception as e:
         logger.error(f"Error in delete_duplicates: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
+        raise HTTPException(status_code=500, detail="Failed to delete images")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except Exception as e:
logger.error(f"Error in delete_duplicates: {e}")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Error in delete_duplicates: {e}")
raise HTTPException(status_code=500, detail="Failed to delete images")
🤖 Prompt for AI Agents
In @backend/app/routes/duplicates.py around lines 70 - 72, The handler in
delete_duplicates currently logs the exception message and returns str(e) in the
HTTPException detail, which may leak sensitive internals; change the code to log
the full exception internally (use logger.error(..., exc_info=True) or similar)
and raise HTTPException(status_code=500, detail="Internal server error") or
another generic, non-sensitive message instead of str(e), keeping the logger
call referencing the delete_duplicates context and the raised HTTPException for
the same error path.

186 changes: 186 additions & 0 deletions backend/app/utils/duplicate_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import imagehash
from PIL import Image
import os
import json
import cv2
import numpy as np
from typing import List, Dict, Any, Optional
from app.logging.setup_logging import get_logger

logger = get_logger(__name__)

# Constants
# dHash is generally faster and better at gradients than pHash for burst shots
HASH_SIZE = 8
HASH_THRESHOLD = 8 # Strict threshold for hashing
MIN_MATCH_COUNT = 15 # Minimum shared keypoints to consider them the same scene

def get_image_sharpness(image_path: str) -> float:
"""
Returns a score representing the 'sharpness' of an image.
Higher is better.
Technique: Laplacian Variance (Detects edges).
"""
try:
img = cv2.imread(image_path)
if img is None:
return 0.0

# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Calculate Laplacian variance
# Blurry images have low variance (few edges), Sharp images have high variance
variance = cv2.Laplacian(gray, cv2.CV_64F).var()
return variance
except Exception as e:
logger.error(f"Error calculating sharpness for {image_path}: {e}")
return 0.0

def are_images_geometrically_similar(path1: str, path2: str) -> bool:
"""
Uses ORB (Oriented FAST and Rotated BRIEF) to detect if two images
are of the same scene, even if camera moved slightly or angle changed.
"""
try:
img1 = cv2.imread(path1, 0) # Read as grayscale
img2 = cv2.imread(path2, 0)

if img1 is None or img2 is None:
return False

# Initialize ORB detector
orb = cv2.ORB_create(nfeatures=500)

# Find keypoints and descriptors
kp1, des1 = orb.detectAndCompute(img1, None)
kp2, des2 = orb.detectAndCompute(img2, None)

if des1 is None or des2 is None:
return False

# Match descriptors using BFMatcher (Brute Force) with Hamming distance
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(des1, des2)

# Sort matches by distance (best matches first)
matches = sorted(matches, key=lambda x: x.distance)

# Take top 15% of matches or top 50 matches
good_matches = [m for m in matches if m.distance < 50]

# If we have enough strong geometrical matches, it's the same object/scene
return len(good_matches) > MIN_MATCH_COUNT

except Exception as e:
logger.error(f"Error matching features between {path1} and {path2}: {e}")
return False


def calculate_phash(image_path: str) -> Optional[str]:
"""
Calculate perceptual hash for an image.
"""
try:
img = Image.open(image_path)
# phash is generally good for finding duplicates including resized/compressed ones
hash_obj = imagehash.phash(img)
return str(hash_obj)
except Exception as e:
logger.error(f"Error calculating pHash for {image_path}: {e}")
return None

def identify_best_shot(images: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""
Identify the best shot from a list of duplicate images.
Heuristic: Sharpness (Laplacian Variance), then File Size.
"""
if not images:
return None

# Calculate sharpness for all if not already calculated
for img in images:
if 'sharpness_score' not in img:
img['sharpness_score'] = get_image_sharpness(img['path'])

# Pick best image
# Heuristic: Sharpness is king.
# Tie-breaker: File size (higher usually means more color data if sharpness is equal)
try:
best_image = max(images, key=lambda x: (x.get('sharpness_score', 0), os.path.getsize(x['path']) if os.path.exists(x['path']) else 0))
return best_image
except Exception as e:
logger.error(f"Error identifying best shot: {e}")
return images[0] if images else None

def group_similar_images(images: List[Dict[str, Any]], threshold: int = HASH_THRESHOLD) -> List[List[Dict[str, Any]]]:
"""
Groups images by Visual Hash (dHash) and verifies with ORB.
"""
clusters = []

# Pre-compute dHash (Difference Hash) instead of pHash
processed_images = []
for img in images:
if img.get('phash'): # We are technically using the pHash from DB if available, or calculating on fly
# If we want to switch to dHash strictly we might need to re-compute.
# For now let's reuse the stored hash as a first pass filter if possible,
# OR strictly compute dHash now for better burst mode detection.
# Given the context, let's calculate dHash on fly for high accuracy as requested.
pass

path = img.get('path')
if not path or not os.path.exists(path):
continue

try:
# Calculate dHash on the fly for heavy logic mode
pil_img = Image.open(path)
dhash = imagehash.dhash(pil_img, hash_size=HASH_SIZE)
img['hash_obj'] = dhash
# Compute sharpness now to save time later
img['sharpness_score'] = get_image_sharpness(path)
processed_images.append(img)
except Exception as e:
logger.warning(f"Error processing image {path}: {e}")

# Sort by sharpness initially so the "best" image often becomes the cluster rep
processed_images.sort(key=lambda x: x.get('sharpness_score', 0), reverse=True)

# Greedy clustering
for img in processed_images:
found_cluster = False
img_hash = img['hash_obj']

for cluster in clusters:
if not cluster:
continue

rep_img = cluster[0]
rep_hash = rep_img['hash_obj']

dist = img_hash - rep_hash

# Fast Check: Hamming Distance
if dist <= threshold:
# Secondary Check: ORB Verification
# We check geometric similarity if the hash is "close but not perfect" or just always.
# To be robust as requested:
if are_images_geometrically_similar(img['path'], rep_img['path']):
cluster.append(img)
found_cluster = True
break

if not found_cluster:
clusters.append([img])

# Filter out single-image clusters (no duplicates)
duplicate_groups = [cluster for cluster in clusters if len(cluster) > 1]

# Remove temporary objects
for group in duplicate_groups:
for img in group:
img.pop('hash_obj', None)

return duplicate_groups

3 changes: 3 additions & 0 deletions backend/app/utils/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from app.models.FaceDetector import FaceDetector
from app.models.ObjectClassifier import ObjectClassifier
from app.logging.setup_logging import get_logger
from app.utils.duplicate_detector import calculate_phash

logger = get_logger(__name__)
Comment on lines +22 to 24
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Duplicate logger definition detected.

Line 24 defines logger = get_logger(__name__), but line 30 redefines it as logger = logging.getLogger(__name__). This shadows the first definition and may cause inconsistent logging behavior if get_logger provides custom formatting or handlers.

🔎 Proposed fix

Remove the duplicate definition at line 30:

 logger = get_logger(__name__)
 
 
 # GPS EXIF tag constant
 GPS_INFO_TAG = 34853
 
-logger = logging.getLogger(__name__)
-

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In @backend/app/utils/images.py around lines 22 - 24, There are two logger
definitions: logger = get_logger(__name__) and later logger =
logging.getLogger(__name__), which shadows the custom get_logger; remove the
redundant logging.getLogger assignment so the module consistently uses logger
from get_logger(__name__) (ensure any imports still used like get_logger remain
and that references to logger elsewhere in the file use the single definition).


Expand Down Expand Up @@ -165,6 +166,7 @@ def image_util_prepare_image_records(
# Generate thumbnail
if image_util_generate_thumbnail(image_path, thumbnail_path):
metadata = image_util_extract_metadata(image_path)
phash = calculate_phash(image_path)
logger.debug(f"Extracted metadata for {image_path}: {metadata}")
image_records.append(
{
Expand All @@ -174,6 +176,7 @@ def image_util_prepare_image_records(
"thumbnailPath": thumbnail_path,
"metadata": json.dumps(metadata),
"isTagged": False,
"phash": phash,
}
)

Expand Down
3 changes: 2 additions & 1 deletion backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from app.routes.images import router as images_router
from app.routes.face_clusters import router as face_clusters_router
from app.routes.user_preferences import router as user_preferences_router
from app.routes.duplicates import router as duplicates_router
from fastapi.openapi.utils import get_openapi
from app.logging.setup_logging import (
configure_uvicorn_logging,
Expand Down Expand Up @@ -130,7 +131,7 @@ async def root():
app.include_router(
user_preferences_router, prefix="/user-preferences", tags=["User Preferences"]
)

app.include_router(duplicates_router, prefix="/duplicates", tags=["Duplicates"])

# Entry point for running with: python3 main.py
if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,4 @@ ruff>=0.0.241
psutil>=5.9.5
pytest-asyncio>=1.0.0
setuptools==66.1.1
ImageHash==4.3.1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check ImageHash package version information and security advisories

echo "=== Checking available ImageHash versions (last 10) ==="
curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.releases | keys[]' | sort -V | tail -10

echo -e "\n=== Latest version ==="
curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.info.version'

echo -e "\n=== Release date of version 4.3.1 ==="
curl -s https://pypi.org/pypi/ImageHash/4.3.1/json | jq -r '.releases' 2>/dev/null || curl -s https://pypi.org/pypi/ImageHash/json | jq -r '.releases."4.3.1"[0].upload_time' 2>/dev/null

echo -e "\n=== Check if version 4.3.1 exists ==="
curl -s https://pypi.org/pypi/ImageHash/4.3.1/json | jq -r '.info.version' 2>/dev/null

Repository: AOSSIE-Org/PictoPy

Length of output: 288


🌐 Web query:

ImageHash security vulnerabilities CVE

💡 Result:

Short answer: There are no public CVEs that specifically name the Python ImageHash package. Snyk’s vulnerability DB lists no direct vulnerabilities for ImageHash, and PyPI shows the project but no CVE entries. [1][2]

Context / related notes:

  • Image-processing libraries commonly do have CVEs (e.g., ImageMagick and others), so risks often come from image decoders or native libs you use alongside ImageHash (not ImageHash itself). Examples: ImageMagick CVEs tracked by Ubuntu, and a recent stb_image CVE. [3][4]

If you want, I can:

  • search more exhaustively (NVD, GitHub advisories, OSS‑DB) and return any CVEs that mention ImageHash by name; or
  • scan a project that uses ImageHash for known-vulnerable dependencies. Which would you prefer?

🌐 Web query:

ImageHash package security advisories

💡 Result:

Short answer: The ImageHash project itself has no published security advisories; its GitHub security page shows no advisories and no SECURITY.md. However, ImageHash depends on image libraries (notably Pillow), which have had several CVEs (e.g., CVE-2022-22817, CVE-2021-28675). You should track Pillow (and other imaging deps) for security fixes and keep ImageHash and its dependencies up to date. [1][2][3][4]

Sources:

  • ImageHash GitHub security overview (no advisories). [1]
  • Snyk package page for ImageHash (no direct vulnerabilities recorded). [2]
  • CVE-2022-22817 (Pillow ImageMath.eval arbitrary expression). [3]
  • CVE-2021-28675 (Pillow PSD plugin issue). [4]

Update ImageHash to version 4.3.2 (latest available).

Version 4.3.1 is valid and free of known vulnerabilities, but version 4.3.2 is now available. ImageHash itself has no published CVEs; however, its dependency on Pillow means security updates to image libraries should be monitored and kept current.

🤖 Prompt for AI Agents
In @backend/requirements.txt at line 74, Update the ImageHash dependency line in
requirements.txt from "ImageHash==4.3.1" to "ImageHash==4.3.2": locate the
ImageHash entry (the "ImageHash==4.3.1" token) and change the pinned version to
4.3.2, then run your dependency install/lock workflow (pip install -r
requirements.txt or regenerate lockfile) to ensure the new version is applied.

Loading