feat(Aesthetic > Composition): Boost visual-interest scaling, sharpen weighting, and increase model influence to raise overall score calibration.

danactive · danactive · commit 86f95fb89e89 · 2026-02-09T21:30:17.000-08:00
diff --git a/Makefile b/Makefile
@@ -20,13 +20,22 @@ load-weights:
 		-e MODEL_REPO="$(MODEL_REPO)" \
 		-e FILENAMES="$(FILENAMES)" \
 		weights-loader \
-		sh -c 'python /dock/hugging-offline.py --repo-id "$$MODEL_REPO" --filenames "$$FILENAMES" > /dock/build.log 2>&1'
+		sh -c 'python /dock/hugging-offline.py --repo-id "$$MODEL_REPO" --filenames $$FILENAMES > /dock/build.log 2>&1'
 	@echo "Part 3/3 Copying weights to local directory and display log"
 	docker cp extract-model:/dock/models/. ./models/
 	docker cp extract-model:/dock/build.log ./weights.log || true
 	docker rm extract-model
 	cat ./weights.log || true
 
+load-aesthetic-scorer:
+	$(MAKE) load-clip-vit-base-patch32
+	$(MAKE) load-weights MODEL_REPO="rsinema/aesthetic-scorer" \
+		FILENAMES="model.pt preprocessor_config.json tokenizer.json tokenizer_config.json special_tokens_map.json merges.txt vocab.json"
+
+load-clip-vit-base-patch32:
+	$(MAKE) load-weights MODEL_REPO="openai/clip-vit-base-patch32" \
+		FILENAMES="pytorch_model.bin config.json preprocessor_config.json tokenizer.json tokenizer_config.json special_tokens_map.json merges.txt vocab.json"
+
 build-ai-api:
 	docker build -f apps/api/Dockerfile -t ai-api .
 
diff --git a/apps/api/README.md b/apps/api/README.md
@@ -93,3 +93,16 @@ curl -X POST -H "Content-Type: image/jpeg" --data-binary @your_image.jpg http://
    [sa_0_4_vit_b_16_linear.pth](https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_b_16_linear.pth)
 1. Place it in `models/aesthetic/sa_0_4_vit_b_16_linear.pth`
 1. The OpenAI CLIP backbone weights for ViT-B/16 will be downloaded automatically on first run `make ai-api`
+
+## Aesthetic scorer (multi-attribute)
+
+To enable the newer multi-attribute aesthetic scorer (used by `/scores`):
+
+1. Download the model weights and processor files:
+   `make load-aesthetic-scorer`
+1. Download the CLIP ViT-B/32 backbone (offline):
+   `make load-clip-vit-base-patch32`
+1. Rebuild and run the API:
+   `make build-ai-api && make ai-api`
+
+The weights are stored under `models/rsinema_aesthetic-scorer` and `models/openai_clip-vit-base-patch32` for offline loading.
diff --git a/apps/api/aesthetic.py b/apps/api/aesthetic.py
@@ -2,18 +2,23 @@
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
-from PIL import Image
+from PIL import Image, ImageFilter
+import numpy as np
 import logging
 from collections import OrderedDict
 import io
 import clip
+from transformers import CLIPProcessor, CLIPVisionModel
 
 # Set up logging once
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("uvicorn")
 logger.setLevel(logging.DEBUG)
 
 HEAD_PATH = "models/aesthetic/sa_0_4_vit_b_16_linear.pth"
+SCORER_DIR = "models/rsinema_aesthetic-scorer"
+SCORER_MODEL_PATH = f"{SCORER_DIR}/model.pt"
+CLIP_BASE_DIR = "models/openai_clip-vit-base-patch32"
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -67,6 +72,56 @@ def load_clip_model() -> tuple[torch.nn.Module, callable]:
 # One-time global setup
 _clip_model, preprocess = load_clip_model()
 regression_head = load_aesthetic_head(HEAD_PATH)
+_aesthetic_scorer = None
+_aesthetic_processor = None
+_aesthetic_backbone = None
+
+class AestheticScorer(nn.Module):
+  def __init__(self, backbone):
+    super().__init__()
+    self.backbone = backbone
+    hidden_dim = backbone.config.hidden_size
+    self.aesthetic_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.quality_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.composition_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.light_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.color_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.dof_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+    self.content_head = nn.Sequential(nn.Linear(hidden_dim, 1))
+
+  def forward(self, pixel_values):
+    features = self.backbone(pixel_values).pooler_output
+    return (
+      self.aesthetic_head(features),
+      self.quality_head(features),
+      self.composition_head(features),
+      self.light_head(features),
+      self.color_head(features),
+      self.dof_head(features),
+      self.content_head(features),
+    )
+
+def load_aesthetic_scorer():
+  global _aesthetic_scorer, _aesthetic_processor, _aesthetic_backbone
+  try:
+    _aesthetic_processor = CLIPProcessor.from_pretrained(SCORER_DIR, use_fast=False)
+    _aesthetic_backbone = CLIPVisionModel.from_pretrained(CLIP_BASE_DIR, local_files_only=True).to(device)
+    loaded = torch.load(SCORER_MODEL_PATH, map_location=device)
+    if isinstance(loaded, dict) and all(isinstance(v, torch.Tensor) for v in loaded.values()):
+      scorer = AestheticScorer(_aesthetic_backbone)
+      scorer.load_state_dict(loaded, strict=False)
+      _aesthetic_scorer = scorer
+    else:
+      _aesthetic_scorer = loaded
+    _aesthetic_scorer.eval()
+    logger.info("✅ Aesthetic scorer loaded.")
+  except Exception as e:
+    logger.error(f"⚠️ Failed to load aesthetic scorer: {e}")
+    _aesthetic_scorer = None
+    _aesthetic_processor = None
+    _aesthetic_backbone = None
+
+load_aesthetic_scorer()
 
 async def score_aesthetic(req: Request) -> float:
   img_bytes = await req.body()
@@ -80,3 +135,94 @@ async def score_aesthetic(req: Request) -> float:
     score = score_tensor.item()
 
   return float(score)
+
+def _grayscale_np(img: Image.Image, size: int = 256) -> np.ndarray:
+  resized = img.resize((size, size))
+  return (np.array(resized.convert("L"), dtype=np.float32) / 255.0)
+
+def _edges_intensity(img: Image.Image, size: int = 256) -> np.ndarray:
+  resized = img.resize((size, size))
+  edges = resized.filter(ImageFilter.FIND_EDGES).convert("L")
+  return (np.array(edges, dtype=np.float32) / 255.0)
+
+def _rule_of_thirds_score(edge_map: np.ndarray) -> float:
+  if edge_map.size == 0:
+    return 0.0
+  h, w = edge_map.shape
+  ys = np.linspace(0, h - 1, h, dtype=np.float32)
+  xs = np.linspace(0, w - 1, w, dtype=np.float32)
+  yy, xx = np.meshgrid(ys, xs, indexing="ij")
+  thirds_y = np.array([h / 3, 2 * h / 3], dtype=np.float32)
+  thirds_x = np.array([w / 3, 2 * w / 3], dtype=np.float32)
+  sigma = min(h, w) / 12
+  weight = np.zeros_like(edge_map, dtype=np.float32)
+  for ty in thirds_y:
+    for tx in thirds_x:
+      weight += np.exp(-(((yy - ty) ** 2 + (xx - tx) ** 2) / (2 * sigma ** 2)))
+  weighted = float((edge_map * weight).sum())
+  total = float(edge_map.sum())
+  if total <= 0:
+    return 0.0
+  ratio = weighted / total
+  return float(max(0.0, min(10.0, ratio * 10)))
+
+def _visual_interest_score(edge_map: np.ndarray) -> float:
+  mean_edge = float(edge_map.mean())
+  score = mean_edge * 60.0
+  return float(max(0.0, min(10.0, score)))
+
+def _sharpness_score(gray: np.ndarray) -> float:
+  if gray.size == 0:
+    return 0.0
+  padded = np.pad(gray, 1, mode="edge")
+  lap = (
+    padded[:-2, 1:-1]
+    + padded[2:, 1:-1]
+    + padded[1:-1, :-2]
+    + padded[1:-1, 2:]
+    - 4 * padded[1:-1, 1:-1]
+  )
+  variance = float(lap.var())
+  score = variance * 1000.0
+  return float(max(0.0, min(10.0, score)))
+
+def _score_with_aesthetic_model(img: Image.Image) -> dict | None:
+  if _aesthetic_scorer is None or _aesthetic_processor is None:
+    return None
+  inputs = _aesthetic_processor(images=img, return_tensors="pt")["pixel_values"].to(device)
+  with torch.no_grad():
+    scores = _aesthetic_scorer(inputs)
+  labels = ["overall", "quality", "composition", "lighting", "color", "depth_of_field", "content"]
+  return {label: float(score.item()) for label, score in zip(labels, scores)}
+
+async def score_photo_tips(req: Request) -> dict:
+  img_bytes = await req.body()
+  img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+  edge_map = _edges_intensity(img)
+  gray = _grayscale_np(img)
+  thirds_score = _rule_of_thirds_score(edge_map)
+  interest_score = _visual_interest_score(edge_map)
+  sharpness_score = _sharpness_score(gray)
+  composition = (interest_score * 0.8) + (thirds_score * 0.2)
+  sharpness_factor = 0.9 + (sharpness_score / 20.0)
+  model_scores = _score_with_aesthetic_model(img)
+  model_overall = (model_scores["overall"] * 2) if model_scores else None
+  base_overall = composition if model_overall is None else ((model_overall * 0.7) + (composition * 0.3))
+  overall_score = base_overall * sharpness_factor
+  tips = []
+  if thirds_score < 4:
+    tips.append("Try placing the subject near rule-of-thirds intersections.")
+  if interest_score < 4:
+    tips.append("Add more texture, contrast, or a clearer subject to increase visual interest.")
+  if sharpness_score < 4:
+    tips.append("Looks a bit soft; try a faster shutter or steadier shot.")
+  if not tips:
+    tips.append("Strong composition and visual interest.")
+  return {
+    "rule_of_thirds_score": round(thirds_score, 2),
+    "visual_interest_score": round(interest_score, 2),
+    "sharpness_score": round(sharpness_score, 2),
+    "overall_score": round(max(0.0, min(10.0, overall_score)) * 10, 1),
+    "model_scores": model_scores,
+    "tips": tips,
+  }
diff --git a/apps/api/main.py b/apps/api/main.py
@@ -3,7 +3,7 @@
 import logging
 import sys
 import traceback
-from aesthetic import score_aesthetic
+from aesthetic import score_photo_tips
 from classify import classify_image
 
 # Setup logging once
@@ -38,6 +38,6 @@ async def classify_endpoint(req: Request):
 @main_py_app.post("/scores")
 async def score_endpoint(req: Request):
     try:
-        return {"aesthetic_score": await score_aesthetic(req)}
+        return await score_photo_tips(req)
     except Exception as e:
         return error_response(e)
diff --git a/apps/api/requirements.txt b/apps/api/requirements.txt
@@ -6,4 +6,5 @@ scikit-learn==1.5.0
 timm==1.0.15
 torch==2.0.1
 torchvision==0.15.2
+transformers==4.38.2
 uvicorn==0.34.3
diff --git a/src/components/OrganizePreviews/PreviewImage.tsx b/src/components/OrganizePreviews/PreviewImage.tsx
@@ -20,10 +20,27 @@ const NOT_AVAILABLE = 'N/A'
 // Module-level cache to persist across remounts
 const scoreCache: Record<string, string> = {}
 
-function formatScore(score: number | undefined): string {
-  return typeof score === 'number'
-    ? `${Math.abs(score * 100).toFixed(1)}%`
-    : NOT_AVAILABLE
+function formatScore(
+  overall: number | undefined,
+  interest: number | undefined,
+  thirds: number | undefined,
+): string {
+  const percent = (value: number) => `${Math.round(value * 10)}%`
+  if (typeof overall === 'number') {
+    return `${Math.round(overall)}%`
+  }
+  const hasInterest = typeof interest === 'number'
+  const hasThirds = typeof thirds === 'number'
+  if (hasInterest && hasThirds) {
+    return percent((interest * 0.8) + (thirds * 0.2))
+  }
+  if (hasInterest) {
+    return percent(interest)
+  }
+  if (hasThirds) {
+    return percent(thirds)
+  }
+  return NOT_AVAILABLE
 }
 
 // SWR fetcher that uses the cache
@@ -38,7 +55,11 @@ const fetchScore = async (absolutePath: string) => {
   })
   if (!res.ok) throw new Error('Failed to fetch')
   const data = await res.json()
-  const scoreStr = formatScore(data.aesthetic_score)
+  const scoreStr = formatScore(
+    data.overall_score,
+    data.visual_interest_score,
+    data.rule_of_thirds_score,
+  )
   scoreCache[absolutePath] = scoreStr
   return scoreStr
 }
@@ -58,7 +79,7 @@ function DraggableThumb({
         <Link href={absolutePath} target="_blank" title="View original in new tab">
           {filename}
         </Link>
-        &nbsp;<span title='Aesthetic score'>{displayScore}</span>
+        &nbsp;<span title="Composition scores">{displayScore}</span>
       </span>
       <Img
         key={`thumbnail-${filename}`}