Skip to content

Commit 86f95fb

Browse files
committed
feat(Aesthetic > Composition): Boost visual-interest scaling, sharpen weighting, and increase model influence to raise overall score calibration.
1 parent 6c88c8a commit 86f95fb

File tree

6 files changed

+200
-10
lines changed

6 files changed

+200
-10
lines changed

Makefile

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,22 @@ load-weights:
2020
-e MODEL_REPO="$(MODEL_REPO)" \
2121
-e FILENAMES="$(FILENAMES)" \
2222
weights-loader \
23-
sh -c 'python /dock/hugging-offline.py --repo-id "$$MODEL_REPO" --filenames "$$FILENAMES" > /dock/build.log 2>&1'
23+
sh -c 'python /dock/hugging-offline.py --repo-id "$$MODEL_REPO" --filenames $$FILENAMES > /dock/build.log 2>&1'
2424
@echo "Part 3/3 Copying weights to local directory and display log"
2525
docker cp extract-model:/dock/models/. ./models/
2626
docker cp extract-model:/dock/build.log ./weights.log || true
2727
docker rm extract-model
2828
cat ./weights.log || true
2929

30+
load-aesthetic-scorer:
31+
$(MAKE) load-clip-vit-base-patch32
32+
$(MAKE) load-weights MODEL_REPO="rsinema/aesthetic-scorer" \
33+
FILENAMES="model.pt preprocessor_config.json tokenizer.json tokenizer_config.json special_tokens_map.json merges.txt vocab.json"
34+
35+
load-clip-vit-base-patch32:
36+
$(MAKE) load-weights MODEL_REPO="openai/clip-vit-base-patch32" \
37+
FILENAMES="pytorch_model.bin config.json preprocessor_config.json tokenizer.json tokenizer_config.json special_tokens_map.json merges.txt vocab.json"
38+
3039
build-ai-api:
3140
docker build -f apps/api/Dockerfile -t ai-api .
3241

apps/api/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,16 @@ curl -X POST -H "Content-Type: image/jpeg" --data-binary @your_image.jpg http://
9393
[sa_0_4_vit_b_16_linear.pth](https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_b_16_linear.pth)
9494
1. Place it in `models/aesthetic/sa_0_4_vit_b_16_linear.pth`
9595
1. The OpenAI CLIP backbone weights for ViT-B/16 will be downloaded automatically on first run `make ai-api`
96+
97+
## Aesthetic scorer (multi-attribute)
98+
99+
To enable the newer multi-attribute aesthetic scorer (used by `/scores`):
100+
101+
1. Download the model weights and processor files:
102+
`make load-aesthetic-scorer`
103+
1. Download the CLIP ViT-B/32 backbone (offline):
104+
`make load-clip-vit-base-patch32`
105+
1. Rebuild and run the API:
106+
`make build-ai-api && make ai-api`
107+
108+
The weights are stored under `models/rsinema_aesthetic-scorer` and `models/openai_clip-vit-base-patch32` for offline loading.

apps/api/aesthetic.py

Lines changed: 147 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,23 @@
22
import torch
33
import torch.nn as nn
44
import torchvision.transforms as T
5-
from PIL import Image
5+
from PIL import Image, ImageFilter
6+
import numpy as np
67
import logging
78
from collections import OrderedDict
89
import io
910
import clip
11+
from transformers import CLIPProcessor, CLIPVisionModel
1012

1113
# Set up logging once
1214
logging.basicConfig(level=logging.DEBUG)
1315
logger = logging.getLogger("uvicorn")
1416
logger.setLevel(logging.DEBUG)
1517

1618
HEAD_PATH = "models/aesthetic/sa_0_4_vit_b_16_linear.pth"
19+
SCORER_DIR = "models/rsinema_aesthetic-scorer"
20+
SCORER_MODEL_PATH = f"{SCORER_DIR}/model.pt"
21+
CLIP_BASE_DIR = "models/openai_clip-vit-base-patch32"
1722

1823
device = "cuda" if torch.cuda.is_available() else "cpu"
1924

@@ -67,6 +72,56 @@ def load_clip_model() -> tuple[torch.nn.Module, callable]:
6772
# One-time global setup
6873
_clip_model, preprocess = load_clip_model()
6974
regression_head = load_aesthetic_head(HEAD_PATH)
75+
_aesthetic_scorer = None
76+
_aesthetic_processor = None
77+
_aesthetic_backbone = None
78+
79+
class AestheticScorer(nn.Module):
80+
def __init__(self, backbone):
81+
super().__init__()
82+
self.backbone = backbone
83+
hidden_dim = backbone.config.hidden_size
84+
self.aesthetic_head = nn.Sequential(nn.Linear(hidden_dim, 1))
85+
self.quality_head = nn.Sequential(nn.Linear(hidden_dim, 1))
86+
self.composition_head = nn.Sequential(nn.Linear(hidden_dim, 1))
87+
self.light_head = nn.Sequential(nn.Linear(hidden_dim, 1))
88+
self.color_head = nn.Sequential(nn.Linear(hidden_dim, 1))
89+
self.dof_head = nn.Sequential(nn.Linear(hidden_dim, 1))
90+
self.content_head = nn.Sequential(nn.Linear(hidden_dim, 1))
91+
92+
def forward(self, pixel_values):
93+
features = self.backbone(pixel_values).pooler_output
94+
return (
95+
self.aesthetic_head(features),
96+
self.quality_head(features),
97+
self.composition_head(features),
98+
self.light_head(features),
99+
self.color_head(features),
100+
self.dof_head(features),
101+
self.content_head(features),
102+
)
103+
104+
def load_aesthetic_scorer():
105+
global _aesthetic_scorer, _aesthetic_processor, _aesthetic_backbone
106+
try:
107+
_aesthetic_processor = CLIPProcessor.from_pretrained(SCORER_DIR, use_fast=False)
108+
_aesthetic_backbone = CLIPVisionModel.from_pretrained(CLIP_BASE_DIR, local_files_only=True).to(device)
109+
loaded = torch.load(SCORER_MODEL_PATH, map_location=device)
110+
if isinstance(loaded, dict) and all(isinstance(v, torch.Tensor) for v in loaded.values()):
111+
scorer = AestheticScorer(_aesthetic_backbone)
112+
scorer.load_state_dict(loaded, strict=False)
113+
_aesthetic_scorer = scorer
114+
else:
115+
_aesthetic_scorer = loaded
116+
_aesthetic_scorer.eval()
117+
logger.info("✅ Aesthetic scorer loaded.")
118+
except Exception as e:
119+
logger.error(f"⚠️ Failed to load aesthetic scorer: {e}")
120+
_aesthetic_scorer = None
121+
_aesthetic_processor = None
122+
_aesthetic_backbone = None
123+
124+
load_aesthetic_scorer()
70125

71126
async def score_aesthetic(req: Request) -> float:
72127
img_bytes = await req.body()
@@ -80,3 +135,94 @@ async def score_aesthetic(req: Request) -> float:
80135
score = score_tensor.item()
81136

82137
return float(score)
138+
139+
def _grayscale_np(img: Image.Image, size: int = 256) -> np.ndarray:
140+
resized = img.resize((size, size))
141+
return (np.array(resized.convert("L"), dtype=np.float32) / 255.0)
142+
143+
def _edges_intensity(img: Image.Image, size: int = 256) -> np.ndarray:
144+
resized = img.resize((size, size))
145+
edges = resized.filter(ImageFilter.FIND_EDGES).convert("L")
146+
return (np.array(edges, dtype=np.float32) / 255.0)
147+
148+
def _rule_of_thirds_score(edge_map: np.ndarray) -> float:
149+
if edge_map.size == 0:
150+
return 0.0
151+
h, w = edge_map.shape
152+
ys = np.linspace(0, h - 1, h, dtype=np.float32)
153+
xs = np.linspace(0, w - 1, w, dtype=np.float32)
154+
yy, xx = np.meshgrid(ys, xs, indexing="ij")
155+
thirds_y = np.array([h / 3, 2 * h / 3], dtype=np.float32)
156+
thirds_x = np.array([w / 3, 2 * w / 3], dtype=np.float32)
157+
sigma = min(h, w) / 12
158+
weight = np.zeros_like(edge_map, dtype=np.float32)
159+
for ty in thirds_y:
160+
for tx in thirds_x:
161+
weight += np.exp(-(((yy - ty) ** 2 + (xx - tx) ** 2) / (2 * sigma ** 2)))
162+
weighted = float((edge_map * weight).sum())
163+
total = float(edge_map.sum())
164+
if total <= 0:
165+
return 0.0
166+
ratio = weighted / total
167+
return float(max(0.0, min(10.0, ratio * 10)))
168+
169+
def _visual_interest_score(edge_map: np.ndarray) -> float:
170+
mean_edge = float(edge_map.mean())
171+
score = mean_edge * 60.0
172+
return float(max(0.0, min(10.0, score)))
173+
174+
def _sharpness_score(gray: np.ndarray) -> float:
175+
if gray.size == 0:
176+
return 0.0
177+
padded = np.pad(gray, 1, mode="edge")
178+
lap = (
179+
padded[:-2, 1:-1]
180+
+ padded[2:, 1:-1]
181+
+ padded[1:-1, :-2]
182+
+ padded[1:-1, 2:]
183+
- 4 * padded[1:-1, 1:-1]
184+
)
185+
variance = float(lap.var())
186+
score = variance * 1000.0
187+
return float(max(0.0, min(10.0, score)))
188+
189+
def _score_with_aesthetic_model(img: Image.Image) -> dict | None:
190+
if _aesthetic_scorer is None or _aesthetic_processor is None:
191+
return None
192+
inputs = _aesthetic_processor(images=img, return_tensors="pt")["pixel_values"].to(device)
193+
with torch.no_grad():
194+
scores = _aesthetic_scorer(inputs)
195+
labels = ["overall", "quality", "composition", "lighting", "color", "depth_of_field", "content"]
196+
return {label: float(score.item()) for label, score in zip(labels, scores)}
197+
198+
async def score_photo_tips(req: Request) -> dict:
199+
img_bytes = await req.body()
200+
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
201+
edge_map = _edges_intensity(img)
202+
gray = _grayscale_np(img)
203+
thirds_score = _rule_of_thirds_score(edge_map)
204+
interest_score = _visual_interest_score(edge_map)
205+
sharpness_score = _sharpness_score(gray)
206+
composition = (interest_score * 0.8) + (thirds_score * 0.2)
207+
sharpness_factor = 0.9 + (sharpness_score / 20.0)
208+
model_scores = _score_with_aesthetic_model(img)
209+
model_overall = (model_scores["overall"] * 2) if model_scores else None
210+
base_overall = composition if model_overall is None else ((model_overall * 0.7) + (composition * 0.3))
211+
overall_score = base_overall * sharpness_factor
212+
tips = []
213+
if thirds_score < 4:
214+
tips.append("Try placing the subject near rule-of-thirds intersections.")
215+
if interest_score < 4:
216+
tips.append("Add more texture, contrast, or a clearer subject to increase visual interest.")
217+
if sharpness_score < 4:
218+
tips.append("Looks a bit soft; try a faster shutter or steadier shot.")
219+
if not tips:
220+
tips.append("Strong composition and visual interest.")
221+
return {
222+
"rule_of_thirds_score": round(thirds_score, 2),
223+
"visual_interest_score": round(interest_score, 2),
224+
"sharpness_score": round(sharpness_score, 2),
225+
"overall_score": round(max(0.0, min(10.0, overall_score)) * 10, 1),
226+
"model_scores": model_scores,
227+
"tips": tips,
228+
}

apps/api/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
import sys
55
import traceback
6-
from aesthetic import score_aesthetic
6+
from aesthetic import score_photo_tips
77
from classify import classify_image
88

99
# Setup logging once
@@ -38,6 +38,6 @@ async def classify_endpoint(req: Request):
3838
@main_py_app.post("/scores")
3939
async def score_endpoint(req: Request):
4040
try:
41-
return {"aesthetic_score": await score_aesthetic(req)}
41+
return await score_photo_tips(req)
4242
except Exception as e:
4343
return error_response(e)

apps/api/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ scikit-learn==1.5.0
66
timm==1.0.15
77
torch==2.0.1
88
torchvision==0.15.2
9+
transformers==4.38.2
910
uvicorn==0.34.3

src/components/OrganizePreviews/PreviewImage.tsx

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,27 @@ const NOT_AVAILABLE = 'N/A'
2020
// Module-level cache to persist across remounts
2121
const scoreCache: Record<string, string> = {}
2222

23-
function formatScore(score: number | undefined): string {
24-
return typeof score === 'number'
25-
? `${Math.abs(score * 100).toFixed(1)}%`
26-
: NOT_AVAILABLE
23+
function formatScore(
24+
overall: number | undefined,
25+
interest: number | undefined,
26+
thirds: number | undefined,
27+
): string {
28+
const percent = (value: number) => `${Math.round(value * 10)}%`
29+
if (typeof overall === 'number') {
30+
return `${Math.round(overall)}%`
31+
}
32+
const hasInterest = typeof interest === 'number'
33+
const hasThirds = typeof thirds === 'number'
34+
if (hasInterest && hasThirds) {
35+
return percent((interest * 0.8) + (thirds * 0.2))
36+
}
37+
if (hasInterest) {
38+
return percent(interest)
39+
}
40+
if (hasThirds) {
41+
return percent(thirds)
42+
}
43+
return NOT_AVAILABLE
2744
}
2845

2946
// SWR fetcher that uses the cache
@@ -38,7 +55,11 @@ const fetchScore = async (absolutePath: string) => {
3855
})
3956
if (!res.ok) throw new Error('Failed to fetch')
4057
const data = await res.json()
41-
const scoreStr = formatScore(data.aesthetic_score)
58+
const scoreStr = formatScore(
59+
data.overall_score,
60+
data.visual_interest_score,
61+
data.rule_of_thirds_score,
62+
)
4263
scoreCache[absolutePath] = scoreStr
4364
return scoreStr
4465
}
@@ -58,7 +79,7 @@ function DraggableThumb({
5879
<Link href={absolutePath} target="_blank" title="View original in new tab">
5980
{filename}
6081
</Link>
61-
&nbsp;<span title='Aesthetic score'>{displayScore}</span>
82+
&nbsp;<span title="Composition scores">{displayScore}</span>
6283
</span>
6384
<Img
6485
key={`thumbnail-${filename}`}

0 commit comments

Comments
 (0)