22import torch
33import torch .nn as nn
44import torchvision .transforms as T
5- from PIL import Image
5+ from PIL import Image , ImageFilter
6+ import numpy as np
67import logging
78from collections import OrderedDict
89import io
910import clip
11+ from transformers import CLIPProcessor , CLIPVisionModel
1012
1113# Set up logging once
1214logging .basicConfig (level = logging .DEBUG )
1315logger = logging .getLogger ("uvicorn" )
1416logger .setLevel (logging .DEBUG )
1517
1618HEAD_PATH = "models/aesthetic/sa_0_4_vit_b_16_linear.pth"
19+ SCORER_DIR = "models/rsinema_aesthetic-scorer"
20+ SCORER_MODEL_PATH = f"{ SCORER_DIR } /model.pt"
21+ CLIP_BASE_DIR = "models/openai_clip-vit-base-patch32"
1722
1823device = "cuda" if torch .cuda .is_available () else "cpu"
1924
@@ -67,6 +72,56 @@ def load_clip_model() -> tuple[torch.nn.Module, callable]:
6772# One-time global setup
6873_clip_model , preprocess = load_clip_model ()
6974regression_head = load_aesthetic_head (HEAD_PATH )
75+ _aesthetic_scorer = None
76+ _aesthetic_processor = None
77+ _aesthetic_backbone = None
78+
79+ class AestheticScorer (nn .Module ):
80+ def __init__ (self , backbone ):
81+ super ().__init__ ()
82+ self .backbone = backbone
83+ hidden_dim = backbone .config .hidden_size
84+ self .aesthetic_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
85+ self .quality_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
86+ self .composition_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
87+ self .light_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
88+ self .color_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
89+ self .dof_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
90+ self .content_head = nn .Sequential (nn .Linear (hidden_dim , 1 ))
91+
92+ def forward (self , pixel_values ):
93+ features = self .backbone (pixel_values ).pooler_output
94+ return (
95+ self .aesthetic_head (features ),
96+ self .quality_head (features ),
97+ self .composition_head (features ),
98+ self .light_head (features ),
99+ self .color_head (features ),
100+ self .dof_head (features ),
101+ self .content_head (features ),
102+ )
103+
104+ def load_aesthetic_scorer ():
105+ global _aesthetic_scorer , _aesthetic_processor , _aesthetic_backbone
106+ try :
107+ _aesthetic_processor = CLIPProcessor .from_pretrained (SCORER_DIR , use_fast = False )
108+ _aesthetic_backbone = CLIPVisionModel .from_pretrained (CLIP_BASE_DIR , local_files_only = True ).to (device )
109+ loaded = torch .load (SCORER_MODEL_PATH , map_location = device )
110+ if isinstance (loaded , dict ) and all (isinstance (v , torch .Tensor ) for v in loaded .values ()):
111+ scorer = AestheticScorer (_aesthetic_backbone )
112+ scorer .load_state_dict (loaded , strict = False )
113+ _aesthetic_scorer = scorer
114+ else :
115+ _aesthetic_scorer = loaded
116+ _aesthetic_scorer .eval ()
117+ logger .info ("✅ Aesthetic scorer loaded." )
118+ except Exception as e :
119+ logger .error (f"⚠️ Failed to load aesthetic scorer: { e } " )
120+ _aesthetic_scorer = None
121+ _aesthetic_processor = None
122+ _aesthetic_backbone = None
123+
124+ load_aesthetic_scorer ()
70125
71126async def score_aesthetic (req : Request ) -> float :
72127 img_bytes = await req .body ()
@@ -80,3 +135,94 @@ async def score_aesthetic(req: Request) -> float:
80135 score = score_tensor .item ()
81136
82137 return float (score )
138+
139+ def _grayscale_np (img : Image .Image , size : int = 256 ) -> np .ndarray :
140+ resized = img .resize ((size , size ))
141+ return (np .array (resized .convert ("L" ), dtype = np .float32 ) / 255.0 )
142+
143+ def _edges_intensity (img : Image .Image , size : int = 256 ) -> np .ndarray :
144+ resized = img .resize ((size , size ))
145+ edges = resized .filter (ImageFilter .FIND_EDGES ).convert ("L" )
146+ return (np .array (edges , dtype = np .float32 ) / 255.0 )
147+
148+ def _rule_of_thirds_score (edge_map : np .ndarray ) -> float :
149+ if edge_map .size == 0 :
150+ return 0.0
151+ h , w = edge_map .shape
152+ ys = np .linspace (0 , h - 1 , h , dtype = np .float32 )
153+ xs = np .linspace (0 , w - 1 , w , dtype = np .float32 )
154+ yy , xx = np .meshgrid (ys , xs , indexing = "ij" )
155+ thirds_y = np .array ([h / 3 , 2 * h / 3 ], dtype = np .float32 )
156+ thirds_x = np .array ([w / 3 , 2 * w / 3 ], dtype = np .float32 )
157+ sigma = min (h , w ) / 12
158+ weight = np .zeros_like (edge_map , dtype = np .float32 )
159+ for ty in thirds_y :
160+ for tx in thirds_x :
161+ weight += np .exp (- (((yy - ty ) ** 2 + (xx - tx ) ** 2 ) / (2 * sigma ** 2 )))
162+ weighted = float ((edge_map * weight ).sum ())
163+ total = float (edge_map .sum ())
164+ if total <= 0 :
165+ return 0.0
166+ ratio = weighted / total
167+ return float (max (0.0 , min (10.0 , ratio * 10 )))
168+
169+ def _visual_interest_score (edge_map : np .ndarray ) -> float :
170+ mean_edge = float (edge_map .mean ())
171+ score = mean_edge * 60.0
172+ return float (max (0.0 , min (10.0 , score )))
173+
174+ def _sharpness_score (gray : np .ndarray ) -> float :
175+ if gray .size == 0 :
176+ return 0.0
177+ padded = np .pad (gray , 1 , mode = "edge" )
178+ lap = (
179+ padded [:- 2 , 1 :- 1 ]
180+ + padded [2 :, 1 :- 1 ]
181+ + padded [1 :- 1 , :- 2 ]
182+ + padded [1 :- 1 , 2 :]
183+ - 4 * padded [1 :- 1 , 1 :- 1 ]
184+ )
185+ variance = float (lap .var ())
186+ score = variance * 1000.0
187+ return float (max (0.0 , min (10.0 , score )))
188+
189+ def _score_with_aesthetic_model (img : Image .Image ) -> dict | None :
190+ if _aesthetic_scorer is None or _aesthetic_processor is None :
191+ return None
192+ inputs = _aesthetic_processor (images = img , return_tensors = "pt" )["pixel_values" ].to (device )
193+ with torch .no_grad ():
194+ scores = _aesthetic_scorer (inputs )
195+ labels = ["overall" , "quality" , "composition" , "lighting" , "color" , "depth_of_field" , "content" ]
196+ return {label : float (score .item ()) for label , score in zip (labels , scores )}
197+
198+ async def score_photo_tips (req : Request ) -> dict :
199+ img_bytes = await req .body ()
200+ img = Image .open (io .BytesIO (img_bytes )).convert ("RGB" )
201+ edge_map = _edges_intensity (img )
202+ gray = _grayscale_np (img )
203+ thirds_score = _rule_of_thirds_score (edge_map )
204+ interest_score = _visual_interest_score (edge_map )
205+ sharpness_score = _sharpness_score (gray )
206+ composition = (interest_score * 0.8 ) + (thirds_score * 0.2 )
207+ sharpness_factor = 0.9 + (sharpness_score / 20.0 )
208+ model_scores = _score_with_aesthetic_model (img )
209+ model_overall = (model_scores ["overall" ] * 2 ) if model_scores else None
210+ base_overall = composition if model_overall is None else ((model_overall * 0.7 ) + (composition * 0.3 ))
211+ overall_score = base_overall * sharpness_factor
212+ tips = []
213+ if thirds_score < 4 :
214+ tips .append ("Try placing the subject near rule-of-thirds intersections." )
215+ if interest_score < 4 :
216+ tips .append ("Add more texture, contrast, or a clearer subject to increase visual interest." )
217+ if sharpness_score < 4 :
218+ tips .append ("Looks a bit soft; try a faster shutter or steadier shot." )
219+ if not tips :
220+ tips .append ("Strong composition and visual interest." )
221+ return {
222+ "rule_of_thirds_score" : round (thirds_score , 2 ),
223+ "visual_interest_score" : round (interest_score , 2 ),
224+ "sharpness_score" : round (sharpness_score , 2 ),
225+ "overall_score" : round (max (0.0 , min (10.0 , overall_score )) * 10 , 1 ),
226+ "model_scores" : model_scores ,
227+ "tips" : tips ,
228+ }
0 commit comments