Skip to content

Commit ab8050e

Browse files
committed
"changes on yolo inference"
1 parent bfb6c14 commit ab8050e

File tree

15 files changed

+432
-87
lines changed

15 files changed

+432
-87
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,5 @@ htmlcov/
8080
*.temp
8181
.claude
8282
backend/docs
83+
# Local scripts
84+
commit-and-push.sh

apps/api-inference-yolo/src/app/integrations/sam3/inference.py

Lines changed: 95 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ def __init__(self):
3232

3333
self.model_path = resolved_path
3434
self.device = self._get_device()
35-
self.predictor = None
35+
self.feature_predictor = None
36+
self.inference_predictor = None
3637
self.visualizer = Sam3Visualizer()
3738

3839
logger.info(f"SAM3 (Ultralytics) inference initialized - Model: {self.model_path}, Device: {self.device}")
@@ -102,13 +103,27 @@ def load_model(self):
102103
task="segment",
103104
mode="predict",
104105
model=self.model_path,
105-
half=True if self.device == "cuda" else False, # Use FP16 for faster inference on CUDA
106+
half=False, # Use full precision as requested
106107
save=False, # Don't save prediction results to disk
107-
device=self.device
108+
device=self.device,
109+
verbose=False # Reduce noise in logs
108110
)
109111

110112
try:
111-
self.predictor = SAM3SemanticPredictor(overrides=overrides)
113+
# Initialize two predictors to follow the working reference implementation exactly
114+
# One for feature extraction (image encoding), one for inference (decoding)
115+
self.feature_predictor = SAM3SemanticPredictor(overrides=overrides)
116+
self.inference_predictor = SAM3SemanticPredictor(overrides=overrides)
117+
118+
# Setup both models
119+
self.feature_predictor.setup_model()
120+
self.inference_predictor.setup_model()
121+
122+
# Share the underlying model to save VRAM if possible, while keeping separate predictor states
123+
if hasattr(self.feature_predictor, 'model') and self.feature_predictor.model is not None:
124+
self.inference_predictor.model = self.feature_predictor.model
125+
logger.info("Shared underlying model between predictors to optimized VRAM")
126+
112127
logger.info(f"SAM3 model loaded successfully on {self.device} (FP16: {overrides['half']})")
113128
except Exception as e:
114129
logger.error(f"Failed to load SAM3 model: {e}")
@@ -135,62 +150,109 @@ async def _load_image_from_upload(self, file: UploadFile) -> np.ndarray:
135150
return np.array(image_pil)
136151

137152
async def _run_inference(self, image_np, conf_threshold=None, **kwargs) -> tuple:
138-
"""Shared inference logic following test-yolo-sam3.py pattern.
153+
"""Shared inference logic following feature-based inference pattern.
139154
140155
Args:
141156
image_np: Image as numpy array
142157
conf_threshold: Confidence threshold to override model default
143158
**kwargs: Arguments to pass to predictor (text, bboxes, etc.)
144159
"""
145160

146-
# Update predictor confidence if specified
147-
if conf_threshold is not None and hasattr(self.predictor, 'args'):
148-
self.predictor.args.conf = conf_threshold
161+
# Update predictor confidence if specified for inference predictor
162+
if conf_threshold is not None and hasattr(self.inference_predictor, 'args'):
163+
self.inference_predictor.args.conf = conf_threshold
149164
logger.info(f"Set confidence threshold to {conf_threshold}")
150165

151-
# Set image (like predictor.set_image() in test script)
152-
self.predictor.set_image(image_np)
153-
154-
# Run prediction (like predictor(text=[...]) in test script)
155-
results = self.predictor(**kwargs)
156-
157-
# Handle results - Ultralytics returns Results object or list
158-
if isinstance(results, list):
159-
result = results[0] if len(results) > 0 else None
166+
# 1. Extract features using feature_predictor (like predictor.set_image())
167+
# Ultralytics models typically expect BGR format when passed numpy arrays directly
168+
# because they are built around OpenCV which uses BGR by default
169+
if image_np.ndim == 3 and image_np.shape[2] == 3:
170+
logger.info("Converting image from RGB to BGR for Ultralytics predictor")
171+
image_input = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
160172
else:
161-
result = results
173+
image_input = image_np
174+
175+
self.feature_predictor.set_image(image_input)
176+
src_shape = image_np.shape[:2] # Get image shape (height, width)
162177

163-
if result is None:
164-
logger.warning("No results returned from predictor")
178+
# Verify features are extracted
179+
if not hasattr(self.feature_predictor, 'features') or self.feature_predictor.features is None:
180+
logger.error("Failed to extract features from image")
181+
return None, None, [], [], []
182+
183+
# 2. Run inference using inference_predictor reusing features (like predictor2.inference_features)
184+
try:
185+
# Extract text or bboxes from kwargs
186+
text_prompts = kwargs.get('text', None)
187+
bboxes = kwargs.get('bboxes', None)
188+
189+
if text_prompts is not None:
190+
logger.info(f"Running feature-based inference with text prompts: {text_prompts}")
191+
masks, boxes = self.inference_predictor.inference_features(
192+
self.feature_predictor.features,
193+
src_shape=src_shape,
194+
text=text_prompts
195+
)
196+
elif bboxes is not None:
197+
logger.info(f"Running feature-based inference with {len(bboxes)} bounding boxes")
198+
masks, boxes = self.inference_predictor.inference_features(
199+
self.feature_predictor.features,
200+
src_shape=src_shape,
201+
bboxes=bboxes
202+
)
203+
else:
204+
logger.error("No text prompts or bounding boxes provided")
205+
return None, None, [], [], []
206+
207+
# Inference result logging
208+
box_count = 0
209+
mask_count = 0
210+
211+
if boxes is not None:
212+
box_count = len(boxes)
213+
if torch.is_tensor(boxes):
214+
logger.info(f"Boxes tensor shape: {boxes.shape}, device: {boxes.device}")
215+
216+
if masks is not None:
217+
mask_count = len(masks)
218+
if torch.is_tensor(masks):
219+
logger.info(f"Masks tensor shape: {masks.shape}, device: {masks.device}")
220+
221+
logger.info(f"Feature-based inference successful: {mask_count} masks, {box_count} boxes")
222+
223+
except Exception as e:
224+
logger.error(f"Feature-based inference failed: {e}", exc_info=True)
165225
return None, None, [], [], []
166226

167227
boxes_list = []
168228
scores_list = []
169229
masks_polygon = []
170230
masks_tensor = None
171231

172-
# Extract masks
173-
if hasattr(result, 'masks') and result.masks is not None:
174-
masks_tensor = result.masks.data # Get mask tensors [N, H, W]
232+
# Process masks
233+
if masks is not None and len(masks) > 0:
234+
masks_tensor = masks # Already tensor format from inference_features
175235
# Convert masks to polygon format
176236
masks_polygon = masks_to_polygon_data(masks_tensor)
177237
logger.info(f"Extracted {len(masks_polygon)} mask(s)")
178238
else:
179239
logger.warning("No masks found in results")
180240

181-
# Extract boxes and scores
182-
if hasattr(result, 'boxes') and result.boxes is not None:
183-
boxes_list = result.boxes.xyxy.cpu().tolist() # [x1, y1, x2, y2] format
184-
if hasattr(result.boxes, 'conf') and result.boxes.conf is not None:
185-
scores_list = result.boxes.conf.cpu().tolist()
186-
else:
187-
# Default confidence if not available
188-
scores_list = [1.0] * len(boxes_list)
189-
logger.info(f"Extracted {len(boxes_list)} box(es) with confidences: {scores_list}")
241+
# Process boxes - inference_features returns boxes as tensor
242+
if boxes is not None and len(boxes) > 0:
243+
boxes_list = boxes.cpu().tolist() if torch.is_tensor(boxes) else boxes.tolist()
244+
# inference_features doesn't return confidence scores directly
245+
# Use default confidence based on threshold
246+
scores_list = [conf_threshold if conf_threshold else settings.SAM3_DEFAULT_THRESHOLD] * len(boxes_list)
247+
logger.info(f"Extracted {len(boxes_list)} box(es)")
248+
249+
# Log confidence for each detection
250+
for idx, (box, score) in enumerate(zip(boxes_list, scores_list)):
251+
logger.info(f" Detection #{idx+1}: bbox={box}, confidence={score:.4f}")
190252
else:
191253
logger.warning("No boxes found in results")
192254

193-
return result, masks_tensor, boxes_list, scores_list, masks_polygon
255+
return None, masks_tensor, boxes_list, scores_list, masks_polygon
194256

195257

196258
async def inference_text(

apps/api-inference/src/app/integrations/sam3/inference.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -143,20 +143,28 @@ async def inference_text(
143143
outputs = self.model(**inputs)
144144

145145
# Post-process
146+
# Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
146147
results = self.processor.post_process_instance_segmentation(
147148
outputs,
148-
threshold=threshold,
149+
threshold=0.0,
149150
mask_threshold=mask_threshold,
150151
target_sizes=inputs.get("original_sizes").tolist(),
151152
)[0]
152153

154+
# Filter by threshold manually
155+
keep = results["scores"] > threshold
156+
157+
scores = results["scores"][keep]
158+
boxes = results["boxes"][keep]
159+
masks = results["masks"][keep]
160+
153161
# Prepare response
154-
num_objects = len(results["scores"])
155-
boxes_list = results["boxes"].cpu().tolist()
156-
scores_list = results["scores"].cpu().tolist()
162+
num_objects = len(scores)
163+
boxes_list = boxes.cpu().tolist()
164+
scores_list = scores.cpu().tolist()
157165

158166
# Convert masks to polygon coordinates
159-
masks_polygon = masks_to_polygon_data(results["masks"])
167+
masks_polygon = masks_to_polygon_data(masks)
160168

161169
processing_time_ms = (time.perf_counter() - start_time) * 1000
162170

@@ -234,20 +242,28 @@ async def inference_bbox(
234242
outputs = self.model(**inputs)
235243

236244
# Post-process
245+
# Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
237246
results = self.processor.post_process_instance_segmentation(
238247
outputs,
239-
threshold=threshold,
248+
threshold=0.0,
240249
mask_threshold=mask_threshold,
241250
target_sizes=inputs.get("original_sizes").tolist(),
242251
)[0]
243252

253+
# Filter by threshold manually
254+
keep = results["scores"] > threshold
255+
256+
scores = results["scores"][keep]
257+
boxes = results["boxes"][keep]
258+
masks = results["masks"][keep]
259+
244260
# Prepare response
245-
num_objects = len(results["scores"])
246-
boxes_list = results["boxes"].cpu().tolist()
247-
scores_list = results["scores"].cpu().tolist()
261+
num_objects = len(scores)
262+
boxes_list = boxes.cpu().tolist()
263+
scores_list = scores.cpu().tolist()
248264

249265
# Convert masks to polygon coordinates
250-
masks_polygon = masks_to_polygon_data(results["masks"])
266+
masks_polygon = masks_to_polygon_data(masks)
251267

252268
processing_time_ms = (time.perf_counter() - start_time) * 1000
253269

@@ -323,22 +339,30 @@ async def inference_batch(
323339
outputs = self.model(**inputs)
324340

325341
# Post-process
342+
# Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
326343
results = self.processor.post_process_instance_segmentation(
327344
outputs,
328-
threshold=threshold,
345+
threshold=0.0,
329346
mask_threshold=mask_threshold,
330347
target_sizes=inputs.get("original_sizes").tolist(),
331348
)
332349

333350
# Process each result
334351
batch_results = []
335352
for idx, (image, result) in enumerate(zip(images, results)):
336-
num_objects = len(result["scores"])
337-
boxes_list = result["boxes"].cpu().tolist()
338-
scores_list = result["scores"].cpu().tolist()
353+
# Filter by threshold manually
354+
keep = result["scores"] > threshold
355+
356+
scores = result["scores"][keep]
357+
boxes = result["boxes"][keep]
358+
masks = result["masks"][keep]
359+
360+
num_objects = len(scores)
361+
boxes_list = boxes.cpu().tolist()
362+
scores_list = scores.cpu().tolist()
339363

340364
# Convert masks to polygon coordinates
341-
masks_polygon = masks_to_polygon_data(result["masks"])
365+
masks_polygon = masks_to_polygon_data(masks)
342366

343367
result_item = {
344368
"image_index": idx,

apps/requirements_linux.txt

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
annotated-doc==0.0.4
2+
annotated-types==0.7.0
3+
anyio==4.12.1
4+
certifi==2026.1.4
5+
charset-normalizer==3.4.4
6+
click==8.3.1
7+
clip @ git+https://github.com/ultralytics/CLIP.git@88ade288431a46233f1556d1e141901b3ef0a36b
8+
contourpy==1.3.3
9+
cuda-bindings==12.9.4
10+
cuda-pathfinder==1.3.3
11+
cycler==0.12.1
12+
fastapi==0.128.0
13+
filelock==3.20.3
14+
fonttools==4.61.1
15+
fsspec==2026.1.0
16+
ftfy==6.3.1
17+
h11==0.16.0
18+
hf-xet==1.2.0
19+
httpcore==1.0.9
20+
httpx==0.28.1
21+
huggingface_hub==1.3.7
22+
idna==3.11
23+
Jinja2==3.1.6
24+
kiwisolver==1.4.9
25+
MarkupSafe==3.0.3
26+
matplotlib==3.10.8
27+
mpmath==1.3.0
28+
networkx==3.6.1
29+
numpy==2.2.6
30+
nvidia-cublas-cu12==12.8.4.1
31+
nvidia-cuda-cupti-cu12==12.8.90
32+
nvidia-cuda-nvrtc-cu12==12.8.93
33+
nvidia-cuda-runtime-cu12==12.8.90
34+
nvidia-cudnn-cu12==9.10.2.21
35+
nvidia-cufft-cu12==11.3.3.83
36+
nvidia-cufile-cu12==1.13.1.3
37+
nvidia-curand-cu12==10.3.9.90
38+
nvidia-cusolver-cu12==11.7.3.90
39+
nvidia-cusparse-cu12==12.5.8.93
40+
nvidia-cusparselt-cu12==0.7.1
41+
nvidia-nccl-cu12==2.27.5
42+
nvidia-nvjitlink-cu12==12.8.93
43+
nvidia-nvshmem-cu12==3.4.5
44+
nvidia-nvtx-cu12==12.8.90
45+
opencv-python==4.13.0.90
46+
opencv-python-headless==4.12.0.88
47+
packaging==26.0
48+
pillow==12.1.0
49+
polars==1.37.1
50+
polars-runtime-32==1.37.1
51+
psutil==7.2.2
52+
pydantic==2.12.5
53+
pydantic-settings==2.12.0
54+
pydantic_core==2.41.5
55+
pyparsing==3.3.2
56+
python-dateutil==2.9.0.post0
57+
python-dotenv==1.2.1
58+
python-multipart==0.0.22
59+
PyYAML==6.0.3
60+
regex==2026.1.15
61+
requests==2.32.5
62+
safetensors==0.7.0
63+
sam3-app==0.1.0
64+
-e git+https://github.com/Rajkisan/annotate-anu.git@4ac6f9fba72207235c41cc01d00ae8845d688bea#egg=sam3_yolo_app&subdirectory=apps/api-inference-yolo
65+
scipy==1.17.0
66+
setuptools==80.10.2
67+
shellingham==1.5.4
68+
six==1.17.0
69+
starlette==0.50.0
70+
sympy==1.14.0
71+
timm==1.0.24
72+
tokenizers==0.22.2
73+
torch==2.10.0
74+
torchvision==0.25.0
75+
tqdm==4.67.3
76+
transformers==5.0.0
77+
triton==3.6.0
78+
typer-slim==0.21.1
79+
typing-inspection==0.4.2
80+
typing_extensions==4.15.0
81+
ultralytics==8.4.11
82+
ultralytics-thop==2.0.18
83+
urllib3==2.6.3
84+
uvicorn==0.40.0
85+
wcwidth==0.5.3
86+
wheel==0.46.3

0 commit comments

Comments
 (0)