agfianf
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/api-inference-yolo/src/app/integrations/sam3/inference.py‎
Lines changed: 95 additions & 33 deletions b/‎apps/api-inference-yolo/src/app/integrations/sam3/inference.py‎
Lines changed: 95 additions & 33 deletions
diff --git a/‎apps/api-inference/src/app/integrations/sam3/inference.py‎
Lines changed: 39 additions & 15 deletions b/‎apps/api-inference/src/app/integrations/sam3/inference.py‎
Lines changed: 39 additions & 15 deletions
diff --git a/‎apps/requirements_linux.txt‎
Lines changed: 86 additions & 0 deletions b/‎apps/requirements_linux.txt‎
Lines changed: 86 additions & 0 deletions
@@ -80,3 +80,5 @@ htmlcov/
 *.temp
 .claude
 backend/docs
+# Local scripts
+commit-and-push.sh
@@ -32,7 +32,8 @@ def __init__(self):
 
         self.model_path = resolved_path
         self.device = self._get_device()
-        self.predictor = None
+        self.feature_predictor = None
+        self.inference_predictor = None
         self.visualizer = Sam3Visualizer()
 
         logger.info(f"SAM3 (Ultralytics) inference initialized - Model: {self.model_path}, Device: {self.device}")
@@ -102,13 +103,27 @@ def load_model(self):
             task="segment",
             mode="predict",
             model=self.model_path,
-            half=True if self.device == "cuda" else False,  # Use FP16 for faster inference on CUDA
+            half=False,  # Use full precision as requested
             save=False,  # Don't save prediction results to disk
-            device=self.device
+            device=self.device,
+            verbose=False  # Reduce noise in logs
         )
 
         try:
-            self.predictor = SAM3SemanticPredictor(overrides=overrides)
+            # Initialize two predictors to follow the working reference implementation exactly
+            # One for feature extraction (image encoding), one for inference (decoding)
+            self.feature_predictor = SAM3SemanticPredictor(overrides=overrides)
+            self.inference_predictor = SAM3SemanticPredictor(overrides=overrides)
+            
+            # Setup both models
+            self.feature_predictor.setup_model()
+            self.inference_predictor.setup_model()
+            
+            # Share the underlying model to save VRAM if possible, while keeping separate predictor states
+            if hasattr(self.feature_predictor, 'model') and self.feature_predictor.model is not None:
+                self.inference_predictor.model = self.feature_predictor.model
+                logger.info("Shared underlying model between predictors to optimized VRAM")
+            
             logger.info(f"SAM3 model loaded successfully on {self.device} (FP16: {overrides['half']})")
         except Exception as e:
             logger.error(f"Failed to load SAM3 model: {e}")
@@ -135,62 +150,109 @@ async def _load_image_from_upload(self, file: UploadFile) -> np.ndarray:
         return np.array(image_pil) 
 
     async def _run_inference(self, image_np, conf_threshold=None, **kwargs) -> tuple:
-        """Shared inference logic following test-yolo-sam3.py pattern.
+        """Shared inference logic following feature-based inference pattern.
         
         Args:
             image_np: Image as numpy array
             conf_threshold: Confidence threshold to override model default
             **kwargs: Arguments to pass to predictor (text, bboxes, etc.)
         """
 
-        # Update predictor confidence if specified
-        if conf_threshold is not None and hasattr(self.predictor, 'args'):
-            self.predictor.args.conf = conf_threshold
+        # Update predictor confidence if specified for inference predictor
+        if conf_threshold is not None and hasattr(self.inference_predictor, 'args'):
+            self.inference_predictor.args.conf = conf_threshold
             logger.info(f"Set confidence threshold to {conf_threshold}")
 
-        # Set image (like predictor.set_image() in test script)
-        self.predictor.set_image(image_np)
-        
-        # Run prediction (like predictor(text=[...]) in test script)
-        results = self.predictor(**kwargs)
-        
-        # Handle results - Ultralytics returns Results object or list
-        if isinstance(results, list):
-            result = results[0] if len(results) > 0 else None
+        # 1. Extract features using feature_predictor (like predictor.set_image())
+        # Ultralytics models typically expect BGR format when passed numpy arrays directly
+        # because they are built around OpenCV which uses BGR by default
+        if image_np.ndim == 3 and image_np.shape[2] == 3:
+            logger.info("Converting image from RGB to BGR for Ultralytics predictor")
+            image_input = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
         else:
-            result = results
+            image_input = image_np
+            
+        self.feature_predictor.set_image(image_input)
+        src_shape = image_np.shape[:2]  # Get image shape (height, width)
 
-        if result is None:
-            logger.warning("No results returned from predictor")
+        # Verify features are extracted
+        if not hasattr(self.feature_predictor, 'features') or self.feature_predictor.features is None:
+            logger.error("Failed to extract features from image")
+            return None, None, [], [], []
+        
+        # 2. Run inference using inference_predictor reusing features (like predictor2.inference_features)
+        try:
+            # Extract text or bboxes from kwargs
+            text_prompts = kwargs.get('text', None)
+            bboxes = kwargs.get('bboxes', None)
+            
+            if text_prompts is not None:
+                logger.info(f"Running feature-based inference with text prompts: {text_prompts}")
+                masks, boxes = self.inference_predictor.inference_features(
+                    self.feature_predictor.features,
+                    src_shape=src_shape,
+                    text=text_prompts
+                )
+            elif bboxes is not None:
+                logger.info(f"Running feature-based inference with {len(bboxes)} bounding boxes")
+                masks, boxes = self.inference_predictor.inference_features(
+                    self.feature_predictor.features,
+                    src_shape=src_shape,
+                    bboxes=bboxes
+                )
+            else:
+                logger.error("No text prompts or bounding boxes provided")
+                return None, None, [], [], []
+            
+            # Inference result logging
+            box_count = 0
+            mask_count = 0
+            
+            if boxes is not None:
+                box_count = len(boxes)
+                if torch.is_tensor(boxes):
+                    logger.info(f"Boxes tensor shape: {boxes.shape}, device: {boxes.device}")
+            
+            if masks is not None:
+                mask_count = len(masks)
+                if torch.is_tensor(masks):
+                    logger.info(f"Masks tensor shape: {masks.shape}, device: {masks.device}")
+
+            logger.info(f"Feature-based inference successful: {mask_count} masks, {box_count} boxes")
+            
+        except Exception as e:
+            logger.error(f"Feature-based inference failed: {e}", exc_info=True)
             return None, None, [], [], []
 
         boxes_list = []
         scores_list = []
         masks_polygon = []
         masks_tensor = None
 
-        # Extract masks
-        if hasattr(result, 'masks') and result.masks is not None:
-            masks_tensor = result.masks.data  # Get mask tensors [N, H, W]
+        # Process masks
+        if masks is not None and len(masks) > 0:
+            masks_tensor = masks  # Already tensor format from inference_features
             # Convert masks to polygon format
             masks_polygon = masks_to_polygon_data(masks_tensor)
             logger.info(f"Extracted {len(masks_polygon)} mask(s)")
         else:
             logger.warning("No masks found in results")
 
-        # Extract boxes and scores
-        if hasattr(result, 'boxes') and result.boxes is not None:
-            boxes_list = result.boxes.xyxy.cpu().tolist()  # [x1, y1, x2, y2] format
-            if hasattr(result.boxes, 'conf') and result.boxes.conf is not None:
-                scores_list = result.boxes.conf.cpu().tolist()
-            else:
-                # Default confidence if not available
-                scores_list = [1.0] * len(boxes_list)
-            logger.info(f"Extracted {len(boxes_list)} box(es) with confidences: {scores_list}")
+        # Process boxes - inference_features returns boxes as tensor
+        if boxes is not None and len(boxes) > 0:
+            boxes_list = boxes.cpu().tolist() if torch.is_tensor(boxes) else boxes.tolist()
+            # inference_features doesn't return confidence scores directly
+            # Use default confidence based on threshold
+            scores_list = [conf_threshold if conf_threshold else settings.SAM3_DEFAULT_THRESHOLD] * len(boxes_list)
+            logger.info(f"Extracted {len(boxes_list)} box(es)")
+            
+            # Log confidence for each detection
+            for idx, (box, score) in enumerate(zip(boxes_list, scores_list)):
+                logger.info(f"  Detection #{idx+1}: bbox={box}, confidence={score:.4f}")
         else:
             logger.warning("No boxes found in results")
 
-        return result, masks_tensor, boxes_list, scores_list, masks_polygon
+        return None, masks_tensor, boxes_list, scores_list, masks_polygon
 
 
     async def inference_text(
 
@@ -143,20 +143,28 @@ async def inference_text(
             outputs = self.model(**inputs)
 
         # Post-process
+        # Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
         results = self.processor.post_process_instance_segmentation(
             outputs,
-            threshold=threshold,
+            threshold=0.0,
             mask_threshold=mask_threshold,
             target_sizes=inputs.get("original_sizes").tolist(),
         )[0]
 
+        # Filter by threshold manually
+        keep = results["scores"] > threshold
+        
+        scores = results["scores"][keep]
+        boxes = results["boxes"][keep]
+        masks = results["masks"][keep]
+
         # Prepare response
-        num_objects = len(results["scores"])
-        boxes_list = results["boxes"].cpu().tolist()
-        scores_list = results["scores"].cpu().tolist()
+        num_objects = len(scores)
+        boxes_list = boxes.cpu().tolist()
+        scores_list = scores.cpu().tolist()
 
         # Convert masks to polygon coordinates
-        masks_polygon = masks_to_polygon_data(results["masks"])
+        masks_polygon = masks_to_polygon_data(masks)
 
         processing_time_ms = (time.perf_counter() - start_time) * 1000
 
@@ -234,20 +242,28 @@ async def inference_bbox(
             outputs = self.model(**inputs)
 
         # Post-process
+        # Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
         results = self.processor.post_process_instance_segmentation(
             outputs,
-            threshold=threshold,
+            threshold=0.0,
             mask_threshold=mask_threshold,
             target_sizes=inputs.get("original_sizes").tolist(),
         )[0]
 
+        # Filter by threshold manually
+        keep = results["scores"] > threshold
+        
+        scores = results["scores"][keep]
+        boxes = results["boxes"][keep]
+        masks = results["masks"][keep]
+
         # Prepare response
-        num_objects = len(results["scores"])
-        boxes_list = results["boxes"].cpu().tolist()
-        scores_list = results["scores"].cpu().tolist()
+        num_objects = len(scores)
+        boxes_list = boxes.cpu().tolist()
+        scores_list = scores.cpu().tolist()
 
         # Convert masks to polygon coordinates
-        masks_polygon = masks_to_polygon_data(results["masks"])
+        masks_polygon = masks_to_polygon_data(masks)
 
         processing_time_ms = (time.perf_counter() - start_time) * 1000
 
@@ -323,22 +339,30 @@ async def inference_batch(
             outputs = self.model(**inputs)
 
         # Post-process
+        # Use 0.0 threshold to get raw scores, then filter manually to ensure accuracy
         results = self.processor.post_process_instance_segmentation(
             outputs,
-            threshold=threshold,
+            threshold=0.0,
             mask_threshold=mask_threshold,
             target_sizes=inputs.get("original_sizes").tolist(),
         )
 
         # Process each result
         batch_results = []
         for idx, (image, result) in enumerate(zip(images, results)):
-            num_objects = len(result["scores"])
-            boxes_list = result["boxes"].cpu().tolist()
-            scores_list = result["scores"].cpu().tolist()
+            # Filter by threshold manually
+            keep = result["scores"] > threshold
+            
+            scores = result["scores"][keep]
+            boxes = result["boxes"][keep]
+            masks = result["masks"][keep]
+
+            num_objects = len(scores)
+            boxes_list = boxes.cpu().tolist()
+            scores_list = scores.cpu().tolist()
 
             # Convert masks to polygon coordinates
-            masks_polygon = masks_to_polygon_data(result["masks"])
+            masks_polygon = masks_to_polygon_data(masks)
 
             result_item = {
                 "image_index": idx,
 
@@ -0,0 +1,86 @@
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.1
+certifi==2026.1.4
+charset-normalizer==3.4.4
+click==8.3.1
+clip @ git+https://github.com/ultralytics/CLIP.git@88ade288431a46233f1556d1e141901b3ef0a36b
+contourpy==1.3.3
+cuda-bindings==12.9.4
+cuda-pathfinder==1.3.3
+cycler==0.12.1
+fastapi==0.128.0
+filelock==3.20.3
+fonttools==4.61.1
+fsspec==2026.1.0
+ftfy==6.3.1
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.3.7
+idna==3.11
+Jinja2==3.1.6
+kiwisolver==1.4.9
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+mpmath==1.3.0
+networkx==3.6.1
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+opencv-python==4.13.0.90
+opencv-python-headless==4.12.0.88
+packaging==26.0
+pillow==12.1.0
+polars==1.37.1
+polars-runtime-32==1.37.1
+psutil==7.2.2
+pydantic==2.12.5
+pydantic-settings==2.12.0
+pydantic_core==2.41.5
+pyparsing==3.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-multipart==0.0.22
+PyYAML==6.0.3
+regex==2026.1.15
+requests==2.32.5
+safetensors==0.7.0
+sam3-app==0.1.0
+-e git+https://github.com/Rajkisan/annotate-anu.git@4ac6f9fba72207235c41cc01d00ae8845d688bea#egg=sam3_yolo_app&subdirectory=apps/api-inference-yolo
+scipy==1.17.0
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+starlette==0.50.0
+sympy==1.14.0
+timm==1.0.24
+tokenizers==0.22.2
+torch==2.10.0
+torchvision==0.25.0
+tqdm==4.67.3
+transformers==5.0.0
+triton==3.6.0
+typer-slim==0.21.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+ultralytics==8.4.11
+ultralytics-thop==2.0.18
+urllib3==2.6.3
+uvicorn==0.40.0
+wcwidth==0.5.3
+wheel==0.46.3