autowarefoundation · m-zain-khawaja · Mar 2, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -219,4 +219,5 @@ __marimo__/
 # VisionPilot specific
 3rdparty/
 trt_cache/
-assets/
+Tutorials/E2E_Models/autoware_vision_pilot/
+Tutorials/E2E_Models/weights/
diff --git a/Models/inference/auto_speed_infer.py b/Models/inference/auto_speed_infer.py
@@ -9,7 +9,7 @@
         # print(f'Using {self.device} for inference')
 
         # Load model
-        self.model = torch.load(checkpoint_path + "/best.pt", map_location="cpu", weights_only=False)['model']
+        self.model = torch.load(checkpoint_path, map_location="cpu", weights_only=False)['model']
         self.model = self.model.to(self.device).eval()
 
     def resize_letterbox(self, img: Image.Image):
@@ -42,7 +42,7 @@
        tensor = transforms.ToTensor()(img).to(self.device).half()
        return tensor.unsqueeze(0), scale, pad_x, pad_y

    def xywh2xyxy(self, x):
        """Convert [cx, cy, w, h] to [x1, y1, x2, y2]"""
        y = x.clone()
        y[:, 0] = x[:, 0] - x[:, 2] / 2  # x1
@@ -71,11 +71,11 @@
        if mask.sum() == 0:
            return torch.empty(0, 6)

        # --- convert to xyxy before NMS ---
        boxes_xyxy = self.xywh2xyxy(boxes[mask])

        combined = torch.cat([
            boxes_xyxy,
            scores[mask].unsqueeze(1),
            class_ids[mask].float().unsqueeze(1)
        ], dim=1)
@@ -97,7 +97,7 @@
        if predictions.numel() == 0:
            return []

        # --- adjust from letterboxed to original coords ---
        predictions[:, [0, 2]] = (predictions[:, [0, 2]] - pad_x) / scale
        predictions[:, [1, 3]] = (predictions[:, [1, 3]] - pad_y) / scale


diff --git a/Models/requirements.txt b/Models/requirements.txt
@@ -1,17 +1,16 @@
-albumentations==1.4.18
-cmapy==0.6.6
-matplotlib==3.5.3
-numpy==2.2.5
-onnx==1.17.0
-onnxruntime==1.21.0
-opencv_contrib_python==4.10.0.84
-opencv_python==4.10.0.84
-opencv_python_headless==4.11.0.86
-Pillow==11.3.0
-pytorch_model_summary==0.1.2
-thop==0.1.1.post2209072238
-torch==2.7.0
-torchvision==0.22.0
-tensorboard==2.20.0
-tensorboard-data-server==0.7.2
-
+albumentations>=1.4.18
+cmapy>=0.6.6
+matplotlib>=3.8.0
+numpy>=1.21.0,<2.0.0
+onnx>=1.17.0
+onnxruntime>=1.21.0
+opencv-contrib-python>=4.10.0.84
+opencv-python>=4.10.0.84
+opencv-python-headless>=4.11.0.86
+Pillow>=11.3.0
+pytorch-model-summary>=0.1.2
+thop>=0.1.1.post2209072238
+torch>=2.7.0
+torchvision>=0.22.0
+tensorboard>=2.20.0
+tensorboard-data-server>=0.7.2
diff --git a/Models/visualizations/AutoSpeed/image_visualization.py b/Models/visualizations/AutoSpeed/image_visualization.py
@@ -1,16 +1,19 @@
-from argparse import ArgumentParser
+import os
 import cv2
 from PIL import Image
+from argparse import ArgumentParser
+
 from Models.inference.auto_speed_infer import AutoSpeedNetworkInfer
 
-color_map = {
-    1: (0, 0, 255),  # red
-    2: (0, 255, 255),  # yellow
-    3: (255, 255, 0)  # cyan
+color_map = {           # BGR
+    1: (0, 0, 255),     # Red
+    2: (0, 255, 255),   # Yellow
+    3: (255, 255, 0)    # Cyan
 }
 
 
 def make_visualization(prediction, input_image_filepath):
+
     img_cv = cv2.imread(input_image_filepath)
     for pred in prediction:
         x1, y1, x2, y2, conf, cls = pred
@@ -19,27 +22,85 @@ def make_visualization(prediction, input_image_filepath):
         color = color_map.get(int(cls), (255, 255, 255))
 
         x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
-        # cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 2)
         cv2.rectangle(img_cv, (x1, y1), (x2, y2), color, 2)
+
+        # Uncomment this if wanna show classes
         # label = f"Class: {int(cls)} | Score: {conf:.2f}"
         # cv2.putText(img_cv, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
 
-    cv2.imshow('Prediction Objects', img_cv)
-    cv2.waitKey(0)
+    # Tran: let's not show imgs, instead saving em in batch.
+    # cv2.imshow("Prediction Objects", img_cv)
+    # cv2.waitKey(0)
+    return Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
 
 
-if __name__ == "__main__":
+def main():
+
     parser = ArgumentParser()
-    parser.add_argument("-p", "--model_checkpoint_path", dest="model_checkpoint_path",
-                        help="path to pytorch checkpoint file to load model dict")
-    parser.add_argument("-i", "--input_image_filepath", dest="input_image_filepath",
-                        help="path to input image which will be processed by DomainSeg")
+
+    parser.add_argument(
+        "-p", 
+        "--model_checkpoint_path", 
+        dest = "model_checkpoint_path",
+        help = "Path to Pytorch checkpoint file to load model dict"
+    )
+    parser.add_argument(
+        "-i",
+        "--input_image_dirpath",
+        dest = "input_image_dirpath",
+        help = "Path to input image directory which will be processed by AutoSpeed"
+    )
+    parser.add_argument(
+        "-o",
+        "--output_image_dirpath",
+        dest = "output_image_dirpath",
+        help = "Path to output image directory where visualizations will be saved",
+        required = True
+    )
+
     args = parser.parse_args()
-    model_checkpoint_path = args.model_checkpoint_path
-    input_image_filepath = args.input_image_filepath
 
+    # Arranging I/O dirs
+    input_image_dirpath = args.input_image_dirpath
+    output_image_dirpath = args.output_image_dirpath
+    if (not os.path.exists(output_image_dirpath)):
+        os.makedirs(output_image_dirpath)
+
+    # Model checkpoint path
+    model_checkpoint_path = args.model_checkpoint_path
     model = AutoSpeedNetworkInfer(model_checkpoint_path)
-    img = Image.open(input_image_filepath).convert("RGB")
 
-    prediction = model.inference(img)
-    make_visualization(prediction, input_image_filepath)
+    # Process through input image dir
+    for filename in sorted(os.listdir(input_image_dirpath)):
+        if (filename.endswith((".png", ".jpg", ".jpeg"))):
+
+            # Fetch image
+            input_image_filepath = os.path.join(
+                input_image_dirpath, filename
+            )
+            img_id = filename.split(".")[0].zfill(3)
+            print(f"Reading Image: {input_image_filepath}")
+
+            # Inference
+            img = Image.open(input_image_filepath).convert("RGB")
+            prediction = model.inference(img)
+
+            # Visualization
+            vis_image = make_visualization(
+                prediction, 
+                input_image_filepath
+            )
+
+            output_image_filepath = os.path.join(
+                output_image_dirpath,
+                f"{img_id}_data.png"
+            )
+            vis_image.save(output_image_filepath)
+
+        else:
+            print(f"Skipping non-image file: {filename}")
+            continue
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Models/visualizations/AutoSpeed/video_visualization.py b/Models/visualizations/AutoSpeed/video_visualization.py
@@ -12,10 +12,10 @@
 sys.path.append('../../..')
 from Models.inference.auto_speed_infer import AutoSpeedNetworkInfer
 
-color_map = {
-    1: (0, 0, 255),  # red
-    2: (0, 255, 255),  # yellow
-    3: (255, 255, 0)  # cyan
+color_map = {           # Colors (BGR)
+    1: (0, 0, 255),     # Red
+    2: (0, 255, 255),   # Yellow
+    3: (255, 255, 0)    # Cyan
 }
 
 
@@ -129,28 +129,48 @@ def make_visualization(prediction, image):
 
 
 def main():
+
     parser = ArgumentParser()
-    parser.add_argument("-p", "--model_checkpoint_path", dest="model_checkpoint_path",
-                        help="path to pytorch checkpoint (.pt) or ONNX model (.onnx)")
-    parser.add_argument("-i", "--video_filepath", dest="video_filepath",
-                        help="path to input video which will be processed by AutoSpeed")
-    parser.add_argument("-o", "--output_file", dest="output_file",
-                        help="path to output video visualization file, must include output file name")
-    parser.add_argument('-v', "--vis", action='store_true', default=False,
-                        help="flag for whether to show frame by frame visualization while processing is occuring")
+
+    parser.add_argument(
+        "-p", 
+        "--model_checkpoint_path", 
+        dest = "model_checkpoint_path",
+        help = "Path to Pytorch checkpoint (.pth) or ONNX model (.onnx)."
+    )
+    parser.add_argument(
+        "-i", 
+        "--video_filepath", 
+        dest = "video_filepath",
+        help = "Path to input video which will be processed by AutoSpeed.")
+    parser.add_argument(
+        "-o", 
+        "--output_file", 
+        dest = "output_file",
+        help = "Path to output video visualization file, must include output file name.")
+    parser.add_argument(
+        "-v", 
+        "--vis", 
+        action = "store_true", 
+        default = False,
+        help = "Flag for whether to show frame by frame visualization while processing is occuring."
+    )
     args = parser.parse_args()
 
     # Detect model type and load
     model_path = args.model_checkpoint_path
 
-    if model_path.endswith('.onnx'):
-        print('Loading ONNX model...')
-        model = AutoSpeedONNXInfer(onnx_path=model_path)
-        print('ONNX Model Loaded')
-    elif model_path.endswith('.pt') or os.path.isdir(model_path):
-        print('Loading PyTorch model...')
-        model = AutoSpeedNetworkInfer(checkpoint_path=model_path)
-        print('PyTorch Model Loaded')
+    if model_path.endswith(".onnx"):
+        print("Loading ONNX model...")
+        model = AutoSpeedONNXInfer(onnx_path = model_path)
+        print("ONNX model loaded.")
+    elif (
+        (model_path.endswith(".pth")) or 
+        (os.path.isdir(model_path))
+    ):
+        print("Loading PyTorch model...")
+        model = AutoSpeedNetworkInfer(checkpoint_path = model_path)
+        print("PyTorch model loaded.")
     else:
         raise ValueError(f"Unsupported model format: {model_path}. Use .pt or .onnx")
 
@@ -160,15 +180,19 @@ def main():
     cap = cv2.VideoCapture(video_filepath)
 
     # Output filepath
-    output_filepath_obj = args.output_file + '.avi'
+    output_filepath_obj = args.output_file + ".avi"
 
     fps = cap.get(cv2.CAP_PROP_FPS)
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
     # Video writer object
-    writer_obj = cv2.VideoWriter(output_filepath_obj,
-                                 cv2.VideoWriter_fourcc(*"MJPG"), fps, (frame_width, frame_height))
+    writer_obj = cv2.VideoWriter(
+        output_filepath_obj,
+        cv2.VideoWriter_fourcc(*"MJPG"), 
+        fps, 
+        (frame_width, frame_height)
+    )
 
     # Check if video catpure opened successfully
     if (cap.isOpened() == False):
@@ -177,8 +201,9 @@ def main():
         print('Reading video frames')
 
     # Read until video is completed
-    print('Processing started')
+    print("Processing started...")
     while (cap.isOpened()):
+
         # Capture frame-by-frame
         ret, frame = cap.read()
         if ret == True:
@@ -197,15 +222,18 @@ def main():
                 display_w = 960
                 h, w, _ = vis_obj.shape
                 display_h = int(h * (display_w / w))
-                vis_display = cv2.resize(vis_obj, (display_w, display_h))
-                cv2.imshow('Prediction Objects', vis_display)
+                vis_display = cv2.resize(
+                    vis_obj, 
+                    (display_w, display_h)
+                )
+                cv2.imshow("Prediction Objects", vis_display)
                 cv2.waitKey(10)
 
             # Writing to video frame
             writer_obj.write(vis_obj)
 
         else:
-            print('Frame not read - ending processing')
+            print("Frame not read - ending processing...")
             break
 
     # When everything done, release the video capture and writer objects
@@ -214,9 +242,9 @@ def main():
 
     # Closes all the frames
     cv2.destroyAllWindows()
-    print('Completed')
+    print("Completed.")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 # %%