Align vdo-larod model input shapes (#421)

johan-hultberg-work · danielmyh · Daniel Myhrman · web-flow · commit 010ae9ae17f8 · 2025-09-16T23:03:31.000+02:00
Co-authored-by: Daniel Myhrman &lt;74043942+danielmyh@users.noreply.github.com&gt;
Co-authored-by: Daniel Myhrman &lt;danielmy@axis.com&gt;
Co-authored-by: Isak Jakobsson &lt;isakj@axis.com&gt;
diff --git a/tensorflow-to-larod-artpec8/README.md b/tensorflow-to-larod-artpec8/README.md
@@ -71,7 +71,7 @@ If your machine doesn't have the hardware requisites, like not enough GPU to tra
 
 ### The example model
 
-In this tutorial, we'll train a simple model with one input and two outputs. The input to the model is a FP32 RGB image scaled to the [0, 1] range and of shape `(480, 270, 3)`.
+In this tutorial, we'll train a simple model with one input and two outputs. The input to the model is a FP32 RGB image scaled to the [0, 1] range and of shape `(256, 256, 3)`.
 The output of the model are two separate tensors of shape `(1,)`, representing the model's confidences for the presence of `person`  and `car`. The outputs are configured as such, and not as one tensor with a SoftMax activation, in order to demonstrate how to use multiple outputs.
 However, the general process of making a camera-compatible model is the same irrespective of the dimensions or number of inputs or outputs.
 
@@ -84,7 +84,7 @@ In order to produce a model with BatchNormalization layers that are fused with t
 Specifically, the convolutional layers need to not use bias, e.g., for Keras Conv2D layers have the `use_bias=False` parameter set, and the layer order needs to be: `convolutional layer -> batch normalization -> relu`.
 This will "fold" , or "fuse", the batch normalization, which increases performance.
 
-The pre-trained model is trained on the MS COCO 2017 **training** dataset, which is significantly larger than the supplied MS COCO 2017 **validation** dataset. After training it for 8 epochs and fine-tuning the model with quantization for 4 epochs, it achieves around 85% validation accuracy on both the people output and the car output with 6.6 million parameters. This model is saved in the frozen graph format in the `/env/output_models` directory.
+To replicate the model training used for the model in [vdo-larod](../vdo-larod/), utilize the MS COCO 2017 **training** dataset, which is significantly larger than the provided MS COCO 2017 **validation** dataset. After training for 12 epochs and fine-tuning the model with quantization for 1 epoch, it achieves good accuracy on both people and cars.
 
 ### Model training and quantization
 
diff --git a/tensorflow-to-larod-artpec8/env/training/model.py b/tensorflow-to-larod-artpec8/env/training/model.py
@@ -54,7 +54,7 @@ def _residual_block(x, n_filters, strides):
     return x
 
 
-def create_model(n_blocks=4, n_filters=16, input_shape=(480, 270, 3)):
+def create_model(n_blocks=4, n_filters=16, input_shape=(256, 256, 3)):
     """ Defines and instantiates a model.
 
     Args:
diff --git a/tensorflow-to-larod-artpec8/env/training/train.py b/tensorflow-to-larod-artpec8/env/training/train.py
@@ -147,9 +147,9 @@ def train_model(data_generator, trained_model_path, model_configuration, train_e
     parser.add_argument('-a', '--annotations', type=str, required=True,
                         help='path to the .json-file containing COCO instance \
                         annotations')
-    parser.add_argument('--input-width', type=int, default=480,
+    parser.add_argument('--input-width', type=int, default=256,
                         help='The width of the model\'s input image')
-    parser.add_argument('--input-height', type=int, default=270,
+    parser.add_argument('--input-height', type=int, default=256,
                         help='The height of the model\'s input image')
     parser.add_argument('-e', '--training-epochs', type=int, default=8,
                         help='number of training epochs')
@@ -158,7 +158,7 @@ def train_model(data_generator, trained_model_path, model_configuration, train_e
     args = parser.parse_args()
 
     print('Using TensorFlow version: {}'.format(tf.__version__))
-    data_generator = DataGenerator(args.images, args.annotations, batch_size=8,
+    data_generator = DataGenerator(args.images, args.annotations, batch_size=16,
                                    width=args.input_width, height=args.input_height)
 
     trained_model_path = '/env/models/fp32_model/model'
diff --git a/tensorflow-to-larod-artpec8/env/training/utils.py b/tensorflow-to-larod-artpec8/env/training/utils.py
@@ -33,8 +33,8 @@ class SimpleCOCODataGenerator(Sequence):
         reprocesses it to simply output whether a certain class exists in
         a given image.
     """
-    def __init__(self, samples_dir, annotation_path, width=480, height=270,
-                 batch_size=2, shuffle=True, balance=True):
+    def __init__(self, samples_dir, annotation_path, width=256, height=256,
+                 batch_size=16, shuffle=True, balance=True):
         """ Initializes the data generator.
 
         Args:
@@ -169,10 +169,6 @@ def _generate_batch(self, batch_annotations):
             img_path = os.path.join(self.samples_dir, annotation['file_name'])
             img = Image.open(img_path).resize((self.width, self.height))
 
-            # Horizontal flipping with p=0.5
-            if np.random.random() >= 0.5:
-                img = img.transpose(Image.FLIP_LEFT_RIGHT)
-
             X[i, ] = np.array(img)
             y_person[i, ] = annotation['has_person']
             y_car[i, ] = annotation['has_car']
diff --git a/vdo-larod/Dockerfile b/vdo-larod/Dockerfile
@@ -16,7 +16,7 @@ ARG CHIP
 # Download the pretrained model
 ARG MODEL_BUCKET=https://acap-ml-models.s3.amazonaws.com/tensorflow_to_larod_resnet
 RUN if [ "$CHIP" = artpec8 ] || [ "$CHIP" = artpec9 ] || [ "$CHIP" = cpu ]  ; then \
-        curl -o model.tflite $MODEL_BUCKET/custom_resnet_artpec8_car_human_480x270.tflite ; \
+        curl -o model.tflite $MODEL_BUCKET/custom_resnet_artpec8_car_human_256.tflite ; \
     elif [ "$CHIP" = edgetpu ]; then \
         curl -o model.tflite $MODEL_BUCKET/custom_resnet_edgetpu_car_human_256.tflite ; \
     elif [ "$CHIP" = cv25 ]; then \
diff --git a/vdo-larod/README.md b/vdo-larod/README.md
@@ -33,8 +33,8 @@ See the manifest.json.* files to change the configuration on chip, image size, n
 
 ## Which backends and models are supported?
 
-Unless you modify the app to your own needs you should only use our pretrained model that takes 480x270 (256x256 for Ambarella CV25 and Google TPU) RGB (interleaved or planar) images as input,
-and that outputs an array of 2 confidence scores of person and car in the format of `float32`.
+Unless you modify the app to your own needs you should only use our pretrained model that takes 256x256 RGB (interleaved or planar) images as input,
+and that outputs an array of 2 confidence scores of person and car in the format of `uint8`.
 
 You can run the example with any inference backend as long as you can provide it with a model as described above.
 
@@ -280,67 +280,96 @@ In previous larod versions, the chip was referred to as a number instead of a st
 ```sh
 ----- Contents of SYSTEM_LOG for 'vdo_larod' -----
 
-
-vdo_larod[584171]: Starting /usr/local/packages/vdo_larod/vdo_larod
-vdo_larod[584171]: chooseStreamResolution: We select stream w/h=480 x 270 based on VDO channel info.
-vdo_larod[584171]: Creating VDO image provider and creating stream 480 x 270
-vdo_larod[584171]: Setting up larod connection with chip axis-a8-dlpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
-vdo_larod[584171]: Loading the model... This might take up to 5 minutes depending on your device model.
-vdo_larod[584171]: Model loaded successfully
-vdo_larod[584171]: Created mmaped model output 0 with size 1
-vdo_larod[584171]: Created mmaped model output 1 with size 1
-vdo_larod[584171]: Start fetching video frames from VDO
-
-vdo_larod[584171]: Ran pre-processing for 2 ms
-vdo_larod[584171]: Ran inference for 16 ms
-vdo_larod[584171]: Person detected: 65.14% - Car detected: 11.92%
-
-vdo_larod[4165]: Exit /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[141742]: Starting /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[141742]: choose_stream_resolution: We select stream w/h=480 x 270 based on VDO channel info.
+vdo_larod[141742]: Creating VDO image provider and creating stream 480 x 270
+vdo_larod[141742]: 'buffer.count'-----: <uint32 2>
+vdo_larod[141742]: 'dynamic.framerate': <true>
+vdo_larod[141742]: 'format'-----------: <uint32 3>
+vdo_larod[141742]: 'framerate'--------: <30.0>
+vdo_larod[141742]: 'height'-----------: <uint32 270>
+vdo_larod[141742]: 'input'------------: <uint32 1>
+vdo_larod[141742]: 'socket.blocking'--: <false>
+vdo_larod[141742]: 'width'------------: <uint32 480>
+vdo_larod[141742]: Dump of vdo stream settings map =====
+vdo_larod[141742]: Setting up larod connection with chip axis-a8-dlpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
+vdo_larod[141742]: Loading the model... This might take up to 5 minutes depending on your device model.
+vdo_larod[141742]: Model loaded successfully
+vdo_larod[141742]: Calculate crop image
+vdo_larod[141742]: Crop input image X=105 Y=0 (270 x 270)
+vdo_larod[141742]: Created mmaped model output 0 with size 1
+vdo_larod[141742]: Created mmaped model output 1 with size 1
+
+vdo_larod[141742]: Ran pre-processing for 3 ms
+vdo_larod[141742]: Ran inference for 14 ms
+vdo_larod[141742]: Person detected: 100.00% - Car detected: 3.14%
+
+vdo_larod[141742]: Exit /usr/local/packages/vdo_larod/vdo_larod
 ```
 
 #### Output - ARTPEC-9 with TensorFlow Lite
 
 ```sh
 ----- Contents of SYSTEM_LOG for 'vdo_larod' -----
 
-vdo_larod[584171]: Starting /usr/local/packages/vdo_larod/vdo_larod
-vdo_larod[584171]: chooseStreamResolution: We select stream w/h=480 x 270 based on VDO channel info.
-vdo_larod[584171]: Creating VDO image provider and creating stream 480 x 270
-vdo_larod[584171]: Setting up larod connection with chip a9-dlpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
-vdo_larod[584171]: Loading the model... This might take up to 5 minutes depending on your device model.
-vdo_larod[584171]: Model loaded successfully
-vdo_larod[584171]: Created mmaped model output 0 with size 1
-vdo_larod[584171]: Created mmaped model output 1 with size 1
-vdo_larod[584171]: Start fetching video frames from VDO
-
-vdo_larod[584171]: Ran pre-processing for 2 ms
-vdo_larod[584171]: Ran inference for 7 ms
-vdo_larod[584171]: Person detected: 65.14% - Car detected: 11.92%
 
-vdo_larod[4165]: Exit /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[3991067]: Starting /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[3991067]: choose_stream_resolution: We select stream w/h=480 x 360 based on VDO channel info.
+vdo_larod[3991067]: Creating VDO image provider and creating stream 480 x 360
+vdo_larod[3991067]: 'buffer.count'-----: <uint32 2>
+vdo_larod[3991067]: 'dynamic.framerate': <true>
+vdo_larod[3991067]: 'format'-----------: <uint32 3>
+vdo_larod[3991067]: 'framerate'--------: <30.0>
+vdo_larod[3991067]: 'height'-----------: <uint32 360>
+vdo_larod[3991067]: 'input'------------: <uint32 1>
+vdo_larod[3991067]: 'socket.blocking'--: <false>
+vdo_larod[3991067]: 'width'------------: <uint32 480>
+vdo_larod[3991067]: Dump of vdo stream settings map =====
+vdo_larod[3991067]: Setting up larod connection with chip a9-dlpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
+vdo_larod[3991067]: Loading the model... This might take up to 5 minutes depending on your device model.
+vdo_larod[3991067]: Model loaded successfully
+vdo_larod[3991067]: Calculate crop image
+vdo_larod[3991067]: Crop input image X=0 Y=60 (360 x 360)
+vdo_larod[3991067]: Created mmaped model output 0 with size 1
+vdo_larod[3991067]: Created mmaped model output 1 with size 1
+vdo_larod[3991067]: Start fetching video frames from VDO
+vdo_larod[3991067]: Ran pre-processing for 13 ms
+vdo_larod[3991067]: Ran inference for 5 ms
+vdo_larod[3991067]: Person detected: 100.00% - Car detected: 3.14%
+
+vdo_larod[3991067]: Exit /usr/local/packages/vdo_larod/vdo_larod
 ```
 
 #### Output - CPU with TensorFlow Lite
 
 ```sh
 ----- Contents of SYSTEM_LOG for 'vdo_larod' -----
 
-vdo_larod[584171]: Starting /usr/local/packages/vdo_larod/vdo_larod
-vdo_larod[584171]: chooseStreamResolution: We select stream w/h=480 x 270 based on VDO channel info.
-vdo_larod[584171]: Creating VDO image provider and creating stream 480 x 270
-vdo_larod[584171]: Setting up larod connection with chip cpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
-vdo_larod[584171]: Loading the model... This might take up to 5 minutes depending on your device model.
-vdo_larod[584171]: Model loaded successfully
-vdo_larod[584171]: Created mmaped model output 0 with size 1
-vdo_larod[584171]: Created mmaped model output 1 with size 1
-vdo_larod[584171]: Start fetching video frames from VDO
-
-vdo_larod[584171]: Ran pre-processing for 3 ms
-vdo_larod[584171]: Ran inference for 2594 ms
-vdo_larod[584171]: Change VDO stream framerate to 1.000000 because of too long inference time
-vdo_larod[584171]: Person detected: 65.14% - Car detected: 11.92%
-
-vdo_larod[4165]: Exit /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[145071]: Starting /usr/local/packages/vdo_larod/vdo_larod
+vdo_larod[145071]: choose_stream_resolution: We select stream w/h=480 x 270 based on VDO channel info.
+vdo_larod[145071]: Creating VDO image provider and creating stream 480 x 270
+vdo_larod[145071]: Dump of vdo stream settings map =====
+vdo_larod[145071]: 'buffer.count'-----: <uint32 2>
+vdo_larod[145071]: 'dynamic.framerate': <true>
+vdo_larod[145071]: 'format'-----------: <uint32 3>
+vdo_larod[145071]: 'framerate'--------: <30.0>
+vdo_larod[145071]: 'height'-----------: <uint32 270>
+vdo_larod[145071]: 'input'------------: <uint32 1>
+vdo_larod[145071]: 'socket.blocking'--: <false>
+vdo_larod[145071]: 'width'------------: <uint32 480>
+vdo_larod[145071]: Setting up larod connection with chip cpu-tflite and model file /usr/local/packages/vdo_larod/model/model.tflite
+vdo_larod[145071]: Loading the model... This might take up to 5 minutes depending on your device model.
+vdo_larod[145071]: Model loaded successfully
+vdo_larod[145071]: Calculate crop image
+vdo_larod[145071]: Crop input image X=105 Y=0 (270 x 270)
+vdo_larod[145071]: Created mmaped model output 0 with size 1
+vdo_larod[145071]: Created mmaped model output 1 with size 1
+vdo_larod[145071]: Start fetching video frames from VDO
+vdo_larod[145071]: Ran pre-processing for 3 ms
+vdo_larod[145071]: Ran inference for 545 ms
+vdo_larod[145071]: Person detected: 100.00% - Car detected: 3.14%
+
+vdo_larod[145071]: Exit /usr/local/packages/vdo_larod/vdo_larod
 ```
 
 #### Output - Google TPU
diff --git a/vdo-larod/app/manifest.json.artpec8 b/vdo-larod/app/manifest.json.artpec8
@@ -6,7 +6,7 @@
             "appName": "vdo_larod",
             "vendor": "Axis Communications",
             "embeddedSdkVersion": "3.0",
-            "runOptions": "axis-a8-dlpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 480 270",
+            "runOptions": "axis-a8-dlpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 256 256",
             "vendorUrl": "https://www.axis.com",
             "runMode": "never",
             "version": "1.0.0"
diff --git a/vdo-larod/app/manifest.json.artpec9 b/vdo-larod/app/manifest.json.artpec9
@@ -6,7 +6,7 @@
             "appName": "vdo_larod",
             "vendor": "Axis Communications",
             "embeddedSdkVersion": "3.0",
-            "runOptions": "a9-dlpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 480 270",
+            "runOptions": "a9-dlpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 256 256",
             "vendorUrl": "https://www.axis.com",
             "runMode": "never",
             "version": "1.0.0"
diff --git a/vdo-larod/app/manifest.json.cpu b/vdo-larod/app/manifest.json.cpu
@@ -6,7 +6,7 @@
             "appName": "vdo_larod",
             "vendor": "Axis Communications",
             "embeddedSdkVersion": "3.0",
-            "runOptions": "cpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 480 270",
+            "runOptions": "cpu-tflite /usr/local/packages/vdo_larod/model/model.tflite 256 256",
             "vendorUrl": "https://www.axis.com",
             "runMode": "never",
             "version": "1.0.0"