[refactor] invconv commit that supports jde yolo with integer convolutional layers

chyomin06 · fracape · commit 19b7ca8b2c74 · 2025-07-09T23:49:27.000-07:00
diff --git a/cfgs/vision_model/default.yaml b/cfgs/vision_model/default.yaml
@@ -36,6 +36,7 @@ jde_1088x608:
   model_path_prefix: ${..model_root_path}
   cfg: "models/Towards-Realtime-MOT/cfg/yolov3_1088x608.cfg"
   weights: "weights/jde/jde.1088x608.uncertainty.pt"
+  integer_conv_weight: False
   iou_thres: 0.5
   conf_thres: 0.5
   nms_thres: 0.4
diff --git a/compressai_vision/model_wrappers/intconv_wrapper.py b/compressai_vision/model_wrappers/intconv_wrapper.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022-2024, InterDigital Communications, Inc
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import logging
+
+import numpy as np
+import torch
+from torch import nn
+
+
+class IntConv2dWrapper(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+        self.initified_weight_mode = False
+
+    """
+    def _set_mode(mode):
+        global _precision, _high_precision, _mode
+
+        if mode == 'none':
+            _precision = 0
+        elif mode == 'float32':
+            _precision = 2**(23+1)
+        elif mode == 'float64':
+            _precision = 2**(52+1)
+
+        _mode = mode
+        torch.backends.cudnn.enabled = mode=='none'
+    """
+
+    def quantize_weights(self):
+        self.initified_weight_mode = True
+
+        if self.bias is None:
+            self.float_bias = torch.zeros(self.out_channels, device=self.weight.device)
+        else:
+            self.float_bias = self.bias.detach().clone()
+
+        if self.weight.dtype == torch.float32:
+            _precision = 2 ** (23 + 1)
+        elif self.weight.dtype == torch.float64:
+            _precision = 2 ** (52 + 1)
+        else:
+            logging.warning(
+                f"Unsupported dtype {self.weight.dtype}. Behaviour may lead unexpected results."
+            )
+            _precision = 2 ** (23 + 1)
+
+        ###### REFERENCE FROM VCMRMS ######
+        # sf const
+        sf_const = 48
+
+        N = np.prod(self.weight.shape[1:])
+        self.N = N
+        self.factor = np.sqrt(_precision)
+        # self.sf = 1/6 #precision bits allocation factor
+        self.sf = np.sqrt(sf_const / N)
+
+        # perform the calculate ion CPU to stabalize the calculation
+        self.w_sum = self.weight.cpu().abs().sum(axis=[1, 2, 3]).to(self.weight.device)
+        self.w_sum[self.w_sum == 0] = 1  # prevent divide by 0
+
+        self.fw = (self.factor / self.sf - np.sqrt(N / 12) * 5) / self.w_sum
+
+        # intify weights
+        self.weight.requires_grad = False  # Just make sure
+        self.weight.copy_(
+            torch.round(self.weight.detach().clone() * self.fw.view(-1, 1, 1, 1))
+        )
+
+        # set bias to 0
+        if self.bias is not None:
+            self.bias.requires_grad = False  # Just make sure
+            self.bias.zero_()
+
+        ###### END OF REFERENCE FROM VCMRMS ######
+
+    def forward(self, x: torch.Tensor):
+        if not self.initified_weight_mode:
+            return super().forward(x)
+
+        _dtype = x.dtype
+        _cudnn_enabled = torch.backends.cudnn.enabled
+        torch.backends.cudnn.enabled = False
+
+        ###### REFERENCE FROM VCMRMS ######
+
+        # Calculate factor
+        fx = 1
+
+        x_abs = x.abs()
+        x_max = x_abs.max()
+        if x_max > 0:
+            fx = (self.factor * self.sf - 0.5) / x_max
+
+        # intify x
+        x = torch.round(fx * x)
+        x = super().forward(x)
+
+        # x should be all integers
+        x /= fx * self.fw.view(-1, 1, 1)
+        x = x.float()
+
+        # apply bias in float format
+        x = (x.permute(0, 2, 3, 1) + self.float_bias).permute(0, 3, 1, 2).contiguous()
+        ###### REFERENCE FROM VCMRMS ######
+
+        torch.backends.cudnn.enabled = _cudnn_enabled
+
+        return x.to(_dtype)
diff --git a/compressai_vision/model_wrappers/jde.py b/compressai_vision/model_wrappers/jde.py
@@ -43,12 +43,16 @@
 )
 from jde.utils.kalman_filter import KalmanFilter
 from jde.utils.utils import non_max_suppression, scale_coords
-from torch import Tensor
 
 from compressai_vision.registry import register_vision_model
 
 from .base_wrapper import BaseWrapper
 
+# Patch in modified create_modules
+from .jde_lowlevel import create_modules
+
+jde.models.create_modules = create_modules
+
 __all__ = [
     "jde_1088x608",
 ]
@@ -84,6 +88,8 @@ def __init__(self, device: str, **kwargs):
             self.model_configs["frame_rate"] / 30.0 * self.model_configs["track_buffer"]
         )
 
+        integer_conv_weight = bool(kwargs["integer_conv_weight"])
+
         assert "splits" in kwargs, "Split layer ids must be provided"
         self.split_layer_list = kwargs["splits"]
         self.features_at_splits = dict(
@@ -96,6 +102,12 @@ def __init__(self, device: str, **kwargs):
             strict=False,
         )
         self.darknet.to(device).eval()
+        for param in self.darknet.parameters():
+            param.requires_grad = False
+
+        # must be called after loading weights to a model
+        if integer_conv_weight:
+            self.darknet = self.quantize_weights(self.darknet)
 
         self.kalman_filter = KalmanFilter()
 
@@ -114,6 +126,17 @@ def reset(self):
 
         self.frame_id = 0
 
+    @staticmethod
+    def quantize_weights(model):
+
+        for module_def, module in zip(model.module_defs, model.module_list):
+            if module_def["type"] == "convolutional":
+                for m in module:
+                    if type(m).__name__ == "IntConv2dWrapper":
+                        m.quantize_weights()
+
+        return model
+
     def input_to_features(self, x, device: str) -> Dict:
         """Computes deep features at the intermediate layer(s) all the way from the input"""
         self.darknet = self.darknet.to(device).eval()
diff --git a/compressai_vision/model_wrappers/jde_lowlevel.py b/compressai_vision/model_wrappers/jde_lowlevel.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2022-2023, InterDigital Communications, Inc
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch.nn as nn
+from jde.models import EmptyLayer, Upsample, YOLOLayer
+
+from .intconv_wrapper import IntConv2dWrapper
+
+try:
+    from jde.utils.syncbn import SyncBN
+
+    batch_norm = SyncBN  # nn.BatchNorm2d
+except ImportError:
+    batch_norm = nn.BatchNorm2d
+
+
+def create_modules(module_defs, device: str):
+    """
+    Constructs module list of layer blocks from module configuration in module_defs
+    """
+    hyperparams = module_defs.pop(0)
+    output_filters = [int(hyperparams["channels"])]
+    module_list = nn.ModuleList()
+    yolo_layer_count = 0
+    for i, module_def in enumerate(module_defs):
+        modules = nn.Sequential()
+
+        if module_def["type"] == "convolutional":
+            bn = int(module_def["batch_normalize"])
+            filters = int(module_def["filters"])
+            kernel_size = int(module_def["size"])
+            pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0
+            modules.add_module(
+                "conv_%d" % i,
+                IntConv2dWrapper(
+                    in_channels=output_filters[-1],
+                    out_channels=filters,
+                    kernel_size=kernel_size,
+                    stride=int(module_def["stride"]),
+                    padding=pad,
+                    bias=not bn,
+                ),
+            )
+            if bn:
+                after_bn = batch_norm(filters)
+                modules.add_module("batch_norm_%d" % i, after_bn)
+                # BN is uniformly initialized by default in pytorch 1.0.1.
+                # In pytorch>1.2.0, BN weights are initialized with constant 1,
+                # but we find with the uniform initialization the model converges faster.
+                nn.init.uniform_(after_bn.weight)
+                nn.init.zeros_(after_bn.bias)
+            if module_def["activation"] == "leaky":
+                modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1))
+
+        elif module_def["type"] == "maxpool":
+            kernel_size = int(module_def["size"])
+            stride = int(module_def["stride"])
+            if kernel_size == 2 and stride == 1:
+                modules.add_module("_debug_padding_%d" % i, nn.ZeroPad2d((0, 1, 0, 1)))
+            maxpool = nn.MaxPool2d(
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=int((kernel_size - 1) // 2),
+            )
+            modules.add_module("maxpool_%d" % i, maxpool)
+
+        elif module_def["type"] == "upsample":
+            upsample = Upsample(scale_factor=int(module_def["stride"]))
+            modules.add_module("upsample_%d" % i, upsample)
+
+        elif module_def["type"] == "route":
+            layers = [int(x) for x in module_def["layers"].split(",")]
+            filters = sum([output_filters[i + 1 if i > 0 else i] for i in layers])
+            modules.add_module("route_%d" % i, EmptyLayer())
+
+        elif module_def["type"] == "shortcut":
+            filters = output_filters[int(module_def["from"])]
+            modules.add_module("shortcut_%d" % i, EmptyLayer())
+
+        elif module_def["type"] == "yolo":
+            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
+            # Extract anchors
+            anchors = [float(x) for x in module_def["anchors"].split(",")]
+            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+            anchors = [anchors[i] for i in anchor_idxs]
+            nC = int(module_def["classes"])  # number of classes
+            img_size = (int(hyperparams["width"]), int(hyperparams["height"]))
+            # Define detection layer
+            yolo_layer = YOLOLayer(
+                anchors,
+                nC,
+                int(hyperparams["nID"]),
+                int(hyperparams["embedding_dim"]),
+                img_size,
+                yolo_layer_count,
+                device,
+            )
+            modules.add_module("yolo_%d" % i, yolo_layer)
+            yolo_layer_count += 1
+
+        # Register module list and number of output filters
+        module_list.append(modules)
+        output_filters.append(filters)
+
+    return hyperparams, module_list