[feat] support split squeeze natural bottlenck case for YOLOX Darknet53 at l13

chyomin06 · fracape · commit 7924471fb241 · 2025-03-03T22:34:40.000Z
diff --git a/cfgs/vision_model/default.yaml b/cfgs/vision_model/default.yaml
@@ -46,4 +46,5 @@ yolox_darknet53:
   conf_thres: 0.001
   nms_thres: 0.65
   weights: "weights/yolox/darknet53/yolox_darknet.pth"
-  splits: "l13" #"l37"
+  splits: "l13" #"l37"
+  squeeze_at_split: False
diff --git a/compressai_vision/model_wrappers/split_squeezes/squeeze_base.py b/compressai_vision/model_wrappers/split_squeezes/squeeze_base.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, InterDigital Communications, Inc
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import torch.nn as nn
+
+
+class squeeze_base(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+        self.squeeze_ftensor = None
+        self.expand_ftensor = None
+
+    @property
+    def address(self):
+        return "PROVIDE URL"
+
+    def squeeze_(self, x):
+        # You may implement your own
+        return self.squeeze_ftensor(x)
+
+    def expand_(self, x):
+        # You may implement your own
+        return self.expand_ftensor(x)
diff --git a/compressai_vision/model_wrappers/split_squeezes/squeeze_yolox.py b/compressai_vision/model_wrappers/split_squeezes/squeeze_yolox.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025, InterDigital Communications, Inc
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import torch.nn as nn
+
+from .squeeze_base import squeeze_base
+
+
+# for YOLOX-Darknet53
+class three_convs_at_l13(squeeze_base):
+    def __init__(self, C0, C1, C2, C3):
+        super().__init__(C0, C1, C2, C3)
+
+        self.fw_block = nn.Sequential(
+            nn.Conv2d(
+                in_channels=C0, out_channels=C1, kernel_size=3, padding=1, stride=1
+            ),
+            nn.PReLU(),
+            nn.Conv2d(
+                in_channels=C1, out_channels=C2, kernel_size=3, padding=1, stride=2
+            ),
+            nn.PReLU(),
+            nn.Conv2d(
+                in_channels=C2, out_channels=C3, kernel_size=1, padding=0, stride=1
+            ),
+            nn.SiLU(inplace=True),
+        )
+
+        self.bw_block = nn.Sequential(
+            nn.Conv2d(
+                in_channels=C3, out_channels=C2, kernel_size=3, padding=1, stride=1
+            ),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
+            nn.PReLU(),
+            nn.Conv2d(
+                in_channels=C2, out_channels=C1, kernel_size=3, padding=1, stride=1
+            ),
+            nn.PReLU(),
+            nn.Conv2d(
+                in_channels=C1, out_channels=C0, kernel_size=1, padding=0, stride=1
+            ),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+        )
+
+    @property
+    def address(self):
+        return "https://dspub.blob.core.windows.net/compressai-vision/split_squeezes/yolox_darknet53/three_convs_squeeze_at_l13_of_yolox_darknet53-f78179c1.pth"
+
+    def squeeze_(self, x):
+        return self.fw_block(x)
+
+    def expand_(self, x):
+        return self.bw_block(x)
+
+    def forward(self, x):
+        y = self.fw_block(x)
+        est_x = self.bw_block(y)
+        return est_x
diff --git a/compressai_vision/model_wrappers/yolox.py b/compressai_vision/model_wrappers/yolox.py
@@ -28,7 +28,6 @@
 # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-import configparser
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List
@@ -40,6 +39,7 @@
 from compressai_vision.registry import register_vision_model
 
 from .base_wrapper import BaseWrapper
+from .split_squeezes import squeeze_yolox
 
 __all__ = [
     "yolox_darknet53",
@@ -76,7 +76,7 @@ def __init__(self, device: str, **kwargs):
         self.conf_thres = kwargs["conf_thres"]
         self.nms_thres = kwargs["nms_thres"]
 
-        self.supported_split_points = Split_Points
+        self.squeeze_at_split_enabled = False
 
         exp = get_exp(exp_file=None, exp_name="yolov3")
 
@@ -85,9 +85,10 @@ def __init__(self, device: str, **kwargs):
 
         assert "splits" in kwargs, "Split layer ids must be provided"
         self.split_id = str(kwargs["splits"]).lower()
-        if self.split_id == str(self.supported_split_points.Layer13_Single):
+
+        if self.split_id == str(Split_Points.Layer13_Single):
             self.split_layer_list = ["l13"]
-        elif self.split_id == str(self.supported_split_points.Layer37_Single):
+        elif self.split_id == str(Split_Points.Layer37_Single):
             self.split_layer_list = ["l37"]
         else:
             raise NotImplementedError
@@ -100,8 +101,12 @@ def __init__(self, device: str, **kwargs):
             torch.load(self.model_info["weights"], map_location="cpu")["model"],
             strict=False,
         )
+
         self.model.to(device).eval()
 
+        if bool(kwargs["squeeze_at_split"]):
+            self.enable_squeeze_at_split(self.split_id)
+
         self.yolo_fpn = self.model.backbone
         self.backbone = self.yolo_fpn.backbone
         self.head = self.model.head
@@ -112,11 +117,38 @@ def __init__(self, device: str, **kwargs):
 
     @property
     def SPLIT_L13(self):
-        return str(self.supported_split_points.Layer13_Single)
+        return str(Split_Points.Layer13_Single)
 
     @property
     def SPLIT_L37(self):
-        return str(self.supported_split_points.Layer37_Single)
+        return str(Split_Points.Layer37_Single)
+
+    def enable_squeeze_at_split(self, split_id):
+        from torch.hub import load_state_dict_from_url
+
+        LIST_OF_SQUEEZE_SUPPORT_SPLITS = [str(Split_Points.Layer13_Single)]
+
+        if split_id in LIST_OF_SQUEEZE_SUPPORT_SPLITS:
+            self.squeeze_at_split_enabled = True
+            self.squeeze_model = squeeze_yolox.three_convs_at_l13(
+                C0=256, C1=256, C2=128, C3=128
+            )
+
+            state_dict = load_state_dict_from_url(
+                self.squeeze_model.address,
+                progress=True,
+                check_hash=True,
+                map_location=self.device,
+            )
+
+            self.squeeze_model.load_state_dict(state_dict)
+            self.squeeze_model.to(self.device).eval()
+
+        else:
+            self.logger.warning(
+                f"Squeeze is not available at {split_id}. Currently only available at {LIST_OF_SQUEEZE_SUPPORT_SPLITS}"
+            )
+            self.squeeze_at_split_enabled = False
 
     def input_to_features(self, x, device: str) -> Dict:
         """Computes deep features at the intermediate layer(s) all the way from the input"""
@@ -126,9 +158,9 @@ def input_to_features(self, x, device: str) -> Dict:
         input_size = tuple(img.shape[2:])
 
         if self.split_id == self.SPLIT_L13:
-            output = self._input_to_feature_at_l13(img)
+            output = self._input_to_feature_at_l13(img, device)
         elif self.split_id == self.SPLIT_L37:
-            output = self._input_to_feature_at_l37(img)
+            output = self._input_to_feature_at_l37(img, device)
         else:
             self.logger.error(f"Not supported split point {self.split_id}")
             raise NotImplementedError
@@ -143,29 +175,36 @@ def features_to_output(self, x: Dict, device: str):
 
         if self.split_id == self.SPLIT_L13:
             return self._feature_at_l13_to_output(
-                x["data"], x["org_input_size"], x["input_size"]
+                x["data"], x["org_input_size"], x["input_size"], device
             )
         elif self.split_id == self.SPLIT_L37:
             return self._feature_at_l37_to_output(
-                x["data"], x["org_input_size"], x["input_size"]
+                x["data"], x["org_input_size"], x["input_size"], device
             )
         else:
             self.logger.error(f"Not supported split points {self.split_id}")
 
         raise NotImplementedError
 
     @torch.no_grad()
-    def _input_to_feature_at_l13(self, x):
+    def _input_to_feature_at_l13(self, x, device):
         """Computes and return feature at layer 13 with leaky relu all the way from the input"""
 
         y = self.backbone.stem(x)
         y = self.backbone.dark2(y)
-        self.features_at_splits[self.SPLIT_L13] = self.backbone.dark3[0](y)
+        y = self.backbone.dark3[0](y)
 
+        if not self.squeeze_at_split_enabled:
+            self.features_at_splits[self.SPLIT_L13] = y
+            return {"data": self.features_at_splits}
+
+        # Further squeeze
+        smodel = self.squeeze_model.to(device)
+        self.features_at_splits[self.SPLIT_L13] = smodel.squeeze_(y)
         return {"data": self.features_at_splits}
 
     @torch.no_grad()
-    def _input_to_feature_at_l37(self, x):
+    def _input_to_feature_at_l37(self, x, device):
         """Computes and return feature at layer 37 with 11th residual layer output all the way from the input"""
 
         y = self.backbone.stem(x)
@@ -177,7 +216,7 @@ def _input_to_feature_at_l37(self, x):
 
     @torch.no_grad()
     def _feature_at_l13_to_output(
-        self, x: Dict, org_img_size: Dict, input_img_size: List
+        self, x: Dict, org_img_size: Dict, input_img_size: List, device
     ):
         """
         performs  downstream task using the features from layer 13
@@ -191,8 +230,13 @@ def _feature_at_l13_to_output(
         <https://github.com/Megvii-BaseDetection/YOLOX?tab=Apache-2.0-1-ov-file#readme>
 
         """
-
         y = x[self.SPLIT_L13]
+
+        # Recovery session to expand dimension to original
+        if self.squeeze_at_split_enabled:
+            smodel = self.squeeze_model.to(device)
+            y = smodel.expand_(y)
+
         for proc_module in self.backbone.dark3[1:]:
             y = proc_module(y)
 
@@ -220,7 +264,7 @@ def _feature_at_l13_to_output(
 
     @torch.no_grad()
     def _feature_at_l37_to_output(
-        self, x: Dict, org_img_size: Dict, input_img_size: List
+        self, x: Dict, org_img_size: Dict, input_img_size: List, device
     ):
         """
         performs  downstream task using the features from layer 37