diff --git a/models/experimental/panoptic_deeplab/README.md b/models/experimental/panoptic_deeplab/README.md new file mode 100644 index 000000000000..dfc4a39aada3 --- /dev/null +++ b/models/experimental/panoptic_deeplab/README.md @@ -0,0 +1,159 @@ +# Panoptic-DeepLab (TT-NN) + +**Platforms:** Wormhole (n150) +**Supported Input Resolution:** `(512, 1024)` = (Height, Width) + +## Introduction +Panoptic-DeepLab is a state-of-the-art bottom-up method for panoptic segmentation, where the goal is to assign semantic labels (e.g., person, dog, cat and so on) to every pixel in the input image as well as instance labels (e.g. an id of 1, 2, 3, etc) to pixels belonging to thing classes. + +This repository provides: +- A **reference PyTorch model** for correctness. +- A **TT-NN implementation** for Tenstorrent hardware (Wormhole). +- A **demo pipeline**, **tests**, and **resources** (weights + sample assets). + +## Table of Contents +- [Prerequisites](#prerequisites) +- [Repository Layout](#repository-layout) +- [Weights](#weights) +- [Quickstart](#quickstart) + - [Run Tests](#run-tests) + - [Run the Demo](#run-the-demo) + - [Custom Images](#custom-images) +- [Performance (Trace + 2CQ)](#performance-trace--2cq) +- [Configuration Notes](#configuration-notes) + +## Prerequisites +- Clone the **tt-metal** repository (source code & toolchains): + +- Install **TT-Metalium™ / TT-NN™**: + Follow the official instructions: +- (Optional, for profiling) Build with profiler enabled: + ```bash + ./build_metal.sh --enable-profiler + +## Repository Layout +``` +models/ +└── experimental/ + └── panoptic_deeplab/ + ├── resources/ + │ ├── test_inputs/ + │ │ └── input_torch_input.pt # generated and stored during runtime + │ ├── input.png + │ ├── Panoptic_Deeplab_R52.pkl # downloaded during runtime if not present in the directory + │ └── panoptic_deeplab_weights_download.sh + ├── reference/ + │ ├── aspp.py + │ ├── decoder.py + │ ├── head.py + │ ├── panoptic_deeplab.py # TorchPanopticDeepLab (reference) + │ ├── res_block.py + │ ├── resnet52_backbone.py + │ ├── resnet52_bottleneck.py + │ ├── resnet52_stem.py + │ └── utils.py + ├── tt/ + │ ├── aspp.py + │ ├── backbone.py + │ ├── bottleneck.py + │ ├── custom_peprocessing.py + │ ├── decoder.py + │ ├── head.py + │ ├── panoptic_deeplab.py + │ ├── res_block.py + │ ├── stem.py + │ └── utils.py + ├── runner/ + │ └── runner.py + ├── common.py + ├── README.md + ├── demo/ + │ ├── config.py + │ ├── post_proessing.py + │ └── panoptic_deeplab_demo.py # CLI demo + └── tests/ + ├── perf/ + │ ├── test_perf.py + └── pcc/ + └── test_panoptic_deeplab.py # end-to-end pytest + └── test_aspp.py + └── test_decoder.py + └── test_head.py + └── test_residual_block.py + └── test_resnet52_backbone.py + └── test_resnet52_bottleneck.py + └── test_resnet52_stem.py +``` + +## Weights +The default model expects Panoptic_Deeplab_R52.pkl in: + +``` +models/experimental/panoptic_deeplab/resources/Panoptic_Deeplab_R52.pkl +``` +If missing, the code will attempt to run: +``` +models/experimental/panoptic_deeplab/resources/panoptic_deeplab_weights_download.sh +``` +Note: The weights are for Cityscapes panoptic segmentation with an R-52 backbone. + +## Quickstart +### Run Tests +``` +models/experimental/panoptic_deeplab/tests/pcc/test_panoptic_deeplab.py +``` +This runs an end-to-end flow that: + + - Loads the Torch reference, + + - Runs the TT-NN graph, + + - Post-processes outputs, + + - Optionally compares results and saves artifacts. + +### Run the Demo +``` +python models/experimental/panoptic_deeplab/demo/panoptic_deeplab_demo.py \ + --input \ + --output +``` +### Custom Images +You can place your image(s) under: +``` +models/experimental/panoptic_deeplab/resources/ +``` +Then re-run either the demo: +``` +python models/experimental/panoptic_deeplab/demo/panoptic_deeplab_demo.py +-i models/experimental/panoptic_deeplab/resources/input.png +-o models/experimental/panoptic_deeplab/resources +``` +Note: Currently, the input image is taken from the Cityscapes dataset, and accordingly, post-processing is applied. + +For visualizing heads comparison of PyTorch and TTNN implementation, enable save_comparison in demo/config. + + +## Performance +### Single Device (BS=1): + +- end-2-end perf is `12.81` FPS + +To run perf test: +``` +pytest models/experimental/panoptic_deeplab/tests/perf/test_perf.py +``` + +To collect perf reports with the profiler, build with `--enable-profiler` + +## Configuration Notes + +- Resolution: (H, W) = (512, 1024) is supported end-to-end. + +- Device: The demo opens a Wormhole device (default id typically 0). If you need to change it, adjust the DemoConfig or the device open call in the demo. + +- Batch Size: Demo/tests are written for BS=1. For larger BS you’ll need to verify memory layouts and tile alignment. + +- Memory Layouts: The TT-NN path uses ROW_MAJOR layout for resize ops and may pad channels to multiples of 32 to satisfy kernel/tile alignment. + +- Weights: The loader maps Detectron/PDL keys → internal module keys. It will auto-download weights if missing via the included script. diff --git a/models/experimental/panoptic_deeplab/common.py b/models/experimental/panoptic_deeplab/common.py new file mode 100644 index 000000000000..842f334b8ae3 --- /dev/null +++ b/models/experimental/panoptic_deeplab/common.py @@ -0,0 +1,238 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import os +import pickle +import ttnn +import torch +import numpy as np +import torchvision.transforms as transforms + +from PIL import Image +from loguru import logger +from typing import Tuple, Optional, Any +from ttnn.model_preprocessing import infer_ttnn_module_args + +from models.experimental.panoptic_deeplab.reference.resnet52_backbone import ResNet52BackBone as TorchBackbone +from models.experimental.panoptic_deeplab.reference.resnet52_stem import DeepLabStem +from models.experimental.panoptic_deeplab.reference.aspp import ASPPModel +from models.experimental.panoptic_deeplab.reference.decoder import DecoderModel +from models.experimental.panoptic_deeplab.reference.res_block import ResModel +from models.experimental.panoptic_deeplab.reference.head import HeadModel +from models.experimental.panoptic_deeplab.reference.panoptic_deeplab import TorchPanopticDeepLab +from models.experimental.panoptic_deeplab.reference.resnet52_bottleneck import Bottleneck + + +# --------------------------- +# Key mapping & model loading +# --------------------------- + +key_mappings = { + # Semantic head mappings + "sem_seg_head.": "semantic_decoder.", + ".predictor.": ".head_1.predictor.", + ".head.pointwise.": ".head_1.conv2.", + ".head.depthwise.": ".head_1.conv1.", + # Instance head mappings + "ins_embed_head.": "instance_decoder.", + ".center_head.0.": ".head_2.conv1.", + ".center_head.1.": ".head_2.conv2.", + ".center_predictor.": ".head_2.predictor.", + ".offset_head.depthwise.": ".head_1.conv1.", + ".offset_head.pointwise.": ".head_1.conv2.", + ".offset_predictor.": ".head_1.predictor.", + # ASPP mappings (res5 -> aspp) + "decoder.res5.project_conv": "aspp", + # Decoder res3 mappings + ".decoder.res3.": ".res3.", + # Decoder res2 mappings + ".decoder.res2.": ".res2.", +} + + +def map_single_key(checkpoint_key): + for key, value in key_mappings.items(): + checkpoint_key = checkpoint_key.replace(key, value) + return checkpoint_key + + +def load_partial_state(torch_model: torch.nn.Module, state_dict, layer_name: str = ""): + partial_state_dict = {} + layer_prefix = layer_name + "." + for k, v in state_dict.items(): + if k.startswith(layer_prefix): + partial_state_dict[k[len(layer_prefix) :]] = v + torch_model.load_state_dict(partial_state_dict, strict=True) + logger.info(f"Successfully loaded all mapped weights with strict=True") + return torch_model + + +def load_torch_model_state(torch_model: torch.nn.Module = None, layer_name: str = "", model_location_generator=None): + if model_location_generator == None or "TT_GH_CI_INFRA" not in os.environ: + model_path = "models" + else: + model_path = model_location_generator("vision-models/panoptic_deeplab", model_subdir="", download_if_ci_v2=True) + if model_path == "models": + if not os.path.exists( + "models/experimental/panoptic_deeplab/resources/Panoptic_Deeplab_R52.pkl" + ): # check if Panoptic_Deeplab_R52.pkl is available + os.system( + "models/experimental/panoptic_deeplab/resources/panoptic_deeplab_weights_download.sh" + ) # execute the panoptic_deeplab_weights_download.sh file + weights_path = "models/experimental/panoptic_deeplab/resources/Panoptic_Deeplab_R52.pkl" + else: + weights_path = os.path.join(model_path, "Panoptic_Deeplab_R52.pkl") + + # Load checkpoint + with open(weights_path, "rb") as f: + checkpoint = pickle.load(f, encoding="latin1") + state_dict = checkpoint["model"] + + converted_count = 0 + for k, v in state_dict.items(): + if isinstance(v, np.ndarray) or isinstance(v, np.array): + state_dict[k] = torch.from_numpy(v) + converted_count += 1 + + # Get keys + checkpoint_keys = set(state_dict.keys()) + + # Get key mappings + logger.info("Mapping keys...") + key_mapping = {} + for checkpoint_key in checkpoint_keys: # pickle key + mapped_key = map_single_key(checkpoint_key) + key_mapping[checkpoint_key] = mapped_key + + # Apply mappings + mapped_state_dict = {} + for checkpoint_key, model_key in key_mapping.items(): + mapped_state_dict[model_key] = state_dict[checkpoint_key] + del mapped_state_dict["pixel_mean"] + del mapped_state_dict["pixel_std"] + logger.debug(f"Mapped {len(mapped_state_dict)} weights") + + if isinstance( + torch_model, + ( + DeepLabStem, + Bottleneck, + TorchBackbone, + ASPPModel, + ResModel, + HeadModel, + DecoderModel, + ), + ): + torch_model = load_partial_state(torch_model, mapped_state_dict, layer_name) + elif isinstance(torch_model, TorchPanopticDeepLab): + torch_model.load_state_dict(mapped_state_dict, strict=True) + else: + raise NotImplementedError("Unknown torch model. Weight loading not implemented") + + return torch_model.eval() + + +def _infer_and_set(module, params_holder, attr_name, run_fn): + """Infer conv args for a TTNN module and set them if present in parameters.""" + if hasattr(params_holder, attr_name): + args = infer_ttnn_module_args(model=module, run_model=run_fn, device=None) + getattr(params_holder, attr_name).conv_args = args + + +def _populate_decoder(torch_dec: torch.nn.Module = None, params_dec: dict = None): + """Warm up a single decoder (semantic or instance) to populate conv_args.""" + if not (torch_dec and params_dec): + return + + # Synthetic tensors that match typical Panoptic-DeepLab strides + input_tensor = torch.randn(1, 2048, 32, 64) + res3_tensor = torch.randn(1, 512, 64, 128) + res2_tensor = torch.randn(1, 256, 128, 256) + + # ASPP + _infer_and_set(torch_dec.aspp, params_dec, "aspp", lambda m: m(input_tensor)) + aspp_out = torch_dec.aspp(input_tensor) + + # res3 + _infer_and_set(torch_dec.res3, params_dec, "res3", lambda m: m(aspp_out, res3_tensor)) + res3_out = torch_dec.res3(aspp_out, res3_tensor) + + # res2 + _infer_and_set(torch_dec.res2, params_dec, "res2", lambda m: m(res3_out, res2_tensor)) + res2_out = torch_dec.res2(res3_out, res2_tensor) + + # heads (one or two, if present) + if hasattr(torch_dec, "head_1"): + _infer_and_set(torch_dec.head_1, params_dec, "head_1", lambda m: m(res2_out)) + if hasattr(torch_dec, "head_2"): + _infer_and_set(torch_dec.head_2, params_dec, "head_2", lambda m: m(res2_out)) + + +def _populate_all_decoders(torch_model: torch.nn.Module = None, parameters: dict = None): + if hasattr(parameters, "semantic_decoder"): + _populate_decoder(torch_model.semantic_decoder, parameters.semantic_decoder) + if hasattr(parameters, "instance_decoder"): + _populate_decoder(torch_model.instance_decoder, parameters.instance_decoder) + + +def preprocess_image( + image_path: str, input_width: int, input_height: int, ttnn_device: ttnn.Device, inputs_mesh_mapper: Optional[Any] +) -> Tuple[torch.Tensor, ttnn.Tensor, np.ndarray, Tuple[int, int]]: + """Preprocess image for both PyTorch and TTNN""" + # Load image + image = Image.open(image_path).convert("RGB") + original_size = image.size # (width, height) + original_array = np.array(image) + preprocess = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])] + ) + + # Resize to model input size + target_size = (input_width, input_height) # PIL expects (width, height) + image_resized = image.resize(target_size) + + # PyTorch preprocessing + torch_tensor = preprocess(image_resized).unsqueeze(0) # Add batch dimension + torch_tensor = torch_tensor.to(torch.float) + + # TTNN preprocessing + ttnn_tensor = None + ttnn_tensor = ttnn.from_torch( + torch_tensor.permute(0, 2, 3, 1), # BCHW -> BHWC + dtype=ttnn.bfloat16, + device=ttnn_device, + mesh_mapper=inputs_mesh_mapper, + ) + + if ttnn_tensor is not None: + _ = ttnn.to_torch(ttnn_tensor) + + return torch_tensor, ttnn_tensor, original_array, original_size + + +def save_preprocessed_inputs(torch_input: torch.Tensor, save_dir: str, filename: str): + """Save preprocessed inputs for testing purposes""" + + # Create directory for test inputs + test_inputs_dir = os.path.join(save_dir, "test_inputs") + os.makedirs(test_inputs_dir, exist_ok=True) + + # Save torch input tensor + torch_input_path = os.path.join(test_inputs_dir, f"{filename}_torch_input.pt") + torch.save( + { + "tensor": torch_input, + "shape": torch_input.shape, + "dtype": torch_input.dtype, + "mean": torch_input.mean().item(), + "std": torch_input.std().item(), + "min": torch_input.min().item(), + "max": torch_input.max().item(), + }, + torch_input_path, + ) + + logger.info(f"Saved preprocessed torch input to: {torch_input_path}") + + return torch_input_path diff --git a/models/experimental/panoptic_deeplab/demo/config.py b/models/experimental/panoptic_deeplab/demo/config.py new file mode 100644 index 000000000000..d1a8fbedf03b --- /dev/null +++ b/models/experimental/panoptic_deeplab/demo/config.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import List, Optional +import numpy as np + + +@dataclass +class DemoConfig: + """Configuration class for demo parameters""" + + # Model configuration + model_type: str = "PanopticDeepLab" + backbone: str = "ResNet-52" + num_classes: int = 19 + weights_path: Optional[str] = None + + # Input configuration + input_height: int = 512 + input_width: int = 1024 + crop_enabled: bool = False + normalize_enabled: bool = True + mean: List[float] = None + std: List[float] = None + + # thing instances + center_threshold: float = 0.01 + nms_kernel: int = 7 + top_k_instances: int = 400 + instance_score_threshold: float = 0.1 + + # stuff classes + stuff_area_threshold: int = 100 + min_stuff_area: int = 100 + + # For all detections + label_divisor: int = 256 + min_instance_area: int = 300 + max_distance: int = 150 + + # Device configuration + device_id: int = 0 + math_fidelity: str = "LoFi" + weights_dtype: str = "bfloat8_b" + activations_dtype: str = "bfloat8_b" + + # Output configuration + save_results: bool = True + save_comparison: bool = False # for different heads visualization + dual_pipeline: bool = False # for saving results in different pipelines + + # Dataset configuration (Cityscapes default) + thing_classes: List[int] = None + stuff_classes: List[int] = None + class_names: List[str] = None + + def __post_init__(self): + """Initialize default values after dataclass creation""" + if self.mean is None: + self.mean = [0.485, 0.456, 0.406] + if self.std is None: + self.std = [0.229, 0.224, 0.225] + if self.thing_classes is None: + self.thing_classes = [11, 12, 13, 14, 15, 16, 17, 18] # Cityscapes things + if self.stuff_classes is None: + self.stuff_classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # Cityscapes stuff + if self.class_names is None: + self.class_names = [ + "road", + "sidewalk", + "building", + "wall", + "fence", + "pole", + "traffic_light", + "traffic_sign", + "vegetation", + "terrain", + "sky", + "person", + "rider", + "car", + "truck", + "bus", + "train", + "motorcycle", + "bicycle", + ] + + def _get_cityscapes_colors(self) -> np.ndarray: + """Get Cityscapes color palette with enhanced visibility""" + return np.array( + [ + [128, 64, 128], # road (stuff) + [244, 35, 232], # sidewalk (stuff) + [70, 70, 70], # building (stuff) + [102, 102, 156], # wall + [190, 153, 153], # fence + [153, 153, 153], # pole + [250, 170, 30], # traffic light + [220, 220, 0], # traffic sign + [107, 142, 35], # vegetation + [152, 251, 152], # terrain + [70, 130, 180], # sky + [220, 20, 60], # person (thing) + [255, 0, 0], # rider (thing) + [0, 0, 142], # car (thing) + [0, 0, 70], # truck (thing) + [0, 60, 100], # bus (thing) + [0, 80, 100], # train (thing) + [0, 0, 230], # motorcycle (thing) + [119, 11, 32], # bicycle (thing) + ], + dtype=np.uint8, + ) diff --git a/models/experimental/panoptic_deeplab/demo/panoptic_deeplab_demo.py b/models/experimental/panoptic_deeplab/demo/panoptic_deeplab_demo.py new file mode 100644 index 000000000000..6d66eb33bbd6 --- /dev/null +++ b/models/experimental/panoptic_deeplab/demo/panoptic_deeplab_demo.py @@ -0,0 +1,385 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import os +import time +from dataclasses import asdict +from pathlib import Path +from typing import Any, Dict, Optional + +import matplotlib.pyplot as plt +import numpy as np +import torch +import ttnn +from PIL import Image +from loguru import logger + +from models.experimental.panoptic_deeplab.tt.panoptic_deeplab import TTPanopticDeepLab +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.experimental.panoptic_deeplab.reference.panoptic_deeplab import TorchPanopticDeepLab +from post_processing import PostProcessing, PanopticVisualizer +from models.experimental.panoptic_deeplab.demo.config import DemoConfig +from models.experimental.panoptic_deeplab.common import ( + _populate_all_decoders, + preprocess_image, + save_preprocessed_inputs, + load_torch_model_state, +) + + +class Demo: + """Panoptic-DeepLab demo supporting both PyTorch and TTNN pipelines.""" + + def __init__(self, config: DemoConfig) -> None: + self.config = config + self.torch_model: Optional[TorchPanopticDeepLab] = None + self.ttnn_model: Optional[TTPanopticDeepLab] = None + self.ttnn_device: Optional[Any] = None + + # Visualization palette (Cityscapes) + self.colors = self.config._get_cityscapes_colors() + self.visualizer = PanopticVisualizer(self.config, alpha=0.4) + + # Mesh mappers for TTNN + self.inputs_mesh_mapper = None + self.weights_mesh_mapper = None + self.output_mesh_composer = None + + # --------------------------------------------------------------------- + # Initialization + # --------------------------------------------------------------------- + + def initialize_torch_model(self) -> None: + """Initialize PyTorch model and load weights.""" + logger.info("Initializing PyTorch Panoptic-DeepLab model…") + model = TorchPanopticDeepLab().eval() + self.torch_model = load_torch_model_state(model, "panoptic_deeplab") + logger.info("PyTorch model ready.") + + def initialize_ttnn_model(self) -> None: + """Initialize TTNN model, preprocess parameters, and build runtime graph.""" + logger.info("Initializing TTNN Panoptic-DeepLab model…") + + # Initialize TT device + self.ttnn_device = ttnn.open_device(device_id=self.config.device_id, l1_small_size=16384) + + # Setup mesh mappers + self._setup_mesh_mappers() + + # Create reference torch model to extract parameters + reference_model = self.torch_model or load_torch_model_state(TorchPanopticDeepLab().eval(), "panoptic_deeplab") + + # Preprocess model parameters + from ttnn.model_preprocessing import preprocess_model_parameters + + logger.info("Preprocessing model parameters for TTNN…") + parameters = preprocess_model_parameters( + initialize_model=lambda: reference_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + _populate_all_decoders(reference_model, parameters) + + model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, + } + + # Create TTNN model + self.ttnn_model = TTPanopticDeepLab(parameters=parameters, model_config=model_config) + logger.info("TTNN model ready.") + + def _setup_mesh_mappers(self) -> None: + """Setup mesh mappers for multi-device support.""" + if self.ttnn_device.get_num_devices() != 1: + self.inputs_mesh_mapper = ttnn.ShardTensorToMesh(self.ttnn_device, dim=0) + self.weights_mesh_mapper = None + self.output_mesh_composer = ttnn.ConcatMeshToTensor(self.ttnn_device, dim=0) + else: + self.inputs_mesh_mapper = None + self.weights_mesh_mapper = None + self.output_mesh_composer = None + + # --------------------------------------------------------------------- + # Inference + # --------------------------------------------------------------------- + + def run_torch_inference(self, input_tensor: torch.Tensor): + """Run PyTorch inference.""" + if self.torch_model is None: + raise RuntimeError("Torch model not initialized.") + logger.info("Running PyTorch inference…") + start = time.time() + with torch.no_grad(): + outputs = self.torch_model(input_tensor) # expected tuple of three heads + logger.info("PyTorch inference completed in {:.4f}s", time.time() - start) + return outputs # (outputs, outputs_2, outputs_3) + + def run_ttnn_inference(self, input_tensor: ttnn.Tensor): + """Run TTNN inference.""" + if self.ttnn_model is None or self.ttnn_device is None: + raise RuntimeError("TTNN model/device not initialized.") + logger.info("Running TTNN inference…") + start = time.time() + outputs = self.ttnn_model(input_tensor, self.ttnn_device) # expected tuple of three heads + logger.info("TTNN inference completed in {:.4f}s", time.time() - start) + return outputs # (ttnn_outputs, ttnn_outputs_2, ttnn_outputs_3) + + # --------------------------------------------------------------------- + # Visualization heads / IO + # --------------------------------------------------------------------- + def _create_head_visualization( + self, original_image: np.ndarray, results: Dict[str, Dict[str, np.ndarray]], save_path: str + ) -> None: + """visualization method for heads comparison.""" + from mpl_toolkits.axes_grid1 import make_axes_locatable + + has_torch = "torch" in results and results["torch"] + has_ttnn = "ttnn" in results and results["ttnn"] + + if has_torch and has_ttnn: + fig = plt.figure(figsize=(16, 20)) + gs = fig.add_gridspec(4, 4, height_ratios=[0.8, 1, 1, 1], hspace=0.3, wspace=0.2) + pipelines = ["torch", "ttnn"] + else: + fig = plt.figure(figsize=(16, 12)) + gs = fig.add_gridspec(3, 4, height_ratios=[0.8, 1, 1], hspace=0.3, wspace=0.2) + pipelines = ["torch"] if has_torch else ["ttnn"] + + # Row 0: Original image + ax_orig = fig.add_subplot(gs[0, 1:3]) + ax_orig.imshow(original_image) + ax_orig.set_title("Original Image", fontsize=14, fontweight="bold") + ax_orig.axis("off") + + for i in [0, 3]: + ax = fig.add_subplot(gs[0, i]) + ax.axis("off") + + # Rows 1-2: Pipeline outputs + for i, pipeline in enumerate(pipelines): + if pipeline not in results: + continue + pipeline_results = results[pipeline] + row = i + 1 + + if self.config.save_comparison: + # Semantic segmentation + ax_sem = fig.add_subplot(gs[row, 0]) + if "semantic_pred" in pipeline_results: + semantic_colored = self._colorize_segmentation(pipeline_results["semantic_pred"]) + ax_sem.imshow(semantic_colored) + ax_sem.set_title(f"{pipeline.upper()} Semantic", fontsize=11) + ax_sem.axis("off") + + # Centers + ax_center = fig.add_subplot(gs[row, 1]) + if "center_heatmap" in pipeline_results: + center = pipeline_results["center_heatmap"] + cmin, cmax = np.min(center), np.max(center) + norm = (center - cmin) / (cmax - cmin + 1e-8) + ax_center.imshow(original_image, alpha=0.5) + ax_center.imshow(norm, cmap="hot", alpha=0.5, vmin=0, vmax=1) + ax_center.set_title(f"{pipeline.upper()} Centers", fontsize=11) + ax_center.axis("off") + + # Offset magnitude + ax_offset = fig.add_subplot(gs[row, 2]) + if "offset_map" in pipeline_results: + offset = pipeline_results["offset_map"] + if offset.ndim == 3 and offset.shape[0] == 2: + mag = np.sqrt(offset[0] ** 2 + offset[1] ** 2) + else: + mag = np.asarray(offset) + vmax = float(np.max(mag)) if np.max(mag) > 0 else 1.0 + im = ax_offset.imshow(mag, cmap="viridis", vmin=0, vmax=vmax) + ax_offset.set_title(f"{pipeline.upper()} Offset", fontsize=11) + + divider = make_axes_locatable(ax_offset) + cax = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(im, cax=cax) + ax_offset.axis("off") + + fig.add_subplot(gs[row, 3]).axis("off") + + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + plt.savefig(save_path, dpi=150, bbox_inches="tight") + plt.close(fig) + + # --------------------------------------------------------------------- + # Colorize segmentation + # --------------------------------------------------------------------- + def _colorize_segmentation(self, segmentation: np.ndarray) -> np.ndarray: + """segmentation color.""" + h, w = segmentation.shape[-2], segmentation.shape[-1] + colored = np.zeros((h, w, 3), dtype=np.uint8) + + # ENSURE proper color mapping for each class + for class_id in range(min(self.config.num_classes, len(self.colors))): + mask = segmentation == class_id + if mask.any(): # Only apply color if class exists + colored[mask] = self.colors[class_id] + + return colored + + # --------------------------------------------------------------------- + # Save results + # --------------------------------------------------------------------- + def save_results( + self, results: Dict[str, Dict[str, np.ndarray]], original_image: np.ndarray, output_dir: str, filename: str + ) -> None: + """Save panoptic predictions to output directory.""" + Path(output_dir).mkdir(parents=True, exist_ok=True) + + for pipeline, pipeline_results in results.items(): + # choose destination dir + if self.config.dual_pipeline: + pipeline_dir = os.path.join(output_dir, pipeline) + else: + pipeline_dir = "models/experimental/panoptic_deeplab/resources" + Path(pipeline_dir).mkdir(parents=True, exist_ok=True) + + if "panoptic_pred" in pipeline_results: + transparent_overlay = self.visualizer.create_transparent_overlay( + original_image, pipeline_results["panoptic_pred"], alpha=0.7 + ) + + # Then add labels to the transparent overlay + labeled_image = self.visualizer.add_labels_to_panoptic( + transparent_overlay, # Use transparent overlay + pipeline_results["panoptic_pred"], + ) + + # Save labeled version + labeled_path = os.path.join(pipeline_dir, f"{filename}_panoptic.png") + Image.fromarray(labeled_image).save(labeled_path) + logger.info(f"Panoptic Segmentation Predictions Saved: {labeled_path}") + + # --------------------------------------------------------------------- + # Save metadata + # --------------------------------------------------------------------- + def _save_metadata( + self, image_path: str, results: Dict[str, Dict[str, Any]], output_dir: str, filename: str + ) -> None: + """Save metadata and comparison manifest for downstream analysis.""" + meta = { + "image_path": image_path, + "config": asdict(self.config), + "results": {"pipelines_run": list(results.keys())}, + "output_files": { + "panoptic": f"{filename}_panoptic.png", + "visualization": f"{filename}_comparison.png", + "original": f"{filename}.png", + }, + } + + path = os.path.join(output_dir, f"{filename}_metadata.json") + with open(path, "w") as f: + json.dump(meta, f, indent=2) + logger.info("Metadata saved: {}", path) + + # --------------------------------------------------------------------- + # Run demo + # --------------------------------------------------------------------- + def run_demo(self, image_path: str, output_dir: str) -> None: + """Run the full demo pipeline end-to-end.""" + logger.info("Starting demo for image: {}", image_path) + + # Initialize models (Torch + TTNN) + self.initialize_torch_model() + self.initialize_ttnn_model() + + # Preprocess image + torch_input, ttnn_input, original_image, original_size = preprocess_image( + image_path, self.config.input_width, self.config.input_height, self.ttnn_device, self.inputs_mesh_mapper + ) + + base_name = Path(image_path).stem + # Save preprocessed inputs (torch tensor + stats) for reproducibility + _ = save_preprocessed_inputs(torch_input, output_dir, base_name) + + # Run inference + torch_outputs = self.run_torch_inference(torch_input) # (o1, o2, o3) + ttnn_outputs = self.run_ttnn_inference(ttnn_input) # (o1, o2, o3) + + # Postprocess to comparable outputs + results = PostProcessing(self.config).postprocess_outputs( + *torch_outputs, + *ttnn_outputs, + original_size, + self.ttnn_device, + self.output_mesh_composer, + ) + + # Save results + if self.config.save_results: + self.save_results(results, original_image, output_dir, base_name) + + if self.config.save_comparison: + viz_path = os.path.join(output_dir, f"{base_name}_comparison.png") + self._create_head_visualization(original_image, results, viz_path) + self._save_metadata(image_path, results, output_dir, base_name) + + logger.info("Demo completed. Output dir: {}", output_dir) + + def cleanup(self) -> None: + """Release device resources.""" + if self.ttnn_device is not None: + try: + ttnn.close_device(self.ttnn_device) + logger.info("TTNN device closed.") + finally: + self.ttnn_device = None + + +# --------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------- +def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="TT Panoptic-DeepLab Demo") + parser.add_argument("--input", "-i", required=True, help="Path to input image") + parser.add_argument( + "--output", + "-o", + default="models/experimental/panoptic_deeplab/resources/outputs", + help="Output directory for results", + ) + return parser.parse_args(argv) + + +def main(argv: Optional[list[str]] = None) -> int: + args = _parse_args(argv) + + # Validate input file + if not args.input or not os.path.exists(args.input): + logger.error("Input image not found: {}", args.input) + return 1 + + # Prepare output directory + out_dir = args.output or "models/experimental/panoptic_deeplab/resources/outputs" + Path(out_dir).mkdir(parents=True, exist_ok=True) + + config = DemoConfig() + demo: Optional[Demo] = None + + logger.info("=== Panoptic-DeepLab Demo ===") + try: + demo = Demo(config) + demo.run_demo(args.input, out_dir) + return 0 + except Exception as e: + logger.exception("Demo failed: {}", e) + return 1 + finally: + if demo is not None: + try: + demo.cleanup() + except Exception: + pass + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/models/experimental/panoptic_deeplab/demo/post_processing.py b/models/experimental/panoptic_deeplab/demo/post_processing.py new file mode 100644 index 000000000000..1da34f2e7595 --- /dev/null +++ b/models/experimental/panoptic_deeplab/demo/post_processing.py @@ -0,0 +1,1094 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn.functional as F +from typing import Tuple, Optional, Dict, Any +import numpy as np +import cv2 +import logging +from collections import Counter +from config import DemoConfig +from PIL import Image, ImageDraw, ImageFont + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------- +# Post-processing class +# --------------------------------------------------------------------- +class PostProcessing: + """ + Post-processing for Panoptic-DeepLab + """ + + def __init__(self, config: DemoConfig): + self.config = config + self.center_threshold = self.config.center_threshold + self.nms_kernel = self.config.nms_kernel + self.top_k_instance = self.config.top_k_instances + self.thing_classes = self.config.thing_classes + self.stuff_classes = self.config.stuff_classes + self.stuff_area_threshold = self.config.stuff_area_threshold + self.label_divisor = self.config.label_divisor + self.instance_score_threshold = self.config.instance_score_threshold + + # Create lookup sets for faster checking + self.thing_set = set(self.thing_classes) + self.stuff_set = set(self.stuff_classes) + + def find_instance_center(self, center_heatmap, threshold=0.1, nms_kernel=7, top_k=200): + """ + Find the center points from the center heatmap. + """ + # Normalize shape to [N, C, H, W] (N usually 1, C usually 1) + if center_heatmap.dim() == 2: + center_heatmap = center_heatmap.unsqueeze(0).unsqueeze(0) # [1,1,H,W] + elif center_heatmap.dim() == 3: + # could be [C,H,W] or [1,H,W]; make it [1,C,H,W] + center_heatmap = center_heatmap.unsqueeze(0) + elif center_heatmap.dim() == 4: + pass + else: + raise ValueError(f"Unsupported center_heatmap dim: {center_heatmap.dim()}") + + # Apply threshold; below threshold become -1 + center_heatmap = F.threshold(center_heatmap, threshold, -1.0) + + # Stronger NMS via max pooling with larger kernel + nms_padding = (nms_kernel - 1) // 2 + center_heatmap_max_pooled = F.max_pool2d(center_heatmap, kernel_size=nms_kernel, stride=1, padding=nms_padding) + + # Keep only strong local maxima + center_heatmap[center_heatmap != center_heatmap_max_pooled] = -1.0 + + # Additional filtering: remove weak peaks + max_val = center_heatmap.max() + min_peak_value = max_val * 0.3 # Only keep peaks that are at least 30% of max + center_heatmap[center_heatmap < min_peak_value] = -1.0 + + # Continue with rest of the method... + center_heatmap = center_heatmap.squeeze() + if center_heatmap.dim() != 2: + center_heatmap = center_heatmap[0] if center_heatmap.dim() > 2 else center_heatmap + + all_centers = torch.nonzero(center_heatmap > 0.0) + if all_centers.size(0) == 0: + return all_centers + + if top_k is not None and all_centers.size(0) > top_k: + scores = center_heatmap[all_centers[:, 0], all_centers[:, 1]] + _, top_indices = torch.topk(scores, min(top_k, scores.size(0))) + return all_centers[top_indices] + + return all_centers + + def group_pixels(self, center_points, offsets): + """Group pixels with stricter distance threshold.""" + height, width = offsets.size()[1:] + + # Generate coordinate map + y_coord, x_coord = torch.meshgrid( + torch.arange(height, dtype=offsets.dtype, device=offsets.device), + torch.arange(width, dtype=offsets.dtype, device=offsets.device), + indexing="ij", + ) + coord = torch.cat((y_coord.unsqueeze(0), x_coord.unsqueeze(0)), dim=0) + + # Predicted centers for each pixel + center_loc = coord + offsets + center_loc = center_loc.flatten(1).T.unsqueeze_(0) + center_points = center_points.unsqueeze(1).float() + + # Compute distances + distance = torch.norm(center_points - center_loc, dim=-1) + MAX_DISTANCE = ( + self.config.max_distance + ) # Any pixel farther than this from all centers is not assigned to any instance. + min_dist, instance_id = torch.min(distance, dim=0) + instance_id = instance_id.reshape((1, height, width)) + 1 + instance_id[min_dist.reshape((1, height, width)) > MAX_DISTANCE] = 0 + + return instance_id + + def get_instance_segmentation( + self, sem_seg, center_heatmap, offsets, thing_seg, thing_ids, threshold=0.1, nms_kernel=3, top_k=None + ): + """ + Post-processing for instance segmentation, gets class agnostic instance id. + """ + center_points = self.find_instance_center( + center_heatmap, threshold=threshold, nms_kernel=nms_kernel, top_k=top_k + ) + logger.info(f"Found {center_points.size(0)} instance centers") + if center_points.size(0) == 0: + return torch.zeros_like(sem_seg), center_points.unsqueeze(0) + ins_seg = self.group_pixels(center_points, offsets) + return thing_seg * ins_seg, center_points.unsqueeze(0) + + def merge_semantic_and_instance( + self, sem_seg, ins_seg, semantic_thing_seg, label_divisor, thing_ids, stuff_area, void_label + ): + """ + Post-processing for panoptic segmentation, by merging semantic segmentation + label and class agnostic instance segmentation label. + """ + # In case thing mask does not align with semantic prediction. + pan_seg = torch.zeros_like(sem_seg) + void_label + is_thing = (ins_seg > 0) & (semantic_thing_seg > 0) + + # Keep track of instance id for each class. + class_id_tracker = Counter() + + # Paste thing by majority voting. + instance_ids = torch.unique(ins_seg) + for ins_id in instance_ids: + if ins_id == 0: + continue + # Make sure only do majority voting within `semantic_thing_seg`. + thing_mask = (ins_seg == ins_id) & is_thing + if torch.nonzero(thing_mask).size(0) == 0: + continue + class_id, _ = torch.mode(sem_seg[thing_mask].view(-1)) + class_id_tracker[class_id.item()] += 1 + new_ins_id = class_id_tracker[class_id.item()] + pan_seg[thing_mask] = class_id * label_divisor + new_ins_id + + unique_pan = torch.unique(pan_seg) + thing_instances = sum(1 for p in unique_pan if (p % label_divisor) > 0 and (p // label_divisor) in thing_ids) + logger.info(f"Created {thing_instances} thing instances") + + class_ids = torch.unique(sem_seg) + for class_id in class_ids: + if class_id.item() in thing_ids: + continue + stuff_mask = (sem_seg == class_id) & (ins_seg == 0) + if stuff_mask.sum().item() >= stuff_area: + pan_seg[stuff_mask] = class_id * label_divisor + # Clamp all values to be >= 0 (removes any negative/void labels) + pan_seg = torch.clamp(pan_seg, min=0) + + road_class_id = 0 + if road_class_id in self.stuff_set: + # Find all pixels semantically classified as road + road_semantic_mask = sem_seg == road_class_id + + if road_semantic_mask.sum() > 1000: # If significant road area exists + # Find road pixels not already assigned to panoptic segments + unassigned_road_mask = road_semantic_mask & (pan_seg == void_label) + + if unassigned_road_mask.sum() > 500: # If significant unassigned road area + logger.info(f"Force-assigning {unassigned_road_mask.sum()} road pixels to panoptic") + # Assign road class ID (0) with label_divisor to create panoptic ID + pan_seg[unassigned_road_mask] = road_class_id * label_divisor + + # Also force-assign any road pixels that might be void + all_road_mask = road_semantic_mask & (pan_seg <= void_label) + if all_road_mask.sum() > 500: + logger.info(f"Force-assigning additional {all_road_mask.sum()} void road pixels") + pan_seg[all_road_mask] = road_class_id * label_divisor + + # Debug: Check final road assignment + final_road_panoptic = (pan_seg // label_divisor) == road_class_id + logger.info( + f"Final road panoptic assignment: {final_road_panoptic.sum()} pixels with ID {road_class_id * label_divisor}" + ) + + return pan_seg + + def get_panoptic_segmentation( + self, + sem_seg, + center_heatmap, + offsets, + thing_ids=[11, 12, 13, 14, 15, 16, 17, 18], + label_divisor=256, + stuff_area=2048, + void_label=0, + threshold=0.05, + nms_kernel=7, + top_k=200, + foreground_mask=None, + ): + """ + Post-processing for panoptic segmentation. + """ + # --- normalize sem_seg to [1,H,W] + if sem_seg.dim() == 4 and sem_seg.shape[0] == 1: + sem_seg = torch.argmax(sem_seg, dim=1) + elif sem_seg.dim() == 4 and sem_seg.shape[0] != 1: + # If batched, pick first item + sem_seg = torch.argmax(sem_seg, dim=1)[0].unsqueeze(0) + elif sem_seg.dim() == 3 and sem_seg.shape[0] == 1: + # already [1,H,W] + pass + elif sem_seg.dim() == 3: + # [C,H,W] ? we expected [1,H,W] - take first batch + sem_seg = sem_seg[0].unsqueeze(0) + elif sem_seg.dim() == 2: + sem_seg = sem_seg.unsqueeze(0) + else: + raise ValueError(f"Unexpected sem_seg shape: {sem_seg.shape}") + + # --- normalize center_heatmap to [1,1,H,W] + if center_heatmap.dim() == 2: + center_heatmap = center_heatmap.unsqueeze(0).unsqueeze(0) + elif center_heatmap.dim() == 3: + # if [1,H,W] -> [1,1,H,W], if [C,H,W] -> [1,C,H,W] + if center_heatmap.shape[0] == 1: + center_heatmap = center_heatmap.unsqueeze(1) + else: + center_heatmap = center_heatmap.unsqueeze(0) + elif center_heatmap.dim() == 4: + # keep as-is (N,C,H,W) + pass + else: + raise ValueError(f"Unexpected center_heatmap shape: {center_heatmap.shape}") + + # --- normalize offsets to [2,H,W] (instance offsets expected channel-first) + if offsets.dim() == 4: + # [N,C,H,W] -> squeeze batch if single + if offsets.shape[0] == 1: + offsets = offsets.squeeze(0) # [C,H,W] + else: + offsets = offsets[0] + elif offsets.dim() == 3: + # keep as [C,H,W] + pass + elif offsets.dim() == 2: + # [H,W] -> not valid for offsets but we will unsqueeze channel + offsets = offsets.unsqueeze(0) + else: + raise ValueError(f"Unexpected offsets shape: {offsets.shape}") + # Create thing mask + thing_seg = torch.zeros_like(sem_seg) + for thing_class in thing_ids: + thing_seg[sem_seg == thing_class] = 1 + + # Get instances + instance, center = self.get_instance_segmentation( + sem_seg, + center_heatmap, + offsets, + thing_seg, + thing_ids, + threshold=threshold, + nms_kernel=nms_kernel, + top_k=top_k, + ) + + # Merge semantic and instance + panoptic = self.merge_semantic_and_instance( + sem_seg, instance, thing_seg, label_divisor, thing_ids, stuff_area, void_label + ) + + return panoptic, center + + def postprocess_outputs( + self, + torch_outputs: torch.Tensor, + torch_outputs_2: torch.Tensor, + torch_outputs_3: torch.Tensor, + ttnn_outputs, + ttnn_outputs_2, + ttnn_outputs_3, + original_size: Tuple[int, int], + ttnn_device, + output_mesh_composer: Optional[Any], + ) -> Dict[str, Dict]: + """Process outputs from both PyTorch and TTNN pipelines.""" + results = {"torch": {}, "ttnn": {}} + + # Process PyTorch outputs + if torch_outputs is not None: + logger.info("Processing PyTorch outputs...") + try: + # Run panoptic fusion + panoptic_pred, _ = self.get_panoptic_segmentation(torch_outputs, torch_outputs_3, torch_outputs_2) + void_pixels = (panoptic_pred == -1).sum() + total_pixels = panoptic_pred.numel() + logger.info(f"Void pixels: {void_pixels}/{total_pixels} ({100*void_pixels/total_pixels:.1f}%)") + + # Extract individual outputs + semantic_pred = torch.argmax(torch_outputs, dim=1) + + # Convert to numpy and resize + results["torch"]["semantic_pred"] = cv2.resize( + semantic_pred[0].cpu().numpy().astype(np.uint8), original_size, interpolation=cv2.INTER_NEAREST + ) + + results["torch"]["panoptic_pred"] = cv2.resize( + panoptic_pred[0].cpu().numpy().astype(np.int32), original_size, interpolation=cv2.INTER_NEAREST + ) + + # save center heatmap + center_np = torch_outputs_3[0, 0].cpu().numpy() + results["torch"]["center_heatmap"] = cv2.resize( + center_np, original_size, interpolation=cv2.INTER_LINEAR + ) + + # save offset map + offset_np = torch_outputs_2[0].cpu().numpy() + results["torch"]["offset_map"] = np.stack( + [ + cv2.resize(offset_np[0], original_size, interpolation=cv2.INTER_LINEAR), + cv2.resize(offset_np[1], original_size, interpolation=cv2.INTER_LINEAR), + ] + ) + + except Exception as e: + logger.error(f"Error processing PyTorch outputs: {e}") + import traceback + + traceback.print_exc() + + # Process TTNN outputs (similar structure) + if ttnn_outputs is not None: + logger.info("Processing TTNN outputs...") + import ttnn + + try: + # Convert to PyTorch tensors + semantic_logits = ttnn.to_torch(ttnn_outputs, device=ttnn_device, mesh_composer=output_mesh_composer) + offset_map = ttnn.to_torch(ttnn_outputs_2, device=ttnn_device, mesh_composer=output_mesh_composer) + center_heatmap = ttnn.to_torch(ttnn_outputs_3, device=ttnn_device, mesh_composer=output_mesh_composer) + + # Reshape if needed (handle TTNN's NHWC format) + semantic_logits = self._reshape_ttnn_output(semantic_logits, "semantic_logits") + offset_map = self._reshape_ttnn_output(offset_map, "offset_map") + center_heatmap = self._reshape_ttnn_output(center_heatmap, "center_heatmap") + + # Run panoptic fusion + panoptic_pred, _ = self.get_panoptic_segmentation(semantic_logits, center_heatmap, offset_map) + + # Extract outputs + semantic_pred = torch.argmax(semantic_logits, dim=1) + + # Convert and resize + results["ttnn"]["semantic_pred"] = cv2.resize( + semantic_pred[0].cpu().numpy().astype(np.uint8), original_size, interpolation=cv2.INTER_NEAREST + ) + + results["ttnn"]["panoptic_pred"] = cv2.resize( + panoptic_pred[0].cpu().numpy().astype(np.int64), original_size, interpolation=cv2.INTER_NEAREST + ) + + # outputs + center_np = center_heatmap[0, 0].cpu().float().numpy() + results["ttnn"]["center_heatmap"] = cv2.resize(center_np, original_size, interpolation=cv2.INTER_LINEAR) + + offset_np = offset_map[0].cpu().float().numpy() + results["ttnn"]["offset_map"] = np.stack( + [ + cv2.resize(offset_np[0], original_size, interpolation=cv2.INTER_LINEAR), + cv2.resize(offset_np[1], original_size, interpolation=cv2.INTER_LINEAR), + ] + ) + + except Exception as e: + logger.error(f"Error processing TTNN outputs: {e}") + import traceback + + traceback.print_exc() + + return results + + def _reshape_ttnn_output(self, tensor: torch.Tensor, key: str) -> torch.Tensor: + """Handle TTNN's NHWC to NCHW conversion.""" + if not isinstance(tensor, torch.Tensor): + return tensor + + if tensor.dim() != 4: + return tensor + + N, d1, d2, d3 = tensor.shape + + if key == "semantic_logits": + expected_c = 19 # set this to dataset's num_classes + if d1 == expected_c: + return tensor # assume already NCHW + if d3 == expected_c: + return tensor.permute(0, 3, 1, 2) + + elif key == "center_heatmap": + # Center head is single channel + if d1 == 1: + return tensor + if d3 == 1: + return tensor.permute(0, 3, 1, 2) + + elif key == "offset_map": + if d1 == 2: + return tensor + if d3 == 2: + return tensor.permute(0, 3, 1, 2) + + return tensor + + +# --------------------------------------------------------------------- +# Panoptic visualizer class +# --------------------------------------------------------------------- +class PanopticVisualizer: + """Visualizer that adds labels to panoptic segmentation results.""" + + def __init__(self, config, alpha: float = 0.5): + self.config = config + self.colors = config._get_cityscapes_colors() + self.class_names = config.class_names + self.thing_classes = set(config.thing_classes) + self.stuff_classes = set(config.stuff_classes) + self.label_divisor = config.label_divisor + self.alpha = alpha # transparency level (0.5 = 50% transparent) + + # --------------------------------------------------------------------- + # Create transparent overlay + # --------------------------------------------------------------------- + def create_transparent_overlay( + self, original_image: np.ndarray, panoptic_pred: np.ndarray, alpha: Optional[float] = None + ) -> np.ndarray: + """Create a transparent colored overlay on the original image.""" + + if alpha is None: + alpha = self.alpha + + if original_image.shape[-1] != 3: + raise ValueError("Original image must be RGB") + + overlay = original_image.copy() + output = original_image.copy() + + unique_ids = np.unique(panoptic_pred) + + road_pixels_found = False + for pan_id in unique_ids: + if pan_id > 0: + semantic_class = int(pan_id // self.label_divisor) + if semantic_class == 0: + road_pixels_found = True + mask = panoptic_pred == pan_id + + # Process all panoptic IDs + for pan_id in unique_ids: + if pan_id == 0: # Skip void/background + continue + + semantic_class = int(pan_id // self.label_divisor) + instance_id = int(pan_id % self.label_divisor) + mask = panoptic_pred == pan_id + + if semantic_class < len(self.colors): + base_color = self.colors[semantic_class].copy() + + # Special handling for road (class 0) + if semantic_class == 0: + road_overlay_color = np.array([128, 64, 128], dtype=np.uint8) + overlay[mask] = road_overlay_color + continue + + # For thing instances, add color variation + if instance_id > 0 and semantic_class in self.thing_classes: + rng = np.random.default_rng(int(pan_id)) + jitter = rng.integers(-30, 31, size=3) + base_color = np.clip(base_color.astype(int) + jitter, 0, 255) + + overlay[mask] = base_color + + # If no road was found in panoptic, try to overlay road from semantic segmentation directly + if not road_pixels_found: + # Create a mask for any pixels that semantically should be road + potential_road_mask = ((panoptic_pred // self.label_divisor) == 0) & (panoptic_pred >= 0) + if potential_road_mask.any(): + road_overlay_color = np.array([128, 64, 128], dtype=np.uint8) + overlay[potential_road_mask] = road_overlay_color + + # Blend overlay with original image + cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) + + return output + + # --------------------------------------------------------------------- + # Add text labels and instance IDs to panoptic segmentation results + # --------------------------------------------------------------------- + def add_labels_to_panoptic( + self, + image: np.ndarray, + panoptic_pred: np.ndarray, + ) -> np.ndarray: + """Add labels to panoptic segmentation results.""" + + labeled_image = image.copy() + h, w = labeled_image.shape[:2] + + font = ImageFont.load_default() + + pil_image = Image.fromarray(labeled_image) + draw = ImageDraw.Draw(pil_image) + + labeled_regions = [] + labeled_classes = set() + + # Get unique panoptic IDs and group by class + unique_ids = np.unique(panoptic_pred) + class_segments = {} + + for pan_id in unique_ids: + if pan_id > 0: + semantic_class = int(pan_id // self.label_divisor) + area = (panoptic_pred == pan_id).sum() + + if semantic_class not in class_segments: + class_segments[semantic_class] = [] + class_segments[semantic_class].append((pan_id, area)) + + # Process cars (class 13) + if 13 in class_segments: + car_segments = class_segments[13] + # Filter out very small car segments + car_segments = [(pid, area) for pid, area in car_segments if area > 300] + + # Group nearby car segments + consolidated_cars = self._consolidate_car_segments(panoptic_pred, car_segments) + + # Filter out very small consolidated cars + valid_cars = [ + (merged_mask, total_area) for merged_mask, total_area in consolidated_cars if total_area >= 800 + ] + + # Label cars with simple sequential numbering based on actual count + for car_num, (merged_mask, total_area) in enumerate(valid_cars, 1): + coords = np.column_stack(np.where(merged_mask)) + if len(coords) == 0: + continue + + # Find best position for this car cluster + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + # Try upper area for better visibility + upper_coords = coords[coords[:, 0] < np.percentile(coords[:, 0], 50)] + if len(upper_coords) > 20: + alt_y, alt_x = upper_coords.mean(axis=0).astype(int) + # Choose the position that's more central + if upper_coords.shape[0] > coords.shape[0] * 0.3: + centroid_y, centroid_x = alt_y, alt_x + + # Sequential numbering: car#1, car#2, car#3, car#4 (for multiple cars) + if len(valid_cars) > 1: + label = f"car#{car_num}" + else: + label = "car" + + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + # Position label + label_x = max(5, min(centroid_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(centroid_y - text_height // 2, h - text_height - 5)) + + # Check overlap with existing labels + overlap = False + for lx, ly, lw, lh in labeled_regions: + center_dist = np.sqrt( + (label_x + text_width / 2 - lx - lw / 2) ** 2 + (label_y + text_height / 2 - ly - lh / 2) ** 2 + ) + if center_dist < 100: # overlap distance + overlap = True + break + + if not overlap: + # Draw car label + padding = 4 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 240), + ) + + # Add shadow + draw.text((label_x + 1, label_y + 1), label, fill=(0, 0, 0, 180), font=font) + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + (label_x - padding, label_y - padding, text_width + 2 * padding, text_height + 2 * padding) + ) + + # Mark cars as processed + labeled_classes.add(13) + + # Process other vehicle classes + for vehicle_class in [14, 15, 16, 17, 18]: # truck, bus, train, motorcycle, bicycle + if vehicle_class in class_segments: + segments = class_segments[vehicle_class] + # Filter by size and sort by area (largest first) + valid_segments = [(pid, area) for pid, area in segments if area > 1000] + + if valid_segments: + valid_segments.sort(key=lambda x: x[1], reverse=True) + + # Label each instance with sequential numbering + for instance_num, (pan_id, area) in enumerate(valid_segments, 1): + mask = panoptic_pred == pan_id + coords = np.column_stack(np.where(mask)) + + if len(coords) > 0: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + if vehicle_class < len(self.class_names): + base_name = self.class_names[vehicle_class] + else: + base_name = f"vehicle_{vehicle_class}" + + # Add sequential numbering if multiple instances + if len(valid_segments) > 1: + label = f"{base_name}#{instance_num}" + else: + label = base_name + + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + label_x = max(5, min(centroid_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(centroid_y - text_height // 2, h - text_height - 5)) + + # Check overlap + overlap = False + for lx, ly, lw, lh in labeled_regions: + if not ( + label_x + text_width < lx + or label_x > lx + lw + or label_y + text_height < ly + or label_y > ly + lh + ): + overlap = True + break + + if not overlap: + padding = 3 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 220), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + ( + label_x - padding, + label_y - padding, + text_width + 2 * padding, + text_height + 2 * padding, + ) + ) + + labeled_classes.add(vehicle_class) + + # Process people (class 11) + if 11 in class_segments: + person_segments = class_segments[11] + # Filter by size and sort by area + valid_persons = [(pid, area) for pid, area in person_segments if area > 800] + + if valid_persons: + valid_persons.sort(key=lambda x: x[1], reverse=True) + + # Label each person with sequential numbering + for person_num, (pan_id, area) in enumerate(valid_persons, 1): + mask = panoptic_pred == pan_id + coords = np.column_stack(np.where(mask)) + + if len(coords) > 0: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + # Add sequential numbering if multiple persons + if len(valid_persons) > 1: + label = f"person#{person_num}" + else: + label = "person" + + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + label_x = max(5, min(centroid_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(centroid_y - text_height // 2, h - text_height - 5)) + + # Check overlap + overlap = False + for lx, ly, lw, lh in labeled_regions: + center_dist = np.sqrt( + (label_x + text_width / 2 - lx - lw / 2) ** 2 + + (label_y + text_height / 2 - ly - lh / 2) ** 2 + ) + if center_dist < 80: + overlap = True + break + + if not overlap: + padding = 3 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 220), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + ( + label_x - padding, + label_y - padding, + text_width + 2 * padding, + text_height + 2 * padding, + ) + ) + + labeled_classes.add(11) + + # Process rider (class 12) + if 12 in class_segments: + rider_segments = class_segments[12] + # Filter by size and sort by area + valid_riders = [(pid, area) for pid, area in rider_segments if area > 600] + + if valid_riders: + valid_riders.sort(key=lambda x: x[1], reverse=True) + + # Label each rider with sequential numbering + for rider_num, (pan_id, area) in enumerate(valid_riders, 1): + mask = panoptic_pred == pan_id + coords = np.column_stack(np.where(mask)) + + if len(coords) > 0: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + # Add sequential numbering if multiple riders + if len(valid_riders) > 1: + label = f"rider#{rider_num}" + else: + label = "rider" + + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + label_x = max(5, min(centroid_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(centroid_y - text_height // 2, h - text_height - 5)) + + # Check overlap + overlap = False + for lx, ly, lw, lh in labeled_regions: + center_dist = np.sqrt( + (label_x + text_width / 2 - lx - lw / 2) ** 2 + + (label_y + text_height / 2 - ly - lh / 2) ** 2 + ) + if center_dist < 80: + overlap = True + break + + if not overlap: + padding = 3 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 220), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + ( + label_x - padding, + label_y - padding, + text_width + 2 * padding, + text_height + 2 * padding, + ) + ) + + labeled_classes.add(12) + + # Process stuff classes (road, sidewalk, building, etc.) - one label per class + for stuff_class in self.stuff_classes: + if stuff_class in labeled_classes: + continue + + if stuff_class in class_segments: + segments = class_segments[stuff_class] + total_area = sum(area for _, area in segments) + + # Lower area threshold for fence, pole, and other small stuff classes + min_area_threshold = 1000 if stuff_class in [4, 5] else 5000 # fence=4, pole=5 + + if total_area > min_area_threshold: + # Create combined mask for all segments of this class + combined_mask = np.zeros_like(panoptic_pred, dtype=bool) + for pan_id, _ in segments: + combined_mask |= panoptic_pred == pan_id + + coords = np.column_stack(np.where(combined_mask)) + if len(coords) > 0: + # Smart positioning based on class + if stuff_class == 2: # building - upper area + upper_coords = coords[coords[:, 0] < np.percentile(coords[:, 0], 30)] + if len(upper_coords) > 0: + centroid_y, centroid_x = upper_coords.mean(axis=0).astype(int) + else: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + elif stuff_class == 0: # road - lower area + lower_coords = coords[coords[:, 0] > np.percentile(coords[:, 0], 70)] + if len(lower_coords) > 0: + centroid_y, centroid_x = lower_coords.mean(axis=0).astype(int) + else: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + elif stuff_class == 4: # fence - try right side area + right_coords = coords[coords[:, 1] > w * 0.7] + if len(right_coords) > 10: + centroid_y, centroid_x = right_coords.mean(axis=0).astype(int) + else: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + else: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + if stuff_class < len(self.class_names): + label = self.class_names[stuff_class] + else: + label = f"class_{stuff_class}" + + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + # Try multiple positions for better placement + positions_to_try = [ + (centroid_y, centroid_x), # Primary position + ] + + # Add alternative positions for fence and pole + if stuff_class in [4, 5]: # fence, pole + positions_to_try.extend( + [ + (centroid_y - 30, centroid_x), + (centroid_y + 30, centroid_x), + (centroid_y, centroid_x + 40), + (centroid_y, centroid_x - 40), + ] + ) + + label_placed = False + + for try_y, try_x in positions_to_try: + label_x = max(5, min(try_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(try_y - text_height // 2, h - text_height - 5)) + + # Check overlap - be more lenient for small classes + overlap = False + min_distance = 40 if stuff_class in [4, 5] else 60 + + for lx, ly, lw, lh in labeled_regions: + center_dist = np.sqrt( + (label_x + text_width / 2 - lx - lw / 2) ** 2 + + (label_y + text_height / 2 - ly - lh / 2) ** 2 + ) + if center_dist < min_distance: + overlap = True + break + + if not overlap: + padding = 2 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 200), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + ( + label_x - padding, + label_y - padding, + text_width + 2 * padding, + text_height + 2 * padding, + ) + ) + + labeled_classes.add(stuff_class) + label_placed = True + break + + else: + logger.debug(f"No coords found for stuff class {stuff_class}") + else: + logger.debug( + f"Insufficient area for stuff class {stuff_class}: {total_area} < {min_area_threshold}" + ) + else: + # For road (class 0), try semantic detection + if stuff_class == 0: + logger.debug(f"Road not in class_segments, trying semantic detection...") + road_semantic_mask = ((panoptic_pred // self.label_divisor) == 0) & (panoptic_pred >= 0) + + if road_semantic_mask.sum() > 10000: + coords = np.column_stack(np.where(road_semantic_mask)) + + if len(coords) > 0: + lower_coords = coords[coords[:, 0] > h * 0.6] + if len(lower_coords) > 100: + centroid_y = int(lower_coords[:, 0].mean()) + centroid_x = int(lower_coords[:, 1].mean()) + else: + centroid_y, centroid_x = coords.mean(axis=0).astype(int) + + label = "road" + bbox = draw.textbbox((0, 0), label, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + label_x = max(5, min(centroid_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(centroid_y - text_height // 2, h - text_height - 5)) + + # Check overlap + overlap = False + for lx, ly, lw, lh in labeled_regions: + if not ( + label_x + text_width < lx + or label_x > lx + lw + or label_y + text_height < ly + or label_y > ly + lh + ): + overlap = True + break + + if not overlap: + padding = 3 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 220), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + + labeled_regions.append( + ( + label_x - padding, + label_y - padding, + text_width + 2 * padding, + text_height + 2 * padding, + ) + ) + + labeled_classes.add(0) + logger.debug(f"Successfully labeled road using semantic detection") + else: + # fallback position for road + alt_x, alt_y = w // 2, int(h * 0.7) + label_x = max(5, min(alt_x - text_width // 2, w - text_width - 5)) + label_y = max(5, min(alt_y - text_height // 2, h - text_height - 5)) + + padding = 3 + draw.rectangle( + [ + label_x - padding, + label_y - padding, + label_x + text_width + padding, + label_y + text_height + padding, + ], + fill=(255, 255, 255, 240), + ) + + draw.text((label_x, label_y), label, fill=(0, 0, 0), font=font) + labeled_classes.add(0) + logger.debug(f"Road label placed at fallback position") + + # Only mark non-road classes as processed if they're not found + if stuff_class != 0: + logger.debug(f"Stuff class {stuff_class} not found in segments") + + return np.array(pil_image) + + # --------------------------------------------------------------------- + # Consolidate nearby car segments into single labels + # --------------------------------------------------------------------- + def _consolidate_car_segments(self, panoptic_pred, car_segments): + """Consolidate nearby car segments into single labels.""" + if not car_segments: + return [] + + h, w = panoptic_pred.shape + consolidated = [] + used_segments = set() + + # Sort by area (largest first) + car_segments.sort(key=lambda x: x[1], reverse=True) + + for pan_id, area in car_segments: + if pan_id in used_segments: + continue + + # Get mask for this car segment + main_mask = panoptic_pred == pan_id + main_coords = np.column_stack(np.where(main_mask)) + + if len(main_coords) == 0: + continue + + # Find nearby car segments to merge + merged_mask = main_mask.copy() + total_area = area + used_segments.add(pan_id) + + # Calculate main segment centroid + main_centroid = main_coords.mean(axis=0) + + # Look for other car segments within reasonable distance + for other_id, other_area in car_segments: + if other_id in used_segments or other_id == pan_id: + continue + + other_mask = panoptic_pred == other_id + other_coords = np.column_stack(np.where(other_mask)) + + if len(other_coords) == 0: + continue + + other_centroid = other_coords.mean(axis=0) + + # Calculate distance between segments + distance = np.linalg.norm(main_centroid - other_centroid) + + # Merge if segments are close enough (within ~80 pixels) + if distance < 80: + merged_mask |= other_mask + total_area += other_area + used_segments.add(other_id) + + consolidated.append((merged_mask, total_area)) + + return consolidated diff --git a/models/experimental/panoptic_deeplab/reference/aspp.py b/models/experimental/panoptic_deeplab/reference/aspp.py new file mode 100644 index 000000000000..2f80a3a5d08d --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/aspp.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn + +from torch import Tensor +from copy import deepcopy +from models.experimental.panoptic_deeplab.reference.utils import Conv2d, DepthwiseSeparableConv2d + + +class ASPPModel(torch.nn.Module): + """ + ASPP MODULE + The input and output of `forward()` method must be NCHW tensors. + """ + + def __init__(self) -> None: + super().__init__() + dilations = [6, 12, 18] + in_channels = 2048 + out_channels = 256 + pool_kernel_size = (32, 64) + norm = nn.BatchNorm2d + activation = nn.ReLU() + use_bias = norm == "" + self.convs = nn.ModuleList() + + # conv 1x1 + self.convs.append( + Conv2d( + in_channels, + out_channels, + kernel_size=1, + bias=use_bias, + norm=nn.BatchNorm2d(out_channels), + activation=deepcopy(activation), + ) + ) + # atrous convs + for dilation in dilations: + self.convs.append( + DepthwiseSeparableConv2d( + in_channels, + out_channels, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm1=nn.BatchNorm2d(in_channels), + activation1=deepcopy(activation), + norm2=nn.BatchNorm2d(out_channels), + activation2=deepcopy(activation), + ) + ) + + # image pooling + # We do not add BatchNorm because the spatial resolution is 1x1, + # the original TF implementation has BatchNorm. + image_pooling = nn.Sequential( + nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1), + Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)), + ) + self.convs.append(image_pooling) + + self.project = Conv2d( + 5 * out_channels, + out_channels, + kernel_size=1, + bias=use_bias, + norm=nn.BatchNorm2d(out_channels), + activation=deepcopy(activation), + ) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass of ASPP Module. + + Args: + x: Input tensor of shape [N, C, H, W] + + Returns: + out: ASPP output + """ + size = x.shape[-2:] + res = [] + for conv in self.convs: + res.append(conv(x)) + res[-1] = nn.functional.interpolate(res[-1], size=size, mode="bilinear", align_corners=False) + res = torch.cat(res, dim=1) + res = self.project(res) + return res diff --git a/models/experimental/panoptic_deeplab/reference/decoder.py b/models/experimental/panoptic_deeplab/reference/decoder.py new file mode 100644 index 000000000000..1d77cfb56ae9 --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/decoder.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +from torch import Tensor +from typing import Tuple + +from models.experimental.panoptic_deeplab.reference.aspp import ASPPModel +from models.experimental.panoptic_deeplab.reference.head import HeadModel +from models.experimental.panoptic_deeplab.reference.res_block import ResModel + + +class DecoderModel(torch.nn.Module): + """ + Modular decoder architecture. + The input and output of `forward()` method must be NCHW tensors. + + Args: + name (string): name of segmentation head + """ + + def __init__(self, name) -> None: + super().__init__() + self.name = name + self.aspp = ASPPModel() + if name == "semantic_decoder": + self.res3 = ResModel(512, 320, 256) + self.res2 = ResModel(256, 288, 256) + self.head_1 = HeadModel(256, 256, 19) + else: + self.res3 = ResModel(512, 320, 128) + self.res2 = ResModel(256, 160, 128) + self.head_1 = HeadModel(128, 32, 2) + self.head_2 = HeadModel(128, 32, 1) + + def forward(self, x: Tensor, res3: Tensor, res2: Tensor) -> Tuple[Tensor, Tensor]: + """ + Forward pass of Decoder Module. + + Args: + x: Input tensor of shape [N, C, H, W] + + Returns: + out: + out_2: + """ + out = self.aspp(x) + out = self.res3(out, res3) + out_ = self.res2(out, res2) + out = self.head_1(out_) + + if self.name == "instance_decoder": + out_2 = self.head_2(out_) + else: + out_2 = None + return out, out_2 diff --git a/models/experimental/panoptic_deeplab/reference/head.py b/models/experimental/panoptic_deeplab/reference/head.py new file mode 100644 index 000000000000..70773e038cb9 --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/head.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch.nn as nn +import torch +from torch import Tensor +from models.experimental.panoptic_deeplab.reference.utils import Conv2d + + +class HeadModel(torch.nn.Module): + """ + Decoder Head Module. + The input and output of `forward()` method must be NCHW tensors. + + Args: + in_channels (int): input channel length + intermediate_channels (int): intermediate channel length + out_channels (int): output channel length + """ + + def __init__(self, in_channels, intermediate_channels, out_channels) -> None: + super().__init__() + + if out_channels == 1: # instance center head + self.conv1 = Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=nn.BatchNorm2d(in_channels), + activation=nn.ReLU(), + ) + self.conv2 = Conv2d( + in_channels, + intermediate_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=nn.BatchNorm2d(intermediate_channels), + activation=nn.ReLU(), + ) + else: # instance offset head and semantics head + self.conv1 = Conv2d( + in_channels, + in_channels, + kernel_size=5, + stride=1, + padding=2, + groups=in_channels, + bias=False, + norm=nn.BatchNorm2d(in_channels), + activation=nn.ReLU(), + ) + self.conv2 = Conv2d( + in_channels, + intermediate_channels, + kernel_size=1, + stride=1, + bias=False, + norm=nn.BatchNorm2d(intermediate_channels), + activation=nn.ReLU(), + ) + self.predictor = Conv2d(intermediate_channels, out_channels, kernel_size=1, stride=1) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass of Head Module. + + Args: + x: Input tensor of shape [N, C, H, W] + + Returns: + out: Segmentation Head output + """ + out = self.conv1(x) + out = self.conv2(out) + out = self.predictor(out) + out = nn.functional.interpolate(out, scale_factor=4, mode="bilinear") + return out diff --git a/models/experimental/panoptic_deeplab/reference/panoptic_deeplab.py b/models/experimental/panoptic_deeplab/reference/panoptic_deeplab.py new file mode 100644 index 000000000000..56f27bdc69bc --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/panoptic_deeplab.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn +from typing import Tuple + +from models.experimental.panoptic_deeplab.reference.resnet52_backbone import ResNet52BackBone +from models.experimental.panoptic_deeplab.reference.decoder import DecoderModel + + +class TorchPanopticDeepLab(nn.Module): + """ + Panoptic DeepLab model using modular decoder architecture. + Combines semantic segmentation and instance segmentation with panoptic fusion. + """ + + def __init__( + self, + ) -> None: + super().__init__() + + # Backbone + self.backbone = ResNet52BackBone() + + # Semantic segmentation decoder + self.semantic_decoder = DecoderModel( + name="semantic_decoder", + ) + + # Instance segmentation decoders + self.instance_decoder = DecoderModel( + name="instance_decoder", + ) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Forward pass of Panoptic DeepLab. + + Args: + x: Input tensor of shape [B, C, H, W] + + Returns: + semantic_logits: Semantic segmentation logits + instance_offset_head_logits: Instance segmentation logits - offset head + instance_center_head_logits: Instance segmentation logits - center head + """ + + # Extract features from backbone + features = self.backbone(x) + + # Extract specific feature maps + backbone_features = features["res_5"] + res3_features = features["res_3"] + res2_features = features["res_2"] + + # Semantic segmentation branch + semantic_logits, _ = self.semantic_decoder(backbone_features, res3_features, res2_features) + + # Instance segmentation branch + instance_offset_head_logits, instance_center_head_logits = self.instance_decoder( + backbone_features, res3_features, res2_features + ) + + return semantic_logits, instance_offset_head_logits, instance_center_head_logits diff --git a/models/experimental/panoptic_deeplab/reference/res_block.py b/models/experimental/panoptic_deeplab/reference/res_block.py new file mode 100644 index 000000000000..b2f2f218210e --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/res_block.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch.nn as nn +import torch +from torch import Tensor +from models.experimental.panoptic_deeplab.reference.utils import Conv2d, DepthwiseSeparableConv2d + + +class ResModel(torch.nn.Module): + """ + Decoder Res Module. + The input and output of `forward()` method must be NCHW tensors. + + Args: + in_channels (int): input channel length + intermediate_channels (int): intermediate channel length + out_channels (int): output channel length + """ + + def __init__(self, in_channels, intermediate_channels, out_channels) -> None: + super().__init__() + self.project_conv = Conv2d( + in_channels, + in_channels // 8, + kernel_size=1, + stride=1, + bias=False, + norm=nn.BatchNorm2d(in_channels // 8), + activation=nn.ReLU(), + ) + self.fuse_conv = DepthwiseSeparableConv2d( + intermediate_channels, + out_channels, + kernel_size=5, + padding=2, + dilation=1, + norm1=nn.BatchNorm2d(intermediate_channels), + activation1=nn.ReLU(), + norm2=nn.BatchNorm2d(out_channels), + activation2=nn.ReLU(), + ) + + def forward(self, x: Tensor, res2: Tensor) -> Tensor: + """ + Forward pass of Res Module. + + Args: + x: Input tensor of shape [N, C, H, W] + res2: Residual Input tensor of shape [N, C, H, W] + + Returns: + out: res output + """ + out = nn.functional.interpolate(x, scale_factor=2, mode="bilinear") + out_ = self.project_conv(res2) + out = torch.cat((out_, out), dim=1) + out = self.fuse_conv(out) + return out diff --git a/models/experimental/panoptic_deeplab/reference/resnet52_backbone.py b/models/experimental/panoptic_deeplab/reference/resnet52_backbone.py new file mode 100644 index 000000000000..f05c53e4fc1a --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/resnet52_backbone.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch.nn as nn +from typing import Callable, List, Optional +from torch import Tensor +from models.experimental.panoptic_deeplab.reference.resnet52_stem import DeepLabStem +from models.experimental.panoptic_deeplab.reference.utils import Conv2d +from models.experimental.panoptic_deeplab.reference.resnet52_bottleneck import Bottleneck + + +class ResNet52BackBone(nn.Module): + def __init__( + self, + block=Bottleneck, + layers=[3, 4, 6, 3], + groups: int = 1, + width_per_group: int = 64, + dialate_layer_config: Optional[List[List[int]]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 128 + if dialate_layer_config is None: + # each element in the tuple indicates the dialtaion to be used + # in the 3x3 conv for each layer + dialate_layer_config = [[1] * num_layers for num_layers in layers] + if len(dialate_layer_config) != len(layers): + raise ValueError( + "dialate_layer_config should be None " f"or a {len(layers)}-element tuple, got {dialate_layer_config}" + ) + self.groups = groups + self.base_width = width_per_group + self.stem = DeepLabStem(in_channels=3, out_channels=self.inplanes, stride=1) + self.res2 = self._make_layer(block, 64, layers[0], stride=1, dialate_config=dialate_layer_config[0]) + self.res3 = self._make_layer(block, 128, layers[1], stride=2, dialate_config=dialate_layer_config[1]) + self.res4 = self._make_layer(block, 256, layers[2], stride=2, dialate_config=dialate_layer_config[2]) + self.res5 = self._make_layer(block, 512, layers[3], stride=1, dialate_config=[2, 4, 8]) + + def _make_layer( + self, + block: Bottleneck, + planes: int, + blocks: int, + stride: int = 1, + dialate_config: Optional[List[int]] = None, + ) -> nn.Sequential: + norm_layer = self._norm_layer + if dialate_config is None: + dialate_config = [1] * blocks + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + norm=norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + self.inplanes, planes, stride, downsample, self.groups, self.base_width, dialate_config[0], norm_layer + ) + ) + self.inplanes = planes * block.expansion + for block_index in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + groups=self.groups, + base_width=self.base_width, + dilation=dialate_config[block_index], + norm_layer=norm_layer, + ) + ) + + return nn.Sequential(*layers) + + def forward(self, x: Tensor) -> Tensor: + x = self.stem(x) + + res_2 = self.res2(x) + res_3 = self.res3(res_2) + res_4 = self.res4(res_3) + res_5 = self.res5(res_4) + out = {"res_2": res_2, "res_3": res_3, "res_5": res_5} + + return out diff --git a/models/experimental/panoptic_deeplab/reference/resnet52_bottleneck.py b/models/experimental/panoptic_deeplab/reference/resnet52_bottleneck.py new file mode 100644 index 000000000000..fd53e7d59c2e --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/resnet52_bottleneck.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +from torch import nn +from torch import Tensor +from models.experimental.panoptic_deeplab.reference.utils import Conv2d + + +class Bottleneck(nn.Module): + expansion: int = 4 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample=None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + norm_layer=None, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self.relu = nn.ReLU(inplace=True) + + width = int(planes * (base_width / 64.0)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = Conv2d( + inplanes, width, kernel_size=1, stride=1, bias=False, norm=norm_layer(width), activation=self.relu + ) + self.conv2 = Conv2d( + width, + width, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, + norm=norm_layer(width), + activation=self.relu, + ) + self.conv3 = Conv2d( + width, + planes * self.expansion, + kernel_size=1, + stride=1, + bias=False, + norm=norm_layer(planes * self.expansion), + ) + self.shortcut = downsample + self.stride = stride + + def forward(self, x: Tensor) -> Tensor: + identity = x + + out = self.conv1(x) + + out = self.conv2(out) + + out = self.conv3(out) + + if self.shortcut is not None: + identity = self.shortcut(x) + + out += identity + out = self.relu(out) + + return out diff --git a/models/experimental/panoptic_deeplab/reference/resnet52_stem.py b/models/experimental/panoptic_deeplab/reference/resnet52_stem.py new file mode 100644 index 000000000000..d0d87b141f6a --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/resnet52_stem.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +from torch import nn +from torch import Tensor +from models.experimental.panoptic_deeplab.reference.utils import Conv2d + + +class CNNBlockBase(nn.Module): + """ + A CNN block is assumed to have input channels, output channels and a stride. + The input and output of `forward()` method must be NCHW tensors. + The method can perform arbitrary computation but must match the given + channels and stride specification. + + Attribute: + in_channels (int): + out_channels (int): + stride (int): + """ + + def __init__(self, in_channels, out_channels, stride) -> None: + """ + The `__init__` method of any subclass should also contain these arguments. + + Args: + in_channels (int): + out_channels (int): + stride (int): + """ + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + +class DeepLabStem(CNNBlockBase): + """ + DeepLab ResNet stem module that processes the input image before the first residual block. + + This stem consists of three 3x3 convolution layers with intermediate Batch Normalization and ReLU, + followed by a 3x3 max pooling operation. The design reduces spatial resolution while increasing + the number of channels to prepare features for deeper layers. + + Attributes: + conv1 (nn.Conv2d): First convolution layer with stride 2 and out_channels // 2 filters. + bn1 (nn.BatchNorm2d): Batch normalization applied after conv1. + conv2 (nn.Conv2d): Second convolution layer maintaining channel size. + bn2 (nn.BatchNorm2d): Batch normalization applied after conv2. + conv3 (nn.Conv2d): Third convolution layer increasing to final out_channels. + bn3 (nn.BatchNorm2d): Batch normalization applied after conv3. + relu (nn.ReLU): In-place ReLU activation function. + maxpool (nn.MaxPool2d): 3x3 max pooling with stride 2 and padding 1. + + Args: + in_channels (int): Number of input channels. Default is 3 for RGB images. + out_channels (int): Number of output channels. Default is 128. + stride (int): Stride used for conv2 and conv3. Default is 1. + + Returns: + torch.Tensor: Output feature map after the stem block with reduced spatial resolution + and increased channel depth. + """ + + def __init__(self, in_channels=3, out_channels=128, stride=1) -> None: + """ + Initialize the DeepLabStem module. + + Args: + in_channels (int): Number of input channels. Default is 3 (e.g., RGB image). + out_channels (int): Number of output channels after the stem. Default is 128. + stride (int): Stride value for the second and third convolutions. Default is 1. + """ + super().__init__(in_channels, out_channels, 1) + self.in_channels = in_channels + self.conv1 = Conv2d( + in_channels, + out_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False, + norm=nn.BatchNorm2d(out_channels // 2), + activation=nn.ReLU(inplace=True), + ) + self.conv2 = Conv2d( + out_channels // 2, + out_channels // 2, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + norm=nn.BatchNorm2d(out_channels // 2), + activation=nn.ReLU(inplace=True), + ) + self.conv3 = Conv2d( + out_channels // 2, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + norm=nn.BatchNorm2d(out_channels), + activation=nn.ReLU(inplace=True), + ) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.maxpool(x) + return x diff --git a/models/experimental/panoptic_deeplab/reference/utils.py b/models/experimental/panoptic_deeplab/reference/utils.py new file mode 100644 index 000000000000..915cc6e27637 --- /dev/null +++ b/models/experimental/panoptic_deeplab/reference/utils.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn.functional as F + + +# --------------------------- +# Torch utility modules +# --------------------------- + + +class Conv2d(torch.nn.Conv2d): + """ + A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. + """ + + def __init__(self, *args, **kwargs): + """ + Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: + + Args: + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + + It assumes that norm layer is used before activation. + """ + norm = kwargs.pop("norm", None) + activation = kwargs.pop("activation", None) + super().__init__(*args, **kwargs) + + self.norm = norm + self.activation = activation + + def forward(self, x): + x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) + if self.norm is not None: + x = self.norm(x) + if self.activation is not None: + x = self.activation(x) + return x + + +class DepthwiseSeparableConv2d(torch.nn.Module): + """ + A kxk depthwise convolution + a 1x1 convolution. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + padding=1, + dilation=1, + *, + norm1=None, + activation1=None, + norm2=None, + activation2=None, + ): + """ + Args: + norm1, norm2 (str or callable): normalization for the two conv layers. + activation1, activation2 (callable(Tensor) -> Tensor): activation + function for the two conv layers. + """ + super().__init__() + self.depthwise = Conv2d( + in_channels, + in_channels, + kernel_size=kernel_size, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=not norm1, + norm=norm1, + activation=activation1, + ) + self.pointwise = Conv2d( + in_channels, + out_channels, + kernel_size=1, + bias=not norm2, + norm=norm2, + activation=activation2, + ) + + def forward(self, x): + return self.pointwise(self.depthwise(x)) diff --git a/models/experimental/panoptic_deeplab/resources/input.png b/models/experimental/panoptic_deeplab/resources/input.png new file mode 100644 index 000000000000..f6330e36dd48 Binary files /dev/null and b/models/experimental/panoptic_deeplab/resources/input.png differ diff --git a/models/experimental/panoptic_deeplab/resources/input_panoptic.png b/models/experimental/panoptic_deeplab/resources/input_panoptic.png new file mode 100644 index 000000000000..21df6022a370 Binary files /dev/null and b/models/experimental/panoptic_deeplab/resources/input_panoptic.png differ diff --git a/models/experimental/panoptic_deeplab/resources/panoptic_deeplab_weights_download.sh b/models/experimental/panoptic_deeplab/resources/panoptic_deeplab_weights_download.sh new file mode 100755 index 000000000000..d02ded81a081 --- /dev/null +++ b/models/experimental/panoptic_deeplab/resources/panoptic_deeplab_weights_download.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Download your pretrained model: +wget https://dl.fbaipublicfiles.com/detectron2/PanopticDeepLab/Cityscapes-PanopticSegmentation/panoptic_deeplab_R_52_os16_mg124_poly_90k_bs32_crop_512_1024_dsconv/model_final_23d03a.pkl -O models/experimental/panoptic_deeplab/resources/Panoptic_Deeplab_R52.pkl diff --git a/models/experimental/panoptic_deeplab/runner/runner.py b/models/experimental/panoptic_deeplab/runner/runner.py new file mode 100644 index 000000000000..4bebeff752fa --- /dev/null +++ b/models/experimental/panoptic_deeplab/runner/runner.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from models.experimental.panoptic_deeplab.tt.panoptic_deeplab import TTPanopticDeepLab + + +class PanopticDeeplabRunner: + def __init__(self, parameters, model_config): + self.model = TTPanopticDeepLab(parameters, model_config) + + def run(self, input): + ttnn_output_tensor = self.model(input) + return ttnn_output_tensor diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_aspp.py b/models/experimental/panoptic_deeplab/tests/pcc/test_aspp.py new file mode 100644 index 000000000000..73699cca6139 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_aspp.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import torch +import pytest +from loguru import logger +import ttnn + +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.aspp import ASPPModel +from models.experimental.panoptic_deeplab.tt.aspp import TTASPP +from models.experimental.panoptic_deeplab.common import load_torch_model_state +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor + + +class ASPPTestInfra: + def __init__(self, device, batch_size, input_channels, height, width, model_config, name): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + self._model_initialized = True + + # Initialize core config + self.device = device + self.batch_size = batch_size + self.input_channels = input_channels + self.height = height + self.width = width + self.model_config = model_config + self.name = name + self.num_devices = device.get_num_devices() + + # Mesh mappers + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + + logger.info(f"Initializing ASPP test for module: {name}") + + # Torch model + torch_model = ASPPModel() + torch_model = load_torch_model_state(torch_model, name) + + # Create synthetic input + self.torch_input_tensor = self._create_input_tensor() + + # Run torch model + self.torch_output_tensor = torch_model(self.torch_input_tensor) + + # Preprocess model + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Initialize TTNN model + self.ttnn_model = TTASPP(parameters, model_config) + + # Prepare TTNN input + logger.info("Converting input to TTNN tensor...") + + # Run model and validate + + logger.info(f"Running TTNN model") + + # Re-convert input tensor (TTNN may deallocate buffers) + tt_host_tensor = ttnn.from_torch( + self.torch_input_tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat8_b, + device=self.device, + mesh_mapper=self.inputs_mesh_mapper, + ) + self.input_tensor = ttnn.to_device(tt_host_tensor, self.device, memory_config=ttnn.L1_MEMORY_CONFIG) + + # Optional: Re-instantiate model if it's not stateless + self.ttnn_model = TTASPP(parameters, self.model_config) + + self.run() + self.validate() + + def _create_input_tensor(self): + shape = (self.batch_size * self.num_devices, self.input_channels, self.height, self.width) + logger.info(f"Generating synthetic input tensor of shape {shape}") + return torch.randn(shape, dtype=torch.float32) + + @classmethod + def get_mesh_mappers(cls, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), # inputs + None, # weights + ttnn.ConcatMeshToTensor(device, dim=0), # outputs + ) + return None, None, None + + def run(self): + logger.info("Running TTNN ASPP model...") + self.output_tensor = self.ttnn_model(self.input_tensor, self.device) + return self.output_tensor + + def _tt_to_torch_nchw(self, tt_tensor, expected_shape): + torch_tensor = ttnn.to_torch(tt_tensor, device=self.device, mesh_composer=self.output_mesh_composer) + torch_tensor = torch.reshape( + torch_tensor, + (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]), + ) + return torch.permute(torch_tensor, (0, 3, 1, 2)) + + def validate(self): + logger.info("Validating TTNN output against PyTorch...") + tt_output_tensor_torch = self._tt_to_torch_nchw(self.output_tensor, self.torch_output_tensor.shape) + + # Deallocate to save memory + ttnn.deallocate(self.output_tensor) + + pcc_threshold = 0.99 + passed, msg = check_with_pcc(self.torch_output_tensor, tt_output_tensor_torch, pcc=pcc_threshold) + assert passed, logger.error(f"ASPP PCC check failed: {msg}") + + logger.info( + f"ASPP layer `{self.name}` passed: " + f"batch_size={self.batch_size}, " + f"act_dtype={self.model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={self.model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={self.model_config['MATH_FIDELITY']}, " + f"PCC={msg}" + ) + + return True, msg + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, input_channels, height, width", + [ + (1, 2048, 32, 64), + ], +) +@pytest.mark.parametrize("name", ["semantic_decoder.aspp", "instance_decoder.aspp"]) +def test_aspp(device, batch_size, input_channels, height, width, name): + ASPPTestInfra( + device=device, + batch_size=batch_size, + input_channels=input_channels, + height=height, + width=width, + model_config=model_config, + name=name, + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_decoder.py b/models/experimental/panoptic_deeplab/tests/pcc/test_decoder.py new file mode 100644 index 000000000000..8b5f7d5eafba --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_decoder.py @@ -0,0 +1,275 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest +import torch +from loguru import logger + +import ttnn +from tests.ttnn.utils_for_testing import check_with_pcc +from ttnn.model_preprocessing import ( + preprocess_model_parameters, + infer_ttnn_module_args, +) + +from models.experimental.panoptic_deeplab.tt.decoder import ( + TTDecoder, + decoder_layer_optimisations, +) +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import ( + create_custom_mesh_preprocessor, +) +from models.experimental.panoptic_deeplab.reference.decoder import DecoderModel +from models.experimental.panoptic_deeplab.common import ( + _populate_decoder, + load_torch_model_state, +) + + +# ------------------------- +# Deterministic seeding once +# ------------------------- +class _SeedOnce: + _done = False + + @classmethod + def ensure(cls) -> None: + if cls._done: + return + torch.manual_seed(42) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + cls._done = True + + +# ------------------------- +# Test Infra +# ------------------------- +class DecoderTestInfra: + """Builds torch + TTNN decoder graphs, runs forward, and validates PCC. + + The flow mirrors the reference decoder with ASPP → res3 → res2 → heads. + """ + + def __init__( + self, + device: ttnn.Device, + batch_size: int, + model_config: dict, + in_channels: int, + res3_intermediate_channels: int, + res2_intermediate_channels: int, + out_channels: tuple[int, ...] | tuple[int] | int, + upsample_channels: int, + height: int, + width: int, + name: str, + ) -> None: + _SeedOnce.ensure() + + self.device = device + self.batch_size = batch_size + self.in_channels = in_channels + self.res3_intermediate_channels = res3_intermediate_channels + self.res2_intermediate_channels = res2_intermediate_channels + self.out_channels = out_channels + self.upsample_channels = upsample_channels + self.height = height + self.width = width + self.name = name + self.model_config = model_config + + # PCC state + self.pcc_passed: bool = False + self.pcc_message: str = "call validate()?" + + # Mesh-mapping policy + ( + self.inputs_mesh_mapper, + self.weights_mesh_mapper, + self.output_mesh_composer, + ) = self._select_mesh_mappers(device) + + # ------------------------ + # Build reference (Torch) + # ------------------------ + self.torch_input_tensor = torch.randn((batch_size, in_channels, height, width), dtype=torch.float) + self.torch_res3_tensor = torch.randn((batch_size, 512, height * 2, width * 2), dtype=torch.float) + self.torch_res2_tensor = torch.randn((batch_size, upsample_channels, height * 4, width * 4), dtype=torch.float) + + torch_model = DecoderModel(name) + torch_model = load_torch_model_state(torch_model, name) + + # Preprocess weights w/ mesh-aware custom preprocessor + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Populate conv_args + _populate_decoder(torch_model, parameters) + + # Golden outputs + ( + self.torch_output_tensor, + self.torch_output_tensor_2, + ) = torch_model(self.torch_input_tensor, self.torch_res3_tensor, self.torch_res2_tensor) + + # ------------------------ + # Build device (TTNN) + # ------------------------ + tt_in = self._torch_to_ttnn_host(self.torch_input_tensor) + tt_res3 = self._torch_to_ttnn_host(self.torch_res3_tensor) + tt_res2 = self._torch_to_ttnn_host(self.torch_res2_tensor) + + self.input_tensor = ttnn.to_device(tt_in, device) + self.res3_tensor = ttnn.to_device(tt_res3, device) + self.res2_tensor = ttnn.to_device(tt_res2, device) + + logger.info("Initializing TTNN model…") + self.ttnn_model = TTDecoder( + parameters, + model_config, + layer_optimisations=decoder_layer_optimisations[name], + name=name, + ) + + self.run() + self.validate() + + # ------------------------- + # Helpers + # ------------------------- + def _select_mesh_mappers(self, device: ttnn.Device): + if device.get_num_devices() != 1: + inputs_mesh_mapper = ttnn.ShardTensorToMesh(device, dim=0) + weights_mesh_mapper = None + output_mesh_composer = ttnn.ConcatMeshToTensor(device, dim=0) + else: + inputs_mesh_mapper = None + weights_mesh_mapper = None + output_mesh_composer = None + return inputs_mesh_mapper, weights_mesh_mapper, output_mesh_composer + + def _torch_to_ttnn_host(self, tensor: torch.Tensor) -> ttnn.Tensor: + # NHWC + dtype conversion for TTNN host tensor + return ttnn.from_torch( + tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat8_b, + device=self.device, + mesh_mapper=self.inputs_mesh_mapper, + ) + + def _ttnn_to_torch(self, tensor: ttnn.Tensor, expected: torch.Size) -> torch.Tensor: + # Compose, reshape to N H W C, then back to N C H W + x = ttnn.to_torch(tensor, device=self.device, mesh_composer=self.output_mesh_composer) + x = torch.reshape(x, (expected[0], expected[2], expected[3], expected[1])) + return torch.permute(x, (0, 3, 1, 2)) + + def _maybe_infer_and_attach(self, module, attr_name: str, parameters, run): + if hasattr(parameters, attr_name): + args = infer_ttnn_module_args(model=module, run_model=run, device=None) + getattr(parameters, attr_name).conv_args = args + + # ------------------------- + # Execution + Validation + # ------------------------- + def run(self): + self.output_tensor, self.output_tensor_2 = self.ttnn_model( + self.input_tensor, + self.res3_tensor, + self.res2_tensor, + self.upsample_channels, + self.device, + ) + return self.output_tensor, self.output_tensor_2 + + def validate(self, output_tensor: ttnn.Tensor | None = None): + # --- Head 1 --- + out = self.output_tensor if output_tensor is None else output_tensor + out_torch = self._ttnn_to_torch(out, expected=self.torch_output_tensor.shape) + + valid_pcc = 0.99 + self.pcc_passed, self.pcc_message = check_with_pcc(self.torch_output_tensor, out_torch, pcc=valid_pcc) + assert self.pcc_passed, logger.error(f"PCC check failed: {self.pcc_message}") + + if "instance" in self.name: + head_name = "Instance Offset Head" + else: + head_name = "Semantic Head" + + logger.info( + f"{head_name}, batch_size={out_torch.shape[0]}, " + f"act_dtype={self.model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={self.model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={self.model_config['MATH_FIDELITY']}, PCC={self.pcc_message}, Shape={self.output_tensor.shape}" + ) + + # --- Head 2 (instance head only) --- + if "instance" in self.name: + out2_torch = self._ttnn_to_torch(self.output_tensor_2, expected=self.torch_output_tensor_2.shape) + + self.pcc_passed, self.pcc_message = check_with_pcc(self.torch_output_tensor_2, out2_torch, pcc=valid_pcc) + assert self.pcc_passed, logger.error(f"PCC check failed: {self.pcc_message}") + + logger.info( + f"Instance Center Head, batch_size={out2_torch.shape[0]}, " + f"act_dtype={self.model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={self.model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={self.model_config['MATH_FIDELITY']}, PCC={self.pcc_message}, Shape={self.output_tensor_2.shape}" + ) + + return self.pcc_passed, self.pcc_message + + +# ------------------------- +# Test parameters & entry +# ------------------------- +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, in_channels, res3_intermediate_channels, res2_intermediate_channels, out_channels, " + "upsample_channels, height, width, name", + [ + # semantic head + (1, 2048, 320, 288, (19,), 256, 32, 64, "semantic_decoder"), + # instance offset head + (1, 2048, 320, 160, (2, 1), 256, 32, 64, "instance_decoder"), + ], +) +def test_decoder( + device, + batch_size, + in_channels, + res3_intermediate_channels, + res2_intermediate_channels, + out_channels, + upsample_channels, + height, + width, + name, +): + DecoderTestInfra( + device=device, + batch_size=batch_size, + model_config=model_config, + in_channels=in_channels, + res3_intermediate_channels=res3_intermediate_channels, + res2_intermediate_channels=res2_intermediate_channels, + out_channels=out_channels, + upsample_channels=upsample_channels, + height=height, + width=width, + name=name, + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_head.py b/models/experimental/panoptic_deeplab/tests/pcc/test_head.py new file mode 100644 index 000000000000..6ff241190a69 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_head.py @@ -0,0 +1,176 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +from loguru import logger +import ttnn + +from ttnn.model_preprocessing import preprocess_model_parameters, infer_ttnn_module_args +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.head import HeadModel +from models.experimental.panoptic_deeplab.tt.head import TTHead, head_layer_optimisations +from models.experimental.panoptic_deeplab.common import load_torch_model_state +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor + + +class HeadTestInfra: + def __init__( + self, + device, + batch_size, + in_channels, + intermediate_channels, + out_channels, + height, + width, + model_config, + name, + ): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + self._model_initialized = True + + # Core config + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size + self.in_channels = in_channels + self.intermediate_channels = intermediate_channels + self.out_channels = out_channels + self.height = height + self.width = width + self.model_config = model_config + self.name = name + + # Mesh mappers + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + + logger.info(f"Initializing Head test for module: {name}") + + # Torch model + torch_model = HeadModel(self.in_channels, self.intermediate_channels, self.out_channels) + torch_model = load_torch_model_state(torch_model, name) + + # Synthetic input + self.torch_input_tensor = self._create_input_tensor() + + # Torch output + self.torch_output_tensor = torch_model(self.torch_input_tensor) + + # Preprocess model parameters + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + parameters.conv_args = infer_ttnn_module_args( + model=torch_model, + run_model=lambda m: m(self.torch_input_tensor), + device=None, + ) + + # Initialize TTNN model + self.ttnn_model = TTHead(parameters, model_config, layer_optimisations=head_layer_optimisations[self.name]) + + # Run model in phases and validate + logger.info(f"Running TTNN Head model") + + # Rebuild TTNN input (since buffers may be freed across passes) + tt_host_tensor = ttnn.from_torch( + self.torch_input_tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat8_b, + device=self.device, + mesh_mapper=self.inputs_mesh_mapper, + ) + self.input_tensor = ttnn.to_device(tt_host_tensor, self.device, memory_config=ttnn.L1_MEMORY_CONFIG) + + # Optional: reinstantiate TTNN model + self.ttnn_model = TTHead(parameters, model_config, layer_optimisations=head_layer_optimisations[self.name]) + + self.run() + self.validate() + + def _create_input_tensor(self): + shape = (self.batch_size * self.num_devices, self.in_channels, self.height, self.width) + logger.info(f"Generating synthetic input tensor of shape {shape}") + return torch.randn(shape, dtype=torch.float32) + + @classmethod + def get_mesh_mappers(cls, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), + None, + ttnn.ConcatMeshToTensor(device, dim=0), + ) + return None, None, None + + def run(self): + logger.info("Running TTNN Head model...") + self.output_tensor = self.ttnn_model(self.input_tensor, self.device) + return self.output_tensor + + def _tt_to_torch_nchw(self, tt_tensor, expected_shape): + torch_tensor = ttnn.to_torch(tt_tensor, device=self.device, mesh_composer=self.output_mesh_composer) + torch_tensor = torch.reshape( + torch_tensor, + (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]), + ) + return torch.permute(torch_tensor, (0, 3, 1, 2)) + + def validate(self): + logger.info("Validating TTNN output against PyTorch...") + tt_output_tensor_torch = self._tt_to_torch_nchw(self.output_tensor, self.torch_output_tensor.shape) + + # Deallocate to save memory + ttnn.deallocate(self.output_tensor) + + pcc_threshold = 0.99 + passed, msg = check_with_pcc(self.torch_output_tensor, tt_output_tensor_torch, pcc=pcc_threshold) + assert passed, logger.error(f"Head PCC check failed: {msg}") + + logger.info( + f"Head `{self.name}` passed: " + f"batch_size={self.batch_size}, " + f"act_dtype={self.model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={self.model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={self.model_config['MATH_FIDELITY']}, " + f"PCC={msg}" + ) + + return True, msg + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, in_channels, intermediate_channels, out_channels, height, width, name", + [ + (1, 256, 256, 19, 128, 256, "semantic_decoder.head_1"), + (1, 128, 32, 2, 128, 256, "instance_decoder.head_1"), + (1, 128, 32, 1, 128, 256, "instance_decoder.head_2"), + ], +) +def test_head(device, batch_size, in_channels, intermediate_channels, out_channels, height, width, name): + HeadTestInfra( + device=device, + batch_size=batch_size, + in_channels=in_channels, + intermediate_channels=intermediate_channels, + out_channels=out_channels, + height=height, + width=width, + model_config=model_config, + name=name, + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_panoptic_deeplab.py b/models/experimental/panoptic_deeplab/tests/pcc/test_panoptic_deeplab.py new file mode 100644 index 000000000000..0f21aa79039c --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_panoptic_deeplab.py @@ -0,0 +1,224 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import os +import pytest +import torch +import ttnn +import numpy as np + +from pathlib import Path +from loguru import logger +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.panoptic_deeplab import TorchPanopticDeepLab +from models.experimental.panoptic_deeplab.tt.panoptic_deeplab import TTPanopticDeepLab +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.experimental.panoptic_deeplab.common import ( + load_torch_model_state, + preprocess_image, + save_preprocessed_inputs, + _populate_all_decoders, +) +import tracy + + +class PanopticDeepLabTestInfra: + def __init__( + self, + device, + batch_size, + in_channels, + height, + width, + model_config, + ): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) + self._model_initialized = True + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + + self.pcc_passed = False + self.pcc_message = "call validate()?" + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size + self.in_channels = in_channels + self.height = height + self.width = width + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + self.real_input_path = "./models/experimental/panoptic_deeplab/resources/input.png" + + # Initialize torch model + torch_model = TorchPanopticDeepLab() + torch_model = load_torch_model_state(torch_model, "panoptic_deeplab") + + # Create input tensor + if self.real_input_path and os.path.exists(self.real_input_path): + self.torch_input_tensor, self.ttnn_input_tensor, self.original_image, self.original_size = preprocess_image( + self.real_input_path, self.width, self.height, self.device, self.inputs_mesh_mapper + ) + base_name = Path(self.real_input_path).stem + torch_input_path = save_preprocessed_inputs( + self.torch_input_tensor, "models/experimental/panoptic_deeplab/resources/test_inputs", base_name + ) + logger.info(f"Preprocessed inputs saved for testing: {torch_input_path}") + logger.info(f"Loading real input from: {self.real_input_path}") + self.torch_input_tensor = self.load_real_input(torch_input_path) + + # Verify shape matches expected dimensions + expected_shape = (batch_size * self.num_devices, in_channels, height, width) + if self.torch_input_tensor.shape != expected_shape: + logger.warning( + f"Input shape mismatch. Expected: {expected_shape}, Got: {self.torch_input_tensor.shape}" + ) + else: + logger.info("Using random input tensor (no real input provided)") + input_shape = (batch_size * self.num_devices, in_channels, height, width) + self.torch_input_tensor = torch.rand(input_shape, dtype=torch.float32) + + # Preprocess model parameters + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Populate conv_args for decoders via one small warm-up pass + _populate_all_decoders(torch_model, parameters) + + # Run torch model with bfloat16 + logger.info("Running PyTorch model...") + self.torch_output_tensor, self.torch_output_tensor_2, self.torch_output_tensor_3 = torch_model( + self.torch_input_tensor + ) + + # Convert input to TTNN format (NHWC) + logger.info("Converting input to TTNN format...") + tt_host_tensor = ttnn.from_torch( + self.torch_input_tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat16, + mesh_mapper=self.inputs_mesh_mapper, + ) + + # Initialize TTNN model + logger.info("Initializing TTNN model...") + self.ttnn_model = TTPanopticDeepLab( + parameters=parameters, + model_config=model_config, + ) + + # First run configures JIT, second run is optimized + for phase in ("JIT configuration", "optimized"): + tracy.signpost("start") + logger.info(f"Running TTNN model pass ({phase})...") + self.input_tensor = ttnn.to_device(tt_host_tensor, device) + self.run() + self.validate() + tracy.signpost("stop") + + def load_real_input(self, input_path: str) -> torch.Tensor: + """Load real input from saved file""" + + if input_path.endswith(".pt"): + # Load PyTorch tensor + data = torch.load(input_path, map_location="cpu") + if isinstance(data, dict): + tensor = data["tensor"] + logger.info(f"Loaded input metadata: {data.keys()}") + if "stats" in data: + logger.info(f"Original input stats: {data['stats']}") + else: + tensor = data + elif input_path.endswith(".npy"): + # Load numpy array + np_array = np.load(input_path) + tensor = torch.from_numpy(np_array) + else: + raise ValueError(f"Unsupported input file format: {input_path}") + + # Ensure tensor is float32 + if tensor.dtype != torch.float32: + tensor = tensor.to(torch.float32) + + return tensor + + @classmethod + def get_mesh_mappers(self, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), # inputs + None, # weights + ttnn.ConcatMeshToTensor(device, dim=0), # outputs + ) + return None, None, None + + @staticmethod + def _tt_to_torch_nchw(tt_tensor, device, mesh_composer, expected_shape): + """Convert TTNN NHWC tensor back to Torch NCHW and reshape to expected batch/shape.""" + t = ttnn.to_torch(tt_tensor, device=device, mesh_composer=mesh_composer) + t = torch.reshape(t, (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1])) + return torch.permute(t, (0, 3, 1, 2)) + + def run(self): + self.output_tensor, self.output_tensor_2, self.output_tensor_3 = self.ttnn_model(self.input_tensor, self.device) + return self.output_tensor, self.output_tensor_2, self.output_tensor_3 + + def validate(self): + """Validate three heads (semantic, offsets, centers) in a uniform loop.""" + checks = [ + ("Semantic Segmentation Head", self.output_tensor, self.torch_output_tensor), + ("Instance Segmentation Offset Head", self.output_tensor_2, self.torch_output_tensor_2), + ("Instance Segmentation Center Head", self.output_tensor_3, self.torch_output_tensor_3), + ] + + self._PCC_THRESH = 0.97 + + for name, tt_out, torch_ref in checks: + out = self._tt_to_torch_nchw(tt_out, self.device, self.output_mesh_composer, torch_ref.shape) + passed, msg = check_with_pcc(torch_ref, out, pcc=self._PCC_THRESH) + assert passed, logger.error(f"{name} PCC check failed: {msg}") + + logger.info( + f"Panoptic DeepLab - {name}: batch_size={self.batch_size}, " + f"act_dtype={model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={model_config['MATH_FIDELITY']}, " + f"PCC={msg}, shape={tt_out.shape}" + ) + + return True, f"All heads passed PCC ≥ {self._PCC_THRESH}" + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, in_channels, height, width", + [ + (1, 3, 512, 1024), + ], +) +def test_panoptic_deeplab( + device, + batch_size, + in_channels, + height, + width, +): + PanopticDeepLabTestInfra( + device, + batch_size, + in_channels, + height, + width, + model_config, + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_residual_block.py b/models/experimental/panoptic_deeplab/tests/pcc/test_residual_block.py new file mode 100644 index 000000000000..a142989bf5c8 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_residual_block.py @@ -0,0 +1,209 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import torch +import pytest + +from loguru import logger +from ttnn.model_preprocessing import preprocess_model_parameters, infer_ttnn_module_args +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.res_block import ResModel +from models.experimental.panoptic_deeplab.tt.res_block import TTRes, res_layer_optimisations +from models.experimental.panoptic_deeplab.common import load_torch_model_state +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor + + +class ResTestInfra: + def __init__( + self, + device, + batch_size, + in_channels, + upsample_channels, + intermediate_channels, + out_channels, + height_res, + width_res, + height, + width, + model_config, + name, + ): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + self._model_initialized = True + + # Core config + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size + self.in_channels = in_channels + self.upsample_channels = upsample_channels + self.intermediate_channels = intermediate_channels + self.out_channels = out_channels + self.height_res = height_res + self.width_res = width_res + self.height = height + self.width = width + self.model_config = model_config + self.name = name + + # Mesh mappers + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + + logger.info(f"Initializing Res block test for module: {name}") + + # Torch model + torch_model = ResModel(self.in_channels, self.intermediate_channels, self.out_channels) + torch_model = load_torch_model_state(torch_model, name) + + # Synthetic inputs + self.torch_input_tensor, self.torch_res_input_tensor = self._create_input_tensors() + + # Torch output + self.torch_output_tensor = torch_model(self.torch_input_tensor, self.torch_res_input_tensor) + + # Preprocess model + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + parameters.conv_args = infer_ttnn_module_args( + model=torch_model, + run_model=lambda m: m(self.torch_input_tensor, self.torch_res_input_tensor), + device=None, + ) + + # Initialize TTNN model + self.ttnn_model = TTRes(parameters, model_config, layer_optimisations=res_layer_optimisations[self.name]) + + # Run phases + logger.info(f"Running TTNN Res block Model") + + # Rebuild TTNN inputs (buffers may be released) + self.input_tensor = self._to_ttnn_device(self.torch_input_tensor) + self.res_input_tensor = self._to_ttnn_device(self.torch_res_input_tensor) + + # Optional: reinstantiate model + self.ttnn_model = TTRes(parameters, model_config, layer_optimisations=res_layer_optimisations[self.name]) + + self.run() + self.validate() + + def _create_input_tensors(self): + shape_main = (self.batch_size * self.num_devices, self.upsample_channels, self.height, self.width) + shape_res = (self.batch_size * self.num_devices, self.in_channels, self.height_res, self.width_res) + logger.info(f"Generating main input tensor of shape {shape_main}") + logger.info(f"Generating residual input tensor of shape {shape_res}") + return ( + torch.randn(shape_main, dtype=torch.float32), + torch.randn(shape_res, dtype=torch.float32), + ) + + def _to_ttnn_device(self, tensor): + host_tensor = ttnn.from_torch( + tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat8_b, + device=self.device, + mesh_mapper=self.inputs_mesh_mapper, + ) + return ttnn.to_device(host_tensor, self.device, memory_config=ttnn.L1_MEMORY_CONFIG) + + @classmethod + def get_mesh_mappers(cls, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), + None, + ttnn.ConcatMeshToTensor(device, dim=0), + ) + return None, None, None + + def run(self): + logger.info("Running TTNN Res block model...") + self.output_tensor = self.ttnn_model( + self.input_tensor, self.res_input_tensor, self.upsample_channels, self.device + ) + return self.output_tensor + + def _tt_to_torch_nchw(self, tt_tensor, expected_shape): + torch_tensor = ttnn.to_torch(tt_tensor, device=self.device, mesh_composer=self.output_mesh_composer) + torch_tensor = torch.reshape( + torch_tensor, + (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]), + ) + return torch.permute(torch_tensor, (0, 3, 1, 2)) + + def validate(self): + logger.info("Validating TTNN output against PyTorch...") + tt_output_tensor_torch = self._tt_to_torch_nchw(self.output_tensor, self.torch_output_tensor.shape) + + # Deallocate to save memory + ttnn.deallocate(self.output_tensor) + + pcc_threshold = 0.99 + passed, msg = check_with_pcc(self.torch_output_tensor, tt_output_tensor_torch, pcc=pcc_threshold) + assert passed, logger.error(f"Res PCC check failed: {msg}") + + logger.info( + f"Res block `{self.name}` passed: " + f"batch_size={self.batch_size}, " + f"act_dtype={self.model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={self.model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={self.model_config['MATH_FIDELITY']}, " + f"PCC={msg}" + ) + + return True, msg + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, in_channels, upsample_channels, intermediate_channels, out_channels, height_res, width_res, height, width, name", + [ + (1, 512, 256, 320, 256, 64, 128, 32, 64, "semantic_decoder.res3"), + (1, 256, 256, 288, 256, 128, 256, 64, 128, "semantic_decoder.res2"), + (1, 512, 256, 320, 128, 64, 128, 32, 64, "instance_decoder.res3"), + (1, 256, 128, 160, 128, 128, 256, 64, 128, "instance_decoder.res2"), + ], +) +def test_res( + device, + batch_size, + in_channels, + upsample_channels, + intermediate_channels, + out_channels, + height_res, + width_res, + height, + width, + name, +): + ResTestInfra( + device=device, + batch_size=batch_size, + in_channels=in_channels, + upsample_channels=upsample_channels, + intermediate_channels=intermediate_channels, + out_channels=out_channels, + height_res=height_res, + width_res=width_res, + height=height, + width=width, + model_config=model_config, + name=name, + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_backbone.py b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_backbone.py new file mode 100644 index 000000000000..2bf96d1fce1f --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_backbone.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import torch +import pytest + +from loguru import logger +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.resnet52_backbone import ResNet52BackBone as TorchBackbone +from models.experimental.panoptic_deeplab.tt.backbone import TTBackbone +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.experimental.panoptic_deeplab.common import load_torch_model_state + + +class BackboneTestInfra: + def __init__(self, device, batch_size, in_channels, height, width, model_config, name): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) # Only seed once + self._model_initialized = True + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + + self.pcc_passed = False + self.pcc_message = "call validate()?" + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size * self.num_devices + self.name = name + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + + # Torch model + weights + torch_model = TorchBackbone() + torch_model = load_torch_model_state(torch_model, name) + + # Torch input + golden output + self.torch_input_tensor = torch.randn((self.batch_size, in_channels, height, width), dtype=torch.float) + self.torch_output = torch_model(self.torch_input_tensor) + + # Preprocess parameters + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Convert input to TTNN host tensor + def to_ttnn_host(tensor): + return ttnn.from_torch( + tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat16, + mesh_mapper=self.inputs_mesh_mapper, + ) + + tt_host_tensor = to_ttnn_host(self.torch_input_tensor) + + # TTNN model + self.ttnn_model = TTBackbone(parameters=parameters, model_config=model_config) + + # Move input to device + self.input_tensor = ttnn.to_device(tt_host_tensor, device) + + # Run + validate + self.run() + self.validate() + + def get_mesh_mappers(self, device): + if device.get_num_devices() != 1: + inputs_mesh_mapper = ttnn.ShardTensorToMesh(device, dim=0) + weights_mesh_mapper = None + output_mesh_composer = ttnn.ConcatMeshToTensor(device, dim=0) + else: + inputs_mesh_mapper = None + weights_mesh_mapper = None + output_mesh_composer = None + return inputs_mesh_mapper, weights_mesh_mapper, output_mesh_composer + + def run(self): + self.output_tensor = self.ttnn_model(self.input_tensor, self.device) + return self.output_tensor + + def validate(self, output_tensor=None): + tt_output = self.output_tensor if output_tensor is None else output_tensor + + valid_pcc = {"res_2": 0.99, "res_3": 0.99, "res_5": 0.99} + self.pcc_passed_all = [] + self.pcc_message_all = [] + + for key in tt_output: + tt_output_tensor_torch = ttnn.to_torch( + tt_output[key], + dtype=self.torch_output[key].dtype, + device=self.device, + mesh_composer=self.output_mesh_composer, + ) + + # Free device memory + ttnn.deallocate(tt_output[key]) + + expected_shape = self.torch_output[key].shape + tt_output_tensor_torch = torch.reshape( + tt_output_tensor_torch, (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]) + ) + tt_output_tensor_torch = torch.permute(tt_output_tensor_torch, (0, 3, 1, 2)) + + pcc_passed, pcc_message = check_with_pcc(self.torch_output[key], tt_output_tensor_torch, pcc=valid_pcc[key]) + self.pcc_passed_all.append(pcc_passed) + self.pcc_message_all.append(pcc_message) + + assert all(self.pcc_passed_all), logger.error(f"PCC check failed: {self.pcc_message_all}") + logger.info( + f"ResNet52 Backbone - batch_size={self.batch_size}, " + f"act_dtype={model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={model_config['MATH_FIDELITY']}, " + f"PCC={self.pcc_message_all}" + ) + + return self.pcc_passed_all, self.pcc_message_all + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, in_channels, height, width, name", + [ + (1, 3, 512, 1024, "backbone"), + ], +) +def test_backbone(device, batch_size, in_channels, height, width, name): + BackboneTestInfra(device, batch_size, in_channels, height, width, model_config, name) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_bottleneck.py b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_bottleneck.py new file mode 100644 index 000000000000..c6b316345c43 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_bottleneck.py @@ -0,0 +1,171 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import torch +import pytest + +from loguru import logger +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.tt.bottleneck import TTBottleneck, get_bottleneck_optimisation +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.experimental.panoptic_deeplab.common import load_torch_model_state +from models.experimental.panoptic_deeplab.reference.resnet52_bottleneck import Bottleneck +from models.experimental.panoptic_deeplab.reference.utils import Conv2d + + +class BottleneckTestInfra: + def __init__( + self, device, batch_size, inplanes, planes, height, width, stride, dilation, downsample, name, model_config + ): + super().__init__() + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) # Seed once for determinism + self._model_initialized = True + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + + self.pcc_passed = False + self.pcc_message = "call validate()?" + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size + self.name = name + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + + # Optional downsample layer + downsample_conv = None + if downsample: + downsample_conv = Conv2d( + inplanes, + planes * Bottleneck.expansion, + kernel_size=1, + stride=stride, + bias=False, + norm=torch.nn.BatchNorm2d(planes * Bottleneck.expansion), + ) + + # Torch model + torch_model = Bottleneck( + inplanes=inplanes, planes=planes, stride=stride, dilation=dilation, downsample=downsample_conv + ) + torch_model = load_torch_model_state(torch_model, name) + + # Torch input + golden output + input_shape = (batch_size * self.num_devices, inplanes, height, width) + self.torch_input_tensor = torch.randn(input_shape, dtype=torch.float) + self.torch_output_tensor = torch_model(self.torch_input_tensor) + + # Preprocess model params + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Convert input to TTNN host tensor + def to_ttnn_host(tensor): + return ttnn.from_torch( + tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat16, + mesh_mapper=self.inputs_mesh_mapper, + ) + + tt_host_tensor = to_ttnn_host(self.torch_input_tensor) + + # TTNN model + self.ttnn_model = TTBottleneck( + parameters=parameters, + downsample=downsample, + stride=stride, + dilation=dilation, + name=name, + model_config=model_config, + layer_optimisations=get_bottleneck_optimisation(name), + ) + + # Move input to device + self.input_tensor = ttnn.to_device(tt_host_tensor, device) + + # Run + validate + self.run() + self.validate() + + def get_mesh_mappers(self, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), # inputs + None, # weights + ttnn.ConcatMeshToTensor(device, dim=0), # outputs + ) + return None, None, None + + def run(self): + self.output_tensor, _ = self.ttnn_model(self.input_tensor, self.device, self.input_tensor.shape) + return self.output_tensor + + def validate(self, output_tensor=None): + tt_output_tensor = self.output_tensor if output_tensor is None else output_tensor + tt_output_tensor_torch = ttnn.to_torch( + tt_output_tensor, device=self.device, mesh_composer=self.output_mesh_composer + ) + + # Free device memory + ttnn.deallocate(tt_output_tensor) + + expected_shape = self.torch_output_tensor.shape + tt_output_tensor_torch = torch.reshape( + tt_output_tensor_torch, (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]) + ) + tt_output_tensor_torch = torch.permute(tt_output_tensor_torch, (0, 3, 1, 2)) + + valid_pcc = 0.99 + self.pcc_passed, self.pcc_message = check_with_pcc( + self.torch_output_tensor, tt_output_tensor_torch, pcc=valid_pcc + ) + + assert self.pcc_passed, logger.error(f"PCC check failed: {self.pcc_message}") + logger.info( + f"Bottleneck `{self.name}` passed: " + f"batch_size={self.batch_size}, " + f"act_dtype={model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={model_config['MATH_FIDELITY']}, " + f"PCC={self.pcc_message}" + ) + + return self.pcc_passed, self.pcc_message + + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, inplanes, planes, height, width, stride, dilation, downsample, name", + [ + # res2 + (1, 128, 64, 128, 256, 1, 1, True, "backbone.res2.0"), + (1, 256, 64, 128, 256, 1, 1, False, "backbone.res2.1"), + # res3 + (1, 256, 128, 128, 256, 2, 1, True, "backbone.res3.0"), + (1, 512, 128, 64, 128, 1, 1, False, "backbone.res3.1"), + # res4 + (1, 512, 256, 64, 128, 2, 1, True, "backbone.res4.0"), + (1, 1024, 256, 32, 64, 1, 1, False, "backbone.res4.1"), + # res5 + (1, 1024, 512, 32, 64, 1, 2, True, "backbone.res5.0"), + (1, 2048, 512, 32, 64, 1, 4, False, "backbone.res5.1"), + (1, 2048, 512, 32, 64, 1, 8, False, "backbone.res5.2"), + ], +) +def test_bottleneck(device, batch_size, inplanes, planes, height, width, stride, dilation, downsample, name): + BottleneckTestInfra( + device, batch_size, inplanes, planes, height, width, stride, dilation, downsample, name, model_config + ) diff --git a/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_stem.py b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_stem.py new file mode 100644 index 000000000000..949f74ce0c4b --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/pcc/test_resnet52_stem.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import torch +import pytest + +from loguru import logger +from ttnn.model_preprocessing import preprocess_model_parameters +from tests.ttnn.utils_for_testing import check_with_pcc + +from models.experimental.panoptic_deeplab.reference.resnet52_stem import DeepLabStem +from models.experimental.panoptic_deeplab.tt.stem import resnet52Stem, neck_optimisations +from models.experimental.panoptic_deeplab.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.experimental.panoptic_deeplab.common import load_torch_model_state + + +class Resnet52StemTestInfra: + def __init__(self, device, batch_size, inplanes, planes, height, width, stride, model_config, name): + super().__init__() + self._init_seeds() + self.device = device + self.num_devices = device.get_num_devices() + self.batch_size = batch_size * self.num_devices + self.inputs_mesh_mapper, self.weights_mesh_mapper, self.output_mesh_composer = self.get_mesh_mappers(device) + self.name = name + + # Build reference torch model + torch_model = DeepLabStem(in_channels=inplanes, out_channels=planes, stride=stride) + torch_model = load_torch_model_state(torch_model, name) + + # Preprocess parameters for TTNN + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_model, + custom_preprocessor=create_custom_mesh_preprocessor(self.weights_mesh_mapper), + device=None, + ) + + # Prepare golden inputs/outputs + input_shape = (self.batch_size, inplanes, height, width) + self.torch_input_tensor = torch.randn(input_shape, dtype=torch.float) + self.torch_output_tensor = torch_model(self.torch_input_tensor) + + # Convert input to TTNN format + tt_host_tensor = ttnn.from_torch( + self.torch_input_tensor.permute(0, 2, 3, 1), + dtype=ttnn.bfloat16, + mesh_mapper=self.inputs_mesh_mapper, + ) + self.input_tensor = ttnn.to_device(tt_host_tensor, device) + + # Build TTNN model + self.ttnn_model = resnet52Stem( + parameters=parameters, + stride=stride, + model_config=model_config, + layer_optimisations=neck_optimisations, + ) + + # Run + validate + self.run() + self.validate(model_config) + + def _init_seeds(self): + if not hasattr(self, "_model_initialized"): + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + torch.backends.cudnn.deterministic = True + self._model_initialized = True + + def get_mesh_mappers(self, device): + if device.get_num_devices() != 1: + return ( + ttnn.ShardTensorToMesh(device, dim=0), + None, + ttnn.ConcatMeshToTensor(device, dim=0), + ) + return None, None, None + + def run(self): + self.output_tensor = self.ttnn_model(self.input_tensor, self.device) + return self.output_tensor + + def validate(self, model_config, output_tensor=None): + tt_output_tensor = self.output_tensor if output_tensor is None else output_tensor + tt_output_tensor_torch = ttnn.to_torch( + tt_output_tensor, + device=self.device, + mesh_composer=self.output_mesh_composer, + ) + + # Deallocate output tensor + ttnn.deallocate(tt_output_tensor) + + # Reshape + permute back to NCHW + expected_shape = self.torch_output_tensor.shape + tt_output_tensor_torch = torch.reshape( + tt_output_tensor_torch, + (expected_shape[0], expected_shape[2], expected_shape[3], expected_shape[1]), + ) + tt_output_tensor_torch = torch.permute(tt_output_tensor_torch, (0, 3, 1, 2)) + + # PCC validation + pcc_passed, pcc_message = check_with_pcc(self.torch_output_tensor, tt_output_tensor_torch, pcc=0.99) + assert pcc_passed, logger.error(f"PCC check failed: {pcc_message}") + + logger.info( + f"ResNet52 Stem Block [{self.name}] - " + f"batch_size={self.batch_size}, " + f"act_dtype={model_config['ACTIVATIONS_DTYPE']}, " + f"weight_dtype={model_config['WEIGHTS_DTYPE']}, " + f"math_fidelity={model_config['MATH_FIDELITY']}, " + f"PCC={pcc_message}" + ) + return pcc_passed, pcc_message + + +# Default model config +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True) +@pytest.mark.parametrize( + "batch_size, inplanes, planes, height, width, stride, name", + [ + (1, 3, 128, 512, 1024, 1, "backbone.stem"), + ], +) +def test_stem(device, batch_size, inplanes, planes, height, width, stride, name): + Resnet52StemTestInfra( + device, + batch_size, + inplanes, + planes, + height, + width, + stride, + model_config, + name, + ) diff --git a/models/experimental/panoptic_deeplab/tests/perf/test_perf.py b/models/experimental/panoptic_deeplab/tests/perf/test_perf.py new file mode 100644 index 000000000000..f04aadf768fa --- /dev/null +++ b/models/experimental/panoptic_deeplab/tests/perf/test_perf.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from models.perf.device_perf_utils import check_device_perf, prep_device_perf_report, run_device_perf + + +@pytest.mark.parametrize( + "batch_size, model_name, expected_perf", + [ + (1, "panoptic_deeplab", 12.8), + ], +) +@pytest.mark.models_device_performance_bare_metal +def test_perf_device_bare_metal_panoptic_deeplab(batch_size, model_name, expected_perf): + subdir = model_name + num_iterations = 1 + margin = 0.04 + + command = f"pytest models/experimental/panoptic_deeplab/tests/pcc/test_panoptic_deeplab.py" + + cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] + + inference_time_key = "AVG DEVICE KERNEL SAMPLES/S" + expected_perf_cols = {inference_time_key: expected_perf} + + post_processed_results = run_device_perf(command, subdir, num_iterations, cols, batch_size, has_signposts=True) + expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols) + prep_device_perf_report( + model_name=f"ttnn_functional_{model_name}_{batch_size}", + batch_size=batch_size, + post_processed_results=post_processed_results, + expected_results=expected_results, + comments="", + ) diff --git a/models/experimental/panoptic_deeplab/tt/aspp.py b/models/experimental/panoptic_deeplab/tt/aspp.py new file mode 100644 index 000000000000..94ac045563d2 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/aspp.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from models.experimental.panoptic_deeplab.tt.utils import ( + TTConv2D, + TTUpsample, + TTDepthwiseSeparableConv2D, + DepthwiseSeparableOptimizer, +) + + +aspp_optimisations = [ + DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 64, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "reshard_if_not_optimal": True, + }, + pointwise={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reshard_if_not_optimal": True, + }, + ), + DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 512, + "shard_layout": ttnn.TensorMemoryLayout.WIDTH_SHARDED, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "reshard_if_not_optimal": True, + }, + pointwise={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reshard_if_not_optimal": True, + }, + ), + DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 512, + "shard_layout": ttnn.TensorMemoryLayout.WIDTH_SHARDED, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "reshard_if_not_optimal": True, + }, + pointwise={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reshard_if_not_optimal": True, + }, + ), +] + + +class TTASPP: + def __init__(self, parameters, model_config) -> None: + self.model_config = model_config + + dilations = [6, 12, 18] + in_channels = 2048 + activation = ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU) + self.convs = [] + + # conv 1x1 + self.convs.append( + TTConv2D( + kernel_size=1, + stride=1, + padding=0, + groups=1, + parameters=parameters["convs"][0], + kernel_fidelity=model_config, + activation=activation, + act_block_h=32, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + reshard_if_not_optimal=True, + ) + ) + # atrous convs + for index, dilation in enumerate(dilations): + self.convs.append( + TTDepthwiseSeparableConv2D( + kernel_size=3, + stride=1, + padding=dilation, + dilation=dilation, + groups=in_channels, + parameters=parameters["convs"][index + 1], + model_config=model_config, + activation=activation, + optimisations=aspp_optimisations[index], + ) + ) + + # image pooling + self.pooling_conv = TTConv2D( + kernel_size=1, + stride=1, + padding=0, + groups=1, + parameters=parameters["convs"][4][1], + kernel_fidelity=model_config, + activation=activation, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + shard_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED, + deallocate_activation=True, + reallocate_halo_output=True, + reshard_if_not_optimal=True, + ) + + # Upsample + self.upsample = TTUpsample( + scale_factor=(32, 64), + mode="bilinear", + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + ) + + # Projection conv + self.project = TTConv2D( + kernel_size=1, + stride=1, + padding=0, + groups=1, + parameters=parameters.project, + kernel_fidelity=model_config, + activation=activation, + shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + deallocate_activation=True, + reshard_if_not_optimal=True, + ) + + def __call__( + self, + x, + device, + ): + res = [] + for conv in self.convs: + out, _ = conv(device, x, (1, 32, 64, 2048)) + res.append(out) + + x = ttnn.reshape(x, [1, 1, x.shape[0] * x.shape[1] * x.shape[2], x.shape[-1]]) + out = ttnn.avg_pool2d( + input_tensor=x, + batch_size=1, + input_h=32, + input_w=64, + channels=2048, + kernel_size=(32, 64), + stride=(1, 1), + padding=(0, 0), + applied_shard_scheme=ttnn.TensorMemoryLayout.WIDTH_SHARDED, + in_place_halo=True, + deallocate_input=True, + reallocate_halo_output=True, + ) + ttnn.deallocate(x, force=True) + + out, shape = self.pooling_conv(device, out, (1, 1, 1, 2048)) + out = self.upsample(device, out, [1, 1, 1, 256], reshape_output=True, dtype=ttnn.bfloat8_b) + res.append(out) + + aspp_concat = ttnn.concat(res, dim=3) + for res_out in res: + ttnn.deallocate(res_out, force=True) + + shape = (1, 32, 64, 1280) + out, shape = self.project(device, aspp_concat, shape) + + return out diff --git a/models/experimental/panoptic_deeplab/tt/backbone.py b/models/experimental/panoptic_deeplab/tt/backbone.py new file mode 100644 index 000000000000..8d0318b36e18 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/backbone.py @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from typing import List, Optional +from models.experimental.panoptic_deeplab.tt.bottleneck import TTBottleneck, get_bottleneck_optimisation +from models.experimental.panoptic_deeplab.tt.stem import resnet52Stem, neck_optimisations + + +class TTBackbone: + def __init__(self, parameters, model_config, name="backbone"): + layers = [3, 4, 6, 3] + self.inplanes = 128 + # stem + self.stem = resnet52Stem( + parameters.stem, + stride=1, + model_config=model_config, + layer_optimisations=neck_optimisations, + ) + # Four bottleneck stages (res2, res3, res4, res5) + self.res2 = self._make_layer( + name=f"{name}.res2", + parameters=parameters.res2, + planes=64, + blocks=layers[0], + stride=1, + dilate_config=None, + model_config=model_config, + layer_optimisations=get_bottleneck_optimisation("res2"), + ) + self.res3 = self._make_layer( + name=f"{name}.res3", + parameters=parameters.res3, + planes=128, + blocks=layers[1], + stride=2, + dilate_config=None, + model_config=model_config, + layer_optimisations=get_bottleneck_optimisation("res3"), + ) + self.res4 = self._make_layer( + name=f"{name}.res4", + parameters=parameters.res4, + planes=256, + blocks=layers[2], + stride=2, + dilate_config=None, + model_config=model_config, + layer_optimisations=get_bottleneck_optimisation("res4"), + ) + self.res5 = self._make_layer( + name=f"{name}.res5", + parameters=parameters.res5, + planes=512, + blocks=layers[3], + stride=1, + dilate_config=[2, 4, 8], + model_config=model_config, + layer_optimisations=get_bottleneck_optimisation("res5"), + ) + + def _make_layer( + self, + name: str, + parameters, + planes: int, + blocks: int, + stride: int, + dilate_config: Optional[List[int]] = None, + model_config=None, + layer_optimisations=get_bottleneck_optimisation("default"), + ) -> List[TTBottleneck]: + if dilate_config is None: + dilate_config = [1] * blocks + layers = [] + layers.append( + TTBottleneck( + parameters=parameters[0], + downsample=stride != 1 or self.inplanes != planes * TTBottleneck.expansion, + stride=stride, + model_config=model_config, + dilation=dilate_config[0], + name=f"{name}.0", + layer_optimisations=layer_optimisations, + ) + ) + self.inplanes = planes * TTBottleneck.expansion + for block_num in range(1, blocks): + layers.append( + TTBottleneck( + parameters=parameters[block_num], + downsample=False, + stride=1, + model_config=model_config, + dilation=dilate_config[block_num], + name=f"{name}.{block_num}", + layer_optimisations=layer_optimisations, + ) + ) + return layers + + def __call__(self, x, device): + x = self.stem(x, device) + shape = x.shape + + for block in self.res2: + x, shape = block(x, device, shape) + res_2 = x + res_2 = ttnn.to_memory_config(res_2, ttnn.DRAM_MEMORY_CONFIG) + + for block in self.res3: + x, shape = block(x, device, shape) + res_3 = x + res_3 = ttnn.to_memory_config(res_3, ttnn.DRAM_MEMORY_CONFIG) + + for block in self.res4: + x, shape = block(x, device, shape) + + for block in self.res5: + x, shape = block(x, device, shape) + x = ttnn.to_memory_config(x, ttnn.DRAM_MEMORY_CONFIG) + res_5 = x + + return {"res_2": res_2, "res_3": res_3, "res_5": res_5} diff --git a/models/experimental/panoptic_deeplab/tt/bottleneck.py b/models/experimental/panoptic_deeplab/tt/bottleneck.py new file mode 100644 index 000000000000..9b39a62e81f3 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/bottleneck.py @@ -0,0 +1,264 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from models.experimental.panoptic_deeplab.tt.utils import TTConv2D +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class BottleneckOptimizer: + conv1: dict + conv2: dict + conv3: dict + downsample: dict + + +bottleneck_layer_optimisations = { + "default": BottleneckOptimizer( + conv1={"act_block_h": 32, "memory_config": ttnn.DRAM_MEMORY_CONFIG}, + conv2={ + "act_block_h": 32, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + conv3={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + }, + downsample={ + "memory_config": None, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + ), + "res2": BottleneckOptimizer( + conv1={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "reshard_if_not_optimal": True, + }, + conv2={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + conv3={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + }, + downsample={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "slice_config": ttnn.Conv2dSliceConfig(slice_type=ttnn.Conv2dSliceHeight, num_slices=2), + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + ), + "res3": BottleneckOptimizer( + conv1={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "reshard_if_not_optimal": True, + }, + conv2={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + conv3={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + }, + downsample={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "slice_config": ttnn.Conv2dSliceConfig(slice_type=ttnn.Conv2dSliceHeight, num_slices=2), + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + ), + "res4": BottleneckOptimizer( + conv1={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "reshard_if_not_optimal": True, + }, + conv2={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + conv3={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + }, + downsample={ + "act_block_h": 32, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + ), + "res5": BottleneckOptimizer( + conv1={ + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "reshard_if_not_optimal": True, + "dtype": ttnn.bfloat16, + }, + conv2={ + "act_block_h": 512, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reshard_if_not_optimal": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "dtype": ttnn.bfloat16, + }, + conv3={ + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reshard_if_not_optimal": True, + }, + downsample={ + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "reshard_if_not_optimal": True, + "deallocate_activation": True, + }, + ), +} + + +def get_bottleneck_optimisation(layer_name): + for key in ["res2", "res3", "res4", "res5"]: + if key in layer_name: + return bottleneck_layer_optimisations[key] + return bottleneck_layer_optimisations["default"] + + +class TTBottleneck: + expansion: int = 4 + + def __init__( + self, + parameters, + downsample, + stride, + model_config, + dilation: int = 1, + name: Optional[str] = "bottleneck", + layer_optimisations=bottleneck_layer_optimisations["default"], + ) -> None: + self.name = name + self.layer_optimisations = layer_optimisations + self.conv1 = TTConv2D( + kernel_size=1, + stride=1, + padding=0, + dilation=1, + parameters=parameters.conv1, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.conv1, + ) + self.conv2 = TTConv2D( + kernel_size=3, + stride=stride if downsample else 1, + padding=dilation, + dilation=dilation, + parameters=parameters.conv2, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.conv2, + ) + self.conv3 = TTConv2D( + kernel_size=1, + stride=1, + padding=0, + dilation=1, + parameters=parameters.conv3, + kernel_fidelity=model_config, + activation=None, + **layer_optimisations.conv3, + ) + + self.downsample = downsample + if downsample: + self.downsample_conv = TTConv2D( + kernel_size=1, + stride=stride, + padding=0, + dilation=1, + parameters=parameters.shortcut, + kernel_fidelity=model_config, + activation=None, + **layer_optimisations.downsample, + ) + + self.model_config = model_config + return + + def __call__( + self, + x, + device, + in_shape, + ): + # Convert to DRAM interleaved for DRAM sliced conv's + if self.layer_optimisations.downsample.get("slice_config", False) or self.layer_optimisations.conv1.get( + "slice_config", False + ): + x = ttnn.to_memory_config(x, ttnn.DRAM_MEMORY_CONFIG) + + # conv1 is 1x1 conv + out, shape = self.conv1(device, x, in_shape) + + # FIXME: PCC drop when persistent L1 buffer is used + if self.downsample: + out = ttnn.to_memory_config(out, ttnn.DRAM_MEMORY_CONFIG) + + # conv2 is 3x3 conv + out, shape = self.conv2(device, out, shape) + + # conv3 is 1x1 conv + out, shape = self.conv3(device, out, shape) + + # run downsample conv 1x1 if required + if self.downsample: + ds_out, _ = self.downsample_conv(device, x, in_shape) + else: + ds_out = x + + if ds_out.shape != out.shape: + ds_out = ttnn.reshape(ds_out, (1, 1, ds_out.shape[0] * ds_out.shape[1] * ds_out.shape[2], ds_out.shape[3])) + if ds_out.layout != out.layout: + ds_out = ttnn.to_layout(ds_out, out.layout) + if ds_out.memory_config() != out.memory_config(): + ds_out = ttnn.to_memory_config(ds_out, out.memory_config()) + + out = ttnn.add_( + out, + ds_out, + activations=[ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU)], + ) + + ttnn.deallocate(ds_out) + return out, shape diff --git a/models/experimental/panoptic_deeplab/tt/custom_preprocessing.py b/models/experimental/panoptic_deeplab/tt/custom_preprocessing.py new file mode 100644 index 000000000000..e968fdba7607 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/custom_preprocessing.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import torch + +from ttnn.model_preprocessing import convert_torch_model_to_ttnn_model, fold_batch_norm2d_into_conv2d + +from models.experimental.panoptic_deeplab.reference.decoder import DecoderModel +from models.experimental.panoptic_deeplab.reference.aspp import ASPPModel +from models.experimental.panoptic_deeplab.reference.head import HeadModel +from models.experimental.panoptic_deeplab.reference.res_block import ResModel +from models.experimental.panoptic_deeplab.reference.resnet52_stem import DeepLabStem +from models.experimental.panoptic_deeplab.reference.resnet52_bottleneck import Bottleneck +from models.experimental.panoptic_deeplab.reference.resnet52_backbone import ResNet52BackBone as TorchBackbone +from models.experimental.panoptic_deeplab.reference.panoptic_deeplab import TorchPanopticDeepLab +from models.experimental.panoptic_deeplab.reference.utils import Conv2d, DepthwiseSeparableConv2d + + +def preprocess_conv_parameter(parameter, *, dtype): + parameter = ttnn.from_torch(parameter, dtype=dtype, layout=ttnn.TILE_LAYOUT) + return parameter + + +def custom_preprocessor( + model, name, ttnn_module_args, convert_to_ttnn, custom_preprocessor_func=None, mesh_mapper=None +): + parameters = {} + if isinstance(model, Conv2d): + if model.norm is not None: + weight, bias = fold_batch_norm2d_into_conv2d(model, model.norm) + else: + weight = model.weight.clone().detach().contiguous() + bias = ( + model.bias.clone().detach().contiguous() if model.bias is not None else torch.zeros(model.out_channels) + ) + parameters["weight"] = ttnn.from_torch(weight, mesh_mapper=mesh_mapper) + parameters["bias"] = ttnn.from_torch(torch.reshape(bias, (1, 1, 1, -1)), mesh_mapper=mesh_mapper) + elif isinstance( + model, + ( + DepthwiseSeparableConv2d, + Bottleneck, + DeepLabStem, + TorchBackbone, + ASPPModel, + ResModel, + HeadModel, + DecoderModel, + TorchPanopticDeepLab, + ), + ): + # Let the sub-modules handle their own preprocessing + for child_name, child in model.named_children(): + parameters[child_name] = convert_torch_model_to_ttnn_model( + child, + name=f"{name}.{child_name}", + custom_preprocessor=custom_preprocessor_func, + convert_to_ttnn=convert_to_ttnn, + ttnn_module_args=ttnn_module_args, + ) + + return parameters + + +def create_custom_mesh_preprocessor(mesh_mapper=None): + def custom_mesh_preprocessor(model, name, ttnn_module_args, convert_to_ttnn): + return custom_preprocessor( + model, name, ttnn_module_args, convert_to_ttnn, custom_mesh_preprocessor, mesh_mapper + ) + + return custom_mesh_preprocessor diff --git a/models/experimental/panoptic_deeplab/tt/decoder.py b/models/experimental/panoptic_deeplab/tt/decoder.py new file mode 100644 index 000000000000..a4eeaf554009 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/decoder.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from dataclasses import dataclass + +from models.experimental.panoptic_deeplab.tt.aspp import TTASPP +from models.experimental.panoptic_deeplab.tt.head import TTHead +from models.experimental.panoptic_deeplab.tt.res_block import TTRes +from models.experimental.panoptic_deeplab.tt.res_block import res_layer_optimisations +from models.experimental.panoptic_deeplab.tt.head import head_layer_optimisations + + +@dataclass +class DecoderOptimizer: + res_layer_optimisations: dict + head_layer_optimisations: dict + shape: tuple + + +decoder_layer_optimisations = { + "default": DecoderOptimizer( + res_layer_optimisations=res_layer_optimisations["default"], + head_layer_optimisations=head_layer_optimisations["default"], + shape=(0, 0, 0, 0), + ), + "semantic_decoder": DecoderOptimizer( + res_layer_optimisations={ + "res3": res_layer_optimisations["semantic_decoder.res3"], + "res2": res_layer_optimisations["semantic_decoder.res2"], + }, + head_layer_optimisations={ + "head_1": head_layer_optimisations["semantic_decoder.head_1"], + }, + shape=(1, 128, 256, 256), + ), + "instance_decoder": DecoderOptimizer( + res_layer_optimisations={ + "res3": res_layer_optimisations["instance_decoder.res3"], + "res2": res_layer_optimisations["instance_decoder.res2"], + }, + head_layer_optimisations={ + "head_1": head_layer_optimisations["instance_decoder.head_1"], + "head_2": head_layer_optimisations["instance_decoder.head_2"], + }, + shape=(1, 128, 256, 128), + ), +} + + +class TTDecoder: + def __init__( + self, + parameters, + model_config, + layer_optimisations=decoder_layer_optimisations["default"], + name="semantic_decoder", + ) -> None: + super().__init__() + self.shape = layer_optimisations.shape + self.name = name + + self.aspp = TTASPP(parameters.aspp, model_config) + self.res3 = TTRes( + parameters.res3, + model_config, + layer_optimisations=layer_optimisations.res_layer_optimisations["res3"], + ) + self.res2 = TTRes( + parameters.res2, + model_config, + layer_optimisations=layer_optimisations.res_layer_optimisations["res2"], + ) + self.head = TTHead( + parameters.head_1, + model_config, + layer_optimisations=layer_optimisations.head_layer_optimisations["head_1"], + ) + if self.shape[-1] == 128: + self.head_2 = TTHead( + parameters.head_2, + model_config, + layer_optimisations=layer_optimisations.head_layer_optimisations["head_2"], + ) + if self.name == "semantic_decoder": + self.res3_upsample_channels = 256 + self.res2_upsample_channels = 256 + else: + self.res3_upsample_channels = 256 + self.res2_upsample_channels = 128 + + def __call__(self, x, res3, res2, upsample_channels, device): + out = self.aspp(x, device) + out = self.res3(out, res3, self.res3_upsample_channels, device) + out = self.res2(out, res2, self.res2_upsample_channels, device) + + if self.name == "instance_decoder": + activation_copy = ttnn.clone(out) + out = self.head(out, device) + + if self.name == "instance_decoder": + out_ = self.head_2(activation_copy, device) + else: + out_ = None + + return out, out_ diff --git a/models/experimental/panoptic_deeplab/tt/head.py b/models/experimental/panoptic_deeplab/tt/head.py new file mode 100644 index 000000000000..537ced69f410 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/head.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from dataclasses import dataclass +from models.experimental.panoptic_deeplab.tt.utils import TTConv2D, TTUpsample + + +@dataclass +class HeadOptimizer: + conv1: dict + conv2: dict + predictor: dict + shape: tuple + + +head_layer_optimisations = { + "default": HeadOptimizer( + conv1={"act_block_h": 32, "memory_config": ttnn.DRAM_MEMORY_CONFIG}, + conv2={ + "act_block_h": 32, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + predictor={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + }, + shape=(0, 0, 0, 0), + ), + "semantic_decoder.head_1": HeadOptimizer( + conv1={ + "act_block_h": 256, + "deallocate_activation": True, + "reallocate_halo_output": True, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "reshard_if_not_optimal": True, + }, + conv2={ + "act_block_h": 32, + "deallocate_activation": True, + "reallocate_halo_output": True, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + }, + predictor={ + "act_block_h": 32, + "deallocate_activation": True, + "reallocate_halo_output": True, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + }, + shape=(1, 128, 256, 256), + ), + "instance_decoder.head_1": HeadOptimizer( + conv1={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + conv2={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + predictor={ + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + }, + shape=(1, 128, 256, 128), + ), + "instance_decoder.head_2": HeadOptimizer( + conv1={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + conv2={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + predictor={ + "act_block_h": 64, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "input_channels_alignment": 32, + "memory_config": ttnn.L1_MEMORY_CONFIG, + }, + shape=(1, 128, 256, 128), + ), +} + + +class TTHead: + def __init__( + self, + parameters, + model_config, + layer_optimisations=head_layer_optimisations["default"], + ) -> None: + self.layer_optimisations = layer_optimisations + # Conv1 + self.conv1 = TTConv2D( + kernel_size=parameters.conv_args["conv1"].kernel_size, + stride=parameters.conv_args["conv1"].stride, + padding=parameters.conv_args["conv1"].padding, + dilation=parameters.conv_args["conv1"].dilation, + groups=parameters.conv_args["conv1"].groups, + parameters=parameters.conv1, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.conv1, + ) + # Conv2 + self.conv2 = TTConv2D( + kernel_size=parameters.conv_args["conv2"].kernel_size, + stride=parameters.conv_args["conv2"].stride, + padding=parameters.conv_args["conv2"].padding, + groups=parameters.conv_args["conv2"].groups, + parameters=parameters.conv2, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.conv2, + ) + # Predictor + self.predictor = TTConv2D( + kernel_size=parameters.conv_args["predictor"].kernel_size, + stride=parameters.conv_args["predictor"].stride, + padding=parameters.conv_args["predictor"].padding, + groups=parameters.conv_args["predictor"].groups, + parameters=parameters.predictor, + kernel_fidelity=model_config, + **layer_optimisations.predictor, + ) + + # Upsample + self.upsample = TTUpsample( + scale_factor=(4), + mode="bilinear", + math_fidelity=ttnn.MathFidelity.HiFi2, + math_approx_mode=True, + fp32_dest_acc_en=False, + ) + + def __call__( + self, + x, + device, + ): + shape = self.layer_optimisations.shape + + out, shape = self.conv1(device, x, shape) + + out, shape = self.conv2(device, out, shape) + + out, shape = self.predictor(device, out, shape) + + out = self.upsample(device, out, shape, reshape_output=False, pad_ch_to_32=True, sent_to_dram=False) + out = ttnn.to_memory_config(out, ttnn.DRAM_MEMORY_CONFIG) + return out diff --git a/models/experimental/panoptic_deeplab/tt/panoptic_deeplab.py b/models/experimental/panoptic_deeplab/tt/panoptic_deeplab.py new file mode 100644 index 000000000000..570bd363483f --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/panoptic_deeplab.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn + +from models.experimental.panoptic_deeplab.tt.backbone import TTBackbone +from models.experimental.panoptic_deeplab.tt.decoder import TTDecoder, decoder_layer_optimisations + + +class TTPanopticDeepLab: + """ + TTNN implementation of Panoptic DeepLab using backbone and decoder architecture. + Combines backbone, semantic segmentation, and instance segmentation. + """ + + def __init__( + self, + parameters, + model_config, + ): + self.model_config = model_config + + # Initialize backbone + self.backbone = TTBackbone(parameters.backbone, model_config) + + # Initialize semantic segmentation decoder + self.semantic_decoder = TTDecoder( + parameters.semantic_decoder, + model_config, + layer_optimisations=decoder_layer_optimisations["semantic_decoder"], + name="semantic_decoder", + ) + + # Initialize instance segmentation decoder + self.instance_decoder = TTDecoder( + parameters.instance_decoder, + model_config, + layer_optimisations=decoder_layer_optimisations["instance_decoder"], + name="instance_decoder", + ) + + def __call__( + self, + x, + device, + ): + """ + Forward pass of TTNN Panoptic DeepLab. + + Args: + x: Input tensor of shape [B, H, W, C] in TTNN format + device: TTNN device + + Returns: + semantic_logits: Semantic segmentation logits + instance_offset_head_logit: Instance segmentation logits - offset head + instance_center_head_logit: Instance segmentation logits - center head + """ + + # Extract features from backbone + features = self.backbone(x, device) + + # clone the specific feature maps the instance decoders expect + backbone_features = ttnn.clone(features["res_5"]) + res3_features = ttnn.clone(features["res_3"]) + res2_features = ttnn.clone(features["res_2"]) + + # Semantic segmentation branch + semantic_logit, _ = self.semantic_decoder( + features["res_5"], + features["res_3"], + features["res_2"], + upsample_channels=256, + device=device, + ) + + # Instance segmentation branch + instance_offset_head_logit, instance_center_head_logit = self.instance_decoder( + backbone_features, + res3_features, + res2_features, + upsample_channels=256, + device=device, + ) + + return semantic_logit, instance_offset_head_logit, instance_center_head_logit diff --git a/models/experimental/panoptic_deeplab/tt/res_block.py b/models/experimental/panoptic_deeplab/tt/res_block.py new file mode 100644 index 000000000000..0272b478feac --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/res_block.py @@ -0,0 +1,209 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from typing import Dict, Any +from dataclasses import dataclass +from models.experimental.panoptic_deeplab.tt.utils import ( + TTConv2D, + TTUpsample, + TTDepthwiseSeparableConv2D, + DepthwiseSeparableOptimizer, +) + + +@dataclass +class ResOptimizer: + project_conv: Dict[Any, Any] + fuse_conv: DepthwiseSeparableOptimizer + shape: tuple + + +res_layer_optimisations = { + "default": ResOptimizer( + project_conv={"act_block_h": 32, "memory_config": ttnn.DRAM_MEMORY_CONFIG}, + fuse_conv=DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 32, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + pointwise={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + }, + ), + shape=(0, 0, 0, 0), + ), + "instance_decoder.res3": ResOptimizer( + project_conv={ + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + }, + fuse_conv=DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 256, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "reshard_if_not_optimal": True, + }, + pointwise={ + "act_block_h": 32, + "shard_layout": ttnn.TensorMemoryLayout.BLOCK_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + ), + shape=(1, 64, 128, 512), + ), + "instance_decoder.res2": ResOptimizer( + project_conv={ + "act_block_h": 128, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + fuse_conv=DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 32, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + pointwise={ + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + ), + shape=(1, 128, 256, 256), + ), + "semantic_decoder.res3": ResOptimizer( + project_conv={ + "act_block_h": 32, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + fuse_conv=DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 256, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + pointwise={ + "act_block_h": 32, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + ), + shape=(1, 64, 128, 512), + ), + "semantic_decoder.res2": ResOptimizer( + project_conv={ + "act_block_h": 32, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + fuse_conv=DepthwiseSeparableOptimizer( + depthwise={ + "act_block_h": 160, + "memory_config": ttnn.L1_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + }, + pointwise={ + "act_block_h": 32, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + }, + ), + shape=(1, 128, 256, 256), + ), +} + + +class TTRes: + def __init__( + self, + parameters, + model_config, + layer_optimisations=res_layer_optimisations["default"], + ) -> None: + # upsample + self.upsample = TTUpsample( + scale_factor=(2), + mode="bilinear", + memory_config=ttnn.DRAM_MEMORY_CONFIG, + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + ) + + # Project conv + self.project_conv = TTConv2D( + kernel_size=parameters.conv_args["project_conv"].kernel_size, + stride=parameters.conv_args["project_conv"].stride, + padding=parameters.conv_args["project_conv"].padding, + dilation=parameters.conv_args["project_conv"].dilation, + groups=parameters.conv_args["project_conv"].groups, + parameters=parameters.project_conv, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.project_conv, + ) + + # Fuse conv + self.fuse_conv = TTDepthwiseSeparableConv2D( + kernel_size=parameters.conv_args["fuse_conv"]["depthwise"].kernel_size, + stride=parameters.conv_args["fuse_conv"]["depthwise"].stride, + padding=parameters.conv_args["fuse_conv"]["depthwise"].padding, + groups=parameters.conv_args["fuse_conv"]["depthwise"].groups, + dilation=1, + parameters=parameters.fuse_conv, + model_config=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + optimisations=layer_optimisations.fuse_conv, + ) + self.shape = layer_optimisations.shape + + def __call__( + self, + x, + res, + upsample_channels, + device, + ): + shape = [self.shape[-4], self.shape[-3] // 2, self.shape[-2] // 2, upsample_channels] + + out = self.upsample(device, x, shape, sent_to_dram=True, reshape_output=True) + + out_res, shape = self.project_conv(device, res, self.shape) + + out = ttnn.concat([out_res, out], dim=3) + + shape = (self.shape[-4], self.shape[-3], self.shape[-2], upsample_channels + shape[-1]) + + out, shape = self.fuse_conv(device, out, shape) + return out diff --git a/models/experimental/panoptic_deeplab/tt/stem.py b/models/experimental/panoptic_deeplab/tt/stem.py new file mode 100644 index 000000000000..4937d6bd5089 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/stem.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from dataclasses import dataclass +from models.experimental.panoptic_deeplab.tt.utils import TTConv2D + + +@dataclass +class NeckOptimizer: + conv1: dict + conv2: dict + conv3: dict + + +neck_optimisations = NeckOptimizer( + conv1={ + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "slice_config": ttnn.Conv2dSliceConfig(slice_type=ttnn.Conv2dSliceHeight, num_slices=2), + "dtype": ttnn.bfloat16, + }, + conv2={ + "act_block_h": 512, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "dtype": ttnn.bfloat16, + }, + conv3={ + "act_block_h": 128, + "shard_layout": ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + "memory_config": ttnn.DRAM_MEMORY_CONFIG, + "deallocate_activation": True, + "reallocate_halo_output": True, + "reshard_if_not_optimal": True, + "enable_act_double_buffer": True, + "enable_weights_double_buffer": True, + "dtype": ttnn.bfloat16, + }, +) + + +class resnet52Stem: + def __init__( + self, + parameters, + stride, + model_config, + layer_optimisations=neck_optimisations, + ) -> None: + self.conv1 = TTConv2D( + kernel_size=3, + stride=2, + padding=1, + parameters=parameters.conv1, + kernel_fidelity=model_config, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + **layer_optimisations.conv1, + ) + self.conv2 = TTConv2D( + kernel_size=3, + stride=stride, + padding=1, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + parameters=parameters.conv2, + kernel_fidelity=model_config, + **layer_optimisations.conv2, + ) + self.conv3 = TTConv2D( + kernel_size=3, + stride=stride, + padding=1, + activation=ttnn.UnaryWithParam(ttnn.UnaryOpType.RELU), + parameters=parameters.conv3, + kernel_fidelity=model_config, + **layer_optimisations.conv3, + ) + + def __call__( + self, + x, + device, + ): + # conv1 is stride 2 conv 3x3 + out, shape = self.conv1(device, x, x.shape) + + # conv2 and 3 are 3x3 conv's with stride 1 + out, shape = self.conv2(device, out, shape) + out, shape = self.conv3(device, out, shape) + + out = ttnn.max_pool2d( + input_tensor=out, + batch_size=shape[-4], + input_h=shape[-3], + input_w=shape[-2], + channels=shape[-1], + kernel_size=[3, 3], + stride=[2, 2], + padding=[1, 1], + dilation=[1, 1], + in_place_halo=True, + applied_shard_scheme=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ceil_mode=False, + ) + out = ttnn.reshape(out, (shape[-4], shape[-3] // 2, shape[-2] // 2, shape[-1])) + + return out diff --git a/models/experimental/panoptic_deeplab/tt/utils.py b/models/experimental/panoptic_deeplab/tt/utils.py new file mode 100644 index 000000000000..a8fff3194df6 --- /dev/null +++ b/models/experimental/panoptic_deeplab/tt/utils.py @@ -0,0 +1,290 @@ +# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from dataclasses import dataclass +from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores + + +# --------------------------- +# TTNN utility modules +# --------------------------- + + +@dataclass +class DepthwiseSeparableOptimizer: + depthwise: dict + pointwise: dict + + +class TTConv2D: + def __init__( + self, + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + parameters: dict | None = None, + kernel_fidelity: dict | None = None, + *, + memory_config=None, + act_block_h=None, + act_block_w=None, + deallocate_activation=False, + reallocate_halo_output=False, + shard_layout=None, + activation=None, + groups=1, + num_cores_nhw=None, + is_reshape=False, + enable_act_double_buffer=False, + enable_weights_double_buffer=False, + fp32_dest_acc_en=False, + packer_l1_acc=False, + math_approx_mode=False, + input_channels_alignment=32, + reshard_if_not_optimal=False, + slice_config=None, + dtype=None, + weights_dtype=None, + math_fidelity=None, + ) -> None: + if isinstance(kernel_size, int): + self.kernel_size = (kernel_size, kernel_size) + elif isinstance(kernel_size, tuple): + self.kernel_size = kernel_size + else: + ValueError("Invalid config") + if isinstance(stride, int): + self.stride = (stride, stride) + elif isinstance(stride, tuple): + self.stride = stride + else: + ValueError("Invalid config") + if isinstance(padding, int): + self.padding = (padding, padding, padding, padding) + elif isinstance(padding, tuple): + self.padding = padding + else: + ValueError("Invalid config") + if isinstance(dilation, int): + self.dilation = (dilation, dilation) + elif isinstance(dilation, tuple): + self.dilation = dilation + else: + ValueError("Invalid config") + + self.kernel_fidelity = kernel_fidelity + self.weights = parameters["weight"] + self.bias = parameters["bias"] + self.deallocate_activation = deallocate_activation + self.reallocate_halo_output = reallocate_halo_output + self.fp32_dest_acc_en = fp32_dest_acc_en + self.packer_l1_acc = packer_l1_acc + self.math_approx_mode = math_approx_mode + self.input_channels_alignment = input_channels_alignment + self.reshard_if_not_optimal = reshard_if_not_optimal + self.out_channels = self.weights.shape[0] + self.act_block_h = act_block_h + self.act_block_w = act_block_w + self.groups = groups + self.activation = activation + self.memory_config = memory_config + self.shard_layout = shard_layout + self.slice_config = slice_config + self.num_cores_nhw = num_cores_nhw + self.is_reshape = is_reshape + self.enable_act_double_buffer = enable_act_double_buffer + self.enable_weights_double_buffer = enable_weights_double_buffer + if dtype is not None: + self.dtype = dtype + else: + self.dtype = self.kernel_fidelity["ACTIVATIONS_DTYPE"] + if weights_dtype is not None: + self.weights_dtype = weights_dtype + else: + self.weights_dtype = self.kernel_fidelity["WEIGHTS_DTYPE"] + if math_fidelity is not None: + self.math_fidelity = math_fidelity + else: + self.math_fidelity = self.kernel_fidelity["MATH_FIDELITY"] + + def __call__(self, device, input_tensor, input_shape): + conv_config = ttnn.Conv2dConfig( + weights_dtype=self.weights_dtype, + activation=self.activation, + deallocate_activation=self.deallocate_activation, + reallocate_halo_output=self.reallocate_halo_output, + reshard_if_not_optimal=self.reshard_if_not_optimal, + shard_layout=self.shard_layout, + enable_act_double_buffer=self.enable_act_double_buffer, + enable_weights_double_buffer=self.enable_weights_double_buffer, + in_place=True, + ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.kernel_fidelity["MATH_FIDELITY"], + fp32_dest_acc_en=self.fp32_dest_acc_en, + packer_l1_acc=self.packer_l1_acc, + math_approx_mode=self.math_approx_mode, + ) + if self.num_cores_nhw is not None: + shard_grid = get_shard_grid_from_num_cores(self.num_cores_nhw, device) + conv_config.core_grid = shard_grid + conv_config.override_sharding_config = True + + if self.act_block_h is not None: + conv_config.act_block_h_override = self.act_block_h + if self.act_block_w is not None: + conv_config.act_block_w_div = self.act_block_w + + [output_tensor, [_out_height, _out_width], [self.weights, self.bias]] = ttnn.conv2d( + input_tensor=input_tensor, + weight_tensor=self.weights, + bias_tensor=self.bias, + in_channels=input_shape[-1], + out_channels=self.out_channels, + device=device, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + batch_size=input_shape[-4], + input_height=input_shape[-3], + input_width=input_shape[-2], + conv_config=conv_config, + compute_config=compute_config, + slice_config=self.slice_config, + groups=self.groups, + return_weights_and_bias=True, + return_output_dim=True, + dtype=self.dtype, + memory_config=self.memory_config, + ) + + if self.is_reshape: + output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG) + output_tensor = ttnn.to_layout(output_tensor, ttnn.TILE_LAYOUT) + output_tensor = ttnn.reshape( + output_tensor, (input_tensor.shape[0], _out_height, _out_width, output_tensor.shape[-1]) + ) + output_tensor = ttnn.permute(output_tensor, (0, 3, 1, 2)) + return output_tensor, (input_tensor.shape[0], _out_height, _out_width, output_tensor.shape[-1]) + + +class TTDepthwiseSeparableConv2D: + def __init__( + self, + kernel_size, + stride, + padding, + dilation, + groups, + parameters, + model_config, + activation, + optimisations, + ): + super().__init__() + + # Depthwise + self.depthwise = TTConv2D( + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + parameters=parameters.depthwise, + kernel_fidelity=model_config, + activation=activation, + **optimisations.depthwise, + ) + + # Pointwise + self.pointwise = TTConv2D( + kernel_size=1, + stride=1, + padding=0, + groups=1, + parameters=parameters.pointwise, + kernel_fidelity=model_config, + activation=activation, + **optimisations.pointwise, + ) + + def __call__(self, device, x, in_shape): + out, shape = self.depthwise(device, x, in_shape) + out, shape = self.pointwise(device, out, shape) + return out, shape + + +class TTUpsample: + def __init__( + self, + scale_factor: int = 1, + mode: str = "bilinear", + memory_config=ttnn.L1_MEMORY_CONFIG, + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + ) -> None: + self.scale_factor = scale_factor + self.mode = mode + self.memory_config = memory_config + + self.compute_kernel_config = ttnn.WormholeComputeKernelConfig( + math_fidelity=math_fidelity, + math_approx_mode=math_approx_mode, + fp32_dest_acc_en=fp32_dest_acc_en, + ) + + def __call__( + self, + device, + input_tensor, + input_shape=None, + reshape_output=False, + pad_ch_to_32=False, + sent_to_dram=False, + dtype=ttnn.bfloat8_b, + ): + # Convert a **sharded** tensor (distributed across cores) into a single **interleaved** tensor, choosing the backing memory + # - DRAM: use when tensors are large or when later ops expect DRAM residency. + # - L1 : fastest on-chip memory; use when the tensor fits and you’ll run + # compute-heavy kernels immediately after. + if sent_to_dram: + input_tensor = ttnn.sharded_to_interleaved(input_tensor, ttnn.DRAM_MEMORY_CONFIG) + else: + input_tensor = ttnn.sharded_to_interleaved(input_tensor, ttnn.L1_MEMORY_CONFIG) + + input_tensor = ttnn.to_layout(input_tensor, ttnn.ROW_MAJOR_LAYOUT) + input_tensor = ttnn.reshape(input_tensor, input_shape) + + # Optionally pad channels to a multiple of 32 to match TT tile/channel alignment. + if pad_ch_to_32: + input_tensor = ttnn.pad(input_tensor, [(0, 0), (0, 0), (0, 0), (0, 32 - input_tensor.shape[-1] % 32)], 0) + + output_tensor = ttnn.upsample( + input_tensor, + scale_factor=self.scale_factor, + mode=self.mode, + memory_config=self.memory_config, + compute_kernel_config=self.compute_kernel_config, + ) + + # Remove channel padding if added. + if pad_ch_to_32: + output_tensor = ttnn.slice( + output_tensor, + [0, 0, 0, 0], + [output_tensor.shape[0], output_tensor.shape[1], output_tensor.shape[2], input_shape[-1]], + ) + + if reshape_output: + host = ttnn.from_device(output_tensor) + host = ttnn.to_dtype(host, dtype) + B, H, W, C = host.shape + host = ttnn.reshape(host, [1, 1, B * H * W, C]) + output_tensor = ttnn.to_device(host, device) + + return output_tensor