open-edge-platform
diff --git a/‎library/docs/source/guide/tutorials/advanced/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎library/docs/source/guide/tutorials/advanced/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎library/docs/source/guide/tutorials/advanced/tree_path_kl_loss_hcls.rst‎
Lines changed: 170 additions & 0 deletions b/‎library/docs/source/guide/tutorials/advanced/tree_path_kl_loss_hcls.rst‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎library/src/otx/backend/native/models/classification/classifier/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎library/src/otx/backend/native/models/classification/classifier/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎library/src/otx/backend/native/models/classification/classifier/h_label_classifier.py‎
Lines changed: 85 additions & 0 deletions b/‎library/src/otx/backend/native/models/classification/classifier/h_label_classifier.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎library/src/otx/backend/native/models/classification/hlabel_models/base.py‎
Lines changed: 36 additions & 1 deletion b/‎library/src/otx/backend/native/models/classification/hlabel_models/base.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎library/src/otx/backend/native/models/classification/hlabel_models/timm_model.py‎
Lines changed: 3 additions & 0 deletions b/‎library/src/otx/backend/native/models/classification/hlabel_models/timm_model.py‎
Lines changed: 3 additions & 0 deletions
@@ -10,5 +10,6 @@ Advanced Tutorials
    peft
    torch_compile
    hier_metric_collection
+   tree_path_kl_loss_hcls
 
 .. Once we have enough material, we might need to categorize these into `data`, `model learning` sections.
@@ -0,0 +1,170 @@
+Using Tree-Path KL Divergence for Hierarchical Classification
+=============================================================
+
+This tutorial explains how to train hierarchical classification models in
+OpenVINO™ Training Extensions with **Tree-Path KL Divergence Loss**, a training-time
+regularizer that encourages consistent predictions along the taxonomy path
+from root to leaf. The method is implemented in:
+
+- :class:`otx.backend.native.models.classification.losses.tree_path_kl_divergence_loss.TreePathKLDivergenceLoss`
+- :class:`otx.backend.native.models.classification.classifier.h_label_classifier.KLHLabelClassifier`
+
+The feature is currently exposed by default in
+:class:`otx.backend.native.models.classification.hlabel_models.timm_model.TimmModelHLabelCls`.
+Users may adapt other architectures with minimal modifications by adding the
+same wrapper (``KLHLabelClassifier``) in their model’s ``_finalize_model()``.
+
+Overview
+--------
+
+Hierarchical classification models predict multiple levels of labels
+(e.g., manufacturer → family → variant). Standard cross-entropy treats each
+level independently, which means models may output **inconsistent**
+combinations such as:
+
+- predicting a correct fine-grained leaf but an incompatible ancestor, or
+- predicting parents and children belonging to different branches.
+
+Tree-Path KL Divergence introduces a path-consistency objective by comparing:
+
+- the model’s *combined* probability distribution across all levels, and
+- a **tree-consistent target distribution** that places probability mass on
+  each ground-truth category along the path.
+
+This encourages smooth transitions between hierarchy levels and reduces
+structurally invalid predictions.
+
+How It Works
+------------
+
+Tree-Path KL Divergence operates on:
+
+- a **list of logits** from each hierarchy level (root → ... → leaf), and
+- a **target index** for each corresponding level.
+
+The algorithm implemented in
+:class:`TreePathKLDivergenceLoss` performs the following:
+
+1. Concatenates all level logits and applies log-softmax.
+2. Constructs a sparse target distribution that allocates equal probability to
+   the correct class at each level.
+3. Computes KL divergence between the model’s distribution and the path-aware
+   target distribution.
+4. Scales the result by ``loss_weight`` (typically ``1.0``).
+
+In :class:`KLHLabelClassifier`, this KL term is added to the hierarchical
+cross-entropy loss:
+
+- cross-entropy is averaged across all hierarchy levels,
+- KL divergence is multiplied by ``kl_weight``,
+- ``kl_weight = 0`` disables the KL term completely.
+
+Enabling Tree-Path KL Divergence
+--------------------------------
+
+The recommended entry point is the provided recipe:
+
+.. code-block:: text
+
+   recipe/classification/h_label_cls/efficientnet_v2_kl.yaml
+
+This recipe uses :class:`TimmModelHLabelCls` and exposes the argument
+``kl_weight`` directly in ``init_args``:
+
+.. code-block:: yaml
+
+   task: H_LABEL_CLS
+   model:
+     class_path: otx.backend.native.models.classification.hlabel_models.timm_model.TimmModelHLabelCls
+     init_args:
+       label_info: <LABEL-TREE-INFO>
+       model_name: tf_efficientnetv2_s.in21k
+       kl_weight: 1.0
+
+Using the CLI
+--------------------------------
+
+To train a hierarchical model with Tree-Path KL Divergence, the CLI requires:
+
+- ``--data_root``: a path to a directory containing an **``annotations/`` folder**  
+  whose JSON annotation files follow **Datumaro format**.
+  See the format specification here:
+
+   https://open-edge-platform.github.io/datumaro/stable/docs/data-formats/datumaro_format.html
+
+- ``--config``: the **path to a recipe YAML file**, such as  
+  ``recipe/classification/h_label_cls/efficientnet_v2_kl.yaml``.
+
+A full training command example:
+
+.. code-block:: bash
+
+   (otx) $ otx train \
+       --config recipe/classification/h_label_cls/efficientnet_v2_kl.yaml \
+       --data_root /path/to/dataset_with_annotations \
+       --model.kl_weight 1.0
+
+To disable Tree-Path KL Divergence and train a standard hierarchical model:
+
+.. code-block:: bash
+
+   (otx) $ otx train \
+       --config recipe/classification/h_label_cls/efficientnet_v2_kl.yaml \
+       --model.kl_weight 0.0
+
+Extending Other Architectures
+-----------------------------
+
+Currently, Tree-Path KL Divergence is automatically supported only by
+``TimmModelHLabelCls``. To integrate the feature into other architectures, add
+the following logic to the model’s ``_finalize_model`` method:
+
+1. Accept a new ``kl_weight`` argument in the model init.
+2. After constructing the underlying model, wrap it as:
+
+   .. code-block:: python
+
+      if self.kl_weight > 0:
+          model = KLHLabelClassifier(model, kl_weight=self.kl_weight)
+
+3. Ensure that the model returns a list of logits aligned with the hierarchy.
+
+Only a few lines are required, and this enables the same training procedure
+for any backbone (ResNet, ViT, ConvNeXt, etc.).
+
+When to Use Tree-Path KL Divergence
+-----------------------------------
+
+Tree-Path KL Divergence is most helpful when:
+
+- the label space forms a strict taxonomy,
+- incorrect parent/child combinations are undesirable,
+- fine-grained classes are scarce and benefit from structural priors,
+- you want improved consistency across hierarchy levels.
+
+Practically, start with:
+
+- ``kl_weight = 1.0`` or ``2.0`` for most datasets,
+- monitor both fine-grained and coarse-level accuracy,
+- adjust ``kl_weight`` based on the trade-off between accuracy and
+  hierarchical consistency.
+
+Practical Tips
+--------------
+
+- Ensure that ``label_info`` correctly describes the hierarchy.
+- Excessively large ``kl_weight`` values may over-regularize the model.
+- For benchmarking, compare:
+  - ``kl_weight = 0`` (baseline),
+  - ``kl_weight = 1–4`` (KL-enabled variants).
+- Tree-Path KL acts as a *training-time* consistency constraint; it does not
+  modify architecture or inference cost.
+
+Limitations
+-----------
+
+- Supported out-of-the-box only for :class:`TimmModelHLabelCls`.
+- Requires the model to output logits for **each level** of the hierarchy.
+- Not applicable to flat classification tasks.
+
+
@@ -4,6 +4,6 @@
 """Head modules for OTX custom model."""
 
 from .base_classifier import ImageClassifier
-from .h_label_classifier import HLabelClassifier
+from .h_label_classifier import HLabelClassifier, KLHLabelClassifier
 
-__all__ = ["HLabelClassifier", "ImageClassifier"]
+__all__ = ["HLabelClassifier", "ImageClassifier", "KLHLabelClassifier"]
@@ -11,6 +11,7 @@
 import torch
 
 from otx.backend.native.models.classification.heads.hlabel_cls_head import HierarchicalClsHead
+from otx.backend.native.models.classification.losses.tree_path_kl_divergence_loss import TreePathKLDivergenceLoss
 from otx.backend.native.models.classification.utils.ignored_labels import get_valid_label_mask
 
 from .base_classifier import ImageClassifier
@@ -143,3 +144,87 @@ def _forward_explain(self, images: torch.Tensor) -> dict[str, torch.Tensor | lis
             outputs["preds"] = preds
 
         return outputs
+
+
+class KLHLabelClassifier(HLabelClassifier):
+    """Hierarchical label classifier with tree path KL divergence loss.
+
+    Args:
+        backbone (nn.Module): Backbone network.
+        neck (nn.Module | None): Neck network.
+        head (nn.Module): Head network.
+        multiclass_loss (nn.Module): Multiclass loss function.
+        multilabel_loss (nn.Module | None, optional): Multilabel loss function.
+        init_cfg (dict | list[dict] | None, optional): Initialization configuration.
+        kl_weight (float): Loss weight for tree path KL divergence loss
+
+    Attributes:
+        multiclass_loss (nn.Module): Multiclass loss function.
+        multilabel_loss (nn.Module | None): Multilabel loss function.
+        is_ignored_label_loss (bool): Flag indicating if ignored label loss is used.
+
+    Methods:
+        loss(inputs, labels, **kwargs): Calculate losses from a batch of inputs and data samples.
+    """
+
+    def __init__(self, *args, kl_weight: float = 1.0, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.kl_weight = kl_weight
+        self.kl_loss = TreePathKLDivergenceLoss(reduction="batchmean", loss_weight=1.0)
+
+    def loss(self, inputs: torch.Tensor, labels: torch.Tensor, **kwargs) -> torch.Tensor:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            labels (torch.Tensor): The annotation data of
+                every samples.
+
+        Returns:
+            torch.Tensor: loss components
+        """
+        cls_scores = self.extract_feat(inputs, stage="head")
+        loss_score = torch.tensor(0.0, device=cls_scores.device)
+        logits_list = []
+        target_list = []
+        num_effective_heads_in_batch = 0
+        for i in range(self.head.num_multiclass_heads):
+            if i not in self.head.empty_multiclass_head_indices:
+                head_gt = labels[:, i]
+                logit_range = self.head._get_head_idx_to_logits_range(i)  # noqa: SLF001
+                head_logits = cls_scores[:, logit_range[0] : logit_range[1]]
+                valid_mask = head_gt >= 0
+                head_gt = head_gt[valid_mask]
+                if len(head_gt) > 0:
+                    head_logits = head_logits[valid_mask]
+                    logits_list.append(head_logits)
+                    target_list.append(head_gt)
+                    ce = self.multiclass_loss(head_logits, head_gt)
+                    loss_score += ce
+                    num_effective_heads_in_batch += 1
+
+        if num_effective_heads_in_batch > 0:
+            loss_score /= num_effective_heads_in_batch
+
+        if len(logits_list) > 1:
+            kl_loss = self.kl_loss(logits_list, torch.stack(target_list, dim=1))
+            loss_score += self.kl_weight * kl_loss
+
+        # Multilabel logic (preserved as-is)
+        if self.head.num_multilabel_classes > 0:
+            head_gt = labels[:, self.head.num_multiclass_heads :]
+            head_logits = cls_scores[:, self.head.num_single_label_classes :]
+            valid_mask = head_gt > 0
+            head_gt = head_gt[valid_mask]
+            if len(head_gt) > 0 and self.multilabel_loss is not None:
+                head_logits = head_logits[valid_mask]
+                imgs_info = kwargs.pop("imgs_info", None)
+                if imgs_info is not None and self.is_ignored_label_loss:
+                    valid_label_mask = get_valid_label_mask(imgs_info, self.head.num_classes).to(head_logits.device)
+                    valid_label_mask = valid_label_mask[:, self.head.num_single_label_classes :]
+                    valid_label_mask = valid_label_mask[valid_mask]
+                    kwargs["valid_label_mask"] = valid_label_mask
+                loss_score += self.multilabel_loss(head_logits, head_gt, **kwargs)
+
+        return loss_score
@@ -7,6 +7,7 @@
 
 from abc import abstractmethod
 from copy import deepcopy
+from functools import wraps
 from typing import TYPE_CHECKING, Any
 
 import torch
@@ -15,6 +16,7 @@
 from otx.backend.native.exporter.base import OTXModelExporter
 from otx.backend.native.exporter.native import OTXNativeModelExporter
 from otx.backend.native.models.base import DataInputParams, DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel
+from otx.backend.native.models.classification.classifier import KLHLabelClassifier
 from otx.backend.native.schedulers import LRSchedulerListCallable
 from otx.data.entity.base import OTXBatchLossEntity
 from otx.data.entity.torch import OTXDataBatch, OTXPredBatch
@@ -46,6 +48,7 @@ class OTXHlabelClsModel(OTXModel):
         Defaults to DefaultSchedulerCallable.
         metric (MetricCallable, optional): Callable for the metric. Defaults to HLabelClsMetricCallable.
         torch_compile (bool, optional): Flag to indicate whether to use torch.compile. Defaults to False.
+        kl_weight: The weight of tree-path KL divergence loss. Defaults to zero, use CrossEntropy only.
     """
 
     label_info: HLabelInfo
@@ -60,7 +63,9 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallable,
         torch_compile: bool = False,
+        kl_weight: float = 0.0,
     ) -> None:
+        self.kl_weight = kl_weight
         super().__init__(
             label_info=label_info,
             data_input_params=data_input_params,
@@ -71,16 +76,46 @@ def __init__(
             metric=metric,
             torch_compile=torch_compile,
         )
-
         if freeze_backbone:
             classification_layers = self._identify_classification_layers()
             for name, param in self.named_parameters():
                 param.requires_grad = name in classification_layers
 
+    def __getattribute__(self, name: str):
+        attr = super().__getattribute__(name)
+        if name == "_create_model" and callable(attr):
+            cache_name = "__cm_cached__"
+            cache = super().__getattribute__("__dict__").get(cache_name)
+            if cache:
+                return cache
+
+            @wraps(attr)
+            def wrapped(*a, **kw) -> nn.Module:
+                model = attr(*a, **kw)
+                return self._finalize_model(model)
+
+            self.__dict__[cache_name] = wrapped
+            return wrapped
+        return attr
+
     @abstractmethod
     def _create_model(self, head_config: dict | None = None) -> nn.Module:  # type: ignore[override]
         """Create a PyTorch model for this class."""
 
+    def _finalize_model(self, model: nn.Module) -> nn.Module:
+        """Run after child _create_model(); upgrade to KL if enabled."""
+        if self.kl_weight > 0:
+            return KLHLabelClassifier(
+                backbone=model.backbone,
+                neck=model.neck,
+                head=model.head,
+                multiclass_loss=model.multiclass_loss,
+                multilabel_loss=model.multilabel_loss,
+                init_cfg=getattr(model, "init_cfg", None),
+                kl_weight=self.kl_weight,
+            )
+        return model
+
     def _identify_classification_layers(self, prefix: str = "model.") -> list[str]:
         """Simple identification of the classification layers. Used for incremental learning."""
         # identify classification layers
 
@@ -45,6 +45,7 @@ class TimmModelHLabelCls(OTXHlabelClsModel):
         metric (MetricCallable, optional): The metric callable for evaluating the model.
             Defaults to HLabelClsMetricCallable.
         torch_compile (bool, optional): Whether to compile the model using TorchScript. Defaults to False.
+        kl_weight: The weight of tree-path KL divergence loss. Defaults to zero, use CrossEntropy only.
     """
 
     def __init__(
@@ -57,6 +58,7 @@ def __init__(
         scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
         metric: MetricCallable = HLabelClsMetricCallable,
         torch_compile: bool = False,
+        kl_weight: float = 0.0,
     ) -> None:
         super().__init__(
             label_info=label_info,
@@ -67,6 +69,7 @@ def __init__(
             scheduler=scheduler,
             metric=metric,
             torch_compile=torch_compile,
+            kl_weight=kl_weight,
         )
 
     def _create_model(self, head_config: dict | None = None) -> nn.Module:  # type: ignore[override]