From 02bc8525ba31e1365cf8feca00f6867567b730f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?=
 <62263673+pggPL@users.noreply.github.com>
Date: Mon, 19 May 2025 18:15:22 +0200
Subject: [PATCH 01/26] =?UTF-8?q?[Pytorch]=20NVIDIA-DL-Framework-Inspect?=
 =?UTF-8?q?=20support=20=E2=80=93=20part=203=20=E2=80=93=20tests=20(#1612)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* tests drop

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* move dir

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* tests fox

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Przemek Tredak <ptredak@nvidia.com>
Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 qa/L0_pytorch_debug_unittest/test.sh          |  26 +
 qa/L1_pytorch_distributed_unittest/test.sh    |  14 +
 tests/pytorch/debug/conftest.py               |  27 +
 tests/pytorch/debug/run_distributed.py        | 647 ++++++++++++++++
 tests/pytorch/debug/test_api_features.py      | 398 ++++++++++
 tests/pytorch/debug/test_config.py            | 151 ++++
 .../debug/test_configs/disable_fp8_gemms.yaml |   8 +
 .../debug/test_configs/disable_fp8_layer.yaml |   7 +
 .../debug/test_configs/dummy_feature.yaml     |   9 +
 .../fake_quantization_config.yaml             |  14 +
 .../test_configs/per_tensor_scaling.yaml      |  19 +
 .../stats_collection_test_config.yaml         |  59 ++
 ...ensor_manipulation_transformer_engine.yaml |  45 ++
 tests/pytorch/debug/test_distributed.py       |  39 +
 tests/pytorch/debug/test_numerics.py          | 718 ++++++++++++++++++
 tests/pytorch/debug/test_sanity.py            | 107 +++
 tests/pytorch/debug/utils.py                  |  22 +
 tests/pytorch/distributed/run_numerics.py     |  12 +
 tests/pytorch/test_numerics.py                |  26 +
 transformer_engine/debug/features/api.py      |   6 +-
 .../debug/features/fake_quant.py              |   2 +-
 .../debug/features/log_fp8_tensor_stats.py    |   1 -
 .../debug/features/per_tensor_scaling.py      |   5 +-
 .../debug/features/utils/stats_computation.py |   7 +-
 .../debug/pytorch/debug_quantization.py       |  18 +-
 transformer_engine/pytorch/distributed.py     |   6 +
 transformer_engine/pytorch/module/base.py     |   7 +-
 .../pytorch/module/layernorm_linear.py        |   1 +
 28 files changed, 2385 insertions(+), 16 deletions(-)
 create mode 100644 qa/L0_pytorch_debug_unittest/test.sh
 create mode 100644 tests/pytorch/debug/conftest.py
 create mode 100644 tests/pytorch/debug/run_distributed.py
 create mode 100644 tests/pytorch/debug/test_api_features.py
 create mode 100644 tests/pytorch/debug/test_config.py
 create mode 100644 tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml
 create mode 100644 tests/pytorch/debug/test_configs/disable_fp8_layer.yaml
 create mode 100644 tests/pytorch/debug/test_configs/dummy_feature.yaml
 create mode 100644 tests/pytorch/debug/test_configs/fake_quantization_config.yaml
 create mode 100644 tests/pytorch/debug/test_configs/per_tensor_scaling.yaml
 create mode 100644 tests/pytorch/debug/test_configs/stats_collection_test_config.yaml
 create mode 100644 tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml
 create mode 100644 tests/pytorch/debug/test_distributed.py
 create mode 100644 tests/pytorch/debug/test_numerics.py
 create mode 100644 tests/pytorch/debug/test_sanity.py
 create mode 100644 tests/pytorch/debug/utils.py

diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
new file mode 100644
index 000000000..9339777f4
--- /dev/null
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+
+: ${TE_PATH:=/opt/transformerengine}
+: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
+: ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+
+# Config with the dummy feature which prevents nvinspect from being disabled.
+# Nvinspect will be disabled if no feature is active.
+: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
+
+FAIL=0
+
+pip install pytest==8.2.1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+
+# standard numerics tests with initialized debug
+NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
+
+exit $FAIL
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 4319e96c7..09ef661c4 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -20,6 +20,7 @@ FAILED_CASES=""
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
+
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
@@ -30,6 +31,19 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_use
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn_with_cp.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || test_fail "test_fused_attn_with_cp.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
 
+
+# debug tests
+
+
+# Config with the dummy feature which prevents nvinspect from being disabled.
+# Nvinspect will be disabled if no feature is active.
+: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
+: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
+
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
+# standard numerics tests with initialized debug
+NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
     exit 1
diff --git a/tests/pytorch/debug/conftest.py b/tests/pytorch/debug/conftest.py
new file mode 100644
index 000000000..20edc6aab
--- /dev/null
+++ b/tests/pytorch/debug/conftest.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--feature_dirs", nargs="+", action="store", default="", help="List of feature directories"
+    )
+    parser.addoption(
+        "--configs_dir",
+        action="store",
+        default="",
+        type=str,
+        help="Path to the directory with configs.",
+    )
+
+
+@pytest.fixture
+def feature_dirs(request):
+    return request.config.getoption("--feature_dirs")
+
+
+@pytest.fixture
+def configs_dir(request):
+    return request.config.getoption("--configs_dir")
diff --git a/tests/pytorch/debug/run_distributed.py b/tests/pytorch/debug/run_distributed.py
new file mode 100644
index 000000000..640fdf9c5
--- /dev/null
+++ b/tests/pytorch/debug/run_distributed.py
@@ -0,0 +1,647 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import tempfile
+import functools
+import os
+import itertools
+import random
+import argparse
+import re
+
+import torch
+import torch.distributed as dist
+import transformer_engine
+import transformer_engine_torch as tex
+import nvdlfw_inspect.api as debug_api
+from transformer_engine.debug import set_weight_tensor_tp_group_reduce
+
+
+from test_numerics import (
+    _emulate_linear,
+    _init_debug,
+    disable_fp8_gemms_create_config,
+    DISABLE_FP8_LAYER_CONFIG,
+    _cmp,
+    IN_SIZE,
+    OUT_SIZE,
+    _init_model,
+    SEED,
+    SEQ_LEN,
+    BATCH_SIZE,
+    FP8_RECIPE,
+    fake_quant_fp8_create_config,
+    _get_current_scale,
+    _prepare_per_tensor_scaling_config,
+    AMAX_HISTORY_LEN,
+    set_scaling_factors,
+    set_current_scaling_factors,
+)
+
+WORLD_RANK, WORLD_SIZE = None, None
+NCCL_WORLD = None
+FEATURE_DIRS = None
+all_boolean = [True, False]
+TEST_NR = 0
+
+
+def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None):
+    if tp_size is None:
+        tp_size = WORLD_SIZE
+        tp_rank = WORLD_RANK
+    torch.manual_seed(weight_seed)
+    weight = torch.randn((OUT_SIZE, IN_SIZE)).cuda()
+    torch.manual_seed(data_seed)
+    in_split_size = IN_SIZE // tp_size
+    out_split_size = OUT_SIZE // tp_size
+    x = torch.randn((SEQ_LEN * BATCH_SIZE, IN_SIZE), requires_grad=True).cuda()
+    if parallel_mode == "row":
+        x = x[:, tp_rank * in_split_size : (tp_rank + 1) * in_split_size]
+    x.retain_grad()
+
+    with torch.no_grad():
+        if parallel_mode == "column":
+            weight = weight[tp_rank * out_split_size : (tp_rank + 1) * out_split_size, :]
+        else:
+            weight = weight[:, tp_rank * in_split_size : (tp_rank + 1) * in_split_size]
+
+    return x, weight.contiguous()
+
+
+def _init_model(weight, parallel_mode=None, tp_group=None, name="linear"):
+    model = transformer_engine.pytorch.Linear(
+        IN_SIZE,
+        OUT_SIZE,
+        name=name,
+        parallel_mode=parallel_mode,
+        tp_group=(tp_group or NCCL_WORLD if parallel_mode else None),
+    )
+    with torch.no_grad():
+        model.weight.copy_(weight)
+    return model
+
+
+class AllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, dim, group=None):
+        if group is None:
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+        else:
+            world_size = torch.distributed.get_world_size(group=group)
+            rank = torch.distributed.get_rank(group=group)
+            dist.barrier()
+
+        # Create a list to gather tensors from all processes
+        y_list = [torch.zeros_like(tensor) for _ in range(world_size)]
+        torch.distributed.all_gather(y_list, tensor, group=group)
+
+        # Save the world size and rank for backward computation
+        ctx.world_size = world_size
+        ctx.rank = rank
+        ctx.dim = dim
+
+        # Concatenate the gathered tensors along the feature dimension
+        y_full = torch.cat(y_list, dim=dim)
+
+        return y_full
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Split the gradient output and return the portion corresponding to this rank
+        grad_input = torch.chunk(grad_output, ctx.world_size, dim=ctx.dim)[ctx.rank]
+        return grad_input, None, None
+
+
+def _run_forward_backward(x, model, parallel_mode=None, group=None):
+    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y = model(x)
+
+    y.requires_grad_(True)
+    y.retain_grad()
+    if parallel_mode == "column":
+        y = AllGather.apply(y, -1, group)
+        y.requires_grad_(True)
+        y.retain_grad()
+        l = y.sum()
+        l.backward()
+    elif parallel_mode == "row":
+        l = y.sum()
+        l.backward()
+    debug_api.step()
+    return y
+
+
+def _emulate_linear_distributed(*args, parallel_mode=None, **kwargs):
+    assert parallel_mode in ["column", "row"]
+
+    def split(gradient):
+        split_size = OUT_SIZE // WORLD_SIZE
+        gradient = gradient[:, WORLD_RANK * split_size : (WORLD_RANK + 1) * split_size]
+        return gradient
+
+    activation_sync = None
+    gradient_sync = None
+    if parallel_mode == "column":
+        activation_sync = lambda x: AllGather.apply(x, -1)
+        gradient_sync = split
+    else:
+        activation_sync = (
+            lambda activation: dist.all_reduce(activation, op=dist.ReduceOp.SUM) or activation
+        )
+
+    output = _emulate_linear(
+        *args, activation_sync=activation_sync, gradient_sync=gradient_sync, **kwargs
+    )
+
+    if parallel_mode == "column":
+        dist.all_reduce(output["dgrad"], op=dist.ReduceOp.SUM)
+
+    return output
+
+
+def check_debug_log(msg):
+    with open(f"log/debug_logs/debug_log_globalrank-{WORLD_RANK}.log", "r") as f:
+        for line in f.readlines():
+            if msg in line:
+                return True
+    return False
+
+
+def run_debug_test(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank = dist.get_rank()
+        temp_file_name = None
+        temp_logdir_name = None
+
+        if rank == 0:
+            with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
+                temp_file_name = temp_file.name
+            temp_dir_obj = tempfile.TemporaryDirectory()
+            temp_logdir_name = temp_dir_obj.name
+
+            # Store the TemporaryDirectory object to prevent it from being deleted
+            wrapper.temp_dir_obj = temp_dir_obj
+
+        temp_file_name_list = [temp_file_name]
+        temp_logdir_name_list = [temp_logdir_name]
+
+        # Broadcast the temporary file and directory names to all processes
+        dist.broadcast_object_list(temp_file_name_list, src=0)
+        dist.broadcast_object_list(temp_logdir_name_list, src=0)
+
+        temp_file_name = temp_file_name_list[0]
+        temp_logdir_name = temp_logdir_name_list[0]
+
+        dist.barrier()
+
+        config_file = open(temp_file_name, mode="r+", buffering=1)
+
+        try:
+            kwargs["config_file"] = config_file
+            kwargs["log_dir"] = temp_logdir_name
+
+            if rank == 0:
+                global TEST_NR
+                print(f"Running test {TEST_NR} {func.__name__} with args = {args}.")
+                TEST_NR += 1
+
+            func(*args, **kwargs)
+        finally:
+            if rank == 0 and temp_file_name is not None:
+                os.unlink(temp_file_name)
+
+            debug_api.end_debug()
+
+            if rank == 0 and hasattr(wrapper, "temp_dir_obj"):
+                wrapper.temp_dir_obj.cleanup()
+
+    return wrapper
+
+
+CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed:
+  layers:
+    layer_types: [linear]
+  enabled:
+    True
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight, output, wgrad, dgrad]
+      stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
+      start_step : 0
+      end_step: 1
+    LogFp8TensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight]
+      stats: [underflows%]
+      start_step : 0
+      end_step: 1
+"""
+
+
+def _prepare_config_test_log_distributed(config_file):
+    if WORLD_RANK != 0:
+        return
+    config_file.write(CONFIG_LOG_TEST_DISTRIBUTED)
+    config_file.flush()
+
+
+def _compute_dynamic_range(tensor):
+    tensor_abs = tensor.abs()
+    tensor_abs = tensor_abs[tensor_abs != 0]
+    if tensor_abs.any():
+        amin = tensor_abs.min().float()
+    else:
+        amin = torch.tensor(1, device=tensor.device).to(torch.float)
+    amax = tensor_abs.max().float()
+    if not amax.all():
+        amax = torch.tensor(1, device=tensor.device).to(torch.float)
+    dynamic_range = torch.log2(amax) - torch.log2(amin)
+    return dynamic_range
+
+
+@run_debug_test
+def test_log_distributed(parallel_mode, gather_weight, **kwargs):
+    _prepare_config_test_log_distributed(kwargs["config_file"])
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+    set_weight_tensor_tp_group_reduce(gather_weight)
+    if WORLD_SIZE % 2 != 0:
+        return  # skip
+    TP_SIZE = WORLD_SIZE // 2
+    DP_SIZE = 2
+    TP_RANK = WORLD_RANK % TP_SIZE
+    DP_RANK = (WORLD_RANK - TP_RANK) // TP_SIZE
+
+    debug_api.set_tensor_reduction_group(NCCL_WORLD)
+
+    x, weight = _get_tensors(
+        parallel_mode,
+        weight_seed=TP_RANK * 1234,
+        data_seed=DP_RANK * 1234,
+        tp_size=TP_SIZE,
+        tp_rank=TP_RANK,
+    )
+
+    tp_group_ranks = [i for i in range(DP_RANK * TP_SIZE, (DP_RANK + 1) * TP_SIZE)]
+    tp_group = dist.new_group(ranks=tp_group_ranks)
+
+    dp_group_ranks = [i for i in range(TP_RANK, WORLD_SIZE, TP_SIZE)]
+    dp_group = dist.new_group(ranks=dp_group_ranks)
+
+    model = _init_model(weight, parallel_mode=parallel_mode, tp_group=tp_group)
+    output = _run_forward_backward(x, model, parallel_mode=parallel_mode, group=tp_group)
+
+    gathered_activation = AllGather.apply(x.contiguous(), 0)
+    gathered_weight = AllGather.apply(weight.contiguous(), 0, tp_group)
+    gathered_gradient = AllGather.apply(output.grad.contiguous(), 0, dp_group)
+    if parallel_mode == "row":
+        gathered_gradient = AllGather.apply(gathered_gradient, 0, tp_group)
+
+    log_file = kwargs["log_dir"] + "/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log"
+
+    dist.barrier()
+    if WORLD_RANK != 0:
+        return  # stats are gathered on node 0
+    with open(log_file) as f:
+        content = f.read()
+
+    def get_stat(tensor, stat):
+        regex = r".*_{tensor}_{stat}\s+.*iteration=(\d+)\s+.*value=([-+]?\d*\.?\d+)".format(
+            tensor=tensor, stat=stat
+        )
+        for line in content.splitlines():
+            match = re.search(regex, line)
+            if match:
+                value = float(match.group(2))
+                return value
+
+    rf = lambda x: round(float(x), 4)
+    stats = []
+    tensors = {
+        "activation": gathered_activation,
+        "weight": gathered_weight if gather_weight else weight,
+        "gradient": gathered_gradient,
+    }
+    stats = {
+        "min": torch.min,
+        "max": torch.max,
+        "mean": torch.mean,
+        "std": torch.std,
+        "l1_norm": lambda x: torch.norm(x, p=1),
+        "l2_norm": lambda x: torch.norm(x, p=2),
+        "cur_amax": lambda x: x.abs().max(),
+        "dynamic_range": _compute_dynamic_range,
+    }
+    for stat_key in stats.keys():
+        for tensor_key in tensors.keys():
+            torch.testing.assert_close(
+                get_stat(tensor_key, stat_key),
+                rf(stats[stat_key](tensors[tensor_key])),
+                atol=0.0001,
+                rtol=0.0001,
+            )
+    set_weight_tensor_tp_group_reduce(True)  # reset
+
+
+@run_debug_test
+def test_log_expert_parallel(**kwargs):
+    """
+    This test tests the scenario, when one of the node of data parallel does not invoke the debug layer.
+    It naturally occurs in the expert parallelism, when one expert doesn't get input on one node,
+    but gets it on other nodes. If there were all_gather inside forward(), this would result in deadlock.
+    """
+    _prepare_config_test_log_distributed(kwargs["config_file"])
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+    debug_api.set_tensor_reduction_group(NCCL_WORLD)
+    x, weight = _get_tensors(
+        "row", weight_seed=WORLD_RANK * 1234, data_seed=WORLD_RANK * 1234, tp_size=1, tp_rank=0
+    )  # data parallel
+    model = _init_model(weight, parallel_mode=None, name="linear1")
+    model1 = _init_model(weight, parallel_mode=None, name="linear2")
+    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y1 = model(x)
+        y2 = model1(x)
+        y = y1 + y2
+    y.sum().backward()
+    debug_api.step()
+    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y = model(x)
+        if WORLD_RANK != 0:
+            y = y + model1(x)
+
+    y.sum().backward()
+
+
+@run_debug_test
+def test_disable_fp8_gemms(fprop_fp8, dgrad_fp8, wgrad_fp8, parallel_mode, **kwargs):
+    disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, kwargs["config_file"])
+    fp8_kwargs = {
+        "fprop_fp8": fprop_fp8,
+        "dgrad_fp8": dgrad_fp8,
+        "wgrad_fp8": wgrad_fp8,
+    }
+
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+    x, weight = _get_tensors(parallel_mode)
+    model = _init_model(weight, parallel_mode=parallel_mode)
+    y = _run_forward_backward(x, model, parallel_mode=parallel_mode)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+
+    x.grad.zero_()
+    ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs)
+    _cmp(ground_truth, output)
+
+
+@run_debug_test
+def test_disable_fp8_layer(parallel_mode, **kwargs):
+    if WORLD_RANK == 0:
+        kwargs["config_file"].write(DISABLE_FP8_LAYER_CONFIG)
+        kwargs["config_file"].flush()
+    dist.barrier()
+
+    x, weight = _get_tensors(parallel_mode)
+
+    ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode)
+    x.grad.zero_()
+
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+
+    model = _init_model(weight, parallel_mode)
+    y = _run_forward_backward(x, model, parallel_mode)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+    _cmp(ground_truth, output)
+
+
+@run_debug_test
+def test_per_tensor_scaling(
+    fprop_inp,
+    fprop_weight,
+    dgrad_weight,
+    dgrad_grad,
+    wgrad_input,
+    wgrad_grad,
+    parallel_mode,
+    **kwargs,
+):
+    input_kwargs = {
+        "fprop_inp": fprop_inp,
+        "fprop_weight": fprop_weight,
+        "dgrad_weight": dgrad_weight,
+        "dgrad_grad": dgrad_grad,
+        "wgrad_input": wgrad_input,
+        "wgrad_grad": wgrad_grad,
+    }
+    fp8_kwargs = {
+        "fprop_fp8": True,
+        "dgrad_fp8": True,
+        "wgrad_fp8": True,
+    }
+    """
+        Runs a test to validate per-tensor (current) scaling in FP8 computations.
+        The function performs warm-up iterations to populate the amax buffer of the model and compute scaling factors based on delayed scaling.
+        Subsequently, weights and inputs are switched to ensure their current scaling factors differ from those based on delayed scaling;
+        similarly, the loss is multiplied by a large factor to alter the gradient's magnitude,
+        creating a discrepancy between the original (delayed) and per-tensor (current) scaling factors.
+        Finally, a linear pass is emulated, and the results are compared.”
+    """
+    _prepare_per_tensor_scaling_config(
+        fprop_inp,
+        fprop_weight,
+        dgrad_weight,
+        dgrad_grad,
+        wgrad_input,
+        wgrad_grad,
+        kwargs["config_file"],
+    )
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+
+    warmup_input, warmup_weight = _get_tensors(parallel_mode=parallel_mode)
+    model = _init_model(warmup_weight, parallel_mode=parallel_mode)
+
+    # Warmup run to setup amax and scaling factors.
+    for _ in range(AMAX_HISTORY_LEN):
+        _run_forward_backward(warmup_input, model, parallel_mode=parallel_mode)
+
+    x, weight = _get_tensors(
+        parallel_mode=parallel_mode, weight_seed=WORLD_RANK * 2137, data_seed=WORLD_RANK * 2137
+    )
+    model.weight.data = weight.data
+    x.retain_grad()
+
+    # delayed scaling factor
+    # need to be collected before forward pass with test data,
+    # because this forward pass changes scaling factors
+    set_scaling_factors(model, input_kwargs, fp8_kwargs)
+
+    LOSS_MULTIPLIER = 100
+
+    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y = model(x)
+        model.zero_grad()
+        if parallel_mode == "column":
+            y = AllGather.apply(y, -1)
+        y.retain_grad()
+
+        (
+            LOSS_MULTIPLIER * y.sum()
+        ).backward()  # Loss multiplication to change gradient's order of magintude
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+    # per tensor - current - scaling factors
+    # need to be collected after forward pass with test data,
+    # because gradient(y.grad) cannot be accessed before forward,
+    # but it needs to be collected.
+
+    set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs)
+    ground_truth = _emulate_linear_distributed(
+        x, weight, parallel_mode=parallel_mode, loss_multiplier=LOSS_MULTIPLIER, **fp8_kwargs
+    )
+
+    _cmp(ground_truth, output)
+
+
+@run_debug_test
+def test_fake_quant_fp8(
+    fprop_inp,
+    fprop_weight,
+    dgrad_weight,
+    dgrad_grad,
+    wgrad_input,
+    wgrad_grad,
+    parallel_mode,
+    **kwargs,
+):
+
+    fp8_kwargs = {
+        "fprop_input_fake_quant": fprop_inp,
+        "fprop_weight_fake_quant": fprop_weight,
+        "dgrad_gradient_fake_quant": dgrad_grad,
+        "dgrad_weight_fake_quant": dgrad_weight,
+        "wgrad_gradient_fake_quant": wgrad_grad,
+        "wgrad_input_fake_quant": wgrad_input,
+        "fprop_fp8": not (fprop_inp or fprop_weight),
+        "dgrad_fp8": not (dgrad_weight or dgrad_grad),
+        "wgrad_fp8": not (wgrad_grad or wgrad_input),
+    }
+    if WORLD_RANK == 0:
+        fake_quant_fp8_create_config(
+            fprop_inp,
+            fprop_weight,
+            dgrad_weight,
+            dgrad_grad,
+            wgrad_input,
+            wgrad_grad,
+            kwargs["config_file"],
+        )
+    dist.barrier()
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+
+    x, weight = _get_tensors(parallel_mode)
+    model = _init_model(weight, parallel_mode)
+    y = _run_forward_backward(x, model, parallel_mode)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+    fp8_kwargs["fprop_input_scale"] = (
+        _get_current_scale(x, fprop_inp) if not fp8_kwargs["fprop_fp8"] else None
+    )
+    fp8_kwargs["fprop_weight_scale"] = (
+        _get_current_scale(weight, fprop_weight) if not fp8_kwargs["fprop_fp8"] else None
+    )
+    fp8_kwargs["dgrad_gradient_scale"] = (
+        _get_current_scale(y.grad, dgrad_grad) if not fp8_kwargs["dgrad_fp8"] else None
+    )
+    fp8_kwargs["dgrad_weight_scale"] = (
+        _get_current_scale(weight, dgrad_weight) if not fp8_kwargs["dgrad_fp8"] else None
+    )
+    fp8_kwargs["wgrad_gradient_scale"] = (
+        _get_current_scale(y.grad, wgrad_grad) if not fp8_kwargs["wgrad_fp8"] else None
+    )
+    fp8_kwargs["wgrad_input_scale"] = (
+        _get_current_scale(x, wgrad_input) if not fp8_kwargs["wgrad_fp8"] else None
+    )
+    ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs)
+    _cmp(ground_truth, output)
+
+
+def _init_distributed():
+    global WORLD_RANK, WORLD_SIZE, NCCL_WORLD, FP8
+
+    WORLD_RANK = int(os.getenv("RANK", "0"))
+    WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
+    LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0"))
+    LOCAL_SIZE = int(os.getenv("LOCAL_WORLD_SIZE", "1"))
+
+    assert WORLD_SIZE == LOCAL_SIZE  # this test supports only 1 node
+    assert LOCAL_SIZE <= torch.cuda.device_count()
+    dist_init_kwargs = {
+        "backend": "nccl",
+        "rank": WORLD_RANK,
+        "world_size": WORLD_SIZE,
+    }
+    dist_init_kwargs["init_method"] = "env://"
+    dist_init_kwargs["device_id"] = torch.device(f"cuda:{LOCAL_RANK}")
+    assert dist.is_nccl_available()
+    torch.cuda.set_device(LOCAL_RANK)
+    dist.init_process_group(**dist_init_kwargs)
+
+    NCCL_WORLD = dist.new_group(backend="nccl")
+
+    WORLD_SIZE = dist.get_world_size()
+
+
+def _run_test_with_combinations(
+    test_function, values_list, num_repeat, extra_args, sample_size=None
+):
+    combinations = itertools.product(values_list, repeat=num_repeat)
+    total_combinations = itertools.product(combinations, extra_args)
+
+    if sample_size is not None:
+        total_combinations = random.sample(list(total_combinations), sample_size)
+
+    for comb, arg in total_combinations:
+        test_function(*comb, arg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--feature_dirs", type=str)
+    args = parser.parse_args()
+    FEATURE_DIRS = args.feature_dirs
+    random.seed(SEED)
+    _init_distributed()
+
+    test_log_expert_parallel()
+    for parallel_mode in ["column", "row"]:
+        for gather_weight in [True, False]:
+            test_log_distributed(parallel_mode, gather_weight)
+
+    for parallel_mode in ["row", "column"]:
+        test_disable_fp8_layer(parallel_mode)
+
+    # test_disable_fp8_gemms
+    _run_test_with_combinations(
+        test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"]
+    )
+
+    # test_fake_quant_fp8
+    dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None]
+    _run_test_with_combinations(
+        test_fake_quant_fp8,
+        dtype_options,
+        num_repeat=6,
+        extra_args=["column", "row"],
+        sample_size=20,
+    )
+
+    _run_test_with_combinations(
+        test_per_tensor_scaling,
+        all_boolean,
+        num_repeat=6,
+        extra_args=["column"],
+        sample_size=20,
+    )
diff --git a/tests/pytorch/debug/test_api_features.py b/tests/pytorch/debug/test_api_features.py
new file mode 100644
index 000000000..f9cd234ba
--- /dev/null
+++ b/tests/pytorch/debug/test_api_features.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
+
+import nvdlfw_inspect.api as debug_api
+
+try:
+    import transformer_engine
+    import transformer_engine_torch as tex
+except (ImportError, ModuleNotFoundError):
+    print("Could not find TransformerEngine package.")
+    exit(1)
+
+
+def test_transformer_engine_no_config(feature_dirs):
+    debug_api.initialize("", feature_dirs=feature_dirs)
+    try:
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # FP8 enabled - true by the default
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+
+        # modify_tensor_enabled - False by default
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+        # inspect_tensor_enabled - False by default
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.attn.qkv", tensor_name="activation", iteration=0
+        )
+
+        # inspect_tensor_postquantize - False by default
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_disable_fp8_gemm(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(configs_dir + "disable_fp8_gemms.yaml", feature_dirs=feature_dirs)
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+
+        # caching
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_disable_fp8_layer(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(configs_dir + "disable_fp8_layer.yaml", feature_dirs=feature_dirs)
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", iteration=0
+        )
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", iteration=0
+        )
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_per_tensor_scaling(configs_dir, feature_dirs):
+    try:
+
+        debug_api.initialize(configs_dir + "per_tensor_scaling.yaml", feature_dirs=feature_dirs)
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # check modify_tensor_enabled
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
+        )
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0
+        )
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
+        )
+
+        # check modify_tensor
+
+        default_quantizer1 = Float8Quantizer(
+            scale=torch.tensor([1]).cuda(),
+            amax=torch.tensor([0]).cuda(),
+            fp8_dtype=tex.DType.kFloat8E4M3,
+        )
+        default_quantizer2 = Float8Quantizer(
+            scale=torch.tensor([1]).cuda(),
+            amax=torch.tensor([0]).cuda(),
+            fp8_dtype=tex.DType.kFloat8E5M2,
+        )
+
+        output1 = debug_api.transformer_engine.modify_tensor(
+            layer_name="decoder.1.mlp.fc1",
+            gemm="fprop",
+            tensor_name="activation",
+            default_quantizer=default_quantizer1,
+            iteration=0,
+            tensor=tensor,
+        )
+        assert type(output1) == Float8Tensor
+        assert output1._fp8_dtype == tex.DType.kFloat8E4M3
+
+        output2 = debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="dgrad",
+            tensor=tensor,
+            tensor_name="gradient",
+            default_quantizer=default_quantizer2,
+            iteration=0,
+        )
+        assert type(output2) == Float8Tensor
+        assert output2._fp8_dtype == tex.DType.kFloat8E5M2
+
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1",
+            gemm="wgrad",
+            tensor_name="gradient",
+            iteration=0,
+        )
+
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc4",
+            gemm="fprop",
+            tensor_name="activation",
+            iteration=0,
+        )
+    finally:
+        debug_api.end_debug()
+
+
+def test_fake_quant(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            configs_dir + "fake_quantization_config.yaml", feature_dirs=feature_dirs
+        )
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # modify_tensor_enabled
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
+        )
+
+        # modify_tensor
+        debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="fprop",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=0,
+            default_quantizer=None,
+        )
+
+        debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="dgrad",
+            tensor=tensor,
+            tensor_name="gradient",
+            iteration=0,
+            default_quantizer=None,
+        )
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.fc2", gemm="wgrad", iteration=0
+        )
+        # caching
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.fc2", gemm="wgrad", iteration=0
+        )
+    finally:
+        debug_api.end_debug()
+
+
+def test_statistics_collection(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            config_file=configs_dir + "stats_collection_test_config.yaml",
+            feature_dirs=feature_dirs,
+            default_logging_enabled=False,
+        )
+
+        tensor = torch.randn((100, 100, 5)).cuda()
+        tensor_fp8 = Float8Tensor(
+            data=tensor.to(torch.uint8).cuda(),
+            fp8_scale_inv=torch.full([1], 1.0).cuda(),
+            fp8_dtype=tex.DType.kFloat8E4M3,
+            shape=tensor.shape,
+            dtype=torch.float32,
+        )
+
+        def log():
+            from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
+
+            return STATS_BUFFERS.log_stats()
+
+        def assert_empty():
+            stats = log()
+            assert len(stats) == 0
+
+        # TE tensor stats --
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.1.mlp.fc1",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.mlp.fc1", tensor_name="activation", iteration=201
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.2.mlp.fc1", tensor_name="activation", iteration=200
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
+        )
+
+        expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
+        expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
+
+        # TE FP8 tensor stats --
+        assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
+        )
+        debug_api.transformer_engine.inspect_tensor_postquantize(
+            "decoder.1.mlp.fc1",
+            tensor=tensor_fp8,
+            tensor_name="gradient",
+            iteration=200,
+            rowwise=True,
+            tp_group=None,
+        )
+        stats = log()
+        torch.testing.assert_close(
+            stats[("decoder.1.mlp.fc1", "gradient", "underflows%", 200)], expected_underflows
+        )
+
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
+        )
+
+        # Second config in same yaml
+        tensor = torch.rand((100, 100, 5))
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.6.mlp.fc1",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        stats_names = [x[3] for x in stats.keys()]
+        all(s in stats_names for s in ["cur_amax", "dynamic_range", "mean", "std", "l1_norm"])
+        assert stats[("decoder.6.mlp.fc1", "activation", "mean", 200)] == tensor.mean()
+
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.7.mlp.fc1",
+            tensor=tensor,
+            tensor_name="weight",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        stats_names = [x[3] for x in stats.keys()]
+        all(s in stats_names for s in ["mean", "std", "l1_norm", "min", "max"])
+        assert stats[("decoder.7.mlp.fc1", "weight", "max", 200)] == tensor.max()
+
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.7.mlp.fc1", tensor_name="weight", iteration=201
+        )
+        assert_empty()
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_statistics_multi_run(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            config_file=configs_dir + "stats_collection_test_config.yaml",
+            feature_dirs=feature_dirs,
+            default_logging_enabled=False,
+        )
+
+        def feed(tensor, tensor_fp8):
+            debug_api.transformer_engine.inspect_tensor(
+                "decoder.5.mlp.fc1",
+                tensor=tensor,
+                tensor_name="activation",
+                iteration=1,
+                tp_group=None,
+            )
+            debug_api.transformer_engine.inspect_tensor_postquantize(
+                "decoder.5.mlp.fc1",
+                tensor=tensor_fp8,
+                tensor_name="activation",
+                iteration=1,
+                rowwise=True,
+                tp_group=None,
+            )
+
+        def log_stats():
+            from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
+
+            return STATS_BUFFERS.log_stats()
+
+        def fp8_tensor(t):
+            return Float8Tensor(
+                data=t.to(torch.uint8).cuda(),
+                fp8_scale_inv=torch.ones([1]).cuda(),
+                fp8_dtype=tex.DType.kFloat8E4M3,
+                shape=t.shape,
+                dtype=torch.float32,
+            )
+
+        shape = [1024, 1024]
+        tensors = [torch.randn(shape) for _ in range(2)]
+        tensors_fp8 = [fp8_tensor(tensors[i]) for i in range(2)]
+
+        feed(tensors[0], tensors_fp8[0])
+        feed(tensors[1], tensors_fp8[1])
+        stats1 = log_stats()
+
+        tensor2 = torch.cat((tensors[0], tensors[1])).cuda()
+        fp8tensor2 = fp8_tensor(tensor2)
+        feed(tensor2, fp8tensor2)
+        stats2 = log_stats()
+
+        assert len(stats1.keys()) > 0
+        for k in stats1.keys():
+            torch.testing.assert_close(stats1[k], stats2[k])
+    finally:
+        debug_api.end_debug()
+
+
+if __name__ == "__main__":
+    pass
diff --git a/tests/pytorch/debug/test_config.py b/tests/pytorch/debug/test_config.py
new file mode 100644
index 000000000..71715a686
--- /dev/null
+++ b/tests/pytorch/debug/test_config.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+import pathlib, os
+
+from nvdlfw_inspect.config_manager import ConfigManager
+
+import nvdlfw_inspect.api as debug_api
+
+try:
+    import transformer_engine
+    from transformer_engine.debug.features.api import TEConfigAPIMapper
+except (ImportError, ModuleNotFoundError):
+    print("Could not find TransformerEngine debug module.")
+    exit(1)
+
+
+def test_transformer_engine_config_parsing(feature_dirs):
+    debug_api.initialize(
+        config_file=pathlib.Path(__file__).resolve().parent
+        / "test_configs/tensor_manipulation_transformer_engine.yaml",
+        feature_dirs=feature_dirs,
+        log_dir="./log",
+    )
+
+    cfg_fc1 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc1")["transformer_engine"]
+    cfg_fc2 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc2")["transformer_engine"]
+    assert cfg_fc1 and cfg_fc2
+
+    gemm_parsing = True
+    tensor_parsing = True
+
+    # Per tensor scaling set for dgrad, filter based on gemm
+    ret, _ = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="activation",
+    )
+    assert not ret
+
+    # per tensor scaling set for gradient, filter based on tensor name
+    ret, _ = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="activation",
+    )
+    assert not ret
+
+    ret, parsed_cfg_fc1 = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc1 == {"gemm": "dgrad", "tensor": "gradient"}
+
+    # Test tensor struct
+    ret, parsed_cfg_fc1_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="activation",
+    )
+    ret, parsed_cfg_fc1_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc1_act == {
+        "gemm": "fprop",
+        "tensor": "activation",
+        "quant_format": "FP8E4M3",
+    }
+    assert parsed_cfg_fc1_wei == {
+        "gemm": "fprop",
+        "tensor": "weight",
+        "quant_format": "FP8E4M3",
+    }
+
+    # Test gemms struct
+    ret, parsed_cfg_fc2_grad = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc2_grad == {"gemm": "dgrad", "tensor": "gradient", "quant_format": "FP8E5M2"}
+    ret, parsed_cfg_fc2_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wei == {"gemm": "dgrad", "tensor": "weight", "quant_format": "FP8E5M2"}
+
+    # Test gemm + tensor struct
+    ret, parsed_cfg_fc2_fprop_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="activation",
+    )
+    assert ret
+    assert parsed_cfg_fc2_fprop_act == {"gemm": "fprop", "tensor": "activation"}
+
+    ret, parsed_cfg_fc2_fprop_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc2_fprop_wei == {"gemm": "fprop", "tensor": "weight"}
+
+    ret, parsed_cfg_fc2_wgrad_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="activation",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wgrad_act == {"gemm": "wgrad", "tensor": "activation"}
+
+    ret, parsed_cfg_fc2_wgrad_grad = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wgrad_grad == {"gemm": "wgrad", "tensor": "gradient"}
+
+    ConfigManager.reset()
diff --git a/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml b/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml
new file mode 100644
index 000000000..b832f26d8
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml
@@ -0,0 +1,8 @@
+test_disable_fp8_gemm_1:
+  enabled: True
+  layers:
+    layer_types: [qkv, fc2]
+  transformer_engine:
+    DisableFP8GEMM:
+      enabled: True
+      gemms: [dgrad, wgrad]
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml b/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml
new file mode 100644
index 000000000..39bfc7a25
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml
@@ -0,0 +1,7 @@
+test_disable_fp8_layer:
+  enabled: True
+  layers:
+    layer_types: [qkv]
+  transformer_engine:
+    DisableFP8Layer:
+      enabled: True
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/dummy_feature.yaml b/tests/pytorch/debug/test_configs/dummy_feature.yaml
new file mode 100644
index 000000000..540e3ac42
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/dummy_feature.yaml
@@ -0,0 +1,9 @@
+deummy_feature_everywhere:
+  enabled: True
+  layers:
+     layer_name_regex_pattern: .*
+  transformer_engine:
+    TestDummyFeature:
+      enabled: True
+      tensors: [weight, activation, gradient, output, wgrad, dgrad]
+      gemms: [wgrad, dgrad, fprop]
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/fake_quantization_config.yaml b/tests/pytorch/debug/test_configs/fake_quantization_config.yaml
new file mode 100644
index 000000000..62feace6d
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/fake_quantization_config.yaml
@@ -0,0 +1,14 @@
+test_fake_quant_fp8:
+  enabled: True
+  layers:
+    layer_numbers: [1]
+    layer_types: [fc1, fc2]
+  transformer_engine:
+    FakeQuant:
+      enabled: True
+      gemms: [fprop, dgrad]
+      tensors_struct:
+        - tensor: activation
+          quant_format: FP8E4M3
+        - tensor: gradient
+          quant_format: FP8E5M2
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml b/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml
new file mode 100644
index 000000000..c17f2f7d2
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml
@@ -0,0 +1,19 @@
+test_per_tensor_scaling:
+  enabled: True
+  layers:
+    layer_numbers: [1]
+    layer_types: [fc1, fc2]
+  transformer_engine:
+    DisableFP8GEMM:
+      enabled: True
+      gemms: [wgrad]
+    PerTensorScaling:
+      enabled: True
+      gemms_struct:
+        - gemm: fprop
+          tensors_struct:
+            - tensor: activation
+            - tensor: weight
+        - gemm: dgrad
+          tensors_struct:
+            - tensor: gradient
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml b/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml
new file mode 100644
index 000000000..8f01b2d62
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml
@@ -0,0 +1,59 @@
+stat_collection_test_1:
+  enabled: True
+  layers:
+    layer_numbers: [1, 3]
+  LogTensorStats:
+    enabled: True
+    stats: [mean, std, l1_norm, l2_norm]
+    tensors: [activation]
+    freq: 1
+    start_step: 100
+    end_step: 500
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      stats: [cur_amax, dynamic_range]
+      tensors: [activation]
+      freq: 2
+      start_step: 100
+      end_step: 500
+    LogFp8TensorStats:
+      enabled: True
+      stats: [underflows%]
+      tensors: [gradient]
+      freq: 5
+      start_step: 100
+      end_step: 500
+  
+stat_collection_test_2:
+  enabled: True
+  layers:
+    layer_numbers: [6, 7]
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          freq: 2
+          start_step: 100
+          end_step: 500
+        - tensor: weight
+          stats: [mean, std, l1_norm, min, max]
+          freq: 5
+          start_step: 100
+          end_step: 500
+  
+stat_collection_test_4:
+  enabled: True
+  layers:
+    layer_numbers: [5]
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation]
+      stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+    LogFp8TensorStats:
+      enabled: True
+      stats: [underflows%]
+      tensors: [activation]
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml b/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml
new file mode 100644
index 000000000..e86486366
--- /dev/null
+++ b/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml
@@ -0,0 +1,45 @@
+# This config is used when FP8 training is ON
+
+transformer_engine_fc1_manipulation:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*(fc1) # Select layers if they end in fc1
+  transformer_engine: # namespace
+    DisableFP8GEMM: # Disable FP8 GEMM. FProp run in high precision
+      enabled: True
+      gemms: [fprop]
+    PerTensorScaling: # Scale DGrad gradients using per tensor current scaling and run FP8 GEMM
+      enabled: True
+      gemms: [dgrad]
+      tensors: [gradient]
+    FakeQuant: # Disable FP8 GEMM for Wgrad. Fake quantize activations to Wgrad and run high precision GEMM
+      enabled: True
+      gemms: [fprop]
+      tensors_struct:
+        - tensor: activation
+          quant_format: FP8E4M3
+        - tensor: weight
+          quant_format: FP8E4M3
+
+transformer_engine_fc2_manipulation:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*(fc2) # Select layers if they end in fc2
+  transformer_engine: # namespace
+    PerTensorScaling: # Scale WGrad and Fprop inputs using per tensor current scaling and run FP8 GEMM
+      enabled: True
+      gemms_struct:
+        - gemm: fprop
+          tensors_struct:
+            - tensor: activation
+            - tensor: weight
+        - gemm: wgrad
+          tensors_struct:
+            - tensor: activation
+            - tensor: gradient
+    FakeQuant: # Disable FP8 GEMM for DGrad. Fake quantize weights and gradients to DGrad and run high precision GEMM
+      enabled: True
+      gemms_struct:
+        - gemm: dgrad
+          tensors: [weight, gradient]
+          quant_format: FP8E5M2
\ No newline at end of file
diff --git a/tests/pytorch/debug/test_distributed.py b/tests/pytorch/debug/test_distributed.py
new file mode 100644
index 000000000..7c072a054
--- /dev/null
+++ b/tests/pytorch/debug/test_distributed.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+
+"""
+    Distributed numerics tests
+
+    These tests test the numerical corectness of the TransformerEngine layers.
+    Tests are parametrized by the layer and fp8 precision.
+    One test consists of running multiple configurations from file run_numerics.py
+    Such design is due to the fact the initialization of one test is long
+    - 2 processes need to start and load torch and TE. Multiple configurations
+    are run in one test - this reduces the initialization overhead.
+
+"""
+
+
+if torch.cuda.device_count() < 2:
+    pytest.skip("Distributed training needs at least 2 GPUs.")
+
+TEST_ROOT = Path(__file__).parent.resolve()
+NUM_PROCS: int = min(4, torch.cuda.device_count())
+LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
+
+
+def test_debug_distributed(feature_dirs):
+    test_path = TEST_ROOT / "run_distributed.py"
+    test_cmd = LAUNCH_CMD + [str(test_path), f"--feature_dirs={feature_dirs[0]}"]
+
+    result = subprocess.run(test_cmd, env=os.environ, capture_output=True, check=False)
+    if result.returncode != 0:
+        raise AssertionError(result.stderr.decode())
diff --git a/tests/pytorch/debug/test_numerics.py b/tests/pytorch/debug/test_numerics.py
new file mode 100644
index 000000000..55c3ab9b7
--- /dev/null
+++ b/tests/pytorch/debug/test_numerics.py
@@ -0,0 +1,718 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import functools
+import itertools
+import os
+import random
+import tempfile
+from string import Template
+
+import pytest
+import torch
+
+import nvdlfw_inspect.api as debug_api
+import transformer_engine.debug
+import transformer_engine.pytorch as tepytorch
+import transformer_engine_torch as tex
+from transformer_engine.common.recipe import DelayedScaling, Format
+from transformer_engine.pytorch.fp8 import _default_sf_compute
+from transformer_engine.pytorch.tensor.float8_tensor import (
+    Float8Quantizer,
+    Float8CurrentScalingQuantizer,
+)
+from transformer_engine.pytorch.module.base import (
+    _2X_ACC_DGRAD,
+    _2X_ACC_FPROP,
+    _2X_ACC_WGRAD,
+)
+
+all_boolean = [True, False]
+FP8_FORMAT = Format.HYBRID
+AMAX_HISTORY_LEN = 16
+FP8_RECIPE = DelayedScaling(
+    fp8_format=FP8_FORMAT, amax_history_len=AMAX_HISTORY_LEN, amax_compute_algo="max"
+)
+SEED = 1234
+IN_SIZE = 128
+OUT_SIZE = 64
+BATCH_SIZE = 16
+SEQ_LEN = 128
+LOSS_FN = torch.nn.functional.cross_entropy
+
+
+def _cast_to_fp8(tensor, scale, dtype):
+    tensor = tensor.contiguous()
+    if type(scale) == torch.Tensor:
+        amax = scale.abs().max().float()
+        quantizer = Float8Quantizer(scale, amax, dtype)
+    else:
+        quantizer = Float8CurrentScalingQuantizer(scale, device=tensor.device)
+
+    return quantizer(tensor)
+
+
+def _get_current_scale(tensor, fp8_dtype):
+    if fp8_dtype == tex.DType.kFloat8E4M3:
+        fp8_max = Format.E4M3.value.max_fwd
+    else:
+        fp8_max = Format.E5M2.value.max_fwd
+
+    amax = tensor.abs().max().float()
+    one = torch.ones(1, device=tensor.device)
+
+    return _default_sf_compute(amax, one, fp8_max, 0).detach()
+
+
+def _fake_cast(tensor, fp8_dtype, scale):
+    scale = scale or _get_current_scale(tensor, fp8_dtype)
+    fp8_tensor = _cast_to_fp8(tensor, scale, fp8_dtype)
+
+    return fp8_tensor.dequantize()
+
+
+def _fp8_gemm_kernel(tensor1, scale1, dtype1, tensor2, scale2, dtype2, use_split_accumulator):
+    fp8_tensor1 = _cast_to_fp8(tensor1, scale1, dtype1)
+    fp8_tensor2 = _cast_to_fp8(tensor2, scale2, dtype2)
+
+    out, *_ = tepytorch.cpp_extensions.general_gemm(
+        fp8_tensor1,
+        fp8_tensor2,
+        tepytorch.module.base.get_workspace(),
+        torch.float32,
+        use_split_accumulator=use_split_accumulator,
+    )
+    out.requires_grad = True
+    return out.T
+
+
+def _emulate_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    fprop_fp8: bool = False,
+    fprop_input_fake_quant: tex.DType = None,
+    fprop_input_scale: torch.Tensor = None,
+    fprop_weight_fake_quant: tex.DType = None,
+    fprop_weight_scale: torch.Tensor = None,
+    dgrad_fp8: bool = False,
+    dgrad_gradient_fake_quant: tex.DType = None,
+    dgrad_gradient_scale: torch.Tensor = None,
+    dgrad_weight_fake_quant: tex.DType = None,
+    dgrad_weight_scale: torch.Tensor = None,
+    wgrad_fp8: bool = False,
+    wgrad_gradient_fake_quant: tex.DType = None,
+    wgrad_gradient_scale: torch.Tensor = None,
+    wgrad_input_fake_quant: tex.DType = None,
+    wgrad_input_scale: torch.Tensor = None,
+    loss_multiplier: float = 1.0,
+    activation_sync=None,
+    gradient_sync=None,
+):
+    _scalar = lambda x: torch.Tensor([x]).cuda() if type(x) in [float, torch.Tensor] else x
+    if fprop_fp8:
+        activation = _fp8_gemm_kernel(
+            input,
+            _scalar(fprop_input_scale or 1.0),
+            tex.DType.kFloat8E4M3,
+            weight,
+            _scalar(fprop_weight_scale or 1.0),
+            tex.DType.kFloat8E4M3,
+            _2X_ACC_FPROP,
+        )
+        activation = activation.clone().detach().contiguous().requires_grad_(True)
+    else:
+        fprop_input = (
+            _fake_cast(input, fprop_input_fake_quant, _scalar(fprop_input_scale))
+            if fprop_input_fake_quant is not None
+            else input
+        )
+        fprop_weight = (
+            _fake_cast(weight, fprop_weight_fake_quant, _scalar(fprop_weight_scale))
+            if fprop_weight_fake_quant is not None
+            else weight
+        )
+
+        activation = (fprop_input @ fprop_weight.T).contiguous()
+
+    if activation_sync:
+        activation = activation_sync(activation)
+
+    activation.retain_grad()
+
+    (loss_multiplier * activation.sum()).backward(retain_graph=True)
+    gradient = activation.grad.clone()
+
+    if gradient_sync:
+        gradient = gradient_sync(gradient)
+
+    if dgrad_fp8:
+        dgrad = _fp8_gemm_kernel(
+            weight.T,
+            _scalar(dgrad_weight_scale or 1.0),
+            tex.DType.kFloat8E4M3,
+            gradient,
+            _scalar(dgrad_gradient_scale or 1.0),
+            tex.DType.kFloat8E5M2,
+            _2X_ACC_DGRAD,
+        ).T
+    else:
+        dgrad_gradient = (
+            _fake_cast(gradient, dgrad_gradient_fake_quant, _scalar(dgrad_gradient_scale))
+            if dgrad_gradient_fake_quant is not None
+            else gradient
+        )
+
+        dgrad_weight = (
+            _fake_cast(weight, dgrad_weight_fake_quant, _scalar(dgrad_weight_scale))
+            if dgrad_weight_fake_quant is not None
+            else weight
+        )
+        dgrad = dgrad_gradient @ dgrad_weight
+
+    if wgrad_fp8:
+        wgrad = _fp8_gemm_kernel(
+            input.T,
+            _scalar(wgrad_input_scale or 1.0),
+            tex.DType.kFloat8E4M3,
+            gradient.T,
+            _scalar(wgrad_gradient_scale or 1.0),
+            tex.DType.kFloat8E5M2,
+            _2X_ACC_WGRAD,
+        ).T
+    else:
+        wgrad_gradient = (
+            _fake_cast(gradient, wgrad_gradient_fake_quant, _scalar(wgrad_gradient_scale))
+            if wgrad_gradient_fake_quant is not None
+            else gradient
+        )
+        wgrad_input = (
+            _fake_cast(input, wgrad_input_fake_quant, _scalar(wgrad_input_scale))
+            if wgrad_input_fake_quant is not None
+            else input
+        )
+        wgrad_input = wgrad_input.contiguous()
+        wgrad_gradient = wgrad_gradient.contiguous()
+        wgrad, *_ = tepytorch.cpp_extensions.general_gemm(
+            wgrad_input,
+            wgrad_gradient,
+            tepytorch.module.base.get_workspace(),
+            torch.float32,
+            layout="NT",
+            grad=True,
+            use_split_accumulator=_2X_ACC_WGRAD,
+        )
+
+    return {"activation": activation, "wgrad": wgrad, "dgrad": dgrad}
+
+
+def _init_debug(config_name, log_dir, feature_dirs):
+    debug_api.initialize(
+        config_file=config_name,
+        feature_dirs=feature_dirs,
+        log_dir=log_dir,
+        default_logging_enabled=True,
+    )
+
+
+def create_config_file(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                try:
+                    kwargs["config_file"] = temp_file
+                    kwargs["log_dir"] = temp_dir
+                    result = func(*args, **kwargs)
+                finally:
+                    temp_file_name = temp_file.name
+                    debug_api.end_debug()
+            os.unlink(temp_file_name)
+        return result
+
+    return wrapper
+
+
+def _cmp(ground_truth, output):
+    torch.testing.assert_close(ground_truth["activation"], output["activation"])
+    torch.testing.assert_close(ground_truth["wgrad"], output["wgrad"])
+    torch.testing.assert_close(ground_truth["dgrad"], output["dgrad"])
+
+
+def _init_model(weight):
+    model = transformer_engine.pytorch.Linear(IN_SIZE, OUT_SIZE, name="linear")
+    with torch.no_grad():
+        model.weight.copy_(weight.contiguous())
+    return model
+
+
+def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None):
+    with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y = model(x, is_first_microbatch=is_first_microbatch)
+    (y.sum() * loss_scale).backward()
+    debug_api.step()
+    return y
+
+
+def _get_tensors():
+    torch.manual_seed(SEED)
+    x = torch.randn((SEQ_LEN * BATCH_SIZE, IN_SIZE), requires_grad=True).cuda()
+    x.retain_grad()
+    weight = torch.randn((OUT_SIZE, IN_SIZE)).cuda()
+    return x, weight
+
+
+DISABLE_FP8_CONFIG = Template(
+    """disable_fp8_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    DisableFP8GEMM:
+      enabled: True
+      gemms: [$gemms]
+"""
+)
+
+
+@pytest.mark.parametrize("fprop_fp8", all_boolean)
+@pytest.mark.parametrize("dgrad_fp8", all_boolean)
+@pytest.mark.parametrize("wgrad_fp8", all_boolean)
+def test_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8):
+    run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8)
+
+
+def disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, config_file):
+    gemms = ""
+    if not fprop_fp8:
+        gemms += "fprop,"
+    if not dgrad_fp8:
+        gemms += "dgrad,"
+    if not wgrad_fp8:
+        gemms += "wgrad,"
+    if len(gemms) > 0:
+        gemms = gemms[:-1]  # remove last ','
+    config_file.write(DISABLE_FP8_CONFIG.safe_substitute(gemms=gemms))
+    config_file.flush()
+
+
+@create_config_file
+def run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8, **kwargs):
+    disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, kwargs["config_file"])
+    fp8_kwargs = {
+        "fprop_fp8": fprop_fp8,
+        "dgrad_fp8": dgrad_fp8,
+        "wgrad_fp8": wgrad_fp8,
+    }
+
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+    x, weight = _get_tensors()
+    model = _init_model(weight)
+    y = _run_forward_backward(x, model)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+
+    x.grad.zero_()
+    ground_truth = _emulate_linear(x, weight, **fp8_kwargs)
+    _cmp(ground_truth, output)
+
+
+def test_disable_fp8_layer(feature_dirs):
+    run_disable_fp8_layer(feature_dirs)
+
+
+DISABLE_FP8_LAYER_CONFIG = """disable_fp8_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    DisableFP8Layer:
+      enabled: True
+"""
+
+
+@create_config_file
+def run_disable_fp8_layer(feature_dirs, **kwargs):
+    kwargs["config_file"].write(DISABLE_FP8_LAYER_CONFIG)
+    kwargs["config_file"].flush()
+
+    x, weight = _get_tensors()
+
+    ground_truth = _emulate_linear(x, weight)
+    x.grad.zero_()
+
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+
+    model = _init_model(weight)
+    y = _run_forward_backward(x, model)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+    _cmp(ground_truth, output)
+
+
+random.seed(1234)
+
+all_combinations = list(itertools.product(all_boolean, repeat=6))
+subset_combinations = random.sample(all_combinations, 20)
+
+
+@pytest.mark.parametrize(
+    "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad",
+    subset_combinations,
+)
+def test_per_tensor_scaling(
+    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+):
+    if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
+        pytest.skip("Skipping test because all parameters are False")
+    run_per_tensor_scaling(
+        feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+    )
+
+
+PER_TENSOR_SCALING_CONFIG = Template(
+    """per_tensor_scaling_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    PerTensorScaling:
+      enabled: True
+      gemms_struct:
+$gemms
+"""
+)
+
+
+def _prepare_per_tensor_scaling_config(
+    fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad, config_file
+):
+    gemms = ""
+    title = lambda x: f"      - gemm: {x}\n        tensors: ["
+
+    def add_tensor(if_add, gemm_name):
+        nonlocal gemms
+        if if_add:
+            gemms += gemm_name + ","
+
+    if fprop_inp or fprop_weight:
+        gemms += title("fprop")
+        add_tensor(fprop_inp, "activation")
+        add_tensor(fprop_weight, "weight")
+        gemms = gemms[:-1] + "]\n"
+    if dgrad_weight or dgrad_grad:
+        gemms += title("dgrad")
+        add_tensor(dgrad_weight, "weight")
+        add_tensor(dgrad_grad, "gradient")
+        gemms = gemms[:-1] + "]\n"
+    if wgrad_input or wgrad_grad:
+        gemms += title("wgrad")
+        add_tensor(wgrad_input, "activation")
+        add_tensor(wgrad_grad, "gradient")
+        gemms = gemms[:-1] + "]\n"
+    config_file.write(PER_TENSOR_SCALING_CONFIG.safe_substitute(gemms=gemms))
+    config_file.flush()
+
+
+def set_scaling_factors(model, input_kwargs, fp8_kwargs):
+    # Copy fp8 scaling factors into fp8_kwargs dict if respective flag in input_kwargs is set.
+    if not input_kwargs["fprop_inp"]:
+        fp8_kwargs["fprop_input_scale"] = model.fp8_meta["scaling_fwd"].scale[0].clone()
+    if not input_kwargs["fprop_weight"]:
+        fp8_kwargs["fprop_weight_scale"] = model.fp8_meta["scaling_fwd"].scale[1].clone()
+    if not input_kwargs["dgrad_grad"]:
+        fp8_kwargs["dgrad_gradient_scale"] = model.fp8_meta["scaling_bwd"].scale[0].clone()
+    if not input_kwargs["dgrad_weight"]:
+        fp8_kwargs["dgrad_weight_scale"] = model.fp8_meta["scaling_fwd"].scale[1].clone()
+    if not input_kwargs["wgrad_grad"]:
+        fp8_kwargs["wgrad_gradient_scale"] = model.fp8_meta["scaling_bwd"].scale[0].clone()
+    if not input_kwargs["wgrad_input"]:
+        fp8_kwargs["wgrad_input_scale"] = model.fp8_meta["scaling_fwd"].scale[0].clone()
+
+
+def set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs):
+    # Compute per tensor scaling factor if respective flag in input_kwargs is set.
+    if input_kwargs["fprop_inp"]:
+        fp8_kwargs["fprop_input_scale"] = tex.DType.kFloat8E4M3
+    if input_kwargs["fprop_weight"]:
+        fp8_kwargs["fprop_weight_scale"] = tex.DType.kFloat8E4M3
+    if input_kwargs["dgrad_grad"]:
+        fp8_kwargs["dgrad_gradient_scale"] = tex.DType.kFloat8E5M2
+    if input_kwargs["dgrad_weight"]:
+        fp8_kwargs["dgrad_weight_scale"] = tex.DType.kFloat8E4M3
+    if input_kwargs["wgrad_grad"]:
+        fp8_kwargs["wgrad_gradient_scale"] = tex.DType.kFloat8E5M2
+    if input_kwargs["wgrad_input"]:
+        fp8_kwargs["wgrad_input_scale"] = tex.DType.kFloat8E4M3
+
+
+@create_config_file
+def run_per_tensor_scaling(
+    feature_dirs,
+    fprop_inp,
+    fprop_weight,
+    dgrad_weight,
+    dgrad_grad,
+    wgrad_input,
+    wgrad_grad,
+    **kwargs,
+):
+    input_kwargs = {
+        "fprop_inp": fprop_inp,
+        "fprop_weight": fprop_weight,
+        "dgrad_weight": dgrad_weight,
+        "dgrad_grad": dgrad_grad,
+        "wgrad_input": wgrad_input,
+        "wgrad_grad": wgrad_grad,
+    }
+    fp8_kwargs = {
+        "fprop_fp8": True,
+        "dgrad_fp8": True,
+        "wgrad_fp8": True,
+    }
+    """
+        Runs a test to validate per-tensor (current) scaling in FP8 computations.
+        The function performs warm-up iterations to populate the amax buffer of the model and compute scaling factors based on delayed scaling.
+        Subsequently, weights and inputs are switched to ensure their current scaling factors differ from those based on delayed scaling;
+        similarly, the loss is multiplied by a large factor to alter the gradient's magnitude,
+        creating a discrepancy between the original (delayed) and per-tensor (current) scaling factors.
+        Finally, a linear pass is emulated, and the results are compared.”
+    """
+    _prepare_per_tensor_scaling_config(
+        fprop_inp,
+        fprop_weight,
+        dgrad_weight,
+        dgrad_grad,
+        wgrad_input,
+        wgrad_grad,
+        kwargs["config_file"],
+    )
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+
+    warmup_input, warmup_weight = _get_tensors()
+    model = _init_model(warmup_weight)
+
+    # Warmup run to setup amax and scaling factors.
+    for _ in range(AMAX_HISTORY_LEN):
+        _run_forward_backward(warmup_input, model)
+
+    x = torch.randn_like(warmup_input, requires_grad=True).cuda()
+    weight = torch.randn_like(warmup_weight, requires_grad=True).cuda()
+    model.weight.data = weight.data
+    x.retain_grad()
+
+    # delayed scaling factor
+    # need to be collected before forward pass with test data,
+    # because this forward pass changes scaling factors
+    set_scaling_factors(model, input_kwargs, fp8_kwargs)
+
+    LOSS_MULTIPLIER = 100
+
+    with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+        y = model(x, is_first_microbatch=True)
+        model.zero_grad()
+        y.retain_grad()
+        (
+            LOSS_MULTIPLIER * y.sum()
+        ).backward()  # Loss multiplication to change gradient's order of magintude
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+
+    # per tensor - current - scaling factors
+    # need to be collected after forward pass with test data,
+    # because gradient(y.grad) cannot be accessed before forward,
+    # but it needs to be collected.
+    set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs)
+
+    ground_truth = _emulate_linear(x, weight, loss_multiplier=LOSS_MULTIPLIER, **fp8_kwargs)
+    _cmp(ground_truth, output)
+
+
+@pytest.mark.parametrize(
+    "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad",
+    subset_combinations,
+)
+def test_microbatching_per_tensor_scaling(
+    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+):
+    if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
+        pytest.skip("Skipping test because all parameters are False")
+
+    @create_config_file
+    def run_microbatching_test(
+        feature_dirs,
+        fprop_inp,
+        fprop_weight,
+        dgrad_weight,
+        dgrad_grad,
+        wgrad_input,
+        wgrad_grad,
+        **kwargs,
+    ):
+        # Prepare the configuration file
+        _prepare_per_tensor_scaling_config(
+            fprop_inp,
+            fprop_weight,
+            dgrad_weight,
+            dgrad_grad,
+            wgrad_input,
+            wgrad_grad,
+            kwargs["config_file"],
+        )
+
+        # Initialize debug
+        _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+
+        # Get data
+        x_full, weight = _get_tensors()
+        microbatch_size = x_full.size(0) // 2
+        x_mb1 = x_full[:microbatch_size, ...].clone().detach().requires_grad_(True)
+        x_mb2 = x_full[microbatch_size:, ...].clone().detach().requires_grad_(True)
+
+        def init_and_warmup():
+            model = _init_model(weight)
+            _run_forward_backward(x_mb1, model, loss_scale=0.5)
+            _run_forward_backward(x_mb2, model, loss_scale=0.5)
+            return model
+
+        # Run without is_first_microbatch
+
+        model = init_and_warmup()  # running next 2 iters does not change amaxes and scaling factors
+        y_mb1 = _run_forward_backward(x_mb1, model, loss_scale=0.5)
+        y_mb2 = _run_forward_backward(x_mb2, model, loss_scale=0.5)
+
+        # Collect outputs
+        output1 = {
+            "activation": torch.cat([y_mb1.clone(), y_mb2.clone()], dim=0),
+            "wgrad": model.weight.grad.clone(),
+            "dgrad": torch.cat([x_mb1.grad.clone(), x_mb2.grad.clone()], dim=0),
+        }
+
+        # Run with is_first_microbatch
+        model = init_and_warmup()  # running next 2 iters does not change amaxes and scaling factors
+        y_mb1 = _run_forward_backward(x_mb1, model, loss_scale=0.5, is_first_microbatch=True)
+        y_mb2 = _run_forward_backward(x_mb2, model, loss_scale=0.5, is_first_microbatch=False)
+
+        # Collect outputs
+        output2 = {
+            "activation": torch.cat([y_mb1.clone(), y_mb2.clone()], dim=0),
+            "wgrad": model.weight.grad.clone(),
+            "dgrad": torch.cat([x_mb1.grad.clone(), x_mb2.grad.clone()], dim=0),
+        }
+
+        # Compare outputs
+        torch.testing.assert_close(output1["activation"], output2["activation"], atol=1.0, rtol=0.5)
+        torch.testing.assert_close(output1["dgrad"], output2["dgrad"], atol=1.0, rtol=0.5)
+        torch.testing.assert_close(output1["wgrad"], output2["wgrad"], atol=1.0, rtol=0.5)
+
+    # Run the test
+    run_microbatching_test(
+        feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+    )
+
+
+all_combinations = list(
+    itertools.product([tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None], repeat=6)
+)
+subset_combinations = random.sample(all_combinations, 10)
+
+
+@pytest.mark.parametrize(
+    "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad",
+    subset_combinations,
+)
+def test_fake_quant_fp8(
+    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+):
+    run_fake_quant_fp8(
+        feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
+    )
+
+
+FAKE_QUANT_CONFIG = Template(
+    """fake_quant_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    FakeQuant:
+      enabled: True
+      gemms_struct:
+$gemms
+"""
+)
+
+
+def fake_quant_fp8_create_config(
+    fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad, config_file
+):
+    format_to_str = {tex.DType.kFloat8E4M3: "FP8E4M3", tex.DType.kFloat8E5M2: "FP8E5M2"}
+    gemms = ""
+
+    def _add_tensor(quant_format, tensor):
+        nonlocal gemms
+        if quant_format:
+            gemms += " " * 8 + "- tensor: " + tensor + "\n"
+            gemms += " " * 8 + "  quant_format: " + format_to_str[quant_format] + "\n"
+
+    title = lambda x: f"      - gemm: {x}\n        tensors_struct:\n"
+    if fprop_inp or fprop_weight:
+        gemms += title("fprop")
+        _add_tensor(fprop_inp, "activation")
+        _add_tensor(fprop_weight, "weight")
+        gemms = gemms[:-1] + "\n"
+    if dgrad_weight or dgrad_grad:
+        gemms += title("dgrad")
+        _add_tensor(dgrad_weight, "weight")
+        _add_tensor(dgrad_grad, "gradient")
+        gemms = gemms[:-1] + "\n"
+    if wgrad_input or wgrad_grad:
+        gemms += title("wgrad")
+        _add_tensor(wgrad_input, "activation")
+        _add_tensor(wgrad_grad, "gradient")
+        gemms = gemms[:-1] + "\n"
+    config = FAKE_QUANT_CONFIG.safe_substitute(gemms=gemms)
+    config_file.write(config)
+    config_file.flush()
+
+
+@create_config_file
+def run_fake_quant_fp8(
+    feature_dirs,
+    fprop_inp,
+    fprop_weight,
+    dgrad_weight,
+    dgrad_grad,
+    wgrad_input,
+    wgrad_grad,
+    **kwargs,
+):
+    fp8_kwargs = {
+        "fprop_input_fake_quant": fprop_inp,
+        "fprop_weight_fake_quant": fprop_weight,
+        "dgrad_gradient_fake_quant": dgrad_grad,
+        "dgrad_weight_fake_quant": dgrad_weight,
+        "wgrad_gradient_fake_quant": wgrad_grad,
+        "wgrad_input_fake_quant": wgrad_input,
+        "fprop_fp8": not (fprop_inp or fprop_weight),
+        "dgrad_fp8": not (dgrad_weight or dgrad_grad),
+        "wgrad_fp8": not (wgrad_grad or wgrad_input),
+    }
+    fake_quant_fp8_create_config(
+        fprop_inp,
+        fprop_weight,
+        dgrad_weight,
+        dgrad_grad,
+        wgrad_input,
+        wgrad_grad,
+        kwargs["config_file"],
+    )
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+
+    x, weight = _get_tensors()
+    model = _init_model(weight)
+    y = _run_forward_backward(x, model)
+
+    output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
+    ground_truth = _emulate_linear(x, weight, **fp8_kwargs)
+    _cmp(ground_truth, output)
diff --git a/tests/pytorch/debug/test_sanity.py b/tests/pytorch/debug/test_sanity.py
new file mode 100644
index 000000000..6b0883b14
--- /dev/null
+++ b/tests/pytorch/debug/test_sanity.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import functools
+import itertools
+import os
+import random
+import tempfile
+from string import Template
+
+import pytest
+import torch
+
+import nvdlfw_inspect.api as debug_api
+import transformer_engine.debug
+import transformer_engine.pytorch as te
+import transformer_engine_torch as tex
+from transformer_engine.common.recipe import DelayedScaling, Format
+from transformer_engine.pytorch.constants import TE_DType
+from transformer_engine.pytorch.fp8 import _default_sf_compute
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
+
+from test_numerics import create_config_file
+
+B, S, H, D = 64, 64, 64, 64
+
+model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"]
+
+configs = {
+    "": "",
+    "log": """log:
+  layers:
+    layer_types: [linear]
+  enabled:
+    True
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight, output, wgrad, dgrad]
+      stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
+      start_step : 0
+      end_step: 1
+    LogFp8TensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight]
+      stats: [underflows, overflows]
+      start_step : 0
+      end_step: 1
+""",
+    "fake_quant": """
+fake_quant_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    FakeQuant:
+      enabled: True
+      gemms: [fprop, dgrad, wgrad]
+      quant_format: FP8E5M2
+""",
+}
+
+
+def _get_model(model_key):
+    if model_key == "linear":
+        return te.Linear(D, D)
+    if model_key == "layernorm_linear":
+        return te.LayerNormLinear(D, D)
+    if model_key == "layernorm_mlp":
+        return te.LayerNormMLP(D, D, D)
+    if model_key == "mha_attention":
+        return te.MultiheadAttention(D, H)
+    if model_key == "transformer_layer":
+        return te.TransformerLayer(D, D, H)
+
+
+def _run_forward_backward(model, fp8):
+    for _ in range(3):
+        inp = torch.randn((S, B, H)).cuda()
+        with te.fp8_autocast(enabled=fp8):
+            out = model(inp)
+        out.sum().backward()
+        debug_api.step()
+
+
+@create_config_file
+def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir):
+    try:
+        if config != "":
+            config_file.write(config)
+            config_file.flush()
+        config_file_name = config_file.name if config != "" else ""
+        debug_api.initialize(feature_dirs=feature_dirs, config_file=config_file_name)
+        model = _get_model(model_key)
+        _run_forward_backward(model, fp8)
+    except Exception as error:
+        raise error
+    finally:
+        debug_api.end_debug()
+
+
+@pytest.mark.parametrize("model_key", model_keys)
+@pytest.mark.parametrize("fp8", [False, True])
+@pytest.mark.parametrize("config_key", configs.keys())
+def test_sanity_debug(model_key, fp8, config_key, feature_dirs):
+    _run_test(model_key, fp8, configs[config_key], feature_dirs)
diff --git a/tests/pytorch/debug/utils.py b/tests/pytorch/debug/utils.py
new file mode 100644
index 000000000..f03ee56b5
--- /dev/null
+++ b/tests/pytorch/debug/utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+
+LOG_FILE = os.path.join("nvdlfw_inspect_logs", "nvdlfw_inspect_globalrank-0.log")
+
+
+def reset_debug_log():
+    if os.path.isfile(LOG_FILE):
+        # delete all content
+        with open(LOG_FILE, "w") as f:
+            pass
+
+
+def check_debug_log(msg):
+    with open(LOG_FILE, "r") as f:
+        for line in f.readlines():
+            if msg in line:
+                return True
+    return False
diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index ac72960c4..a505d0179 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -40,6 +40,18 @@
 LOSS_FN = nn.MSELoss()
 QUANTIZATION = None
 
+if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False):
+    # The numerics of all the layers should work the same,
+    # when debug=True. I fed them with dummy feature
+    # to prevent switching off debug, which can happen if
+    # no feature is active.
+    import nvdlfw_inspect.api as debug_api
+
+    debug_api.initialize(
+        os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
+        feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
+    )
+
 
 # Disable TF32
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
index 3b56796cc..6d9a4412e 100644
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -120,6 +120,20 @@ def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq
 
 mask_types = ["causal", "no_mask"]
 
+NVTE_TEST_NVINSPECT_ENABLED = os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False)
+
+if NVTE_TEST_NVINSPECT_ENABLED:
+    # The numerics of all the layers should work the same,
+    # when debug=True. I fed them with dummy feature
+    # to prevent switching off debug, which can happen if
+    # no feature is active.
+    import nvdlfw_inspect.api as debug_api
+
+    debug_api.initialize(
+        os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
+        feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
+    )
+
 fp8_recipes = [
     recipe.MXFP8BlockScaling(),
     recipe.DelayedScaling(),
@@ -621,6 +635,8 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_m
         pytest.skip(reason_for_no_fp8)
     if recipe.mxfp8() and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
     if recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
@@ -741,6 +757,8 @@ def test_gpt_full_activation_recompute(
         use_cast_transpose_triton =  bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) )
         if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton:
             pytest.skip("Float8 Current Scaling unsupported for full recompute.")
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
     if recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
@@ -1957,6 +1975,8 @@ def test_grouped_linear_accuracy(
         pytest.skip(reason_for_no_fp8)
     if fp8 and recipe.mxfp8() and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
     if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
@@ -2155,6 +2175,8 @@ def test_padding_grouped_linear_accuracy(
         pytest.skip(reason_for_no_fp8)
     if recipe.mxfp8() and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
     if recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
@@ -2276,6 +2298,8 @@ def test_gpt_cuda_graph(dtype, bs, model):
             if use_fa:
                 pytest.skip(f"ROCm flash attention does not support cuda graph with {dtype}")
 
+    if NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("Cuda Graphs are not supported in debug mode.")
     config = model_configs[model]
 
     sigma = 0.023
@@ -2373,6 +2397,8 @@ def test_gpt_fp8_parameters(dtype, bs, model, recipe):
         pytest.skip(reason_for_no_fp8)
     if recipe.mxfp8() and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
+    if NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
     if recipe.float8_block_scaling() and not fp8_block_scaling_available:
         pytest.skip(reason_for_no_fp8_block_scaling)
 
diff --git a/transformer_engine/debug/features/api.py b/transformer_engine/debug/features/api.py
index 887043c42..13ab6040d 100644
--- a/transformer_engine/debug/features/api.py
+++ b/transformer_engine/debug/features/api.py
@@ -12,7 +12,7 @@
 import torch
 
 from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
-from transformer_engine.pytorch.tensor import all_tensor_types
+from transformer_engine.pytorch.tensor import get_all_tensor_types
 from transformer_engine.debug.pytorch.debug_state import TEDebugState
 from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor
 
@@ -424,7 +424,7 @@ def output_assertions_hook(self, api_name, ret, **kwargs):
         if api_name in ["inspect_tensor", "inspect_tensor_postquantize"]:
             assert ret is None
         if api_name == "modify_tensor":
-            assert type(ret) in all_tensor_types
+            assert type(ret) in get_all_tensor_types()
             if (
                 type(ret) == torch.Tensor  # pylint: disable=unidiomatic-typecheck
                 and "dtype" in kwargs
@@ -438,4 +438,4 @@ def step(self):
 
     def end_debug(self):
         """This function is called by the nvidia-dlframework-inspect after every debug_api.end_debug()"""
-        TEDebugState.reset()
+        TEDebugState._reset()
diff --git a/transformer_engine/debug/features/fake_quant.py b/transformer_engine/debug/features/fake_quant.py
index bab4b4dcf..4a5b6c34a 100644
--- a/transformer_engine/debug/features/fake_quant.py
+++ b/transformer_engine/debug/features/fake_quant.py
@@ -49,7 +49,7 @@ def fake_quantize(tensor: torch.Tensor, fp8_format: tex.DType, out=None):
             fp8_dtype = tex.DType.kFloat8E5M2
         amax = tensor.abs().max().float()
         one = torch.ones(1, device=tensor.device)
-        scale = _default_sf_compute(amax, one, fp8_max)
+        scale = _default_sf_compute(amax, one, fp8_max, 0)
 
         quantizer = Float8Quantizer(scale, amax, fp8_dtype)
     else:
diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 4ca2a8ed3..e5c84a9bd 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -120,7 +120,6 @@ def inspect_tensor_postquantize(
         if not rowwise:
             return  # tensor was already seen rowwise in the other gemm
 
-        tensor = tensor._data
         options = (
             config.get("start_step", None),
             config.get("end_step", None),
diff --git a/transformer_engine/debug/features/per_tensor_scaling.py b/transformer_engine/debug/features/per_tensor_scaling.py
index eabb6304a..d648b517d 100644
--- a/transformer_engine/debug/features/per_tensor_scaling.py
+++ b/transformer_engine/debug/features/per_tensor_scaling.py
@@ -15,6 +15,7 @@
 from transformer_engine.pytorch.tensor import Quantizer
 from transformer_engine.pytorch.tensor.float8_tensor import (
     Float8Tensor,
+    Float8Quantizer,
     Float8CurrentScalingQuantizer,
 )
 from transformer_engine.debug.features.api import TEConfigAPIMapper
@@ -39,7 +40,7 @@ def per_tensor_cast(
     }, "[NVTORCH INSPECT ERROR] Only 2 FP8 types: E4M3 and E5M2 are supported in TE."
     tensor = tensor.contiguous()
 
-    quantizer = Float8CurrentScalingQuantizer(fp8_dtype)
+    quantizer = Float8CurrentScalingQuantizer(fp8_dtype, device=tensor.device)
 
     if out is not None:
         quantizer.update_quantized(tensor, out)
@@ -118,7 +119,7 @@ def modify_tensor(
             if key not in ["gemm", "tensor"]:
                 raise ValueError(f'[NVTORCH INSPECT ERROR] Unexpected key in config: "{key}".')
 
-        assert isinstance(default_quantizer, Float8CurrentScalingQuantizer), (
+        assert isinstance(default_quantizer, Float8Quantizer), (
             f"[NVTORCH INSPECT ERROR] Feature={self.__class__.__name__}, API=process_tensor: "
             "Per-tensor current scaling can be used only within `DelayedScaling` recipe autocast."
             f" {layer_name}"
diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py
index 84a740161..d111e4890 100644
--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -96,7 +96,10 @@ def _get(buffers, stat_name):
     "max": (torch.max, lambda buffers: max(_get(buffers, "max"))),
     "sum": (torch.sum, lambda buffers: sum(_get(buffers, "sum"))),
     "mean": (torch.mean, lambda buffers: sum(_get(buffers, "sum")) / sum(_get(buffers, "numel"))),
-    "numel": (lambda x: x.numel(), lambda buffers: sum(_get(buffers, "numel"))),
+    "numel": (
+        lambda x: x.numel() if hasattr(x, "numel") else x.get_data_tensors()[0].numel(),
+        lambda buffers: sum(_get(buffers, "numel")),
+    ),
     "l1_norm": (lambda x: torch.norm(x, p=1), lambda buffers: sum(_get(buffers, "l1_norm"))),
     "l2_norm_square": (
         lambda x: torch.sum(x**2),
@@ -137,7 +140,7 @@ def _get(buffers, stat_name):
         - min(_get(buffers, "dynamic_range_bottom")),
     ),
     "underflows%": (
-        lambda x: (x == 0).sum() / x.numel() * 100,
+        lambda x: (x.get_data_tensors()[0] == 0).sum() / x.get_data_tensors()[0].numel() * 100,
         lambda buffers: 100 * sum(_get(buffers, "underflows_num")) / sum(_get(buffers, "numel")),
     ),
 }
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 4a7a156a0..b725d3ab3 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -18,6 +18,7 @@
 from transformer_engine.pytorch.tensor.quantized_tensor import (
     QuantizedTensor,
     Quantizer,
+    QuantizedTensorBase,
     prepare_for_saving,
     restore_from_saved,
 )
@@ -299,8 +300,9 @@ def quantize(
                 iteration=self.iteration,
                 dtype=dtype,
             )
-            if columnwise_gemm_tensor.dtype != dtype:
-                raise ValueError("Dtype does not match the output of the modify_tensor call")
+            if dtype is not None:
+                if columnwise_gemm_tensor.dtype != dtype:
+                    raise ValueError("Dtype does not match the output of the modify_tensor call")
         if self.rowwise_tensor_plan == API_CALL_MODIFY:
             rowwise_gemm_tensor = debug_api.transformer_engine.modify_tensor(
                 layer_name=self.layer_name,
@@ -311,8 +313,9 @@ def quantize(
                 iteration=self.iteration,
                 dtype=dtype,
             )
-            if rowwise_gemm_tensor.dtype != dtype:
-                raise ValueError("Dtype does not match the output of the modify_tensor call")
+            if dtype is not None:
+                if rowwise_gemm_tensor.dtype != dtype:
+                    raise ValueError("Dtype does not match the output of the modify_tensor call")
 
         # 3. If some tensors still are not defined we use high precision tensor.
         if self.rowwise_tensor_plan == HIGH_PRECISION:
@@ -332,6 +335,7 @@ def quantize(
             quantizer=self,
             layer_name=self.layer_name,
             tensor_name=self.tensor_name,
+            original_tensor=tensor,
         )
 
     def process_gemm_output(self, tensor: torch.Tensor):
@@ -456,7 +460,7 @@ def any_feature_enabled(self) -> bool:
         return False
 
 
-class DebugQuantizedTensor:
+class DebugQuantizedTensor(QuantizedTensorBase):
     """
     Class containing quantized tensors after debug. Depending on configuration
     it can contain one or two different objects. These objects can be accessed by the method
@@ -470,6 +474,7 @@ def __init__(
         quantizer,
         layer_name=None,
         tensor_name=None,
+        original_tensor=None,
     ):
 
         self.rowwise_gemm_tensor = rowwise_gemm_tensor
@@ -477,6 +482,7 @@ def __init__(
         self.quantizer = quantizer
         self._layer_name = layer_name
         self._tensor_name = tensor_name
+        self._original_tensor = original_tensor
 
     def prepare_for_saving(self):
         """ " Prepare for saving method override"""
@@ -524,5 +530,5 @@ def size(self):
         """Size of the tensor."""
         return self.rowwise_gemm_tensor.size()
 
-    def update_usage(self, rowwise_usage: bool, columnwise_usage: bool):
+    def update_usage(self, rowwise_usage: bool = None, columnwise_usage: bool = None):
         """Update usage of the tensor."""
diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py
index ea601397a..1d788148d 100644
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -1243,12 +1243,18 @@ def gather_along_first_dim(
         final_quantizer = (
             None if not needs_quantized_gemm(inp, rowwise=True) else quantizer.parent_quantizer
         )
+        # Temporary fix for TP communication of Float8BlockwiseQTensorBase
+        if isinstance(rowwise, Float8BlockwiseQTensorBase):
+            rowwise = inp._original_tensor
         rowwise_total = gather_along_first_dim(rowwise, process_group, False, final_quantizer)[0]
         out_obj.rowwise_gemm_tensor = rowwise_total
         if rowwise is not columnwise:
             final_quantizer_columnwise = (
                 None if not needs_quantized_gemm(inp, rowwise=False) else quantizer.parent_quantizer
             )
+            # Temporary fix for TP communication of Float8BlockwiseQTensorBase
+            if isinstance(columnwise, Float8BlockwiseQTensorBase):
+                columnwise = inp._original_tensor
             columnwise_total, _ = gather_along_first_dim(
                 columnwise, process_group, False, final_quantizer_columnwise
             )
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index a8b110690..d999efa3c 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -1072,7 +1072,12 @@ def grad_output_preprocess(
             if (
                 isinstance(
                     grad_output_.get_tensor(True),
-                    (QuantizedTensor, Float8TensorBase, MXFP8TensorBase),
+                    (
+                        QuantizedTensor,
+                        Float8TensorBase,
+                        MXFP8TensorBase,
+                        Float8BlockwiseQTensorBase,
+                    ),
                 )
                 and ctx.use_bias
             ):
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index a31823641..53f399d3d 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -205,6 +205,7 @@ def forward(
         # or if a gather of ln_out must be in high precision.
         with_quantized_norm = (
             fp8
+            and not debug
             and not return_layernorm_output
             and not return_layernorm_output_gathered
             and not force_hp_blockwise_ln_out_gather

From 74525d1291a12c6c10b463b059395e6de533a829 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Mon, 19 May 2025 14:25:36 -0700
Subject: [PATCH 02/26] Fix README render for uploading package to PyPI (#1798)

* Fix README render on PyPI

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Update README.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Use anonymous hyperlink for duplicate. Fix indent.

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 README.rst | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index 49e19bd7e..09f204f68 100644
--- a/README.rst
+++ b/README.rst
@@ -450,7 +450,7 @@ Installation
 ============
 
 System Requirements
-^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^
 
 * **Hardware:** Blackwell, Hopper, Grace Hopper/Blackwell, Ada, Ampere
 
@@ -468,10 +468,10 @@ System Requirements
 * **Notes:** FP8 features require Compute Capability 8.9+ (Ada/Hopper/Blackwell)
 
 Installation Methods
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 Docker (Recommended)
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 The quickest way to get started with Transformer Engine is by using Docker images on
 `NVIDIA GPU Cloud (NGC) Catalog <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch>`_.
 
@@ -496,7 +496,7 @@ Where 25.04 (corresponding to April 2025 release) is the container version.
 * NGC PyTorch 23.08+ containers include FlashAttention-2
 
 pip Installation
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^
 
 **Prerequisites for pip installation:**
 
@@ -534,7 +534,7 @@ Source Installation
 `See the installation guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html#installation-from-source>`_
 
 Environment Variables
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^
 These environment variables can be set before installation to customize the build process:
 
 * **CUDA_PATH**: Path to CUDA installation
@@ -545,7 +545,7 @@ These environment variables can be set before installation to customize the buil
 * **NVTE_BUILD_THREADS_PER_JOB**: Control threads per build job
 
 Compiling with FlashAttention
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Transformer Engine supports both FlashAttention-2 and FlashAttention-3 in PyTorch for improved performance. FlashAttention-3 was added in release v1.11 and is prioritized over FlashAttention-2 when both are present in the environment.
 
 You can verify which FlashAttention version is being used by setting these environment variables:
@@ -557,8 +557,9 @@ You can verify which FlashAttention version is being used by setting these envir
 It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.
 
 .. troubleshooting-begin-marker-do-not-remove
+
 Troubleshooting
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^
 
 **Common Issues and Solutions:**
 
@@ -692,7 +693,7 @@ Papers
 Videos
 ======
 
-* `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62457/>`_
+* `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62457/>`__
 * `Blackwell Numerics for AI | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72458/>`_
 * `Building LLMs: Accelerating Pretraining of Foundational Models With FP8 Precision | GTC 2025 <https://www.nvidia.com/gtc/session-catalog/?regcode=no-ncid&ncid=no-ncid&tab.catalogallsessionstab=16566177511100015Kus&search=zoho#/session/1726152813607001vnYK>`_
 * `From FP8 LLM Training to Inference: Language AI at Scale | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72799/>`_

From cea11527603aad6277a6d7fa4fa4b4de500fb433 Mon Sep 17 00:00:00 2001
From: Evgeny Tsykunov <etsykunov@nvidia.com>
Date: Mon, 19 May 2025 23:25:57 +0200
Subject: [PATCH 03/26] Enhance recipe compatibility (#1724)

* Check tensor-recipe compatibility

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Tensor class in recipe, checking for *Base

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Extend recipe __repr__ with recipe_type

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Warn about recipe change

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Enable dynamic recipe change: clear fp8 workspace

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* TE 1.x checkpoint compatibility

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Disable warning for recipe wrappers

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Test recipe change

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use QuantizedTensorBase

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Fix circular import

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Revert previous circular import fix

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* Fix pytorch imports in common

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Let quantizer know about the recipe

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix imports

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>

---------

Signed-off-by: Evgeny Tsykunov <etsykunov@nvidia.com>
Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: Przemyslaw Tredak <ptredak@nvidia.com>
Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 tests/pytorch/test_recipe.py                  | 105 +++++++++++++++++-
 .../common/gemm/cublaslt_gemm.cu              |   3 +-
 transformer_engine/common/recipe/__init__.py  |   9 +-
 .../debug/pytorch/debug_quantization.py       |   6 +-
 transformer_engine/pytorch/module/base.py     |  63 ++++++++++-
 .../pytorch/tensor/float8_blockwise_tensor.py |   8 +-
 .../pytorch/tensor/float8_tensor.py           |  13 ++-
 .../pytorch/tensor/mxfp8_tensor.py            |   8 +-
 .../pytorch/tensor/quantized_tensor.py        |   5 +
 9 files changed, 210 insertions(+), 10 deletions(-)

diff --git a/tests/pytorch/test_recipe.py b/tests/pytorch/test_recipe.py
index 02ff9367a..8d379be7c 100644
--- a/tests/pytorch/test_recipe.py
+++ b/tests/pytorch/test_recipe.py
@@ -8,22 +8,32 @@
 
 import pytest
 import torch
+import warnings
 
 import transformer_engine.common.recipe
 import transformer_engine.pytorch as te
+from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockQuantizer
+from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Quantizer
 import transformer_engine_torch as tex
 from transformer_engine.pytorch.fp8 import (
     FP8GlobalStateManager,
     _amax_and_scale_update,
-    get_default_fp8_recipe,
+    fp8_model_init,
 )
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
 import transformer_engine.pytorch.ops as te_ops
 from transformer_engine.pytorch.utils import is_fp8_fnuz
+from transformer_engine.pytorch import Linear
+from transformer_engine.pytorch.distributed import fp8_autocast
+from transformer_engine.common.recipe import DelayedScaling, Float8BlockScaling, MXFP8BlockScaling
 import transformer_engine_torch as tex
 
 # Check if FP8 is supported
 fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
+mxfp8_available, reason_for_no_mxfp8 = FP8GlobalStateManager.is_mxfp8_available()
+fp8_block_scaling_available, reason_for_no_fp8_block_scaling = (
+    FP8GlobalStateManager.is_fp8_block_scaling_available()
+)
 
 
 # FP8 per tensor delayed scaling
@@ -370,3 +380,96 @@ def setup_fp8_meta():
             )
 
         torch.testing.assert_close(fp8_meta[forward_key].scale, expected_scale)
+
+    @pytest.mark.parametrize(
+        "model_init_recipe",
+        [
+            pytest.param(
+                MXFP8BlockScaling(),
+                marks=pytest.mark.skipif(not mxfp8_available, reason=reason_for_no_mxfp8),
+            ),
+            pytest.param(
+                Float8BlockScaling(),
+                marks=pytest.mark.skipif(
+                    not fp8_block_scaling_available, reason=reason_for_no_fp8_block_scaling
+                ),
+            ),
+        ],
+    )
+    def test_check_for_weight_tensor_and_recipe_correspondence(self, model_init_recipe):
+        with fp8_model_init(enabled=True, recipe=model_init_recipe):
+            linear = Linear(32, 32).cuda()
+
+        x = torch.randn(32, 32, device="cuda")
+        with fp8_autocast(enabled=True, fp8_recipe=DelayedScaling()):
+            with pytest.raises(RuntimeError) as excinfo:
+                _ = linear(x)
+            assert "Recipe mismatch for " in str(excinfo.value)
+
+    @pytest.mark.parametrize(
+        "target_recipe_class, expected_quantizer_type, available_flag, reason",
+        [
+            pytest.param(
+                MXFP8BlockScaling,
+                MXFP8Quantizer,
+                mxfp8_available,
+                reason_for_no_mxfp8,
+                id="DelayedScaling->MXFP8BlockScaling",
+            ),
+            pytest.param(
+                Float8BlockScaling,
+                Float8BlockQuantizer,
+                fp8_block_scaling_available,
+                reason_for_no_fp8_block_scaling,
+                id="DelayedScaling->Float8BlockScaling",
+            ),
+        ],
+    )
+    def test_dynamic_recipe_update(
+        self, target_recipe_class, expected_quantizer_type, available_flag, reason
+    ):
+        if not available_flag:
+            pytest.skip(reason)
+
+        in_features = 32
+        out_features = 32
+        batch_size = 32
+        linear = Linear(in_features, out_features).cuda()
+        initial_recipe = DelayedScaling()
+
+        # Run initial iterations with DelayedScaling
+        for _ in range(3):
+            x = torch.randn(batch_size, in_features, device="cuda")
+            with fp8_autocast(enabled=True, fp8_recipe=initial_recipe):
+                y = linear(x)
+            loss = y.mean()
+            loss.backward()
+
+        for quantizer in linear.quantizers["scaling_fwd"]:
+            assert isinstance(quantizer, Float8Quantizer)
+
+        # Change recipe
+        target_recipe = target_recipe_class()
+
+        # Run subsequent iterations with the target recipe
+        for i in range(3):
+            x = torch.randn(batch_size, in_features, device="cuda")
+            if i == 0:
+                # Expect a warning on the first iteration with the new recipe
+                with pytest.warns(UserWarning, match="Recipe type changed"):
+                    with fp8_autocast(enabled=True, fp8_recipe=target_recipe):
+                        y = linear(x)
+                for quantizer in linear.quantizers["scaling_fwd"]:
+                    assert isinstance(quantizer, expected_quantizer_type)
+            else:
+                # No warning expected on subsequent iterations
+                with warnings.catch_warnings():
+                    warnings.simplefilter("error")  # Raise error if unexpected warning occurs
+                    with fp8_autocast(enabled=True, fp8_recipe=target_recipe):
+                        y = linear(x)
+            loss = y.mean()
+            loss.backward()
+
+        # Final check
+        for quantizer in linear.quantizers["scaling_fwd"]:
+            assert isinstance(quantizer, expected_quantizer_type)
diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
index 36cbcd330..07b256972 100644
--- a/transformer_engine/common/gemm/cublaslt_gemm.cu
+++ b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -94,7 +94,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
       A.scaling_mode == B.scaling_mode ||
           (A.scaling_mode == NVTE_BLOCK_SCALING_1D && B.scaling_mode == NVTE_BLOCK_SCALING_2D) ||
           (A.scaling_mode == NVTE_BLOCK_SCALING_2D && B.scaling_mode == NVTE_BLOCK_SCALING_1D),
-      "Inputs A and B to GEMM need to have compatible scaling modes!");
+      "Inputs A and B to GEMM need to have compatible scaling modes, but got A.scaling_mode = " +
+          to_string(A.scaling_mode) + ", B.scaling_mode = " + to_string(B.scaling_mode));
   NVTE_CHECK(A.has_data() || A.has_columnwise_data(), "Input A does not hold any data!");
   NVTE_CHECK(B.has_data() || B.has_columnwise_data(), "Input B does not hold any data!");
   GemmParam ret;
diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py
index 9426d1621..1cf974987 100644
--- a/transformer_engine/common/recipe/__init__.py
+++ b/transformer_engine/common/recipe/__init__.py
@@ -196,6 +196,7 @@ def __post_init__(self) -> None:
 
     def __repr__(self) -> str:
         return (
+            f"recipe_type={self.__class__.__name__}, "
             f"margin={self.margin}, "
             f"format={str(self.fp8_format).split('.')[1]}, "
             f"amax_history_len={self.amax_history_len}, "
@@ -261,6 +262,7 @@ def __post_init__(self) -> None:
 
     def __repr__(self) -> str:
         return (
+            f"recipe_type={self.__class__.__name__}, "
             f"format={str(self.fp8_format).split('.')[1]}, "
             f"fp8_quant_fwd_inp={self.fp8_quant_fwd_inp}, "
             f"fp8_quant_fwd_weight={self.fp8_quant_fwd_weight}, "
@@ -307,7 +309,11 @@ def __post_init__(self) -> None:
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
 
     def __repr__(self) -> str:
-        return f"margin={self.margin}, format={str(self.fp8_format).split('.')[1]},"
+        return (
+            f"recipe_type={self.__class__.__name__}, "
+            f"margin={self.margin}, "
+            f"format={str(self.fp8_format).split('.')[1]}"
+        )
 
 
 @dataclass()
@@ -391,6 +397,7 @@ def __post_init__(self) -> None:
 
     def __repr__(self) -> str:
         return (
+            f"recipe_type={self.__class__.__name__}, "
             f"format={str(self.fp8_format).split('.')[1]}, "
             f"fp8_quant_fwd_inp={self.fp8_quant_fwd_inp}, "
             f"fp8_quant_fwd_weight={self.fp8_quant_fwd_weight}, "
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index b725d3ab3..4d61757e1 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -14,7 +14,7 @@
 
 import transformer_engine_torch as tex
 
-
+from transformer_engine.common.recipe import Recipe
 from transformer_engine.pytorch.tensor.quantized_tensor import (
     QuantizedTensor,
     Quantizer,
@@ -459,6 +459,10 @@ def any_feature_enabled(self) -> bool:
                 return True
         return False
 
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        """Probably not needed for debug quantizer"""
+        return None
+
 
 class DebugQuantizedTensor(QuantizedTensorBase):
     """
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index d999efa3c..1672bc6bd 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -51,7 +51,7 @@
 
 from ..utils import is_non_tn_fp8_gemm_supported
 from ..tensor._internal.float8_blockwise_tensor_base import Float8BlockwiseQTensorBase
-from ...common.recipe import Recipe
+from ...common.recipe import DelayedScaling, Recipe
 from ...debug.pytorch.debug_state import TEDebugState
 from ...debug.pytorch.debug_quantization import DebugQuantizer, DebugQuantizedTensor
 
@@ -826,6 +826,14 @@ def set_extra_state(self, state: Optional[torch.Tensor]) -> None:
         if state is None:
             return
 
+        # TE 1.x checkpoint compatibility: add DelayedScaling recipe if missing
+        if "recipe" not in state:
+            # TE 1.x only supported delayed scaling, which was the default recipe
+            state["recipe"] = DelayedScaling()
+            # TE 1.x also saved scale_inv, which is not needed with Recipe object
+            state.pop("scale_inv_fwd", None)
+            state.pop("scale_inv_bwd", None)
+
         # Load extra items
         self.fp8_meta.update(state["extra_fp8_variables"])
         self.fp8_meta["recipe"] = state["recipe"]
@@ -899,6 +907,8 @@ def _get_fp8_params(self) -> Union[List[torch.Tensor], None]:
     # assume FP8 execution.
     def init_fp8_metadata(self, num_gemms: int = 1) -> None:
         """Initialize fp8 related metadata and tensors during fprop."""
+        _original_recipe = self.fp8_meta.get("recipe", None)
+
         self.fp8_parameters = FP8GlobalStateManager.with_fp8_parameters()
         self.fp8 = FP8GlobalStateManager.is_fp8_enabled()
         self.fp8_calibration = FP8GlobalStateManager.is_fp8_calibration()
@@ -937,6 +947,19 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None:
 
             self.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
 
+        _current_recipe = self.fp8_meta["recipe"]
+        if _original_recipe is not None and not (
+            issubclass(_current_recipe.__class__, _original_recipe.__class__)
+            or issubclass(_original_recipe.__class__, _current_recipe.__class__)
+        ):
+            warnings.warn(
+                f"Recipe type changed from {_original_recipe.__class__.__name__} "
+                f"to {_current_recipe.__class__.__name__}. "
+                "This may affect model behavior."
+            )
+            # Clear cached workspaces as they were created with the old recipe/quantizer type
+            self._fp8_workspaces.clear()
+
     @contextmanager
     def prepare_forward(
         self,
@@ -961,6 +984,7 @@ def prepare_forward(
 
             self.set_activation_dtype(inp)
             self.init_fp8_metadata(num_gemms=num_gemms)
+            self._check_weight_tensor_recipe_correspondence()
 
             if self.fp8 and self.sequence_parallel and self.fp8_meta["recipe"].delayed():
                 assert self.fp8_meta["recipe"].reduce_amax, (
@@ -1385,6 +1409,43 @@ def _validate_name(self):
             )
             self.name = f"Layer_{TEDebugState.get_layer_count()}"
 
+    def _check_weight_tensor_recipe_correspondence(self) -> None:
+        """
+        Verify that the weight tensor types match their corresponding recipe type.
+        This is invoked in the forward().
+
+        This establishes a 1:1 correspondence between recipe types and tensor types:
+        - DelayedScaling → Float8Tensor
+        - Float8CurrentScaling → Float8Tensor
+        - MXFP8BlockScaling → MXFP8Tensor
+        - Float8BlockScaling → Float8BlockTensor
+
+        Example case to check: recipe is DelayedScaling (DelayedScaling is set in fp8_autocast()),
+        but the weight tensor is MXFP8Tensor (MXFP8BlockScaling is set in fp8_model_init()).
+        """
+        if not self.fp8 and not self.fp8_calibration:
+            return
+        if not hasattr(self, "weight_names") or not self.weight_names:
+            return
+
+        recipe = self.fp8_meta["recipe"]
+        weight_tensors = [getattr(self, name) for name in self.weight_names]
+        for i, tensor in enumerate(weight_tensors):
+            if isinstance(tensor, QuantizedTensorBase):
+                quantizer = tensor._get_quantizer()
+                if quantizer is None:
+                    continue
+                compatible_recipe_class = quantizer._get_compatible_recipe()
+                if compatible_recipe_class is None:
+                    continue
+                if not isinstance(recipe, compatible_recipe_class):
+                    raise RuntimeError(
+                        f"Recipe mismatch for '{self.weight_names[i]}': tensor supports recipe"
+                        f" {compatible_recipe_class.__name__}, but got {recipe.__class__.__name__}."
+                        " Please check the recipes assigned during fp8_model_init() and"
+                        " fp8_autocast() calls."
+                    )
+
     def _turn_off_unsupported_features_in_debug(self):
         if (
             getattr(self, "ub_bulk_wgrad", False)
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
index ce4137c66..4ab04da83 100644
--- a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -4,13 +4,14 @@
 
 """Tensor class with FP8 data quantized with NxN tiles"""
 from __future__ import annotations
-from typing import Optional, Tuple, Iterable
+from typing import Optional, Tuple, Iterable, Union
 
 import math
 import torch
 import transformer_engine_torch as tex
-
 from transformer_engine_torch import DType as TE_DType
+
+from transformer_engine.common.recipe import Float8BlockScaling, Recipe
 from ._internal.float8_blockwise_tensor_base import Float8BlockwiseQTensorBase
 from .quantized_tensor import QuantizedTensor, Quantizer, _IdentityFunc
 from ..utils import devices_match, round_up_to_nearest_multiple
@@ -229,6 +230,9 @@ def calibrate(self, tensor: torch.Tensor) -> None:
         # where state from an estimator influences distribution parameters.
         pass
 
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        return Float8BlockScaling
+
 
 class Float8BlockwiseQTensor(Float8BlockwiseQTensorBase, QuantizedTensor):
     """Tensor class with FP8 data quantized via NxN blocks or 1xN blocks.
diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py
index b55ac577c..fa8e29283 100644
--- a/transformer_engine/pytorch/tensor/float8_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -6,15 +6,20 @@
 
 """Tensor class with FP8 data"""
 from __future__ import annotations
+<<<<<<< HEAD
 import os
 from typing import Optional, Tuple, Iterable
+=======
+from typing import Optional, Tuple, Iterable, Union
+>>>>>>> 6f5af6ae (Enhance recipe compatibility (#1724))
 import warnings
 from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
 import torch
 import transformer_engine_torch as tex
-
 from transformer_engine_torch import DType as TE_DType
+
+from transformer_engine.common.recipe import DelayedScaling, Float8CurrentScaling, Recipe
 from ..utils import canonicalize_process_group, devices_match
 from ._internal.float8_tensor_base import Float8TensorBase, _FromFloat8Func
 from .quantized_tensor import QuantizedTensor, Quantizer, _IdentityFunc
@@ -177,6 +182,9 @@ def create_tensor_from_data(
             quantizer=self,
         )
 
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        return DelayedScaling
+
 
 class Float8CurrentScalingQuantizer(Quantizer):
     """Builder class for FP8 tensors with per-tensor current scaling
@@ -339,6 +347,9 @@ def _canonicalized_amax_reduction_group(self) -> dist_group_type:
         """Get process group for amax reduction"""
         return canonicalize_process_group(self.amax_reduction_group)
 
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        return Float8CurrentScaling
+
 
 class Float8Tensor(Float8TensorBase, QuantizedTensor):
     """Experimental tensor class with FP8 data
diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
index 920b7d6b0..8f3c73eb9 100644
--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -7,16 +7,17 @@
 from collections.abc import Iterable
 import math
 import os
-from typing import Optional, Tuple
 from torch.utils.cpp_extension import IS_HIP_EXTENSION
+from typing import Optional, Tuple, Union
 
 import torch
 if IS_HIP_EXTENSION:
     from ..triton_kernels.cast import te_quantize_triton
 
 import transformer_engine_torch as tex
-
 from transformer_engine_torch import DType as TE_DType
+
+from transformer_engine.common.recipe import MXFP8BlockScaling, Recipe
 from ..constants import MXFP8_BLOCK_SCALING_SIZE
 from ..utils import devices_match, round_up_to_nearest_multiple
 
@@ -145,6 +146,9 @@ def calibrate(self, tensor: torch.Tensor) -> None:
         # TODO(ksivamani): No calibration needed for mxfp8?
         pass
 
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        return MXFP8BlockScaling
+
 
 class MXFP8Tensor(MXFP8TensorBase, QuantizedTensor):
     """Experimental tensor class with FP8 data
diff --git a/transformer_engine/pytorch/tensor/quantized_tensor.py b/transformer_engine/pytorch/tensor/quantized_tensor.py
index e521d4279..9b0adcc22 100644
--- a/transformer_engine/pytorch/tensor/quantized_tensor.py
+++ b/transformer_engine/pytorch/tensor/quantized_tensor.py
@@ -17,6 +17,7 @@
 from torch.utils._pytree import tree_map
 
 import transformer_engine_torch as tex
+from transformer_engine.common.recipe import Recipe
 
 
 class QuantizedTensorBase:
@@ -242,6 +243,10 @@ def copy(self) -> Quantizer:
         """Create shallow copy"""
         return copy.copy(self)
 
+    @abc.abstractmethod
+    def _get_compatible_recipe(self) -> Union[type[Recipe], None]:
+        """Returns recipe class that is compatible with this quantizer"""
+
 
 class _QuantizeFunc(torch.autograd.Function):
     """Cast to FP8 from other dtype"""

From 610c3937d0af085c296528652404d85b68416ef5 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Tue, 20 May 2025 10:47:47 -0600
Subject: [PATCH 04/26] Use an empty torch tensor to indicate no fp8
 information in extra_state (#1799)

* Use an empty torch tensor to indicate no fp8 information in extra_state

Signed-off-by: Peter St. John <pstjohn@nvidia.com>

* Add huggingface from_pretrained / save_pretrained tests

Adds integration tests to ensure models containing TransformerLayer
objects can be saved and loaded using the from_pretrained and
save_pretrained methods.

Signed-off-by: Peter St. John <pstjohn@nvidia.com>

---------

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 qa/L0_pytorch_unittest/test.sh            |  1 +
 setup.py                                  |  2 +-
 tests/pytorch/test_hf_integration.py      | 40 +++++++++++++++++++++++
 transformer_engine/pytorch/module/base.py | 12 +++----
 4 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 tests/pytorch/test_hf_integration.py

diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index 79f3c8fb9..ea5236502 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -44,6 +44,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entro
 NVTE_FLASH_ATTN=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || test_fail "test_fused_attn.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/fused_attn/test_kv_cache.py || test_fail "test_kv_cache.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
diff --git a/setup.py b/setup.py
index 41893644c..0012844a8 100644
--- a/setup.py
+++ b/setup.py
@@ -173,7 +173,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
                 )
                 # Blackwell is not supported as of Triton 3.2.0, need custom internal build
                 # install_reqs.append("triton")
-                test_reqs.extend(["numpy", "torchvision"])
+                test_reqs.extend(["numpy", "torchvision", "transformers"])
         if "jax" in frameworks:
             if rocm_build():
                 from build_tools.jax import jax_install_requires
diff --git a/tests/pytorch/test_hf_integration.py b/tests/pytorch/test_hf_integration.py
new file mode 100644
index 000000000..0b2468510
--- /dev/null
+++ b/tests/pytorch/test_hf_integration.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import pytest
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+
+from transformer_engine.pytorch.transformer import TransformerLayer
+from transformer_engine.pytorch.utils import is_bf16_compatible
+
+
+class SimpleTEModel(PreTrainedModel):
+    config_class = PretrainedConfig
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+        self.my_layer = TransformerLayer(
+            hidden_size=320,
+            num_attention_heads=16,
+            ffn_hidden_size=1024,
+            layer_number=None,
+        )
+
+    def forward(self, hidden_states, attention_mask):
+        return self.my_layer(hidden_states, attention_mask)
+
+
+def test_save_hf_model(tmp_path):
+    model = SimpleTEModel(PretrainedConfig())
+    model.save_pretrained(tmp_path / "simple_te_model")
+
+
+@pytest.mark.xfail(reason="This test is failing until huggingface/transformers#38155 is merged.")
+def test_save_and_load_hf_model(tmp_path):
+    model = SimpleTEModel(PretrainedConfig())
+    model.save_pretrained(tmp_path / "simple_te_model")
+    del model
+    model = SimpleTEModel.from_pretrained(tmp_path / "simple_te_model")
+    assert model is not None
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index 1672bc6bd..e86ccd172 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -746,7 +746,7 @@ def reset(key):
             reset("scaling_fwd")
             reset("scaling_bwd")
 
-    def get_extra_state(self) -> Optional[torch.Tensor]:
+    def get_extra_state(self) -> torch.Tensor:
         """Save before checkpointing."""
 
         # This implementation is working around a few issues:
@@ -781,7 +781,7 @@ def to_cpu(src: torch.Tensor) -> torch.Tensor:
         state = None
         fp8_checkpoint = self.fp8_meta["fp8_checkpoint"] or self.fp8 or self.fp8_calibration
         if not fp8_checkpoint:
-            return None
+            return torch.empty(0, dtype=torch.uint8)
 
         # Copy tensors to CPU and store
         state = {}
@@ -807,13 +807,13 @@ def to_cpu(src: torch.Tensor) -> torch.Tensor:
         state_serialized = torch.frombuffer(state_serialized, dtype=torch.uint8)
         return state_serialized
 
-    def set_extra_state(self, state: Optional[torch.Tensor]) -> None:
+    def set_extra_state(self, state: torch.Tensor) -> None:
         """Load previous state."""
-        if state is None:
-            return
-
         # Load state
         if isinstance(state, torch.Tensor):
+            # No FP8 is indicated by an empty tensor we don't need to unpickle.
+            if state.numel() == 0:
+                return
             # Default format: byte tensor with pickled data
             state = pickle.loads(state.detach().cpu().numpy().tobytes())
         elif isinstance(state, io.BytesIO):

From c5ea9eb7d6500067683dcc4822da389d3ce9408a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?=
 <62263673+pggPL@users.noreply.github.com>
Date: Tue, 20 May 2025 22:42:29 +0200
Subject: [PATCH 05/26] =?UTF-8?q?[Pytorch]=20NVIDIA-DL-Framework-Inspect?=
 =?UTF-8?q?=20support=20=E2=80=93=20part=204=20=E2=80=93=20documentation?=
 =?UTF-8?q?=20(#1611)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs drop

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* a

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* Update docs/debug/1_getting_started.rst

Co-authored-by: Przemyslaw Tredak <ptrendx@gmail.com>
Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com>

* Update docs/debug/1_getting_started.rst

Co-authored-by: Przemyslaw Tredak <ptrendx@gmail.com>
Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com>

* fixes

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* fix imgs

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

---------

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com>
Co-authored-by: Przemyslaw Tredak <ptrendx@gmail.com>
---
 docs/debug.rst                                |  14 +
 docs/debug/1_getting_started.rst              | 241 ++++++++++++++++++
 docs/debug/2_config_file_structure.rst        | 241 ++++++++++++++++++
 docs/debug/3_api_debug_setup.rst              |  87 +++++++
 docs/debug/3_api_features.rst                 |  14 +
 docs/debug/3_api_te_calls.rst                 |  45 ++++
 docs/debug/4_distributed.rst                  |  91 +++++++
 docs/debug/api.rst                            |  13 +
 docs/debug/img/api_calls1.svg                 |   1 +
 docs/debug/img/api_calls2.svg                 |   1 +
 docs/debug/img/fake_quant.svg                 |   1 +
 docs/debug/img/introduction.svg               |   1 +
 docs/debug/img/names.svg                      |   1 +
 docs/debug/img/pipeline_logging.svg           |   1 +
 docs/debug/img/reduction1.svg                 |   1 +
 docs/debug/img/reduction2.svg                 |   1 +
 docs/debug/img/reduction3.svg                 |   1 +
 docs/debug/img/scaling_factors.svg            |   1 +
 docs/debug/img/tensorboard.png                | Bin 0 -> 123093 bytes
 docs/index.rst                                |   1 +
 qa/L0_pytorch_lint/test.sh                    |   2 +-
 .../debug/features/per_tensor_scaling.py      |   1 -
 22 files changed, 758 insertions(+), 2 deletions(-)
 create mode 100644 docs/debug.rst
 create mode 100644 docs/debug/1_getting_started.rst
 create mode 100644 docs/debug/2_config_file_structure.rst
 create mode 100644 docs/debug/3_api_debug_setup.rst
 create mode 100644 docs/debug/3_api_features.rst
 create mode 100644 docs/debug/3_api_te_calls.rst
 create mode 100644 docs/debug/4_distributed.rst
 create mode 100644 docs/debug/api.rst
 create mode 100644 docs/debug/img/api_calls1.svg
 create mode 100644 docs/debug/img/api_calls2.svg
 create mode 100644 docs/debug/img/fake_quant.svg
 create mode 100644 docs/debug/img/introduction.svg
 create mode 100644 docs/debug/img/names.svg
 create mode 100644 docs/debug/img/pipeline_logging.svg
 create mode 100644 docs/debug/img/reduction1.svg
 create mode 100644 docs/debug/img/reduction2.svg
 create mode 100644 docs/debug/img/reduction3.svg
 create mode 100644 docs/debug/img/scaling_factors.svg
 create mode 100644 docs/debug/img/tensorboard.png

diff --git a/docs/debug.rst b/docs/debug.rst
new file mode 100644
index 000000000..d33568ea3
--- /dev/null
+++ b/docs/debug.rst
@@ -0,0 +1,14 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+Precision debug tools
+==============================================
+
+.. toctree::
+   :caption: Precision debug tools
+
+   debug/1_getting_started.rst
+   debug/2_config_file_structure.rst
+   debug/api
+   debug/4_distributed.rst
\ No newline at end of file
diff --git a/docs/debug/1_getting_started.rst b/docs/debug/1_getting_started.rst
new file mode 100644
index 000000000..bc2b95057
--- /dev/null
+++ b/docs/debug/1_getting_started.rst
@@ -0,0 +1,241 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Getting started
+==============
+
+.. note::
+
+   Precision debug tools with `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ for Transformer Engine are currently supported only for PyTorch.
+
+Transformer Engine provides a set of precision debug tools which allow you to easily:
+
+- log the statistics for each of the tensors in every matrix multiply (GEMM) operation,
+- run selected GEMMs in higher precision,
+- run current scaling - with one scaling factor per tensor - for particular GEMMs,
+- test new precisions and integrate them with FP8 training,
+- ... and many more.
+
+There are 4 things one needs to do to use Transformer Engine debug features:
+
+1. Create a configuration YAML file to configure the desired features.
+2. Import, and initialize the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ tool, which is installed as the dependency of the Transformer Engine.
+3. One can pass ``name="..."`` when creating TE layers to easier identify layer names. If this is not provided, names will be inferred automatically.
+4. Invoke ``debug_api.step()`` at the end of one forward-backward pass.
+
+To start debugging, one needs to create a configuration YAML file. This file lists the features to be used in particular layers. There are 2 kinds of features:
+
+- provided by the Transformer Engine - for example, DisableFP8GEMM or LogTensorStats - they are listed in the :doc:`debug features API <3_api_features>` section
+- defined by the user. For details on how to create a custom feature - please read the :doc:`calls to Nvidia-DL-Framework-Inspect <3_api_te_calls>` section.
+
+.. figure:: ./img/introduction.svg
+   :align: center
+
+   Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 3 TE Linear Layers. 
+   ``config.yaml`` contains the specification of the features used for each Linear layer. Some feature classes are provided by TE,
+   one - ``UserProvidedPrecision`` - is a custom feature implemented by the user. Nvidia-DL-Framework-Inspect inserts features into the layers according to the config.
+
+Example training script
+----------------------
+
+Let's look at a simple example of training a Transformer layer using Transformer Engine with FP8 precision. This example demonstrates how to set up the layer, define an optimizer, and perform a few training iterations using synthetic data.
+
+.. code-block:: python
+
+    # train.py
+
+    from transformer_engine.pytorch import TransformerLayer
+    import torch
+    import torch.nn as nn
+    import torch.optim as optim
+    import transformer_engine.pytorch as te
+
+    hidden_size = 512
+    num_attention_heads = 8
+
+    transformer_layer = TransformerLayer(
+        hidden_size=hidden_size,
+        ffn_hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads
+    ).cuda()
+
+    dummy_input = torch.randn(10, 32, hidden_size).cuda()
+    criterion = nn.MSELoss()
+    optimizer = optim.Adam(transformer_layer.parameters(), lr=1e-4)
+    dummy_target = torch.randn(10, 32, hidden_size).cuda()
+
+    for epoch in range(5):
+        transformer_layer.train()
+        optimizer.zero_grad()
+        with te.fp8_autocast(enabled=True):
+            output = transformer_layer(dummy_input)
+        loss = criterion(output, dummy_target)
+        loss.backward()
+        optimizer.step()
+
+We will demonstrate two debug features on the code above:
+
+1. Disabling FP8 precision for specific GEMM operations, such as the FC1 and FC2 forward propagation GEMM.
+2. Logging statistics for other GEMM operations, such as gradient statistics for data gradient GEMM within the LayerNormLinear sub-layer of the TransformerLayer.
+
+Config file
+----------
+
+We need to prepare the configuration YAML file, as below
+
+.. code-block:: yaml
+
+    # config.yaml
+
+    fc1_fprop_to_fp8:
+      enabled: True
+      layers:
+        layer_types: [fc1, fc2] # contains fc1 or fc2 in name
+      transformer_engine:
+        DisableFP8GEMM:
+          enabled: True
+          gemms: [fprop]
+
+    log_tensor_stats:
+      enabled: True
+      layers:
+        layer_types: [layernorm_linear] # contains layernorm_linear in name
+      transformer_engine:
+        LogTensorStats:
+          enabled: True
+          stats: [max, min, mean, std, l1_norm]
+          tensors: [activation]
+          freq: 1
+          start_step: 2
+          end_step: 5
+
+Further explanation on how to create config files is in the :doc:`next part of the documentation <2_config_file_structure>`.
+
+Adjusting Python file
+--------------------
+
+.. code-block:: python
+
+    # (...)
+
+    import nvdlfw_inspect.api as debug_api
+    debug_api.initialize(
+        config_file="./config.yaml",
+        feature_dirs=["/path/to/transformer_engine/debug/features"],
+        log_dir="./log",
+        default_logging_enabled=True)
+
+    # initialization of the TransformerLayer with the name
+    transformer_layer = TransformerLayer(
+      name="transformer_layer",
+      # ...)
+
+    # (...)
+    for epoch in range(5):
+      # forward and backward pass
+      # ...
+      debug_api.step()
+
+In the modified code above, the following changes were made:
+
+1. Added an import for ``nvdlfw_inspect.api``.
+2. Initialized the Nvidia-DL-Framework-Inspect by calling ``debug_api.initialize()`` with appropriate configuration, specifying the path to the config file, feature directories, and log directory.
+3. Added ``debug_api.step()`` after each of the forward-backward pass.
+
+Inspecting the logs
+------------------
+
+Let's look at the files with the logs. Two files will be created:
+
+1. debug logs.
+2. statistics logs.
+
+Let's look inside them!
+
+In the main log file, you can find detailed information about the transformer layer's GEMMs behavior. You can see that ``fc1`` and ``fc2`` fprop GEMMs are run in high precision, as intended.
+
+.. code-block:: text
+
+    # log/nvdlfw_inspect_logs/nvdlfw_inspect_globalrank-0.log
+
+    INFO - Default logging to file enabled at ./log
+    INFO - Reading config from ./config.yaml.
+    INFO - Loaded configs for dict_keys(['fc1_fprop_to_fp8', 'log_tensor_stats']).
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: activation, gemm fprop - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: activation, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: weight, gemm fprop - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: weight, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: gradient, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: gradient, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: activation, gemm fprop - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: activation, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: weight, gemm fprop - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: weight, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: gradient, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.proj: Tensor: gradient, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: activation, gemm fprop - High precision
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: activation, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: weight, gemm fprop - High precision
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: weight, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: gradient, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc1: Tensor: gradient, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: activation, gemm fprop - High precision
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: activation, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: weight, gemm fprop - High precision
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: weight, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: gradient, gemm dgrad - FP8 quantization
+    INFO - transformer_layer.layernorm_mlp.fc2: Tensor: gradient, gemm wgrad - FP8 quantization
+    INFO - transformer_layer.self_attention.layernorm_qkv: Feature=LogTensorStats, API=look_at_tensor_before_process: activation
+    ....
+
+The second log file (``nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log``) contains statistics for tensors we requested in ``config.yaml``.
+
+.. code-block:: text
+
+    # log/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log
+
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_max                 iteration=000002                  value=4.3188
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_min                 iteration=000002                  value=-4.3386
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean                iteration=000002                  value=0.0000
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_std                 iteration=000002                  value=0.9998
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm             iteration=000002                  value=130799.6953
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_max                 iteration=000003                  value=4.3184
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_min                 iteration=000003                  value=-4.3381
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean                iteration=000003                  value=0.0000
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_std                 iteration=000003                  value=0.9997
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm             iteration=000003                  value=130788.1016
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_max                 iteration=000004                  value=4.3181
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_min                 iteration=000004                  value=-4.3377
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean                iteration=000004                  value=0.0000
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_std                 iteration=000004                  value=0.9996
+    INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm             iteration=000004                  value=130776.7969
+
+Logging using TensorBoard
+------------------------
+
+Precision debug tools support logging using `TensorBoard <https://www.tensorflow.org/tensorboard>`_. To enable it, one needs to pass the argument ``tb_writer`` to the ``debug_api.initialize()``.  Let's modify ``train.py`` file.
+
+.. code-block:: python
+
+    # (...)
+
+    from torch.utils.tensorboard import SummaryWriter
+    tb_writer = SummaryWriter('./tensorboard_dir/run1')
+
+    # add tb_writer to the Debug API initialization
+    debug_api.initialize(
+        config_file="./config.yaml",
+        feature_dirs=["/path/to/transformer_engine/debug/features"],
+        log_dir="./log",
+        tb_writer=tb_writer)
+
+    # (...)
+
+Let's run training and open TensorBoard by ``tensorboard --logdir=./tensorboard_dir/run1``:
+
+.. figure:: ./img/tensorboard.png
+   :align: center
+
+   Fig 2: TensorBoard with plotted stats.
\ No newline at end of file
diff --git a/docs/debug/2_config_file_structure.rst b/docs/debug/2_config_file_structure.rst
new file mode 100644
index 000000000..f1069b0c8
--- /dev/null
+++ b/docs/debug/2_config_file_structure.rst
@@ -0,0 +1,241 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Config File Structure
+====================
+
+To enable debug features, create a configuration YAML file to specify the desired behavior, such as determining which GEMMs (General Matrix Multiply operations) should run in higher precision rather than FP8 and defining which statistics to log. 
+Below, we outline how to structure the configuration YAML file.
+
+General Format
+-------------
+
+A config file can have one or more sections, each containing settings for specific layers and features:
+
+.. code-block:: yaml
+
+    section_name_1:
+      enabled: ...
+      layers:
+        # Specify layers here...
+      transformer_engine:
+        Feature1Name:
+          enabled: ...
+          # Feature details...
+        Feature2Name:
+          enabled: ...
+          # Feature details...
+
+    section_name_2:
+      enabled: ...
+      layers:
+        # Specify layers here...
+      Feature1Name: # If feature has no namespace, then it is in the default namespace.
+        enabled: ...
+        # Feature details...
+
+    section_name_3:
+      enabled: ...
+      layers:
+        # Specify layers here...
+      transformer_engine:
+        Feature1Name:
+          enabled: ...
+          # Feature details...
+        Feature2Name:
+          enabled: ...
+          # Feature details...
+
+Sections may have any name and must contain:
+
+1. An ``enabled`` field that specifies whether the features in that section will be active.
+2. A ``layers`` field specifying which layers the section applies to. Each layer can belong to only one section.
+3. Additional fields describing features for those layers.
+
+Layer Specification
+------------------
+
+Debug layers can be identified by a ``name`` parameter:
+
+.. code-block:: python
+
+    linear = transformer_engine.debug.pytorch.Linear(in_features, out_features, name="linear1")
+
+This name is used in the config file to identify the layer. To specify the ``layers`` field, you can use one of the following methods:
+
+1. ``layer_name_regex_pattern``: Use a regular expression to match layer names. This expression must adhere to the Python ``re`` module syntax.
+2. ``layer_types``: Provide a list of strings, where a layer will be selected if any string matches part of its name.
+
+Examples:
+
+.. code-block:: yaml
+
+    # Example 1: Using regular expression to select layers
+    my_section:
+      enabled: ...
+      layers:
+        layer_name_regex_pattern: 'self_attn.*'
+      transformer_engine:
+        (...)
+
+    # Example 2: Using layer type to select layers
+    another_section:
+      enabled: ...
+      layers:
+        layer_types: ['fc1', 'layernorm_linear']
+      transformer_engine:
+        (...)
+
+Names in Transformer Layers
+--------------------------
+
+There are three ways to assign a name to a layer in the Transformer Engine:
+
+- Initialize the layer with the ``name=...`` argument.
+- Use ``debug_api.infer_and_assign_layer_names(model)``, which assigns names based on class names.
+- Rely on the default names assigned during module initialization, such as ``Layer_n``, where ``n`` represents the layer number.
+
+The ``TransformerLayer`` in Transformer Engine is a composition of multiple sub-layers. We can modify some of these layers using precision debug tools, particularly those that contain exactly one linear layer. To see the names of all such layers, we can inspect log files. For instance, a ``TransformerLayer`` named ``transformer_layer`` might consist of:
+
+- ``transformer_layer.self_attn.layernorm_linear_qkv`` / ``transformer_layer.self_attn.linear_qkv`` / ``transformer_layer.self_attn.layernorm_linear_q`` / ``transformer_layer.self_attn.linear_q`` / ``transformer_layer.self_attn.linear_kv``,
+- ``transformer_layer.self_attn.proj``,
+- ``transformer_layer.inter_attn.*`` for ``layer_type="decoder"``,
+- ``transformer_layer.layernorm_mlp.fc1``,
+- ``transformer_layer.layernorm_mlp.fc2``,
+
+depending on the configuration. Some layers, like ``LayerNormLinear``, are fusions of two layers: ``LayerNorm`` and ``Linear``. When referring to such layers in precision debug tools, only the ``Linear`` part is affected.
+
+Below is an example ``TransformerLayer`` with four linear layers that can be influenced by the precision debug tools.
+
+.. figure:: ./img/names.svg
+   :align: center
+   :width: 80%
+
+   Fig 1: Names of layers in an example configuration of TransformerLayer. The most nested blocks represent the most basic layers, each containing one linear layer. Layers that do not contain linear layers, such as ``DotProductAttention``, are omitted.
+
+**Configuration File Example**
+
+.. code-block:: yaml
+
+    # Disables wgrad in all 4 GEMMs
+    section1:
+      enabled: True
+      layers:
+        layer_types: [transformer_layer]
+      transformer_engine:
+        DisableFP8GEMM:
+          enabled: True
+          gemms: [wgrad]
+
+    # Disables all GEMMs in layernorm_mlp layer
+    section2:
+      enabled: True
+      layers:
+        layer_types: [layernorm_mlp]
+      transformer_engine:
+        DisableFP8Layer:
+          enabled: True
+      
+    # Logs wgrad stats in fc1
+    section3:
+      enabled: True
+      layers:
+        layer_types: [fc1]
+      transformer_engine:
+        LogTensorStats:
+          enabled: True
+          stats: [min]
+          tensors: [wgrad]
+          freq: 1
+          start_step: 0
+          end_step: 50
+
+
+Structured Configuration for GEMMs and Tensors
+---------------------------------------------
+
+Sometimes a feature is parameterized by a list of tensors or by a list of GEMMs.
+There are multiple ways of describing this parameterization.
+
+We can pass lists, as below.
+
+.. code-block:: yaml
+
+    Feature:
+      enabled: ...
+      gemms: [gemm1, gemm2]
+      tensors: [tensor1, tensor2]
+      ...
+
+We can use struct for tensors.
+
+.. code-block:: yaml
+
+    Feature:
+      gemms: [gemm1, gemm2]
+      tensors_struct:
+      - tensor: tensor1
+        feature_param1: value
+      - tensor: tensor2
+        feature_param1: value
+      gemm_feature_param1: value
+
+Similarly, we can use struct for GEMMs.
+
+.. code-block:: yaml
+
+    Feature:
+      enabled: ...
+      tensors: [tensor1, tensor2]
+      gemms_struct:
+      - gemm: gemm1
+        feature_param1: value
+      - gemm: gemm2
+        feature_param1: value
+      gemm_feature_param1: value
+
+We can use both structs for tensors and GEMMs. The tensors_struct should be nested inside gemms_struct.
+
+.. code-block:: yaml
+
+    Feature:
+      enabled: ...
+      gemms_struct:
+        - gemm: gemm1
+          tensors: [tensor1, tensor2]
+          tensor_feature_param1: value
+          gemm_feature_param1: value
+        - gemm: gemm2
+          tensors_struct:
+          - tensor: tensor1
+            tensor_feature_param1: value
+          - tensor: tensor2
+            tensor_feature_param2: value
+          gemm_feature_param1: value
+
+Enabling or Disabling Sections and Features
+------------------------------------------
+
+Debug features can be enabled or disabled with the ``enabled`` keyword:
+
+.. code-block:: yaml
+
+    section1:
+      enabled: True
+      layers:
+        layer_types: [self_attention]
+      transformer_engine:
+        LogTensorStats:
+          enabled: False # Disables the LogTensorStats feature
+          stats: [max, min, mean, std, l1_norm]
+
+    section2:
+      enabled: False # Disables entire section2
+      transformer_engine:
+        LogFp8TensorStats:
+          enabled: True # Does not enable the LogFp8TensorStats feature, because section2 is disabled
+          stats: [underflows, overflows]
+
+By organizing your ``config.yaml`` properly, you can easily manage debugging features, ensuring a more streamlined and customizable debugging experience.
diff --git a/docs/debug/3_api_debug_setup.rst b/docs/debug/3_api_debug_setup.rst
new file mode 100644
index 000000000..bda8f096d
--- /dev/null
+++ b/docs/debug/3_api_debug_setup.rst
@@ -0,0 +1,87 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Setup
+=====
+
+Precision debug tools for the Transformer Engine use `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ package from NVIDIA. 
+Please refer to the Nvidia-DL-Framework-Inspect `documentation <https://github.com/NVIDIA/nvidia-dlfw-inspect/tree/main/docs>`_ for more details.
+Below, we outline the steps for debug initialization.
+
+initialize()
+-----------
+
+Must be called once on every rank in the global context to initialize Nvidia-DL-Framework-Inspect.
+
+**Parameters**
+
+- **config_file** (*str*, default=""): Path to the configuration YAML file containing features to enable and layer names. If one wants to run without the configuration file, pass ``""``.
+- **feature_dirs** (*List[str] | str*): List of directories containing features to load and register. One needs to pass ``[/path/to/transformerengine/transformer_engine/debug/features]`` to use TE features.
+- **logger** (*Union[BaseLogger, None]*, default=None): Logger for logging tensor statistics. Should adhere to ``BaseLogger`` from the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ package.
+- **log_dir** (*str*, default= "."): Directory path to hold ``debug_logs`` and ``debug_statistics_logs``.
+- **tb_writer** (*TensorBoardWriter*, default=None): TensorBoard writer for logging.
+- **default_logging_enabled** (*bool*, default=False): Enable default logging to the file.
+
+.. code-block:: python
+
+    import nvdlfw_inspect.api as debug_api
+
+    debug_api.initialize(
+        config_file="./config.yaml",
+        feature_dirs=["/path/to/transformer_engine/debug/features"],
+        log_dir="./log_dir")
+
+set_tensor_reduction_group()
+--------------------------
+
+Needed only for logging tensor stats. In multi-GPU training, activation and gradient tensors are distributed across multiple nodes. This method lets you specify the group for the reduction of stats; see the `reduction group section <./4_distributed.rst#reduction-groups>`_ for more details.
+
+If the tensor reduction group is not specified, then statistics are reduced across all nodes in the run.
+
+**Parameters**
+
+- **group** (torch.distributed.ProcessGroup): The process group across which tensors will be reduced to get stats.
+
+
+.. code-block:: python
+
+    import nvdlfw_inspect.api as debug_api
+
+    # initialization
+    # (...)
+
+    pipeline_parallel_group = initialize_pipeline_parallel_group() 
+
+    debug_api.set_tensor_reduction_group(pipeline_parallel_group)
+
+    # training
+    # (...)
+    # activation/gradient tensor statistics are reduced along pipeline_parallel_group
+
+set_weight_tensor_tp_group_reduce()
+---------------------------------
+
+By default, weight tensor statistics are reduced within the tensor parallel group. This function allows you to disable that behavior; for more details, see `reduction group section <./4_distributed.rst#reduction-groups>`_.
+
+This method is not provided by the ``debug_api``, but by the ``transformer_engine.debug``.
+
+**Parameters**
+
+- **enabled** (*bool*, default=True): A boolean flag to enable or disable the reduction of weight tensor statistics within the tensor parallel group.
+
+
+.. code-block:: python
+
+    import nvdlfw_inspect.api as debug_api
+    from transformer_engine.debug import set_weight_tensor_tp_group_reduce
+
+    # initialization
+    # (...)
+
+    set_weight_tensor_tp_group_reduce(False)
+
+    # training
+    # (...)
+    # weight tensor statistics are not reduced
diff --git a/docs/debug/3_api_features.rst b/docs/debug/3_api_features.rst
new file mode 100644
index 000000000..b31c437b2
--- /dev/null
+++ b/docs/debug/3_api_features.rst
@@ -0,0 +1,14 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Debug features
+==========
+
+.. autoapiclass:: transformer_engine.debug.features.log_tensor_stats.LogTensorStats
+.. autoapiclass:: transformer_engine.debug.features.log_fp8_tensor_stats.LogFp8TensorStats
+.. autoapiclass:: transformer_engine.debug.features.disable_fp8_gemm.DisableFP8GEMM
+.. autoapiclass:: transformer_engine.debug.features.disable_fp8_layer.DisableFP8Layer
+.. autoapiclass:: transformer_engine.debug.features.per_tensor_scaling.PerTensorScaling
+.. autoapiclass:: transformer_engine.debug.features.fake_quant.FakeQuant
diff --git a/docs/debug/3_api_te_calls.rst b/docs/debug/3_api_te_calls.rst
new file mode 100644
index 000000000..eb66c8ff2
--- /dev/null
+++ b/docs/debug/3_api_te_calls.rst
@@ -0,0 +1,45 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Calls to Nvidia-DL-Framework-Inspect
+====================================
+Let's look deeper into how Nvidia-DL-Framework-Inspect with Transformer Engine work together. TransformerEngine layers have some hook calls inside each of the GEMMs. Users can define feature classes or use feature classes provided with TE. File ``config.yaml`` describes which hooks need to be used for which layers. Nvidia-DL-Framework-Inspect combines 3 things: TE training, feature classes and ``config.yaml`` and takes care of inserting hooks in the correct places. This process is illustrated in the image below.
+
+.. figure:: ./img/api_calls1.svg
+   :align: center
+
+   Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 1 Linear Layer. For tensors mentioned in ``config.yaml``, behavior of ``modify_tensor_enabled()`` and ``modify_tensor()`` calls are substituted with definitions from the feature class. Other calls return default values - in fact they do nothing.
+
+In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. The order of these calls is illustrated in the image below.
+
+.. figure:: ./img/api_calls2.svg
+   :align: center
+
+   Fig 2: The calls to Nvidia-DL-Framework-Inspect done for Transformer Engine. There are 2 types of calls: GEMM calls and routing calls.
+
+
+There are 2 categories of API calls, each is used for different purposes:
+
+- GEMM calls - invoked during every GEMM, used to process or quantize tensors and collect information about them,
+- Routing calls - invoked at the beginning of every forward pass - they indicate whether a feature is going to use `modify_tensor()`, etc.
+
+If all routing calls for the layer return `False`, then the layer is invoked in an optimized version with Transformer Engine fusions.
+If any of the routing calls return `True`, layers are run without the fusions. This is necessary because otherwise some tensors cannot be accessed
+if fusions happen. An important remark is that if no feature is used for the layer, then it should perform as fast as the layer without initializing `debug_api`.
+
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor_enabled
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.fp8_gemm_enabled
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_enabled
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize_enabled
diff --git a/docs/debug/4_distributed.rst b/docs/debug/4_distributed.rst
new file mode 100644
index 000000000..6f69f2712
--- /dev/null
+++ b/docs/debug/4_distributed.rst
@@ -0,0 +1,91 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Distributed training
+===================
+
+Nvidia-Pytorch-Inspect with Transformer Engine supports multi-GPU training. This guide describes how to run it and how the supported features work in the distributed setting.
+
+To use precision debug tools in multi-GPU training, one needs to:
+
+1. Run ``debug_api.initialize(...)`` and provide the same configuration YAML file on every node.
+2. If one wants to log stats, one may want to invoke ``debug_api.set_tensor_reduction_group`` with a proper reduction group.
+
+Behavior of the features
+-----------------------
+
+In a distributed setting, **DisableFP8GEMM** and **DisableFP8Layer** function similarly to the single-GPU case, with no notable differences. 
+
+**PerTensorScaling** and **FakeQuant** calculate FP8 scaling factors independently on each node, meaning the number of GPUs may affect results. This differs from the delayed scaling FP8 recipe behavior, in which scaling factors are synchronized.
+
+.. figure:: ./img/scaling_factors.svg
+   :align: center
+
+   Fig 1:  For **PerTensorScaling** and **FakeQuant** tensor scaling factors are computed separately for each of the tensor shards. This is not the case for delayed scaling FP8 scaling factors, which are synchronized.
+
+Logging-related features are more complex and will be discussed further in the next sections.
+
+Reduction groups
+--------------
+
+In setups with tensor, data, or pipeline parallelism, some tensors are distributed across multiple GPUs, requiring a reduction operation to compute statistics for these tensors.
+
+The weight tensor is always split among the tensor parallel group, and debug tools automatically reduce statistics within this group by default. To disable this automatic reduction, use:
+
+.. code-block:: python
+
+    transformer_engine.debug.set_weight_tensor_tp_group_reduce(False)
+
+In cases of data parallelism, Transformer Engine modules lack the process group needed for reduction. To manually specify the group, use:
+
+.. code-block:: python
+
+    debug_api.set_tensor_reduction_group(group)
+
+This command ensures statistics are reduced across the defined group. Activation statistics are logged after the forward pass (immediately after exiting autocast), while gradient (dgrad and wgrad) statistics are logged following the backward pass.
+
+Below, we illustrate configurations for a 4-node setup with tensor parallelism size 2 and data parallelism size 2, showcasing different reduction configurations.
+
+.. figure:: ./img/reduction1.svg
+   :align: center
+
+   Fig 2: There is a single tensor reduction group composed of all nodes. As a result, each node logs the same statistics for the tensors, as they are fully reduced across all nodes.
+
+.. figure:: ./img/reduction2.svg
+   :align: center
+
+   Fig 3: Every node is set with a tensor reduction group consisting of itself. Every node prints the same statistics for weights (which are still synchronized within TP groups), but the statistics of activations and gradients are not synchronized.
+
+.. figure:: ./img/reduction3.svg
+   :align: center
+
+   Fig 4: Weight synchronization is disabled by ``set_weight_tensor_tp_group_reduce(False)``, so every node logs stats for its shard of the weight.
+
+
+Microbatching
+-----------
+
+Let's dive into how statistics collection works with microbatching. By microbatching, we mean invoking multiple ``forward()`` calls for each ``debug_api.step()``. The behavior is as follows:
+
+- For weight tensors, the stats remain the same for each microbatch because the weight does not change.
+- For other tensors, the stats are accumulated.
+
+Logging to files and TensorBoard
+------------------------------
+
+In a single-node setup with ``default_logging_enabled=True``, all logs are saved by default to ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log``. In multi-GPU training, each node writes its reduced statistics to its unique file, named ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-i.log`` for rank i. Because these logs contain reduced statistics, the logged values are identical for all nodes within a reduction group.
+
+If certain nodes are given a TensorBoard writer, only those nodes will log to TensorBoard. This is useful in scenarios involving pipeline, data, and tensor parallelism, such as with two transformer layers and settings TP_SIZE = 2, DP_SIZE = 2, and PP_SIZE = 2. To log all stats to TensorBoard, you should pass a TensorBoard writer to one process in each pipeline parallel group.
+
+.. figure:: ./img/pipeline_logging.svg
+   :align: center
+
+   Fig 5: Example with pipeline parallelism, where a ``tb_writer`` is assigned to one node within each pipeline parallel group, setting these as tensor reduction groups.
+
+Alternatively, setting the tensor reduction group to None will yield unreduced statistics for wgrad and dgrad tensors on each node, allowing for post-processing. For weight statistics without reduction in the TP parallel group, use:
+
+.. code-block:: python
+
+    transformer_engine.debug.set_weight_tensor_tp_group_reduce(False)
\ No newline at end of file
diff --git a/docs/debug/api.rst b/docs/debug/api.rst
new file mode 100644
index 000000000..ac593d353
--- /dev/null
+++ b/docs/debug/api.rst
@@ -0,0 +1,13 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+API
+============
+
+.. toctree::
+   :caption: Precision debug tools API
+
+   3_api_debug_setup.rst
+   3_api_features.rst
+   3_api_te_calls.rst
\ No newline at end of file
diff --git a/docs/debug/img/api_calls1.svg b/docs/debug/img/api_calls1.svg
new file mode 100644
index 000000000..098f384b2
--- /dev/null
+++ b/docs/debug/img/api_calls1.svg
@@ -0,0 +1 @@
+<svg width="4083" height="2026" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-142 -261)"><g><rect x="149.5" y="270.5" width="1231" height="1971" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 506.048 353)">te.Linear</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 793.079 353)">Linear1</text><rect x="1734.5" y="395.5" width="1044" height="230" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1780.07 536)">Nvidia</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1986.32 536)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2009.23 536)">DLFramework</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2468.14 536)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2491.06 536)">Inspect</text><rect x="1654.5" y="834.5" width="1205" height="1446" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2065.65 918)">config.yaml</text><rect x="257.5" y="481.5" width="408" height="138" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><rect x="1722.5" y="970.5" width="1082" height="1271" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1755.36 1178)">Section1:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1793.17 1260)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2038.95 1260)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1793.17 1343)">layer_names</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2187.91 1343)">: [Linear1]</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1793.17 1425)">UserProvidedPrecision</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2494.42 1425)">:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1812.08 1508)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2057.86 1508)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1812.08 1590)">gemms_struct</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2244.06 1590)">:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1906.61 1673)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1948.43 1673)">gemm: </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2177.6 1673)">frop</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1906.61 1755)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1948.43 1755)">tensors: [</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2230.88 1755)">activation</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2521.92 1755)">, </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2559.74 1755)">output</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2751.09 1755)">]</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1906.61 1838)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1948.43 1838)">gemm: </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2177.6 1838)">dgrad</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1906.61 1920)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1948.43 1920)">tensors: [weight]</text><path d="M2253.62 834.44 2253.12 648.426 2260 648.407 2260.49 834.421ZM2242.82 653.036 2256.5 625.5 2270.32 652.964Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3464.77 1380)">Feature</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3751.23 1380)">classes</text><rect x="3256.5" y="1514.5" width="953" height="434" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E8E8E8" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3381.99 1597)">UserProvidedPrecision</text><path d="M3253.73 1732.3 2783.66 533.088 2790.06 530.579 3260.13 1729.79ZM2775.73 541.121 2778.5 510.5 2801.34 531.085Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><rect x="842.5" y="481.5" width="408" height="138" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="589.5" y="638.5" width="350" height="110" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 645.378 722)">FPROP</text><rect x="555.5" y="768.5" width="409" height="138" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><rect x="255.5" y="1005.5" width="408" height="138" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="840.5" y="1005.5" width="408" height="138" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="587.5" y="1163.5" width="350" height="110" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 627.909 1246)">WGRAD</text><rect x="553.5" y="1293.5" width="409" height="137" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="256.5" y="1607.5" width="408" height="137" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><rect x="841.5" y="1607.5" width="408" height="137" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="588.5" y="1764.5" width="350" height="110" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 636.393 1848)">DGRAD</text><rect x="554.5" y="1894.5" width="409" height="137" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><rect x="3350.5" y="1630.5" width="787" height="124" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3385.05 1714)">modify_tensor_enabled</text><path d="M3.10598-1.47286 346.605 722.902 340.393 725.848-3.10598 1.47286ZM353.959 714.342 353.318 745.081 329.111 726.125Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1733.82 510.5)"/><rect x="3269.5" y="625.5" width="952" height="435" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E8E8E8" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3628.58 709)">Default</text><rect x="3350.5" y="906.5" width="795" height="124" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3531.08 989)">modify_tensor</text><rect x="3350.5" y="740.5" width="802" height="124" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3392.42 823)">modify_tensor_enabled</text><rect x="3346.5" y="1789.5" width="795" height="124" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="69" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3527.13 1873)">modify_tensor</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/api_calls2.svg b/docs/debug/img/api_calls2.svg
new file mode 100644
index 000000000..5df72fc2e
--- /dev/null
+++ b/docs/debug/img/api_calls2.svg
@@ -0,0 +1 @@
+<svg width="4235" height="2342" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-41 -119)"><g><rect x="46.4999" y="1576.5" width="1564" height="734" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><rect x="630.5" y="125.5" width="580" height="151" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 786.823 225)">Tensor A</text><rect x="303.5" y="337.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 684.099 414)">inspect_tensor</text><rect x="1258.5" y="596.5" width="617" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1440.36 673)">fp8 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1557.81 673)">cast</text><rect x="114.5" y="596.5" width="683" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 227.611 673)">modify_tensor</text><rect x="303.5" y="826.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 460.753 903)">inspect_tensor_postquantize</text><rect x="1583.5" y="1123.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2095.73 1200)">GEMM</text><rect x="1583.5" y="1310.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1963.85 1387)">inspect_tensor</text><rect x="1859.5" y="1499.5" width="682" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1972.18 1576)">modify_tensor</text><rect x="115.5" y="1956.5" width="1402" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 434.13 2033)">inspect_tensor_enabled</text><rect x="115.5" y="2103.5" width="1402" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 210.785 2180)">inspect_tensor_postquantize_enabled</text><rect x="115.5" y="1660.5" width="1402" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 502.961 1737)">fp8_gemm_enabled</text><rect x="115.5" y="1808.5" width="1402" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 442.461 1885)">modify_tensor_enabled</text><path d="M1.07643-3.26461 444.129 142.822 441.977 149.351-1.07643 3.26461ZM443.006 131.593 464.817 153.263 434.395 157.71Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 920.317 443.5)"/><path d="M921.293 440.155 1545.58 588.133 1543.99 594.822 919.707 446.845ZM1543.5 577.041 1567.09 596.763 1537.16 603.8Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M923.938 443.5 923.938 803.572 917.063 803.572 917.063 443.5ZM934.25 798.988 920.5 826.488 906.75 798.988Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M456.384 699.178 899.056 817.032 897.288 823.676 454.616 705.822ZM897.281 805.888 920.317 826.25 890.206 832.462Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M0.646175-3.37622 624.723 116.066 623.431 122.818-0.646175 3.37622ZM622.16 105.076 646.585 123.75 616.991 132.085Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1567.09 702.5)"/><rect x="2945.5" y="125.5" width="579" height="151" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3100.27 225)">Tensor B</text><rect x="2617.5" y="337.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2998.12 414)">inspect_tensor</text><rect x="3572.5" y="596.5" width="617" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3754.39 673)">fp8 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3871.84 673)">cast</text><rect x="2428.5" y="596.5" width="683" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2541.64 673)">modify_tensor</text><rect x="2617.5" y="826.5" width="1234" height="106" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2774.78 903)">inspect_tensor_postquantize</text><path d="M1.07643-3.26461 444.129 142.822 441.976 149.351-1.07643 3.26461ZM443.006 131.593 464.817 153.263 434.394 157.71Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 3234.32 443.5)"/><path d="M3235.29 440.155 3859.58 588.133 3857.99 594.822 3233.71 446.845ZM3857.5 577.041 3881.09 596.763 3851.16 603.8Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M3237.94 443.5 3237.94 803.572 3231.06 803.572 3231.06 443.5ZM3248.25 798.988 3234.5 826.488 3220.75 798.988Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M2770.38 699.178 3213.06 817.032 3211.29 823.676 2768.62 705.822ZM3211.28 805.888 3234.32 826.25 3204.21 832.462Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M0.646175-3.37622 624.723 116.066 623.431 122.818-0.646175 3.37622ZM622.16 105.076 646.585 123.75 616.991 132.085Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 3881.09 702.5)"/><path d="M921.009 929.1 2178.11 1117.2 2177.09 1124 919.991 935.9ZM2175.09 1106.33 2200.26 1123.99 2171.02 1133.52Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M0.625813-3.38005 1012.36 183.941 1011.11 190.702-0.625813 3.38005ZM1009.73 172.967 1034.27 191.493 1004.72 200.007Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 3234.77 932.5)"/><path d="M3.4375-1.54131e-05 3.43776 57.5713-3.43724 57.5714-3.4375 1.54131e-05ZM13.7502 52.988 0.000360892 80.488-13.7498 52.9881Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 2200.5 1229.5)"/><path d="M3.4375-1.54131e-05 3.43776 57.5713-3.43724 57.5714-3.4375 1.54131e-05ZM13.7502 52.988 0.000360892 80.488-13.7498 52.9881Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 2200.5 1418.5)"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 623.083 2394)">Routing </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 882.041 2394)">calls</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3298.55 1286)">GEMM  </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3536.88 1286)">calls</text><path d="M923.938 276.5 923.938 314.619 917.063 314.62 917.063 276.5ZM934.25 310.036 920.5 337.536 906.75 310.036Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M3237.94 276.5 3237.94 314.619 3231.06 314.62 3231.06 276.5ZM3248.25 310.036 3234.5 337.536 3220.75 310.036Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/fake_quant.svg b/docs/debug/img/fake_quant.svg
new file mode 100644
index 000000000..3ba6973d5
--- /dev/null
+++ b/docs/debug/img/fake_quant.svg
@@ -0,0 +1 @@
+<svg width="4111" height="1434" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-91 -426)"><g><rect x="986" y="1109" width="330" height="221" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1102.43 1203)">FP8 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1066.64 1274)">GEMM</text><rect x="941" y="429" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1085.53 580)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1066.92 651)">weight</text><rect x="95.9999" y="1052" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 239.829 1203)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 239.256 1274)">input</text><rect x="600" y="1109" width="305" height="221" fill="#D9F2D0" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 703.883 1203)">FP8</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 686.397 1274)">input</text><rect x="999" y="831" width="305" height="221" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1102.44 925)">FP8</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1066.92 996)">weight</text><rect x="1510" y="1052" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1653.85 1203)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1592.91 1274)">activation</text><path d="M1154.94 763.5 1154.94 808.797 1148.06 808.797 1148.06 763.5ZM1165.25 804.214 1151.5 831.714 1137.75 804.214Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M1154.94 1046.5 1154.94 1091.8 1148.06 1091.8 1148.06 1046.5ZM1165.25 1087.21 1151.5 1114.71 1137.75 1087.21Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M1.46568e-05-3.4375 61.7245-3.43724 61.7245 3.43776-1.46568e-05 3.4375ZM57.1412-13.7498 84.6412 0.000360892 57.1411 13.7502Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 516.5 1219.5)"/><path d="M1.46568e-05-3.4375 61.7245-3.43724 61.7245 3.43776-1.46568e-05 3.4375ZM57.1412-13.7498 84.6412 0.000360892 57.1411 13.7502Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 904.5 1219.5)"/><path d="M1316.5 1216.06 1486.89 1216.06 1486.89 1222.94 1316.5 1222.94ZM1482.31 1205.75 1509.81 1219.5 1482.31 1233.25Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><rect x="3270" y="1052" width="332" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3370.09 1203)">BF16 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3351.19 1274)">GEMM</text><rect x="3226" y="429" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3370.08 580)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3351.48 651)">weight</text><rect x="2228" y="1052" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2371.88 1203)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2371.31 1274)">input</text><rect x="3780" y="1052" width="420" height="334" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3924.02 1203)">BF16</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3863.09 1274)">activation</text><path d="M3439.94 763.5 3439.94 1029.51 3433.06 1029.51 3433.06 763.5ZM3450.25 1024.93 3436.5 1052.43 3422.75 1024.93Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M2648.5 1216.06 2736.92 1216.06 2736.92 1222.94 2648.5 1222.94ZM2732.33 1205.75 2759.83 1219.5 2732.33 1233.25Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M1.46568e-05-3.4375 61.7245-3.43724 61.7245 3.43776-1.46568e-05 3.4375ZM57.1412-13.7498 84.6412 0.000360892 57.1411 13.7502Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 3185.5 1219.5)"/><path d="M3602.5 1216.06 3757.44 1216.06 3757.44 1222.94 3602.5 1222.94ZM3752.86 1205.75 3780.36 1219.5 3752.86 1233.25Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><rect x="2759" y="1052" width="420" height="334" fill="#D9F2D0" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2903.22 1142)">BF16 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2902.07 1213)">Input</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2809.26 1277)">fake quantized</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2900.65 1337)">to FP8</text><rect x="1628" y="1600" width="305" height="220" fill="#D9F2D0" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1731.92 1693)">FP8</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1714.44 1764)">input</text><rect x="2228" y="1523" width="420" height="334" fill="#D9F2D0" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2371.88 1613)">BF16 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="60" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2370.74 1684)">Input</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2277.92 1748)">fake quantized</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2369.31 1808)">to FP8</text><path d="M2020.39 1673.54 2155.61 1673.54 2155.61 1702.71 2020.39 1702.71ZM2020.39 1717.29 2155.61 1717.29 2155.61 1746.46 2020.39 1746.46Z" fill="#000000" fill-rule="evenodd" fill-opacity="1"/></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/introduction.svg b/docs/debug/img/introduction.svg
new file mode 100644
index 000000000..0eae8e820
--- /dev/null
+++ b/docs/debug/img/introduction.svg
@@ -0,0 +1 @@
+<svg width="4084" height="2031" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-137 -256)"><g><rect x="149.5" y="270.5" width="1152" height="487" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 415.645 366)">te.Linear</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 759.395 366)">Linear1</text><rect x="1761.5" y="409.5" width="1059" height="230" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1864.48 506)">Nvidia</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2111.98 506)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2139.48 506)">DLFramework</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2689.48 506)">-</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2146.36 605)">Inspect</text><rect x="3228.5" y="1024.5" width="988" height="136" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3445.87 1112)">DisableFp8Layer</text><rect x="3228.5" y="1246.5" width="988" height="136" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3467.7 1333)">LogTensorStats</text><rect x="1654.5" y="834.5" width="1205" height="1446" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2027.26 931)">config.yaml</text><rect x="149.5" y="933.5" width="1152" height="549" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 415.645 1029)">te.Linear</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 759.395 1029)">Linear2</text><rect x="273.5" y="469.5" width="880" height="133" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 436.831 556)">DisableFp8Layer</text><rect x="274.5" y="1300.5" width="879" height="134" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 458.848 1388)">LogTensorStats</text><path d="M3225.86 1094.71 2831.08 545.118 2836.66 541.107 3231.45 1090.7ZM2825.38 554.857 2820.5 524.5 2847.71 538.813Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M3225.6 1315.84 2827.97 546.435 2834.07 543.279 3231.71 1312.69ZM2820.91 555.243 2820.5 524.5 2845.34 542.618Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M0.0642826-3.4369 585.01 7.50374 584.882 14.3775-0.0642826 3.4369ZM580.621-2.89266 607.859 11.3692 580.106 24.6025Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1761.36 524.5)"/><path d="M2.78854-2.01009 597.243 822.66 591.666 826.68-2.78854 2.01009ZM602.931 812.915 607.858 843.264 580.623 828.996Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1761.36 524.5)"/><rect x="1740.5" y="970.5" width="1064" height="1271" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1773 1052)">Section1:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1129)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2035.97 1129)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1206)">layer_names</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2174.61 1206)">: [Linear1, Linear2]</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1283)">DisableFp8Layer:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1826.28 1360)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2053.73 1360)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1773 1437)">Section2:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1514)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2035.97 1514)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1591)">layer_names</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2174.61 1591)">: [</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2227.9 1591)">Linear2]</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1668)">LogTensorStats</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2252.88 1668)">:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1826.28 1745)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2053.73 1745)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1826.28 1822)">… </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1908.21 1822)">other</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2071.49 1822)">params</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1773 1899)">Section3:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 1976)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2035.97 1976)">: True</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1808.52 2053)">layer_names</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2174.61 2053)">: [Linear3]</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1790.76 2130)">UserProvidedPrecision</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2459.93 2130)">:</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1826.28 2207)">enabled</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2053.73 2207)">: True</text><path d="M3.39399-0.545183 30.632 169.023 23.844 170.113-3.39399 0.545183ZM40.087 162.862 30.8725 192.194 12.9351 167.223Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 2256.5 834.694)"/><rect x="143.5" y="1680.5" width="1158" height="471" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 412.719 1776)">te.Linear</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 756.469 1776)">Linear3</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3383.74 934)">Feature</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3727.49 934)">classes</text><rect x="273.5" y="1108.5" width="880" height="133" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DCEAF7" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 436.831 1195)">DisableFp8Layer</text><path d="M2.51154-2.34703 594.722 631.374 589.699 636.068-2.51154 2.34703ZM599.128 620.985 607.858 650.466 579.036 639.761Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1761.36 524.5)"/><rect x="3228.5" y="1746.5" width="988" height="137" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3350.19 1834)">UserProvidedPrecision</text><rect x="273.5" y="1883.5" width="880" height="123" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2CFEE" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 341.155 1970)">UserProvidedPrecision</text><path d="M3225.38 1816.11 2824.13 547.385 2830.69 545.312 3231.93 1814.04ZM2815.68 554.866 2820.5 524.5 2841.9 546.574Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M3.16249-1.34725 602.042 1404.44 595.717 1407.13-3.16249 1.34725ZM609.731 1396.18 607.859 1426.86 584.431 1406.95Z" fill="#156082" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 0 0 1 1761.36 524.5)"/><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3430.33 1502)">Provided</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3778.66 1502)">by the </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3353.95 1601)">Transformer Engine</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3433.55 2012)">User </text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3630.63 2012)">can</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3786.47 2012)">define</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3300.63 2111)">custom</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3589.38 2111)">feature</text><text fill="#000000" fill-opacity="1" font-family="Arial,Arial_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3868.97 2111)">classes</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/names.svg b/docs/debug/img/names.svg
new file mode 100644
index 000000000..3990939e7
--- /dev/null
+++ b/docs/debug/img/names.svg
@@ -0,0 +1 @@
+<svg width="2622" height="2062" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-948 -183)"><g><rect x="956.5" y="193.5" width="1971" height="2048" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1010.52 287)">Transformer Layer with name </text><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2161.9 287)">transformer_layer</text><rect x="1229.5" y="555.5" width="1425" height="653" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1519.58 630)">transformer_layer.self_attn</text><rect x="1475.5" y="988.5" width="933" height="95.9999" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="46" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1607.42 1046)">transformer_layer.self_attn.proj</text><rect x="1308.5" y="737.5" width="1267" height="97.0003" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="46" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1423.51 796)">transformer_layer.self_attn.layernorm_linear_qkv</text><rect x="1364.5" y="1404.5" width="1155" height="542" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1432.23 1480)">transformer_layer.layernorm_mlp</text><rect x="1485.5" y="1576.5" width="913" height="90.0001" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="46" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1530.65 1635)">transformer_layer.layernorm_mlp.fc1</text><rect x="1485.5" y="1748.5" width="913" height="89.9998" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="46" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1530.65 1807)">transformer_layer.layernorm_mlp.fc2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3208.71 811)">1 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3269.44 811)">Linear</text><path d="M2574.5 787.5 3176.64 787.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="27.5 20.625" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M0 0 767.947 3.92362" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="27.5 20.625" stroke-opacity="1" fill="none" fill-rule="evenodd" transform="matrix(1 0 0 -1 2408.5 1036.42)"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3208.71 1058)">1 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3269.44 1058)">Linear</text><path d="M0 0 767.947 3.92362" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="27.5 20.625" stroke-opacity="1" fill="none" fill-rule="evenodd" transform="matrix(1 0 0 -1 2398.5 1624.42)"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3198.53 1646)">1 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3259.26 1646)">Linear</text><path d="M0 0 767.947 3.92362" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="27.5 20.625" stroke-opacity="1" fill="none" fill-rule="evenodd" transform="matrix(1 0 0 -1 2398.5 1796.42)"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3199.07 1818)">1 </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3259.8 1818)">Linear</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/pipeline_logging.svg b/docs/debug/img/pipeline_logging.svg
new file mode 100644
index 000000000..b87254315
--- /dev/null
+++ b/docs/debug/img/pipeline_logging.svg
@@ -0,0 +1 @@
+<svg width="3956" height="1347" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-224 -527)"><g><rect x="652.5" y="567.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 772.973 660)">Node 1</text><rect x="1262.5" y="567.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1382.84 660)">Node 2</text><rect x="1871.5" y="567.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1992.71 660)">Node 3</text><rect x="2481.5" y="567.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2602.58 660)">Node 4</text><rect x="652.5" y="1238.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 772.973 1330)">Node 5</text><rect x="1262.5" y="1238.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1382.84 1330)">Node 6</text><rect x="1871.5" y="1238.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1992.71 1330)">Node 7</text><rect x="2481.5" y="1238.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2602.58 1330)">Node 8</text><rect x="3243.5" y="1117.5" width="932" height="241" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3404.16 1264)">TensorBoard logs</text><rect x="2528.5" y="1486.5" width="403" height="110" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2572.13 1568)">tb_writer</text><rect x="2528.5" y="853.5" width="403" height="111" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2572.13 935)">tb_writer</text><path d="M2934 906.136 3230.16 1218.8 3225.17 1223.53 2929 910.864ZM3234.5 1208.39 3243.43 1237.81 3214.53 1227.3Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><path d="M2.39725-2.46366 297.902 285.075 293.107 290.003-2.39725 2.46366ZM301.808 274.488 311.929 303.52 282.63 294.197Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 2931.5 1542.02)"/><path d="M652.5 1182.5C638.969 1182.5 628 1180.67 628 1178.42L628 861.583C628 859.328 617.031 857.5 603.5 857.5 617.031 857.5 628 855.672 628 853.417L628 536.583C628 534.328 638.97 532.5 652.501 532.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M651.5 1867.5C637.969 1867.5 627 1865.67 627 1863.42L627 1547.08C627 1544.83 616.031 1543 602.5 1543 616.031 1543 627 1541.17 627 1538.92L627 1222.58C627 1220.33 637.97 1218.5 651.501 1218.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 331.324 679)">tensor </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 286.339 756)">reduction </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 316.704 833)">group 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 402.779 910)">=</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 308.522 987)">pipeline </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 316.062 1064)">parallel </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 316.704 1141)">group 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 314.139 1313)">tensor </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 269.154 1390)">reduction </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 299.519 1467)">group 2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 385.594 1544)">=</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 291.337 1621)">pipeline </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 298.877 1698)">parallel </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 299.519 1775)">group 2</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/reduction1.svg b/docs/debug/img/reduction1.svg
new file mode 100644
index 000000000..184799d53
--- /dev/null
+++ b/docs/debug/img/reduction1.svg
@@ -0,0 +1 @@
+<svg width="3177" height="1801" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-572 -421)"><g><rect x="594.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="1258.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M1430.5 1186.5 1582.5 1186.5 1582.5 1363.5 1430.5 1363.5 1430.5 1312.05C1450.75 1312.05 1467.17 1296.82 1467.17 1278.04 1467.17 1259.25 1450.75 1244.02 1430.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1430.5 895.5 1582.77 895.5 1582.77 949.68C1603.05 949.68 1619.5 964.821 1619.5 983.5 1619.5 1002.18 1603.05 1017.32 1582.77 1017.32L1582.77 1071.5 1430.5 1071.5 1430.5 1017 1443.06 1014.66C1456.24 1009.53 1465.49 997.509 1465.49 983.5 1465.49 969.491 1456.24 957.472 1443.06 952.338L1430.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 895.5 900.768 895.5 900.768 949.68C921.055 949.68 937.5 964.821 937.5 983.5 937.5 1002.18 921.055 1017.32 900.768 1017.32L900.768 1071.5 748.5 1071.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 1186.5 900.768 1186.5 900.768 1240.99C921.055 1240.99 937.5 1256.22 937.5 1275 937.5 1293.78 921.055 1309.01 900.768 1309.01L900.768 1363.5 748.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><rect x="1876.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="2539.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M2711.5 1186.5 2864.5 1186.5 2864.5 1363.5 2711.5 1363.5 2711.5 1312.05C2731.88 1312.05 2748.41 1296.82 2748.41 1278.04 2748.41 1259.25 2731.88 1244.02 2711.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 1186.5 2182.77 1186.5 2182.77 1240.99C2203.05 1240.99 2219.5 1256.22 2219.5 1275 2219.5 1293.78 2203.05 1309.01 2182.77 1309.01L2182.77 1363.5 2030.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2711.5 895.5 2864.5 895.5 2864.5 1071.5 2711.5 1071.5 2711.5 1020.34C2731.88 1020.34 2748.41 1005.2 2748.41 986.519 2748.41 967.841 2731.88 952.699 2711.5 952.699Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 895.5 2182.77 895.5 2182.77 949.68C2203.05 949.68 2219.5 964.821 2219.5 983.5 2219.5 1002.18 2203.05 1017.32 2182.77 1017.32L2182.77 1071.5 2030.5 1071.5 2030.5 1017 2043.06 1014.66C2056.24 1009.53 2065.49 997.509 2065.49 983.5 2065.49 969.491 2056.24 957.472 2043.06 952.338L2030.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M743.5 1611.5 841.79 1611.5 841.79 1649.36C854.885 1649.36 865.5 1659.95 865.5 1673 865.5 1686.05 854.885 1696.64 841.79 1696.64L841.79 1734.5 743.5 1734.5 743.5 1696.41 751.607 1694.78C760.117 1691.19 766.088 1682.79 766.088 1673 766.088 1663.21 760.117 1654.81 751.607 1651.22L743.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M653.5 1611.5 748.567 1611.5 748.567 1649.36C761.233 1649.36 771.5 1659.95 771.5 1673 771.5 1686.05 761.233 1696.64 748.567 1696.64L748.567 1734.5 653.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M897.5 1611.5 1032.5 1611.5 1032.5 1734.5 897.5 1734.5 897.5 1698.7 909.883 1696.89C921.6 1693.3 929.821 1684.9 929.821 1675.11L929.688 1674.63 930.144 1673C930.144 1659.95 915.529 1649.36 897.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M838.5 1611.5 936.79 1611.5 936.79 1649.36C949.885 1649.36 960.5 1659.95 960.5 1673 960.5 1686.05 949.885 1696.64 936.79 1696.64L936.79 1734.5 838.5 1734.5 838.5 1696.41 846.607 1694.78C855.117 1691.19 861.088 1682.79 861.088 1673 861.088 1663.21 855.117 1654.81 846.607 1651.22L838.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M818.5 1788.5 945.5 1788.5 945.5 1911.5 818.5 1911.5 818.5 1875.75C835.42 1875.75 849.136 1865.16 849.136 1852.11 849.136 1839.06 835.42 1828.47 818.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M740.5 1788.5 835.567 1788.5 835.567 1826.36C848.233 1826.36 858.5 1836.95 858.5 1850 858.5 1863.05 848.233 1873.64 835.567 1873.64L835.567 1911.5 740.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1398.5 1612.5 1496.79 1612.5 1496.79 1650.36C1509.88 1650.36 1520.5 1660.95 1520.5 1674 1520.5 1687.05 1509.88 1697.64 1496.79 1697.64L1496.79 1735.5 1398.5 1735.5 1398.5 1697.41 1406.61 1695.78C1415.12 1692.19 1421.09 1683.79 1421.09 1674 1421.09 1664.21 1415.12 1655.81 1406.61 1652.22L1398.5 1650.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1308.5 1612.5 1402.76 1612.5 1402.76 1650.36C1415.32 1650.36 1425.5 1660.95 1425.5 1674 1425.5 1687.05 1415.32 1697.64 1402.76 1697.64L1402.76 1735.5 1308.5 1735.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1552.5 1612.5 1686.5 1612.5 1686.5 1735.5 1552.5 1735.5 1552.5 1699.7 1564.79 1697.89C1576.42 1694.3 1584.58 1685.9 1584.58 1676.11L1584.45 1675.63 1584.9 1674C1584.9 1660.95 1570.4 1650.36 1552.5 1650.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1493.5 1612.5 1590.98 1612.5 1590.98 1650.36C1603.97 1650.36 1614.5 1660.95 1614.5 1674 1614.5 1687.05 1603.97 1697.64 1590.98 1697.64L1590.98 1735.5 1493.5 1735.5 1493.5 1697.41 1501.54 1695.78C1509.98 1692.19 1515.9 1683.79 1515.9 1674 1515.9 1664.21 1509.98 1655.81 1501.54 1652.22L1493.5 1650.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1473.5 1789.5 1599.5 1789.5 1599.5 1911.5 1473.5 1911.5 1473.5 1876.04C1490.29 1876.04 1503.89 1865.54 1503.89 1852.59 1503.89 1839.65 1490.29 1829.15 1473.5 1829.15Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1395.5 1789.5 1489.76 1789.5 1489.76 1827.06C1502.32 1827.06 1512.5 1837.55 1512.5 1850.5 1512.5 1863.45 1502.32 1873.94 1489.76 1873.94L1489.76 1911.5 1395.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2013.5 1611.5 2111.79 1611.5 2111.79 1649.36C2124.88 1649.36 2135.5 1659.95 2135.5 1673 2135.5 1686.05 2124.88 1696.64 2111.79 1696.64L2111.79 1734.5 2013.5 1734.5 2013.5 1696.41 2021.61 1694.78C2030.12 1691.19 2036.09 1682.79 2036.09 1673 2036.09 1663.21 2030.12 1654.81 2021.61 1651.22L2013.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1923.5 1611.5 2017.76 1611.5 2017.76 1649.36C2030.32 1649.36 2040.5 1659.95 2040.5 1673 2040.5 1686.05 2030.32 1696.64 2017.76 1696.64L2017.76 1734.5 1923.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2167.5 1611.5 2301.5 1611.5 2301.5 1734.5 2167.5 1734.5 2167.5 1698.7 2179.79 1696.89C2191.42 1693.3 2199.58 1684.9 2199.58 1675.11L2199.45 1674.63 2199.9 1673C2199.9 1659.95 2185.39 1649.36 2167.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2107.5 1611.5 2205.79 1611.5 2205.79 1649.36C2218.88 1649.36 2229.5 1659.95 2229.5 1673 2229.5 1686.05 2218.88 1696.64 2205.79 1696.64L2205.79 1734.5 2107.5 1734.5 2107.5 1696.41 2115.61 1694.78C2124.12 1691.19 2130.09 1682.79 2130.09 1673 2130.09 1663.21 2124.12 1654.81 2115.61 1651.22L2107.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2088.5 1788.5 2214.5 1788.5 2214.5 1911.5 2088.5 1911.5 2088.5 1875.75C2105.29 1875.75 2118.89 1865.16 2118.89 1852.11 2118.89 1839.06 2105.29 1828.47 2088.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2010.5 1788.5 2104.76 1788.5 2104.76 1826.36C2117.32 1826.36 2127.5 1836.95 2127.5 1850 2127.5 1863.05 2117.32 1873.64 2104.76 1873.64L2104.76 1911.5 2010.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2692.5 1611.5 2790.79 1611.5 2790.79 1649.36C2803.88 1649.36 2814.5 1659.95 2814.5 1673 2814.5 1686.05 2803.88 1696.64 2790.79 1696.64L2790.79 1734.5 2692.5 1734.5 2692.5 1696.41 2700.61 1694.78C2709.12 1691.19 2715.09 1682.79 2715.09 1673 2715.09 1663.21 2709.12 1654.81 2700.61 1651.22L2692.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2602.5 1611.5 2696.76 1611.5 2696.76 1649.36C2709.32 1649.36 2719.5 1659.95 2719.5 1673 2719.5 1686.05 2709.32 1696.64 2696.76 1696.64L2696.76 1734.5 2602.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2846.5 1611.5 2980.5 1611.5 2980.5 1734.5 2846.5 1734.5 2846.5 1698.7 2858.79 1696.89C2870.42 1693.3 2878.58 1684.9 2878.58 1675.11L2878.45 1674.63 2878.9 1673C2878.9 1659.95 2864.4 1649.36 2846.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2787.5 1611.5 2884.98 1611.5 2884.98 1649.36C2897.97 1649.36 2908.5 1659.95 2908.5 1673 2908.5 1686.05 2897.97 1696.64 2884.98 1696.64L2884.98 1734.5 2787.5 1734.5 2787.5 1696.41 2795.54 1694.78C2803.98 1691.19 2809.9 1682.79 2809.9 1673 2809.9 1663.21 2803.98 1654.81 2795.54 1651.22L2787.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2767.5 1788.5 2893.5 1788.5 2893.5 1911.5 2767.5 1911.5 2767.5 1875.75C2784.29 1875.75 2797.89 1865.16 2797.89 1852.11 2797.89 1839.06 2784.29 1828.47 2767.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2689.5 1788.5 2783.76 1788.5 2783.76 1826.36C2796.32 1826.36 2806.5 1836.95 2806.5 1850 2806.5 1863.05 2796.32 1873.64 2783.76 1873.64L2783.76 1911.5 2689.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 743.462 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 907.316 1507)">1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1393.45 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1557.31 1507)">2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2008.25 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2172.11 1507)">3</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2688.42 1508)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2852.28 1508)">4</text><path d="M577.5 655.5C577.5 630.923 580.82 610.999 584.915 610.999L1810.58 610.999C1814.68 610.999 1818 591.075 1818 566.497 1818 591.075 1821.32 610.999 1825.42 610.999L3051.08 610.999C3055.18 610.999 3058.5 630.923 3058.5 655.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1036.2 713)">TP group 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3175.14 1003)">activation/gradient </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3330.42 1080)">tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3215.24 1294)">weight tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1483.27 505)">Tensor reduction group</text><path d="M577.5 838.5C577.5 813.923 580.82 793.999 584.917 793.999L1175.08 793.999C1179.18 793.999 1182.5 774.076 1182.5 749.499 1182.5 774.076 1185.82 793.999 1189.92 793.999L1780.08 793.999C1784.18 793.999 1787.5 813.923 1787.5 838.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M1848.5 836.5C1848.5 811.923 1851.82 791.999 1855.92 791.999L2446.08 791.999C2450.18 791.999 2453.5 772.076 2453.5 747.499 2453.5 772.076 2456.82 791.999 2460.92 791.999L3051.08 791.999C3055.18 791.999 3058.5 811.923 3058.5 836.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2285.23 713)">TP group 2</text><path d="M841.937 1945.5 841.938 2033.56 835.063 2033.56 835.062 1945.5ZM852.25 2028.97 838.5 2056.47 824.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 771.787 2164)">Stats</text><path d="M1491.94 1945.5 1491.94 2033.56 1485.06 2033.56 1485.06 1945.5ZM1502.25 2028.97 1488.5 2056.47 1474.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1421.78 2164)">Stats</text><path d="M2106.94 1945.5 2106.94 2033.56 2100.06 2033.56 2100.06 1945.5ZM2117.25 2028.97 2103.5 2056.47 2089.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2036.58 2164)">Stats</text><path d="M2786.94 1945.5 2786.94 2033.56 2780.06 2033.56 2780.06 1945.5ZM2797.25 2028.97 2783.5 2056.47 2769.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2716.75 2164)">Stats</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/reduction2.svg b/docs/debug/img/reduction2.svg
new file mode 100644
index 000000000..36f94611e
--- /dev/null
+++ b/docs/debug/img/reduction2.svg
@@ -0,0 +1 @@
+<svg width="3250" height="1773" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-499 -449)"><g><rect x="594.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="1258.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M1430.5 1186.5 1582.5 1186.5 1582.5 1363.5 1430.5 1363.5 1430.5 1312.05C1450.75 1312.05 1467.17 1296.82 1467.17 1278.04 1467.17 1259.25 1450.75 1244.02 1430.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1430.5 895.5 1582.77 895.5 1582.77 949.68C1603.05 949.68 1619.5 964.821 1619.5 983.5 1619.5 1002.18 1603.05 1017.32 1582.77 1017.32L1582.77 1071.5 1430.5 1071.5 1430.5 1017 1443.06 1014.66C1456.24 1009.53 1465.49 997.509 1465.49 983.5 1465.49 969.491 1456.24 957.472 1443.06 952.338L1430.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 895.5 900.768 895.5 900.768 949.68C921.055 949.68 937.5 964.821 937.5 983.5 937.5 1002.18 921.055 1017.32 900.768 1017.32L900.768 1071.5 748.5 1071.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 1186.5 900.768 1186.5 900.768 1240.99C921.055 1240.99 937.5 1256.22 937.5 1275 937.5 1293.78 921.055 1309.01 900.768 1309.01L900.768 1363.5 748.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><rect x="1876.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="2539.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M2711.5 1186.5 2864.5 1186.5 2864.5 1363.5 2711.5 1363.5 2711.5 1312.05C2731.88 1312.05 2748.41 1296.82 2748.41 1278.04 2748.41 1259.25 2731.88 1244.02 2711.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 1186.5 2182.77 1186.5 2182.77 1240.99C2203.05 1240.99 2219.5 1256.22 2219.5 1275 2219.5 1293.78 2203.05 1309.01 2182.77 1309.01L2182.77 1363.5 2030.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2711.5 895.5 2864.5 895.5 2864.5 1071.5 2711.5 1071.5 2711.5 1020.34C2731.88 1020.34 2748.41 1005.2 2748.41 986.519 2748.41 967.841 2731.88 952.699 2711.5 952.699Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 895.5 2182.77 895.5 2182.77 949.68C2203.05 949.68 2219.5 964.821 2219.5 983.5 2219.5 1002.18 2203.05 1017.32 2182.77 1017.32L2182.77 1071.5 2030.5 1071.5 2030.5 1017 2043.06 1014.66C2056.24 1009.53 2065.49 997.509 2065.49 983.5 2065.49 969.491 2056.24 957.472 2043.06 952.338L2030.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M784.5 1617.5 878.762 1617.5 878.762 1655.36C891.32 1655.36 901.5 1665.95 901.5 1679 901.5 1692.05 891.32 1702.64 878.762 1702.64L878.762 1740.5 784.5 1740.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M818.5 1788.5 945.5 1788.5 945.5 1911.5 818.5 1911.5 818.5 1875.75C835.42 1875.75 849.136 1865.16 849.136 1852.11 849.136 1839.06 835.42 1828.47 818.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M740.5 1788.5 835.567 1788.5 835.567 1826.36C848.233 1826.36 858.5 1836.95 858.5 1850 858.5 1863.05 848.233 1873.64 835.567 1873.64L835.567 1911.5 740.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1445.5 1614.5 1543.79 1614.5 1543.79 1652.36C1556.88 1652.36 1567.5 1662.95 1567.5 1676 1567.5 1689.05 1556.88 1699.64 1543.79 1699.64L1543.79 1737.5 1445.5 1737.5 1445.5 1699.41 1453.61 1697.78C1462.12 1694.19 1468.09 1685.79 1468.09 1676 1468.09 1666.21 1462.12 1657.81 1453.61 1654.22L1445.5 1652.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1473.5 1789.5 1599.5 1789.5 1599.5 1911.5 1473.5 1911.5 1473.5 1876.04C1490.29 1876.04 1503.89 1865.54 1503.89 1852.59 1503.89 1839.65 1490.29 1829.15 1473.5 1829.15Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1395.5 1789.5 1489.76 1789.5 1489.76 1827.06C1502.32 1827.06 1512.5 1837.55 1512.5 1850.5 1512.5 1863.45 1502.32 1873.94 1489.76 1873.94L1489.76 1911.5 1395.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2063.5 1611.5 2161.79 1611.5 2161.79 1649.36C2174.88 1649.36 2185.5 1659.95 2185.5 1673 2185.5 1686.05 2174.88 1696.64 2161.79 1696.64L2161.79 1734.5 2063.5 1734.5 2063.5 1696.41 2071.61 1694.78C2080.12 1691.19 2086.09 1682.79 2086.09 1673 2086.09 1663.21 2080.12 1654.81 2071.61 1651.22L2063.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2088.5 1788.5 2214.5 1788.5 2214.5 1911.5 2088.5 1911.5 2088.5 1875.75C2105.29 1875.75 2118.89 1865.16 2118.89 1852.11 2118.89 1839.06 2105.29 1828.47 2088.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2010.5 1788.5 2104.76 1788.5 2104.76 1826.36C2117.32 1826.36 2127.5 1836.95 2127.5 1850 2127.5 1863.05 2117.32 1873.64 2104.76 1873.64L2104.76 1911.5 2010.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2733.5 1617.5 2842.5 1617.5 2842.5 1740.5 2733.5 1740.5 2733.5 1704.7 2743.5 1702.89C2752.96 1699.3 2759.6 1690.9 2759.6 1681.11L2759.49 1680.63 2759.86 1679C2759.86 1665.95 2748.06 1655.36 2733.5 1655.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2767.5 1788.5 2893.5 1788.5 2893.5 1911.5 2767.5 1911.5 2767.5 1875.75C2784.29 1875.75 2797.89 1865.16 2797.89 1852.11 2797.89 1839.06 2784.29 1828.47 2767.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2689.5 1788.5 2783.76 1788.5 2783.76 1826.36C2796.32 1826.36 2806.5 1836.95 2806.5 1850 2806.5 1863.05 2796.32 1873.64 2783.76 1873.64L2783.76 1911.5 2689.5 1911.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M577.5 655.5C577.5 630.923 580.82 611 584.917 611L827.083 611C831.179 611 834.5 591.077 834.5 566.5 834.5 591.077 837.82 611 841.917 611L1084.08 611C1088.18 611 1091.5 630.923 1091.5 655.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1031.56 731)">TP group 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3175.14 1003)">activation/gradient </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3330.42 1080)">tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3215.24 1294)">weight tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 584.658 519)">Tensor reduction group</text><path d="M577.5 838.5C577.5 813.923 580.82 793.999 584.917 793.999L1175.08 793.999C1179.18 793.999 1182.5 774.076 1182.5 749.499 1182.5 774.076 1185.82 793.999 1189.92 793.999L1780.08 793.999C1784.18 793.999 1787.5 813.923 1787.5 838.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M1848.5 836.5C1848.5 811.923 1851.82 791.999 1855.92 791.999L2446.08 791.999C2450.18 791.999 2453.5 772.076 2453.5 747.499 2453.5 772.076 2456.82 791.999 2460.92 791.999L3051.08 791.999C3055.18 791.999 3058.5 811.923 3058.5 836.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2285.23 713)">TP group 2</text><path d="M841.937 1945.5 841.938 2033.56 835.063 2033.56 835.062 1945.5ZM852.25 2028.97 838.5 2056.47 824.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 771.787 2164)">Stats</text><path d="M1491.94 1945.5 1491.94 2033.56 1485.06 2033.56 1485.06 1945.5ZM1502.25 2028.97 1488.5 2056.47 1474.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1421.78 2164)">Stats</text><path d="M2106.94 1945.5 2106.94 2033.56 2100.06 2033.56 2100.06 1945.5ZM2117.25 2028.97 2103.5 2056.47 2089.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2036.58 2164)">Stats</text><path d="M2786.94 1945.5 2786.94 2033.56 2780.06 2033.56 2780.06 1945.5ZM2797.25 2028.97 2783.5 2056.47 2769.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2716.75 2164)">Stats</text><path d="M1237.5 655.5C1237.5 630.923 1240.82 611 1244.92 611L1487.08 611C1491.18 611 1494.5 591.077 1494.5 566.5 1494.5 591.077 1497.82 611 1501.92 611L1744.08 611C1748.18 611 1751.5 630.923 1751.5 655.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1245.05 519)">Tensor reduction group</text><path d="M1846.5 656.5C1846.5 631.923 1849.82 612 1853.92 612L2096.08 612C2100.18 612 2103.5 592.077 2103.5 567.5 2103.5 592.077 2106.82 612 2110.92 612L2353.08 612C2357.18 612 2360.5 631.923 2360.5 656.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1853.18 520)">Tensor reduction group</text><path d="M2531.5 656.5C2531.5 631.647 2534.86 611.5 2539 611.5L2781 611.5C2785.14 611.5 2788.5 591.353 2788.5 566.5 2788.5 591.353 2791.86 611.5 2796 611.5L3038 611.5C3042.14 611.5 3045.5 631.647 3045.5 656.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2538.46 520)">Tensor reduction group</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 743.462 1529)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 907.316 1529)">1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1393.45 1529)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1557.31 1529)">2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2008.25 1529)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2172.11 1529)">3</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2688.42 1530)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2852.28 1530)">4</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/reduction3.svg b/docs/debug/img/reduction3.svg
new file mode 100644
index 000000000..601fb8502
--- /dev/null
+++ b/docs/debug/img/reduction3.svg
@@ -0,0 +1 @@
+<svg width="3177" height="1801" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-572 -421)"><g><rect x="594.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="1258.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M1430.5 1186.5 1582.5 1186.5 1582.5 1363.5 1430.5 1363.5 1430.5 1312.05C1450.75 1312.05 1467.17 1296.82 1467.17 1278.04 1467.17 1259.25 1450.75 1244.02 1430.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1430.5 895.5 1582.77 895.5 1582.77 949.68C1603.05 949.68 1619.5 964.821 1619.5 983.5 1619.5 1002.18 1603.05 1017.32 1582.77 1017.32L1582.77 1071.5 1430.5 1071.5 1430.5 1017 1443.06 1014.66C1456.24 1009.53 1465.49 997.509 1465.49 983.5 1465.49 969.491 1456.24 957.472 1443.06 952.338L1430.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 895.5 900.768 895.5 900.768 949.68C921.055 949.68 937.5 964.821 937.5 983.5 937.5 1002.18 921.055 1017.32 900.768 1017.32L900.768 1071.5 748.5 1071.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M748.5 1186.5 900.768 1186.5 900.768 1240.99C921.055 1240.99 937.5 1256.22 937.5 1275 937.5 1293.78 921.055 1309.01 900.768 1309.01L900.768 1363.5 748.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><rect x="1876.5" y="829.5" width="496" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="2539.5" y="829.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M2711.5 1186.5 2864.5 1186.5 2864.5 1363.5 2711.5 1363.5 2711.5 1312.05C2731.88 1312.05 2748.41 1296.82 2748.41 1278.04 2748.41 1259.25 2731.88 1244.02 2711.5 1244.02Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 1186.5 2182.77 1186.5 2182.77 1240.99C2203.05 1240.99 2219.5 1256.22 2219.5 1275 2219.5 1293.78 2203.05 1309.01 2182.77 1309.01L2182.77 1363.5 2030.5 1363.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2711.5 895.5 2864.5 895.5 2864.5 1071.5 2711.5 1071.5 2711.5 1020.34C2731.88 1020.34 2748.41 1005.2 2748.41 986.519 2748.41 967.841 2731.88 952.699 2711.5 952.699Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2030.5 895.5 2182.77 895.5 2182.77 949.68C2203.05 949.68 2219.5 964.821 2219.5 983.5 2219.5 1002.18 2203.05 1017.32 2182.77 1017.32L2182.77 1071.5 2030.5 1071.5 2030.5 1017 2043.06 1014.66C2056.24 1009.53 2065.49 997.509 2065.49 983.5 2065.49 969.491 2056.24 957.472 2043.06 952.338L2030.5 950.003Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M743.5 1611.5 841.79 1611.5 841.79 1649.36C854.885 1649.36 865.5 1659.95 865.5 1673 865.5 1686.05 854.885 1696.64 841.79 1696.64L841.79 1734.5 743.5 1734.5 743.5 1696.41 751.607 1694.78C760.117 1691.19 766.088 1682.79 766.088 1673 766.088 1663.21 760.117 1654.81 751.607 1651.22L743.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M653.5 1611.5 748.567 1611.5 748.567 1649.36C761.233 1649.36 771.5 1659.95 771.5 1673 771.5 1686.05 761.233 1696.64 748.567 1696.64L748.567 1734.5 653.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M897.5 1611.5 1032.5 1611.5 1032.5 1734.5 897.5 1734.5 897.5 1698.7 909.883 1696.89C921.6 1693.3 929.821 1684.9 929.821 1675.11L929.688 1674.63 930.144 1673C930.144 1659.95 915.529 1649.36 897.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M838.5 1611.5 936.79 1611.5 936.79 1649.36C949.885 1649.36 960.5 1659.95 960.5 1673 960.5 1686.05 949.885 1696.64 936.79 1696.64L936.79 1734.5 838.5 1734.5 838.5 1696.41 846.607 1694.78C855.117 1691.19 861.088 1682.79 861.088 1673 861.088 1663.21 855.117 1654.81 846.607 1651.22L838.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M794.5 1789.5 888.762 1789.5 888.762 1827.36C901.32 1827.36 911.5 1837.95 911.5 1851 911.5 1864.05 901.32 1874.64 888.762 1874.64L888.762 1912.5 794.5 1912.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M1398.5 1612.5 1496.79 1612.5 1496.79 1650.36C1509.88 1650.36 1520.5 1660.95 1520.5 1674 1520.5 1687.05 1509.88 1697.64 1496.79 1697.64L1496.79 1735.5 1398.5 1735.5 1398.5 1697.41 1406.61 1695.78C1415.12 1692.19 1421.09 1683.79 1421.09 1674 1421.09 1664.21 1415.12 1655.81 1406.61 1652.22L1398.5 1650.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1308.5 1612.5 1402.76 1612.5 1402.76 1650.36C1415.32 1650.36 1425.5 1660.95 1425.5 1674 1425.5 1687.05 1415.32 1697.64 1402.76 1697.64L1402.76 1735.5 1308.5 1735.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1552.5 1612.5 1686.5 1612.5 1686.5 1735.5 1552.5 1735.5 1552.5 1699.7 1564.79 1697.89C1576.42 1694.3 1584.58 1685.9 1584.58 1676.11L1584.45 1675.63 1584.9 1674C1584.9 1660.95 1570.4 1650.36 1552.5 1650.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1493.5 1612.5 1590.98 1612.5 1590.98 1650.36C1603.97 1650.36 1614.5 1660.95 1614.5 1674 1614.5 1687.05 1603.97 1697.64 1590.98 1697.64L1590.98 1735.5 1493.5 1735.5 1493.5 1697.41 1501.54 1695.78C1509.98 1692.19 1515.9 1683.79 1515.9 1674 1515.9 1664.21 1509.98 1655.81 1501.54 1652.22L1493.5 1650.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1433.5 1788.5 1550.5 1788.5 1550.5 1911.5 1433.5 1911.5 1433.5 1875.75C1449.09 1875.75 1461.72 1865.16 1461.72 1852.11 1461.72 1839.06 1449.09 1828.47 1433.5 1828.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2013.5 1611.5 2111.79 1611.5 2111.79 1649.36C2124.88 1649.36 2135.5 1659.95 2135.5 1673 2135.5 1686.05 2124.88 1696.64 2111.79 1696.64L2111.79 1734.5 2013.5 1734.5 2013.5 1696.41 2021.61 1694.78C2030.12 1691.19 2036.09 1682.79 2036.09 1673 2036.09 1663.21 2030.12 1654.81 2021.61 1651.22L2013.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1923.5 1611.5 2017.76 1611.5 2017.76 1649.36C2030.32 1649.36 2040.5 1659.95 2040.5 1673 2040.5 1686.05 2030.32 1696.64 2017.76 1696.64L2017.76 1734.5 1923.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2167.5 1611.5 2301.5 1611.5 2301.5 1734.5 2167.5 1734.5 2167.5 1698.7 2179.79 1696.89C2191.42 1693.3 2199.58 1684.9 2199.58 1675.11L2199.45 1674.63 2199.9 1673C2199.9 1659.95 2185.39 1649.36 2167.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2107.5 1611.5 2205.79 1611.5 2205.79 1649.36C2218.88 1649.36 2229.5 1659.95 2229.5 1673 2229.5 1686.05 2218.88 1696.64 2205.79 1696.64L2205.79 1734.5 2107.5 1734.5 2107.5 1696.41 2115.61 1694.78C2124.12 1691.19 2130.09 1682.79 2130.09 1673 2130.09 1663.21 2124.12 1654.81 2115.61 1651.22L2107.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2063.5 1789.5 2157.76 1789.5 2157.76 1827.36C2170.32 1827.36 2180.5 1837.95 2180.5 1851 2180.5 1864.05 2170.32 1874.64 2157.76 1874.64L2157.76 1912.5 2063.5 1912.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M2692.5 1611.5 2790.79 1611.5 2790.79 1649.36C2803.88 1649.36 2814.5 1659.95 2814.5 1673 2814.5 1686.05 2803.88 1696.64 2790.79 1696.64L2790.79 1734.5 2692.5 1734.5 2692.5 1696.41 2700.61 1694.78C2709.12 1691.19 2715.09 1682.79 2715.09 1673 2715.09 1663.21 2709.12 1654.81 2700.61 1651.22L2692.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2602.5 1611.5 2696.76 1611.5 2696.76 1649.36C2709.32 1649.36 2719.5 1659.95 2719.5 1673 2719.5 1686.05 2709.32 1696.64 2696.76 1696.64L2696.76 1734.5 2602.5 1734.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2846.5 1611.5 2980.5 1611.5 2980.5 1734.5 2846.5 1734.5 2846.5 1698.7 2858.79 1696.89C2870.42 1693.3 2878.58 1684.9 2878.58 1675.11L2878.45 1674.63 2878.9 1673C2878.9 1659.95 2864.4 1649.36 2846.5 1649.36Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2787.5 1611.5 2884.98 1611.5 2884.98 1649.36C2897.97 1649.36 2908.5 1659.95 2908.5 1673 2908.5 1686.05 2897.97 1696.64 2884.98 1696.64L2884.98 1734.5 2787.5 1734.5 2787.5 1696.41 2795.54 1694.78C2803.98 1691.19 2809.9 1682.79 2809.9 1673 2809.9 1663.21 2803.98 1654.81 2795.54 1651.22L2787.5 1649.59Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M2719.5 1789.5 2837.5 1789.5 2837.5 1912.5 2719.5 1912.5 2719.5 1876.75C2735.22 1876.75 2747.96 1866.16 2747.96 1853.11 2747.96 1840.06 2735.22 1829.47 2719.5 1829.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D9F2D0" fill-rule="evenodd" fill-opacity="1"/><path d="M577.5 655.5C577.5 630.923 580.82 610.999 584.915 610.999L1810.58 610.999C1814.68 610.999 1818 591.075 1818 566.497 1818 591.075 1821.32 610.999 1825.42 610.999L3051.08 610.999C3055.18 610.999 3058.5 630.923 3058.5 655.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1036.2 713)">TP group 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3175.14 1003)">activation/gradient </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3330.42 1080)">tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3215.24 1294)">weight tensors</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1483.27 505)">Tensor reduction group</text><path d="M577.5 838.5C577.5 813.923 580.82 793.999 584.917 793.999L1175.08 793.999C1179.18 793.999 1182.5 774.076 1182.5 749.499 1182.5 774.076 1185.82 793.999 1189.92 793.999L1780.08 793.999C1784.18 793.999 1787.5 813.923 1787.5 838.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M1848.5 836.5C1848.5 811.923 1851.82 791.999 1855.92 791.999L2446.08 791.999C2450.18 791.999 2453.5 772.076 2453.5 747.499 2453.5 772.076 2456.82 791.999 2460.92 791.999L3051.08 791.999C3055.18 791.999 3058.5 811.923 3058.5 836.5" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2285.23 713)">TP group 2</text><path d="M841.937 1945.5 841.938 2033.56 835.063 2033.56 835.062 1945.5ZM852.25 2028.97 838.5 2056.47 824.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 771.787 2164)">Stats</text><path d="M1491.94 1945.5 1491.94 2033.56 1485.06 2033.56 1485.06 1945.5ZM1502.25 2028.97 1488.5 2056.47 1474.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1421.78 2164)">Stats</text><path d="M2106.94 1945.5 2106.94 2033.56 2100.06 2033.56 2100.06 1945.5ZM2117.25 2028.97 2103.5 2056.47 2089.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2036.58 2164)">Stats</text><path d="M2786.94 1945.5 2786.94 2033.56 2780.06 2033.56 2780.06 1945.5ZM2797.25 2028.97 2783.5 2056.47 2769.75 2028.97Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2716.75 2164)">Stats</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 743.462 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 907.316 1507)">1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1393.45 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1557.31 1507)">2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2008.25 1507)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2172.11 1507)">3</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2688.42 1508)">Node </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2852.28 1508)">4</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/scaling_factors.svg b/docs/debug/img/scaling_factors.svg
new file mode 100644
index 000000000..b70b51e66
--- /dev/null
+++ b/docs/debug/img/scaling_factors.svg
@@ -0,0 +1 @@
+<svg width="3350" height="1050" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-252 -728)"><g><rect x="1012.5" y="1003.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="1622.5" y="1003.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="277.5" y="1003.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M507.5 1222.5 633.5 1222.5 633.5 1345.5 507.5 1345.5 507.5 1309.75C524.286 1309.75 537.895 1299.16 537.895 1286.11 537.895 1273.06 524.286 1262.47 507.5 1262.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M429.5 1222.5 523.762 1222.5 523.762 1260.36C536.32 1260.36 546.5 1270.95 546.5 1284 546.5 1297.05 536.32 1307.64 523.762 1307.64L523.762 1345.5 429.5 1345.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1202.5 1237.5 1296.76 1237.5 1296.76 1275.36C1309.32 1275.36 1319.5 1285.95 1319.5 1299 1319.5 1312.05 1309.32 1322.64 1296.76 1322.64L1296.76 1360.5 1202.5 1360.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M1807.5 1233.5 1934.5 1233.5 1934.5 1356.5 1807.5 1356.5 1807.5 1320.75C1824.42 1320.75 1838.14 1310.16 1838.14 1297.11 1838.14 1284.06 1824.42 1273.47 1807.5 1273.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M532.937 1360.5 532.938 1460.54 526.063 1460.54 526.062 1360.5ZM543.25 1455.95 529.5 1483.45 515.75 1455.95Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 322.919 1547)">One Scaling Factor</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1037.68 1547)">Scaling Factor No. 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1643.02 1548)">Scaling Factor No. 2</text><path d="M1870.94 1371.5 1870.94 1464.58 1864.06 1464.58 1864.06 1371.5ZM1881.25 1460 1867.5 1487.5 1853.75 1460Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M1249.94 1378.5 1249.94 1471.58 1243.06 1471.58 1243.06 1378.5ZM1260.25 1467 1246.5 1494.5 1232.75 1467Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1170.72 975)">Node 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1757.09 975)">Node 2</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 453.544 975)">Node</text><path d="M2282.5 730.5 2282.5 1770.26" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M899.5 730.5 899.5 1770.26" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><rect x="2371.5" y="1003.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><rect x="2981.5" y="1003.5" width="497" height="577" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none"/><path d="M2561.5 1237.5 2655.76 1237.5 2655.76 1275.36C2668.32 1275.36 2678.5 1285.95 2678.5 1299 2678.5 1312.05 2668.32 1322.64 2655.76 1322.64L2655.76 1360.5 2561.5 1360.5Z" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><path d="M3166.5 1233.5 3293.5 1233.5 3293.5 1356.5 3166.5 1356.5 3166.5 1320.75C3183.42 1320.75 3197.14 1310.16 3197.14 1297.11 3197.14 1284.06 3183.42 1273.47 3166.5 1273.47Z" stroke="#042433" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FBE3D6" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2426.08 1547)">One Scaling Factor</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3023.31 1548)">One Scaling Factor</text><path d="M3228.94 1371.5 3228.94 1464.58 3222.06 1464.58 3222.06 1371.5ZM3239.25 1460 3225.5 1487.5 3211.75 1460Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><path d="M2607.94 1378.5 2607.94 1471.58 2601.06 1471.58 2601.06 1378.5ZM2618.25 1467 2604.5 1494.5 2590.75 1467Z" fill="#156082" fill-rule="nonzero" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2529.3 975)">Node 1</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3115.67 975)">Node 2</text><text fill="#404040" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 959.215 839)">PerTensorScaling</text><text fill="#404040" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1585.12 839)">and</text><text fill="#404040" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1732.35 839)">FakeQuant</text><text fill="#404040" fill-opacity="1" font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2610.45 839)">FP8 Delayed Scaling</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/debug/img/tensorboard.png b/docs/debug/img/tensorboard.png
new file mode 100644
index 0000000000000000000000000000000000000000..481dbd2eb9844f82e35d9a0446ebcf8df40ca6b7
GIT binary patch
literal 123093
zcmeFZXIN9))-VbvqA0?arl5jukq(L!X#qt=K#GJUbWjLAARQs0Du@WEG^wGtkVLw)
z03sm0_YwhV2^~U72;A&b-m~v}-DlrF_s4ynkB5~k)|_LOF-IR`5w54B&ce*YOh-q@
zqVf2FJ{=tsoR03qlhdbYPbw=XS?K8K4eiy`^fc7e`1RaeZ0wz&baX-=Jl|W0hCI92
z60j}kcq23R^2+1BH<zMp*=H>y=X*YQuiF6cM0>}?y8L(`j8p|MFuVSItiw?br&HH-
zIR9ycda>dIWqIw$(^J_SB4;4alA|xvpsC>ay7ked`}WzjY9`aNzBe&mO`SpkLec_U
zI9uqvi|f+Mfh=z{Om{)YSf}-bTEg<$B;~*&;9=GC&3D4v+SNS<V6MwoXL=uj?WZp|
zm*0rph+25?VNXUq0)Duhc!uc}_EmCc_HXPs?%f9+I}a{lvxmE|=M)fXN>>tx{jwEL
zOSQ);$^BNlzht|?SHC{MXUW-oldI##<BxGj|EC<nvtQY>Z!b6hzLKN6KInafsQ*|&
zKG`(FkRSka<v!0RFsr3OJg4T))*UM)Hn&(91=qH*Z!q~u{yl%Hbal}vpBVf#6;^+B
z%xi|O(EY`SW4fR1&?^>UlBa2?215-sY_zrML}>S?>FC4k=}yw_=xG-Z?V_E?$51*Z
z+V=(8^&p+$->ppW^b`NSKk?*`j`x6S8XB~3pp_dG>g;ak;&EQ1U4}N)h`oWKhoSZp
z1uGYr#4~G`=THeB*o!|v=#+dEXtyw^$1{E(n3J=+f{*g`fAmnG-T(1e@;d)Nx_CG$
zUpLg&<5zQWgYwHt+>*F;9l*@b&#&ZWZKI(7;L*RpX@4nSxAX9Lp&%*g?d>h$eMiE@
z%~tZZyu7^REh$MUDREj4arc+b9?yKlo!x)`=OF(c=K<8+%FX_ThrNq4{~zN%d+y@t
zp?v-N9~1r0&p+1*^|AlYOwR8A+7@ktl7F;F-j=u}`9EXRK$ZS@s-S1@19dWbU=O3A
zhc*Y`uC%n$Kl=YaTK+TRpCAqY19@Biw(Orl|J3xqK|$_NH#HX+ZB7rse~#u~z<+N1
z7od{lA6x$uD*lP*e>|lj8o;b1`9FsSU>;R=T%@B@rPFwD&%lR%Ws1?~^3X`@W=&g*
zrfb4|S8?;VisEbL*@iRGHHyvi&1h9uZ%K*fa@FG5=JHtM!@ByI$<-Ab@shrEM4EI>
z>F?fvJBrJ9Y`d$9&#05o-qI)8RO#sd>qn4;px)rT?~}0K{;nte$tQt<l7b8jr}+cv
ze*NL$804#7Jm8E7oUmNHt3FQm0p#(CjvjwL%eLvSq4@O-gF;nZI-8kmE)7r5Gv=oL
zu5`I~p4EnRlAZylypdko@z=mR=ja*gl;`eN&YQb+m*w0fI`GkeR_8jw|7GjxR~Uu*
zCZa~CoPa5cQ!9bHfm7FhOq@6`RrBUAcE%s5s>}WfoLQN-Do>d(rteN2W-fvIon_mA
zxwuw@{Uy}D8_GV=&>$OZE%Vnq`4?7I|IeWRf52%r^SAa~cXGxxJs}#sz5WQT`|l^K
zY^)kI{T87<%6bof6(Pdmu{g(QrYNfZ-`O*riTRbK%VsbYw$8bq>eghA!MW+P4%!__
zBw$Lhq7$>(*^HumiW8tZxLSXiR{+p-cDBRm=d2d1M>B$4HG0n7n42JYxZu!=SaE1k
znsWfW9IhHCcoBK%@m^UrYJaold;Bt^C<ORNLb}V5dcEB=G^sPi**GSV`R+AbCqLgS
zKqF^YwQ3Cdn~lYH&I9B}&Pq4W0K~copW+CUzq)K9mW_Q}{D+D3b}itmvbs4%PX!u`
zqELyo3#+krI5{bAj!N3hP*`tL%&{wJF2Iu{6W50H%=8|(LP-%#vU&y~eU-s%wVhRX
z+Ox#`YmxI$uIZI$%&lLVs8z^KcUdo0v6os8P?sFFHohjv(N+FQ^wQ^;2+4~0h%V`9
zSm6;8Y1Wm1ttM2id3n}HYCGYaemui$C3aj`pCCT$b3!CLiA<H<yu<nYe$n?hm{*ag
znRkGR-xSt@2D0udV^K@63Q(DlmT%#<=!h#SiHUJ(e(z{Dq%~60F4*mj!C~0yG>b~7
z7;plPtE-wL(~8A2PLd8dpE2!AfEyEHspL{ajVhw&_mE3u!*$jpQ4C=@LL9Bw^yJB-
zzo7;gBV2xVOI+7W0JXYc`t)gkwUxU#L6cOvkcJ_UCg7n2d5lxY3A5#frn9^ok<F&q
z<Ed3s?KqV0whn|c<%e<V{ebbo<zB)ifpK#OnEuwWw-n4zO~}+xl}~swyh{R*Z;UKc
zfgR*JmCnIGd*{d5W8i}casG2&iv))l*{ACvKSAV6o)MxX9MmLj@bfz)YHQOyW&uW&
zTjkn7AN!?TN&kGJxSts!-J`!-u~vySrN#y{sUH;tVJ~27$Po(lcoMt6cQ>-=p}W}Z
zbIv>aOGgSApCHbupIVhKJOljkL*5^bDjzkd{FdqdJ1Mg_2~^QmlA3N&>|?a%wC__;
z|7Dotu<euVZ5Gi@PP&y(DEwb&xut1_X27|ox`s8xyY0!Ea9fPSTwP1Tc&}~X+UN`9
z-5%(KlPurL>U`^O66PK(XQRON^LGqX5E38K^9kjf{wK`Z+9xRK>ulT>ZU~ENpCj1`
zcrIsLQ5jm)+>Vp-<$19TFGJ<q)NIG2t2Kw3<n{N~`JM*#6xC`0V`Xh<irQz0)vb|t
zh_2yF$@b0=+}I|@XNG<kw@a*q%dWq~I3G%42<0YlnK^9HtIBvZb&VPv@9kU}=rWX_
zjCep<IO))r5>dSz^|ECe0+8(-f*-lfmagq&5^`oR<0_2Lx|sTAwb1J#yF6>p=LaYT
z2WBe`-g(s3Clvka=_73|M|FjK4svz9|BLwv7I)H69A>MpbY<Y}6t*nl0cm3IV0$a)
z`-P)t2}TR<&}5|LfC#es`w(>#Et(ea^7tWM(jK>ALr}}BA|D@}b78NKSTLV#rZSe>
z;h<(k*`4I4t4QNBtaP!M@&xRBz1#Dnjm$Nyt5^38#2TMr0foX;!$0CriNa6w$<|0X
zZ|2BtP-HqKBo21&K<e+SFz-$TsB40GS2W`{PozrL76!0c2SdL>06uR0{JUPRB;AY4
z%U8;iuy~I+R^&`{tDJT<7)O-{O9r&R$q0{l@u^_%b98EF?&POBvMM~>)3o_2grdU5
zT3x`Jc7dxBPn*z#GbAsrB!8RLXQ{T}%C>3$ZkdVzwQ?_jf5e@qCSy16HfAehz-%>|
zPY@j~_iJIOcQ>eCX7>fDd<aACs&V?=H9%5-^KFtY(%Z8ho;LdVW5ru;qpqRaz4R#y
zZTJt1{A?$9RhQ{h53(w$$NY`|;Kn#pA+5_)sIFMjGcSmW=SLJ}nY|{u;D-x*Oxr1?
zC-<!ElLCK=c})-RDr%~R6!dl61@BdkuZ=0Qqmdx_n#|%>@f#Cr#m(w|66+J@SFaNt
z>wTMs{Ez*8I@Z0o@{t?Ks7qPMMN?8(bk$oki8eo>!3u)I0hgfBXGr*4;F?1JknAam
zCbDCT5|OeP0Dym83$IlF4B6}GkX4rT$e8iYwRG{o#J((xN{UA6ycKw0>$2&{cVpyJ
zokROd<SD;A#VrkPq73l)&?aYvM{@*r^`=Ehftj2%HePsQ!NVlXO!APF$A+c}J@I9D
zQCUK&&!)hzXYsP_Hur>N(N(W06%_$+Ub?wcOO<Jty6(Ykx7G<D(?h^i?GYY-$T*RQ
zh^7eL#+omk3Hr!c;dYzv%fP2<PnCDO+f%D2{DB`AfV-~nVi|ZZj1!`=>;j=Qh51kH
z-Gn}mr|H3ngW_;U6~8`<nK_Jg4jRU`P{@tZHipbbVtde%B~qUv7EHfM?1uLoo@ERo
z89apd4WZ3_r?CD3Dn5g|iV8?`QIosVF7HLtyzP&3N3q*l$7KyLma%V<cWen3^FZ#2
z%YSIoFO|J?{mikT=;|ETwzW+yUHw7oG#lNZt7H=59%N3Lh~kOzq-F>3cn#Wyw`AHM
zweygITAy1~0%U$T>elUd0v+3fLBO^XD%t|4lC1L}Iu4%%x3G+nOjn)-{oo0BI<h$+
zZ(gzN>P-0EcC?Ck$oPO*<iCDR^cfo$7O9Xi18rY?SStQQGaf(LoG|ts1n?<5&N@P>
zj8R7>$UKgQH`P_P36Q17LH-HOKJVk~^ACPrk|W{fikMZG2{n{s)vooM7R9MBP^;>_
zm(*5k!U@3%?*i+%E2dbgi+ha3^ta$|8Ggkj=3Y^X0pfk^Dl_*}_9r1cdJW1D(dBhy
z#TM}m_Dgxh!38djk+a8BBevD2hWsfQYETr#DkBe>JL#LQEt%wZX2fv`4Y;M!4N~^2
zQjyI8e6B+gJ$sd%`vs0vLXR5hd48?SIn2|oR}m?wBqi}56E5sEmB!MoUol%p?9{sY
zg5mJl*W?;0ix%)+VrtKDM8$qdvMhYJiOIiq0Fq?AA+nt1QFlUYpFnUpRBn<_-ENa#
zo1f---(L}%o&}a3nZ<l>z3YI`*FtFLXrm4`To<?Dre)Z|H-;56jTmyw2Mv0QsT1N;
zOSe$L`GHvDu?DqOfV~6!hS}G5;teWjF1_9odF>!Dqp2SY=}bsukQO=%p#3C6p4cUe
zz3L=ngplh<)8<}@lrT{`c00m(KlIRVG&f0c36+<^SnLfs)?bDqxhX67Oe1o3s@%dz
zh9?Ms&=~JAQF!W_kZHvIf~Rz({jMD5abg|w?y&!Mg(Lz|p)C$noodZ$KePp|ZQ!-=
zW$$L`c?KZF!b{X_Z5LvI0;~Zd=!Feg+onef<gJzl^yhxFC27~K@(26|yZc1|K$m5y
zS<Koi61c3Hn;O`jn+zX{3Ks;&CfW;+3=A)&u=)@(<qx)CxzI;sL3n$m{O<}0j1dM*
z@U1sgxat~&B8Hn>ILfR|L7CVEt~bqc-6)Hw?_k>?X)XWY?vlcaTTtf8EIo=wI{hc{
zlo%wz$l(N}y4xQ-QbQQ}E&Rx?zP*w(mhWHm<H5FUj(>{vgauMQKfWr!|LO70)2aa9
zZ^B;+7LMtR&d`rsyzKO8Jm(nn9#D3j?~^z4K6x{PD5xvScNL&ZHM5ZKW4grs;^Cd1
zMZ(cR@r)JUt@0w^u~AWxH{m*quKPh}w<+50D&z7ThD=o#t1;Lnru3`gn*no`3wT?0
zR6Z$jouHM=MXj~YfvprsA^2j;lM2Rww)SMB$#?!xV{ga2)`Xqo<xLrK-n6Tal^*Hd
zG5-UXdJB_%tJ_!=E|sdSE8XSC9@;qP7qcq*B=+N-#1X;q7*~;dza<^b<s5G;`4uqT
zFwi$MYU;8lcZ)<oh<Zk+(0OEPSaQ+*#-9Yawa?2#d`h!B*1Bz5CWZ}=kTTIX7BXAM
zhcPD66H1=-0juraHD8%1BkrJ+@MIq;`d>NkLHAF8J496b3b;yF7`he<^u&tQ6zi_}
zUUNO}Zsy6Y$^7ZJa67;PXU(W$npK}?GHELhQQG^c0Xh_0Dx1lWSXi%vC<l}SAF56{
zO5h0w5JKyH5JMsZ;96;kM@R^|#Cj<IH^9LP9}$#4dvVnf-2(v6K8AmgIYe|{q9VFO
zDU%OztnFWm{L$BHM!-MzdNkVs0iaQB3fjJ`f?H1s^)*4m*?B-d{Xx@5dd0C!<FIx{
zZjmL&+x*DDT)(B6L#>g;lAy4$Pj$-x(Qjqdc%1WN_}R_E?~e-cRrm|_rgKefnr^@$
zaC2LdG@r?5Pnm;sPj;G9Q^sjfziP$$;kKXJC0j<nrY26`DdJ~!vH16+g%*!G;$bh}
zM$4QNGi<25{qT12Soh{v<O@zUOsS>I6cT^%q@F#cy;wvO6R175yJ);|W8COrCkkIZ
zqK)(V%=(^uPRXJ|H8CcK$WlHQTg)3=7pH6f<7Qg%TgkcVXfqDZ$NHZii%+{)L2xAd
zBTya6cZNsqphO(8v3v`nvMFidKf-!)HAK5W`<{K=DUAP|&ap=~P=k+n-(=DoD%ykE
zI?L&Yx4dW1h;fLmD_GE-eT#6>v>w-~YGX>r+7-#H9q-2;G?qqi8#da2HVRBB>&N*^
zTJp<>EQgGiZzLHHQ=ymQb50hP>p2{IYzl8%oQ6S+27u1vBVru<pcjz~^`$BZLQ23#
z)TpgVSgvJJ$!EuUgya*4zIF1Z%7mNi@r}0(REO$|Px?a6;SBC;naoD3p6#ILD75Gr
zLn^3zs_O3eSROj+BCbf9o}2m>6JKG8t=-v)!Z7x^O2+0$){nck&3^zjS>RdB*muJC
z*G_!yf{8Sy1GEu8pCB_RSpm16hfShBhcOcn_Mb$O<*fZ%5%I+$7xD`{LUvJvq2v|i
z<6j4lR9nvsQV&7|=JPK+uK@pyX^0grV9shk6-D{sQf7b7K;Uh`+YFJ%!_>n9f!78H
z)*pQN)b)n5rC^Tr#|{3mota?DoZvw2!MUF|yO>G~Rkr4>5Hf?aj1Wz@>8uqG^L<dj
zhT-$1x{dG$&v<9NAKPf96;tYgxw|{uh-8xbc3Jl8fB4xZC0{_VS<X6KVJ17IzP59<
z7&6ed+O=_5y|MyQiB*+3AY3tf^Yj%?xD)~cus50`ew?UJ%I(S2?r^wb3^9?@k|l|v
z4`zq5L5@!rj-$+P{kBrh%8P0%eCJp88roEyi7X!?+D8|c4e?A^DmQPFO8Z-khX(zk
zilqvC(L8$tsfS6*tC{Z<<Mddx1<vOJ;MH|#D#25f(Q&NaUBYJIEh79?D66B`*`AhD
zB>IGQ_3kb1)}!#Ess8f4iYI1q{UA%juHeT<{Bi4ERdLgx4G{PDUIP5l>Q7N`yUxcu
z5$d>iBB&yy9Z=XlE)P`aB?H5^%k6*cVZLv&F|0cxTP7Z7Dna5-7H-<}C|<Vlit907
znCm0qPL>AD=DC$nU4^y+#Kb_86T-b$5bgZs0Rn)HqK4vyzR?`dNczXheYdB=v1NW;
zK59!&$37O&Hivhi1n>1duqW3cdQVq%D9`lq{Sg@(wPHS^jI^IVF#_^+;+qR~25j$=
z+BC12>XMW02tYKdyqpb56-GM^SRH0yw4R{1bgdt@o*a&OM%H~Dd1<D!7W6iIH=*ak
zcAnpd0%X@Rq?%OGEX`trk(w_ox2WF59c6^ptWbE;lth!fg(se@davJv=?D}7JNWaz
z1-D}il>J_ty&%=;)7`b6xryfQ;8hmR4n$L1SDMtR``pK+j_-r(`)pXf_g$5r-|1e~
z1aURlc8tJx%DOU-M}@U4tR(jU_OWrWzQEN2E7*gPR>xllTJ2Qm1#E;)V692YaHk)K
z?|8vE4?!Oc7BW88#XbO&CgZ}}gPXS{tU8AGFLd?g-VSlhO>S1N<Y|{xSLxU*Uz3Z6
zy?74;?!{W+X6=jlB=m+(68)d(gg%`OZ~uNXiL68p|2C1flev|s1@qQIEI3qyNu5xM
z&X~3BCj-X3D?`4Hz$lZ`WUlK2a<bb>6`}><z&<KU`Yd0F!y=d%W`Xvl?q(Om(eK>9
zuSD*)>JHdGM6G@NR0u7Px-Tu-wgySzd9O1Sf3-yj=xL6piD<dyxzA39kp3{(POkeY
zIIrW#i0<Kp#piaaVYTB;FGyk;^G}Yh3d`q7=K1jZWx{voq^E7LmtZAkrg09dtujOG
z7uuuHn~h%x-h7|=lG~UxIi0{jI{9&jmZ&WkA~l-NX)59In^SO^c)R{jX&)Hl!t;xL
zhB&Oncjrye@a?0@D&a~#hmeM_6*I=k`=8~CGKW=erw3xwnDv2er>L8HEg87|hj+SQ
zMBpV0NB`=U8zBC>WrIhYl+BPKk85Wd&Fe<?>6lXrem;nHnSOfd6Z9yY8@fA8J&L>H
z%?lg}h{r51=RY{GRS7BiIv~ea6hL&Y;creb#^>!~9a{D27YY-?C?QJX4JIi57t&TE
zD$Sg*9DlJNy#@<b1ok79>+(lU@2ijpd!kOI=ztEV{v4Ap`}(}q{W5zS0y^hQiY0QV
z5oK{tqwGy*l}u<(-hsRoSivM>a9?mzQ%>Pq8cvJbRJYPN&S69*!_8W)D<HDKtokU(
zzi4CiFo5pk@lrvsYX0?fPi+$3g@w(fOGQ<0`OYg)MTSSDGOX5xaoX`P;B!$_Jx2Sl
zq9eEH1z=0&2#w=au{&~R>Kwh{6gtP%5tmW64>Cvo&klZQtzN}S`3~KGts=nsFnJAO
zgR{MydS~0Q<J|7h^|=T#;RDXtw$D^+-$#QlFA#e1SWfH;YC~;E9OvcZD3E5x%CgUV
z?W6}V1Sih2d(-k|P#|}2dB#Z49@%F)7(*sam<~mZRMu!6^~HFt7(caoac9_%3z9*P
z8Ys(b5&W$mK6p!`c<eFomQzX3d-O$kRlOb7PKEi$u>W3KPi$Q?VGDIGL{k7YL``<P
zy<5Ib^uOX5B0>zDpO7}sJMV1AV^}vb6Fu(Re(IoTDw%bIJ||SrzdsI`f3P6VTWQ$%
z9J*H+S}&Cl*M(viLTFtD@_VL@zH^_*e*Y|mEWb`o7+~@w8N^0~w=1;z_7DVHQc7AQ
zeh;{j1y)GdK?YBvXZD5en$EX^-W;5p{Rm>B9CMs|(JV4Ca|f-4dbvJRWVH_ltabQ7
zv|R;=Ym0AvDcje~3Zg%kwBQpdi2?Qc;4v>3D_nbRj~(W~I-6?8w*9uw#baL4;%CPb
zrFDmZqsa>BrOgj$f<{FqpksfIR3nK!s#5_!+~vCyhme>ca3tT!&0B;6<FRo*72adV
zeYFleCWa#{j)1$okcPdn4unOXnO)(b9|F&LZn$AXqYJ6%Ad&$f9kH9`uy!WE{6hJN
zP0`oZTJ16Zi@5Bumm&YoSNz|#^s1O1aFyN7c>{C;vT0Q2Uo?eG2J0KpfiC%y{LQEG
zDk&|1ZoQJN3h-AWWejrg{poDYK%G4i-sI4;gYV-!0pJ<^3w~df<O3*vd77Gi7jt=P
zAdR+Yl!88sOp)5&vCKKBUEhu3Tj)Hh4hy+Lb`<rDD=r*$Lrq|*Jp1*HZ=Z`UG%<ge
zLXk#}w@V|0+X{45EMAvoFu+Eg)sMlKc0$_|2YmI5to8VSS|ZCu(Nt-Q(|A!u(X8C-
z?^{FO!VBzotnPHp5e5t#Kyn7<gTZy{%1gNbk0hj&xL0~ld6Dt8IX6Ic{~43HX3R9J
z4W^`h+22@dxPZZ)Jq&FFjgeuiXjee}yno;fxf^Hf85xqI^}y(xc2Z9YNK1Tv+!`mw
zz7wl0lx#`rmr<>a+)MONW@ba#KY8Ajr<xFK!MrP?<*_BBSQ9R|SLzyC>Lk0IMQ)p0
zMf7!U3urHDKTr))$2-7{=%60038wgV24YIMj6YXmjgh3*h)M%!#S?Me601^nO|NJ(
zJ9&U`OzkQubDRzsZSmI|w$m~(*+mH;^m@I>?k!);!=$5Ns}TJ?VghEE$rqGc``tE|
z_}p-vucdbnd~d2(I=?P0P1qbRp3JeA^Tm@M4a%+81PJ*}N#-%BXoAP9Gge+2-W_m3
zY8M@sjmZk`Gc!aI`xIoNK1-Emp}Q@xvGS&*NzNk`&w!eK?WmHs3`u#<_fV5N+;C{e
zGNP`2*no0@-GSL*;MXa*^f|`Zlh*g}I8n0LxXeez>>Dw06{Ir?F{XXzN^ADFk|*8=
zR;#1@URT@Z=W3=oG$SA-_!&Nd1fd#(y?Z7D4hX2`l!e%)6;o@%c>1%uium=x+zbKG
zy!2uA`IrQS0Nt0^bSy(1nq~JCmTd==ULl~ZrV<MNZi%gUx;>Ao5C)BQR>rOS1VtiD
zxe8>H)=Lh$^S%j9;aTtXeNMpveZMpY<DO+kCyt$9jl@^pGWw@+SE<LKIDIkTaplqM
zYe;DuNYZ@q1YH}=24st8A!=4e4eLZydNIB>G{1MX{8PQ1L(24^RIB)y&1SGDg>GWy
z=u9Xd+5p>5s%Fx0qj@EJjkP3r=sX7FFxZ8=yvBiKFnq?n{XiZ}YPR*^>KbHwhoa7e
z#Cu#Te%E4Bo*Nj~!)0*kRC#6<cPYPR|3&ms!Kw0USKRKG0AF<kXhZ8>J}vN>B(!IO
z+<Blk5Yq<k-`Bm^S#6gluo`}y@isg9Hv8>OTv&ZRmB8YsprvaywDA3_*w)+wZuJgH
zznW@3K-n?fgG=Q)&<I*k4GG;Ird;UU6c{Nt($``UB|@xrZYBkw&rJ>H=79<R$1PY}
z1n&#8`QAI_PpD2h^-|~>nE?=&nQo^2xpy>2nS>umNRc|26LntZ^SbELXn_tOWHrI*
zNScu1MH%pdQQ|=kYJIJqy`DmBjv0Rah0l8=;MwG$$$2{=&~qT-$)xRcJj|;$U<-C-
z{U??l*PM%ND)HyFEw-SgV4Q;d5$mGV-9q!K+ndELgw(ZH39w6vIkMCsEX^<L_$oQ%
z*=*|yq2&3_QcoGZOrWK1N}sG;;9VREcx|%J7c{uKb@f-yHT_9rNxkhED@+SMELyVe
zX@C0)u2Ay8bWU{lHRXL$#_ohSRk?M}6rLF5#rJgY73bU=$vfXeLk%u1W{z=f?1k)Y
z28|!X2hk$-bzKPHqSnI^|LrK~p2)&&kZYueT1%YH;@bry5qlCXeo#iq*I@Vik^Fvn
z0$g)77+2The0q1WX8QTD-n}*v#hW&g1~x9EffugD9p=Nplx-zrm~<yQ6}@5w#S&A7
zE>^vEc<3&%@_XiTSijij9+^|HcoU9biXG#<im~hGt3Pi7iD5OC%dqLZ1CZ~85KW<i
zeCo9Vt9!71d6qUi`^ukIzQ$l}qKH9q{B=mac$sBge4XPx%%_}tVM%+&aSXDYex|F@
z4jQGyMM=BHnCI)sPNtS7qX~Z6Cu3w}^P{UB(>=@grFaiwa&Mm)X0qv!RcS0C?fiUV
zHpi|YYjx-7DcHF!Fx9NhEqPo&-Wao0&LjE7H9&mei)$*)=1o2QVCa=pJt5BpJUf=~
zvSI{UXS$rEwo*<7#!ls4d%qg;)%SWEQ~Q0<&9RJdOOH>ZO#wt7V)$9cX2*a-A=IPd
z9+9EVh@)3hU;7vxseJIbW6*9zu)kL)BK-@w+~i|R=&+Q4fA8iJd1aQz%;={li2}GY
zjO1(A>>8FLmC94wD~^WYvZORC)1hsMk!r>B@8z<gScZC_^Tm&+Q%2tT_iNYfovg`+
zthmOBZtFHLXPk+1s&uWVj&cf6Wh;*DmQ_^xa$Vt^r_bjkkrJiCI=rj-m-FnR9v(7|
zCb)ycV`DdKWum^cY)BY+S#gUdOH`A;5eXyCrD0MuLvOHWpu^%Dvb3F{xZiwt=jnkC
z_qbsPa9t7blps)iOY$!SXwyjp81T%M0<~L&qbgDKiy@b3`@M6pyXq{<`wM1Zvs<{Y
z@myv7${Ha0LAD@Gav!y$pXI!pC;QO`v2I*&e?PkvEIW0!qQ4)91z)?}Mdsib`24XC
zxqls%ml&e~yOy5r;+_;-$5~_gd{M&?Q$;-8F;6i$V+(%Sk4zq~AipmV9yPhW1J+V4
z^7VWL>gkkram*7H1)niXOzMU<YwpC{9wl-QD~L784!kwKTelEn3lq`+5yi}4m1T~J
zN0~0-><Uw1#*aE2?rNe@dg8J(`M9xxduPj_(0Og|Cu%oL8ZRqzvcqy5S5q(g0d?~R
z$>$)9zjLY%-1{R<$@MdFEk6%#vvL7!ld)A?%8vScc9~1i*nTKH5B*b1x@X<@3Oqn9
z9!ry%HfbfYAPnSEsR%DR!iN^9bgs@qmPF*VsG+I$?|P(Cdh$N@!0wH;jjXQS5yO|S
z?kMa$vv~eeo;Pcx1jbv5;n$nhIInCp7xHLW%#hd7XLUq*xgQM&FF<y29@Jx_7bY$o
zufQ6HsHwFR6Fe)3b~MvC@I=xu@ja9H=e!H0JnHGA>-CP8{R}&Fad2LXyhF8wf&iM%
z+<<l;(H)F%?d)1?gh~x7TzPJDjomni@huf7svux|U;s(pD`DT!VwDm4k%l?EH;OlG
ze#dfrkKp8=5n}J(x>CdA7;`+DW27AOYeM0VP+cAe3=Cg*0l(EzNeh^NnXTfgRDF8=
zB5^%I^mnXXOBMX%C<3#FM4Hk9l3z0Axk?NlI9Tdz7e2SACA8%xiKN?1kyw*#R~5u{
zL+aqMX==2L-RXM?`U3ii-!zX3eGhz*J5urmiv<W4zLwnNu{g{OyCpnRe+VV_x-A?Z
z|0a_;OBlV7T#0b{Ns29~g^AKLQcKGJk&Rs}%r1fMlDEaij}(0ymnOc}?%vKb=W#b|
zG#9u>F2d82Y^C4iQj;~`>x0ql38G%$xz=}SZi~MS%$mMlj_bKG-5q*IyT}~+0SLc<
ze9bMq{iKd!2cfQ=m>H0zqTe9|3fdkgj)vdpYC*rULi(Nq@t9jFiH3W&cQNhz;J?eR
zLOh@>%b9(`sXxml^qC*f0*D`3Jc=%SsTLZB?`ERlKdgBx%jC^K;OT6wN{{&oq*Vxp
zcpZ~_TN!2kMb`DHff+@E+hw-B^2!1ckHz2Wtq7QXRNjTM;7L2ZyHE!^e!ZdIZ!^+t
zERf?!eF)OsdQi%`8h#Dms&C;uT;hayVM4JWo|uV?Mmjc{qqBEgqo-mg&hJtcn}Yjk
zVSZhx0RRym_hm@w;Zbx918!_QwkFzb{%9Hb@n}NEsmJyr$_CM=q-9tT?$ohm#LGMg
zE}v;1e*vQ9%byi3JNZHBXPoYMeC_$D>EZqLs88UR5sK<kRgZny)x6NSL0x?y{|dS$
z=4Qc3=Ugy#SGkSv;K66wIqVIjc^8r)>+B6uG9ab$!aRGM$E%K?a{T*MButE+gFa<@
z0)nRVXi4tWE&7f&fTa;`z76xuHZA_#Xq$hMM<?sU0;Y>1>PHrhA$8*wp%2RnlVyf5
z?Y4c6b{!>=(Iu*%8G1Cj0_qLWm2TOz(g7=|qus-}SHlX-Lmc^W>dVz?;(;ayOJf*9
z<OF+h&7dllT-0gae3x@&sO0szU59!_v8GD-C6RC*qH+z2mVEY`O_%S;O@)jVm3FR6
zmc+mt_Ik=nM;`>J#!i{Y9WCC}ow7F1Cn1F#pXOZC*LrjVZA7^>H@SP`WAL!JGl2NT
zZsBF!YIHbzX*2Qpy<Z~kUrhH$k&akT{F-k}`ZFy^?4p~<ouWV&cyT#AZsKK8JGuo4
zo}X~AXc>EM^%8^9D}Ak5(MNLpGT4@2%Qr5i5RhCqyn#)=LJYxCCPbh)Val{xk)a{c
zsHNn~a}n4fy=x7WLZcst;oE1g-d(>yZ^1sp_urO7t0Acl3=BOskjKw5QUpmAG?S%B
z?1h8<Ms}i)6~lauIYARVDWZ`@%aTkEDzfS(4X3#nOjkKYu=9jzbk#y5$4m}Js~CG$
zSd^E_1X9(Q4kCZ;7XCY7t_r!JrwW;?dlZ5@>6h$H3r~^1RmBEkwH>D&#yy<lY%m8a
zj4DcmM-sRlRo(=7*`}0=Nb;HbF`w@;R%x3-)3y_pz8kNy$m9>v_WvIAucqvcXSha7
zfEmzz>YT0^gqqz8{Mdaj82>k5WyOKPIERyd6Bpx$1WpPqr=<U2{B;uYe>`89sA_s8
zOxef;7?hbC^C|eR7WAKD$X?Eqr}w>HXFcHi+hWQfX@-;Vf%IEver<#PNu1LPpH9MS
zzCO*2{+lh*N)&l$rD*ngpI-g#Qau=HJ<Vo>`qcl1lGs%FBWab3(n3$}|M%LYKd3I4
z))Vt0cIo%O!R%@N_m2VtU3fTdUi%O7{^E{yzJV@~yP7+{{Vk>*|E}5~$HM#<y6{iS
z55n-PHdKmU79jt1x#z!8nEf07kK|LYHU1v?5H|iF6P!^O5D)$|)vp)bfBM_N6RQjV
z4Nbi{|EOq_m<mw97c(4Rsk3bL3Pk_jdoE~{#wf#bR{k~*|8^d>r5g;f9&0z4f}Q_|
ztp9Q&U7YFJ?(WT>_~=h=aiv>pP8)P3ebB$&HT&G=uZg6(obD2Cio8+c{+HwatD!fQ
z@wu)F)uYa3>LuTp++Pp*Z>eg+{4L@w&qw$fvRB7khtA)TimDvfn}qmyuem9!kw3A|
zudAAa)<oHm{^aGF_32{Uzgv0WA4B&@4?Kx->z>*_UF!IE0KG8A?;EQXVewbg{^p2+
z)+8D(;-;vlO9TF23_9D~++5)v>$z6Z`swe7b{1h{rEo?xf7SB;{V$m7Pjh<E2W3k1
zs#pI#NBke=W!umW`Fi8;DOmJ0+r}!hdN|<v-{Rk%ra5x&J^QaB;U9zif2CtKF})i9
z6o<F8f0XtB-1nyau)KF{-ci<S)=lv^#8khQ<f_-Ct9s?nk6Jp(A0U4sp8qXedRG~r
zA5-@%5S4f>FE*MCTsp-dBO|j$CSog*1OnlvpMk#qZ0Pwb4l0{>d$y(N9kR~E#l@}F
z?M=I*usB~{^oJpuIxB3bS;rB%Oq_1ZHJ!VY?f=d1*EoOt8%=UCpLU^X-Joufbjx)R
z#B82GBu)V+D^^Ch_1f(y>ad^o$YPQtes>6BA<qczryi0<EHD)n!;w5P?b@GiHG<{+
z4}H8lZ?}Gl29SmD+8(HbT%A+`re;k60s;-qP=*GgA8N4%b`#~MeX?>QN=@k*ufF{G
zoA+AP;yBk%Z#!+^)uV$&B!lw@&@UOTSNY^)dg9gA4|TM4I!mNWbu<4IIqlZOBKPHp
zN)g;OoKGQ@4jVgGA@-(57ED~~NLwQ7hvSY|d9+_{qK#&eWbNql2ys&}F865V-A}|J
zRDy|XGiQ@}IKExMf95r}6J@(w(dD_XNuAzJ^th%ddJv3q8uGpBzyC~*SX5f=Ibv1`
zB0igmV&DH6qdPgEs{&heH!?D6^~_Fos?r+?oKLWrrfxK`u5~M}doW$`(39(XF851e
zD)de@j1k@%b%ppS>hf|+B=1P3GI1a;npfKHy@^NPLz6P4vwuizGYwU%-NJXJQZKU_
zr<VYB<TSsvW#`(>M;&D4)v|$1K3lkQ>t**Zzeg)c%`G;Cvbc+Ir(%Rp?2p+Z%L84n
zG2qtgk8zBtifRpev^%dX_2`OoiymB0h&&9mEk(u6b$jS|N7gMV)rtL-u?%66ePNsG
z8BbYAcA30?R($ZKS5n&lrp|xzQQOJ4x=Neh1{5=O+w~}`m3ZS?`#5eYjoK%Z95G|F
zo}c+?`neZ$(qMciTIJ}4V9JYNV4y5;z$amE${UHOZ91=Log{wSjJ{JCnEPc2Y$ASJ
zoP2+72zNv#I#(~}mh#0(`;&$sQ*IsNi$^qW*xPL7tHIVEjY~y)nweV$F<yueRV_Sc
z1~m{7lAdrX6#xJ=rmCwI_O^ReYUJ?nfv!6}N)k!10x1lQ7UF?z_Ap?h^Qy_ihp*9U
z1Le+8Ivo}2PJB>`VO2oJ#}1czY6XRMc<9Td&O1FTv0X|XVx>1KvO2#>)*tpS0FPtm
zO$}mSW{2dZ`L>gbX{!{Jjk8xVFgFRvDr5EiT;RPvy^vtoTi?4mhzr<k=Ei*aGOC><
z-G(49=Q-mJ7D}ZmjJ<ZpY`t@Gaw<NNDea=zFI!t~CKfw}o$`w*;UL@6K3fHI<Gd&<
zf~!W+xBBlMhyI6~>UpZQQmIY12XOhdK6iD0UhhyjJ{;KHq^#FhTHOfiHMZjJ=8dyW
zb8WfA{^;2hyP!@Pz@NrQUfoNV&uCL28e2F7B~-&JN_fDGvO$BIsAAjIzSFik;ZpCZ
z5G@_uZNKX1QI{k+;YE7s(BXAHbX~Q=U6tDJt-AQCJK?rf%cwk0-J_p(qRE`yMytqO
z(9lV1UgK(a^~J}zj;e+h06hH_K?IXE)?H(WdC(*6XlFpzt2-BUi;djk@=J;1Tdf)f
z-6!147^m?&KGW*Oq-*O~*Aka_O3C|{dj>7V&Nm66R<?GjIl4R}K)L3uprqZcK5g?<
zK`NOfx^%O2HOtxB5x+=d2AjQ%(#uUY-YJX<If21p4re94q*J?hboWi^lPm0)PpNEv
zW9JeA{SY4Kc;CNgppE6^XV$*gne1HM#t_f^u5N!eVmGiGYyyLp3W*vvUKY>$F<6Mg
zBylJK(~W}U8D&6VV=T9k^>4)15LSSl)p^v{5QavP9>o_wRP&rvWf=Wr;){bLlho;T
zU=~3)4uoD5sm^dhOrHfwf!=+gtx#<L<whfu?YVr>i^;~I>jVDz(aav=xc%AP7csP|
z-12FZu+`JVQRYCAQ8e+rQkhDLp2a-w+Xbx~49(2v9g>t5lN=>XbO|a|P8|6LpTXLV
z?u=q;$V_6p*P(~@<=f9+F%rF}JRxT}+I3q+lwRdfL2UX6@Iqxbd)(2q-+JBZT?Z5K
z;sG@V1?&!@w^%Q5`-rmcEuwNY_!V=e%1*D^?nV%KW7556%oS4QPK_Kn*w~+oR)mrZ
zg&+IP$3e#Zmk#wdo_3tOB=zm+$o^`T@Z-b7?VjYsT@a;oIX77$P8ZTqZk8;(rHgUD
zh5UnZ)L0a}J#?O=DUH?b^lLe<mXFA?!z7R|rIA-T`l{ni-8Ayn@igX~lO4DFopOe#
zq3per+T+3UophyFMw9jU^}4E|H{rM`-pJ;XH%(ehRB_a`<m>)LD&CImlb1IxIYKB#
z!%9*F<wKL?6`R~m+T*N%{f6b6^(DsAI?#U*6HQDZ6-k4@DcK39fi<z}gChPen&i~)
zh1pKDT16`lJ16n^{0Lf0u<&=aO>qqcCQiZ`8ro9^-BOopMTBmMO6CV%t<`&>b?-vt
zNOHkzRc!F516f9Ij*}6xj1Pe)YWsRfz72u&I&)5G@U1qkY*T0)$cYO|p7tf8xje?5
zO1qqrf|1Of_Guf`ryFarK%L^i=q`=}^+wc`*F+Z!A7GLJ=Znq_Q|0iPa&5f=n@GwH
z#Flt-HUL%a2J@eCy;EBG+$3fK-WJj;R5i3??iQ{A<rHy>0qNFPN_#5Kjnf1%pfi3S
z-H;=dG*Feh7KYn4KF$cc>aZQgKc5v1ND^W~kF;}v_&~;nv@&LOM!3oNR&if-Io5Br
z9MW<-ZCl=d0#=#Ii10jbSJVD{7~f~7s?#`pOkEGC2w?td;k%q8dv)hG^~)bdn-|7~
zzvNq%1urt6ETpX;#-#-QY{beW#D+TO!I(u0JPj!+v~ttzk7O>Yy45!k-UFI^t#rS)
z__|v$?x_kPKkEW(u<!|!#nj*p2k%MOFtp;KMP}6mY)$^+s{2m46K5yb=1WqyI2EbR
zdj8e=3(U(w?$pY#%WiLy1C9=0%En>CIaHQKM76u%lng(9J+R~(NxAQ3)_UO8!Cu)+
z?d^PF)vI0B(cMkIE-s=8qJ{CesmG7TRI9@ac;7MugsfISSw*>ud>A)NDzV`BlFg~3
zYNXB_oMXes=^RmXDafAT+ILiOPigr^Rmsj4jCoYJfs^{Qw=73{48{CD)JYeue(#!D
zNLCP(aXwSPe|s!VQ%ba<h$MWs@Z2-TAR6!0f4fC5(uJ~gI2~wTckS-Qii}l}hU}L^
z5Jo|ogSYdJwNl8$P<-lca{(p8)1kJH?FR7HQ!6%<DVkis$0*_X?(5wW5x$fp<;&!x
z^>w&YnZ%g2qRN)Mzm|A;ET~zJD=&DtU5JIkOGbsEz8(qZW84jMK5^b<L05Urz@KtC
z8}|HY3|VcYXY?DAj;_e!D~BeS4X3(y%ZGP9PafTmX2?(w9FikF)gJKy9qNU`zMBtu
z#2uceRsBF(s-${6uNKJI);v2}NC~O<Abl*PrRa-6t&8LpwF_j-C#f$MbtBnUhTKH4
zwIz&SNgMihRo$t#EHJb)5qY25-L&yO5O+es_N(hTA~Q6{Gd1(qo~f#DkTY@ym7F$W
z=AAl0vjL_OZPV<TYgS4k7-eI%%V~DC+=srd;}<7N+QR(rV|oU}W7ee^Z+&Ii^60P&
zi2W^&$HU;jGIA~f12w)mRhyJc9Uuc;svj-7IG$LIF|OFWlUzzQrFPizWlan?#--N%
z2tKPXI6+z1Rq+(4A}1@Ttox+}N}UDe?#7!q7x$M_DSIx#=|u_%3^759e;d~R1(fS(
zBE#smlX_9s$uI^aqE%6@eF?{-yEkGHP_7F5;=E*>mkSg6k%UGUxA4T@&e#7|JuS>%
zv@)l6LDADFjtT-=-h~VtiL#-YBJ`8VT;xPlbV0|$HXuP8K>WV9U!YVORV-qapY4Uo
zwVc+Hth1IC(MrY4mjEq+i8Uf^McZaXgWkJjgdW{K&C+%|-U~Rky@X-j$Kc3>-3A?k
zEGNfsRZ`LcZ}2)Z`B6xE-+UAP8T2HIc7RFsRg3BIZF$QXCsXqjnC_GhzWkX+W((Bx
zcLZFLuAEnhY7`sT6R)m&ff0KA<ky~f&<?{C;0G%CZF~9|xpOCICf$nV*j=7=3F$~k
zo?Ea8Z+<a<*1W1`$Wcj$L@9rWCWT<L5iXc!XJlghhnOkXk5*i-Ek*0hXT{w+q#4yW
zgIkl-CqNDI<pn=~Y6zxU&l`KHDZUfGxG)eoQQEE0(H>8@S*rQ?>Wc>Y_@^><z5Ki=
z{Y1^`prVEfG7^^fd3wnOnp(h$t=)PH%#GB{4{dnxNmq_YGaG$*+4Z1Yo*^byQX)Sx
zH*d)-paBpdoAI0o&5WoDe^S4sM99;WBaG%dW&4ZXlHBuXTrh1_ofy0rI6h6&2#KHN
zm(oKn1pS2XJ|$|5l<CCX16|UM8-Y0=L$eSu2~M1o^}FMau#^Xk80n*IrMNSX)%ZcG
zjh`~q!p3DZ=S?_!hrBK5+2dTnHrU`gUmr^%y3+YVgQl6qH9XNboe5&lEqAi{>#jbw
z`I#@S-jh1ykEs}Ua7H3_i;J|N+2-zDGT|VhQO>#wLBN|@Jw4NrpIvhJ$r89sC}udr
z@r%?AV6qhN5XXKO2n?oRU0r2xjt&F+mDiT8NErYZ?UyNVr!6CIT&wQL18O9QUq>Ky
zo5hm2u9X|#q0@o>{FOO=1|62-(KUi+g&u~6UEZnGn9m`}=>?d8bPj2nrJ}#Fy+5+z
z{ZO*6*v>JX476ooBB|62U0SthiKF)7b+05ibr)=Zzc!>A;2sA;-Of#(aJ$mV_72<}
zXDWU<;{9^6qd(x6@9d@vGJ8-ycF%+sYF-spyc26XsBOV$o+Y;yr|jMLklTn&r%@j2
z|JtifAmMHt=CYLa8%8ik3h?u_cL`%y3zfaC4o_^G<-h+IfMnPh>?%y_(qo7A>-}Fs
zL_T~P=XhlxZxaXnhr#5<=@BUl#S07NyPkHz9oW!J47`A*KCv#CgJ)ac%35Rz6%R{X
zs%~<O96}|}F<ayPDXTV|S9JB|hGo(+G+zRCN3Fp3w?lcIUx*~4b5(Pki^5bRf}5(1
z4ryvQRC{zZsl7sfpbHw=gA%(AwGs|Qe=I@5w=|v~Ek)3XYF?l7J&6BtqhDmbsBWa%
zMPm%&+;$EHEq05sSj|S?os8{IqA7~{3A?OFu~8HW?=u&<Pz12AeLfWk34F46f+fg<
zfq9=me=1c}At087>U>1=&}qgfsUf_(T(4hts+w6|`%+W2mk?XN<<tE>pUf_naXX7F
z?M5>Mx7_3lwX?f~(!ul(7^27k>fruasCSc}S%S240fSjP+Yrf=I_N23Hh@3JSiIFH
zHCS#qVh3V8O~K&D17Nz1R*c{3$plok_HssQFULi2M0|0i|J8+vqwdz53IVwz2bWbC
zaU?llP6}t>5WZvZ=pnUS{0ci&TZg6k5#>XNPEK&SnOPXOp(VX&Bbq5?$6Jk+ZFzrb
zJ)NP~ZMX@jP*IH$QB589c;_L;aVx4q@4P(F%f-XI&UwwN{0Ys-#Jlqk8I6^bkL>aH
zJ7l7zUaHZlh-ERI-tm^O2>G1izo-z!9Mz#r`jqXjTP{Jxt7~`k{?ZLs<*!q%WZieD
zj4StD$_N|38^Y^STg*6QPBS8JUT5AVwA8ozirmqn*OHkEG)M=HsWD$5+I*3c@f<Lw
z*Oi&t@n5go6RXjo*M+h#-}D%bnkI$@2(jCrl{w1#QQ(}0^qzjnRr{1qJ!r@dn65rA
z4D!-fA+-JxIY0mdVqa%lzA_K{m^Bt3jxoW?0CJoGqUq=#AIL9hZsG*X*ez@c!2kf9
zvn4#D&|*L92K~TGb=3zL%L?Zoq5PG*X7?`1ivS<XMR}G2lIn_7gBS+d&kcA{$em+@
zmX#IAp4)o-_b&M=3^VU9eG=M7{|Gb&Q0(r=&LQDsT6_tw2(_kZjp{X;y{}tj#(Hwe
zM_3O-9e(M%)2BWWN&fbvR+ayHy8B_n3Dz>E8#{=nm!8KBfMg38>U>|Q;!VsPIaY>b
zwC*L}mxcZ&@;TSi{Im`s<2-`N6E;w;+xVGP<*>KqDa<*5<F0;#I;)3J`o&;N-+VQq
zeju2F&UJj3)N+2fw*An+D?QlT*HUnt<MQAMK|(@s5c9r0?=x!OOOkP7J2TiZ%~X3t
z8FoBu;^N8twUCu(5!`fIXd`znp9#1V*u}&Lw{-qPz|W2skcjmBt2I{*`-%sf9u`Wx
zN596|mbO*E_r;KNd$esBo+`L1rYFF}I0fpvK0fU;$1<UQg?p9gh^?~;upoI~2X;X7
zw#~rf@ur^VmqzX3o~aRJll(B0#}uU3jCJ`{y$17_JlQK}sp7z64~YV-G4iT;e~8)4
zx;??IU_H6UOdLtP+_`tT7eJ`V#c$~~m@F1v?PeAqQF4QL(kY8&zVu8ZQnJ=veIAPz
ziITjl9aS-NpOzKVkJWR)Tzz{M$Mt87b_*sJm4BVx*nb<WOn*;~5O+}3de!glRN%1o
z6GkVQ^FPiAro88W6VEdgIm}ku`{_-4d>;$BH`X&Zx$&ClW90Iw`jU4ZSqArv&=!qP
z8MiogdJmVAoPUbeJZh}()$d4`s6X8DiZP+LfvL^2l`E4(fnzNTVp-|g&LxlTydMg)
z^O{=f%Mx$kCFkRr3q^Ph<q|<}jn7M6y=56VHJMOe676}%4En^Ao#r4U=oDQqH_e??
zJ81d#<me%wRkfqA7R{hoZG?40G8cxf&X-2_!4|d^kVm3)=s;X|d468I0<A_)t_73?
zQ4u+~k}8M<+|un~*L%4*pra^07e$;o$LPl?H0Y@t)Vf3xm<tv^R2`934$KTp6cJt;
z&!8zQGXqP#dpgSeS?;w}RK1r_vu%mmjr$!@BWm)=u!=-Pt0($tx-R}2ql?UNelXsb
z{~^7rHt_vZ(8(Rk=KQBw7x0qGgIRP5!B)QM4;pVMJH=qmxm)%rc!&@9TRT>BiJ%Pb
zsVl$Aw2TYh@YPet4()7wz?>}vTtfm7<Wbr?Jz8o5{(!wn_ubFY4A#DekLZ6s(Wazz
z+#D$BTds;i2LDvJsdjkce!Jmt^rS}-X!SnpsRHqm@7H&GW3GPc_tZuf1dJI=C|;SM
zsiUcu4bUyqiDA463lkeAwJ<il@VAl?yv&$$<&@W@R@>nwBL_nL5r&Y`Ukt8VhbLD|
zO|CAupQrui*E4@{yn7;5i}}!Ti7xfdA4WMVcYgmH+Dm6}h@Ls?^JTvj-Cji!&wJwC
zLW^bG2{+v?_=xxYV`TW6?x^Hnq&rkY8p>*drVJijl5QV;Dy%y74ZM)xAA68V7aV-8
zDrq=lxW;=u+sN~G`AJLg@CRoD5xR_`>(Fz;K84Ajty<m{pZlFLfM#WS=5^;enK3eP
zd>gY#hOO98F4?AhP%!DI(5Nl>o5L9STG(KZ+D)AodUS?Hc5ZqbpdhD6xj#Y<6OndP
z9Ovnyv%433VtQ`BuL)MFeu?E>Mj(cZTFlPRn{SV`UY@e+YbAtLC_9bb@}@<>1cx(z
zybHEwzOPL4`@KTU>D@_*yc1_`nwP1nHa=v0>4`X-h+}5RJ6l0W`iK_RLpeh23f5)J
z@Q%tqGw2+?aM=MEQ+M><;_3@n)$?Ot0<<Y^v}lzP&5H^vnI~m+PTtZUA=v>pz#~6y
z>E>T%)6^uc2VrvB-wl_`6bV*G4UUY5YVM(|F$~5IiMwie7w8UZHU!th{NzzGg99_2
zD4Vbxm8I|EqT5+mxIp|1GVQ(2DoiT#$Jg;|hCfUCjtGK~4zEKF_<(L1x|&d^L#vlt
z(Mvs%c+S`%?;Vi3<Wh6hN3+hHFvAxI7^xC8VLZ&z4%)WMIg?uIYo{8Z=z$Zun>5~E
zwS4#%a3jvKrhwSFc)642-KUNRP*(>&4U+v5)=ax?uQDz*$)mPqhGbIjjplib1jfUe
zzLLMXTw!GlB>muE`E@E*>a27&5A>+EBOvp_SLXX79irnLDIg%19VS<ddx|Y8iV+2k
zWz2e)(_40>D=RS2`7+U_AUKt|R>%=V|G`6hZqEL4&HJZCFYmw5$Hq-%cFM;AhFKtt
zDECO{$RspYCMOrctlc=t);lCUbxub{&l2)~G4|bYO=a!Yie}Vcmm&g!Iw%AIMFD97
zc9ABKgbs?44$`Fyj$#8W1O%jsgc3sUp^5^AF1@2j4OMyxe9!UTdGE}<j`RKJPle>{
zv&(wcTF=@cw&<}$0p7P^8HmQhVcJpRy-BD_P&X)Ju&U*|<ZqZmlzWv(vDk?V(ms7k
zAkw3{#W%>4WE4VsAehT3BuG}m5JRz1U%o=9N!oM6?*l1fo~L(sx1jlR$rb@cVKfI3
z2Nx66BbGoVE3_qvSw6Nlc)T#V82$c%k=<*FMpxEvxC9p>soAc>OW*6(9;Tuj+&U(t
zxj$Uq+hD!bLhkDKvRtW-`}8}czYNUoes*xSUhG;VIWuGJ;Kiz`hjVm=u)(wZuO`nv
zu)4L}%~m1uN`Ykca3n9Nar(**#fOABT@TeA^OxjQJ-BN#nnr3odS8wYN*OG@*_2+H
z72!Vk!cDwY>0n6m#yo#x(m*CR@ff~dm!t9@=&{P<_2Zk0FzX3+0qA%?F3Hb|hXn5;
zn{>zi#|Ji@!fOP*RFv*<pQ&NX)W{B1)J#lD7d1bpQU08EP;ftaLu_cdL{ppJ0roQo
z+_zT^=d@75KGU4kQ#Jc{VHyQ9B?n)ZF|i)VQ1jM95yW~@fHY~+OjkE<8nq|Bt2W`#
zO`YbcNvS?D9;5gu@px7_6ZWB|DP}xgJp!-qNF<uz%ryKs&)l8rJMomT+kDlK&?r!)
zt?X@0)WNDri!qC&8Tqp>acx3P4;(t?w-l#jjM64@#|71$9UQ*&^lVFwrQaM`^s6ZL
zIbOM64HKqFC?yj4bWvPyIXs0rc$p8?^G8Lch`N<+;*dpMGS0%tJJm`KX^mJAYUtEh
z(G`kSm}UB7RHSBdk(hGqpyLkgfG?o4?rxW%aFqdaOh_C>TdwMGeKTrSK4eYg?TtgG
zKiBP^>qm74JK%7EOkM{%Rrnl<7s*O{PzM8GeBQL5@K!C=LhWLSB2y??yy-sWH{7*q
zT3drBupxE3hnr3lhp{K{13OQN5h@Klmj-az4rEGp`u5Yr13SDGh#pOgDR={ySi*fw
zQ~{iZc_8t1l-p~nU0#V97u3~UNf@vfdesq$l*j3hSYK;8)y;R>L<U|>x|hnd4;^7)
zd13B+XG89T29lBqu|rexvKT?yOdAzrPvE)O`f6FK^`zPz!-4b7as*Am%EZ3vYiBLQ
zGxb*X4+?&;@jJo*G3#}j=w|Fo-F|6u61i4@FrVOsd7ykgin94!J)Yi3kEnO3xqb)V
zssTRjMKayfKI2xSwi)+w?P%$nmeX@XZN>pvTO4PduPT@mH%|yFa|}@OdYsvP&rSNS
z*+g$arDwSh0!X{;K}vfx7Zp-e_fJz<c=rs)i@A0af$F#eudWM8k<9F~n?4UX8W)O5
zG-Jy(N~@dFqiROnvR#O=WckGOhR<RIR>4iD8;8&in0s71EWha-b(CITTS-AB5JN%}
z%%n$6E@whPR;s}r<IsQN;o4wEz>S`B>n&@h&#EjLLJmAyaz6)R|E-G2x3Ub+a}*``
z#)^Rh@mfgA1N6K(gJ$}vg{8Wt<umFAyqf$X_j!?SC-qj1_=P-wZ_&5*C{;MB&dtd=
z7IsAHF_iTmF8q8}{YUq{5>>`jgf75zir*T|{&pu*{BGw<tMu)^{A!2=r(S#W6RE@_
zoz_$e$cw5^B~l1kJO3d*;x9U1y#`9ka(izCy7mVcSVD8zSD##}rMds&Yu0Z<A$aR@
zNzRsocbzzXeq8)d6_)QS?@M%;IOsn1Z$>hB3S01SN!LaQ{ZtqJ&mbWWK-mi}BFtH~
ze~E$uUx}0oIHM}sGYCJGsQ%}R?v{_j!pG}=ta$&6r~j`n0d3e|SvcdC{p?Tw(_aSl
zZ{KrkrXqYic~NWszYWyx;WJ=1<2I=a`gd0xZrcalEC)fhgVIh(oRW6$I3=w0p0<3B
zQ{|BWsw`*t?bB>i7q<ToE7}i%?(AaUeW9K6XnpUSl=~0=^@(ORy?0hkF@U$ILC~7b
z3kE0{L+_<@GJOu_*8{bE%Npc6x)W1>MSwGn%Mi48eOp`GoQ`79hudmEld%RbzyxmP
zW9>hV+K+$8REa7Rb=Pej9~ANc4lUmzIgjMR?AVw$Nu50F{Qc9hWAhVD8mf+}XP0h$
z^@uh!)$O=_r~}HxzK7d6zG4IgnbVec>JOz=9Hbv~9X>=a1A+JJ&NF=*;{GWM>%U!4
z-Vs9eH#myMKdm)f=E*BwTdo3CWZj~!#_5R_w?n-`ve)_=+g9b)XpY;jONqh3aVzn>
zjhRAL8RRTI|M5IInnBPF=c=35s%6QUl;!2+<R<;)C%+ii@!xkp{b+ej_HbX=r%V#5
z{9}LnHn4wKPQQIS`>8%FD@)~d$l2ES&CSip@812Mz`0~8ef)Wy`1^u{V(fdD7Z=&C
zhQGP%1;zX^gUg2BMh)}N4(ME>8+tM3#*gJwrlcYV$_kdlE!Hd}1s{=&!-Px^$Xe%d
zbtW3NFLPy|M0^Dxr}7pkFNf3JYfKnd1-~i#Zo9Clu%VT^<&gNV=6Lue`RPx)<th)3
zkDb=H?(|mzhe)`?=c^Oo^2i!%DZRgi%5Lc1FrGIQVGU=Uok@fXD2a}Vpv#VQflOsy
zoYQ4@&AIJ{ul5~$;Sz9<T_BP7158nl>jOnJ*(~;nNeF?BK9)~Ure{_ojEVw*D9W}>
z#2btpbQ?Rl0P~PK5Tsih?%1Mj4V8c@^?qJ<^yli1dHB#UEzjZlXzPU7AUlhsw1?8G
zQ)S&=>x?0AVhl|buX}d(et705;?(0}A^}2g4gk&0K)JQWvK^e_)rBDavwaBi^?46M
zzBQ<FD#pjE%oeA<Af?%<S0<4jE#3BYvJ+6nIRa&$3D$&~F5p>0^99_zx*c1!<*bVt
zPHrM`h)+d!ruvli>~KT&Dwu8h#oLVTXI2RsCD=wf793T1GyL0*1K*r_Jr>?YWw$ss
zWAWO!oR*k=@m}|t_GNe`o>N{Rt{!irg21Tl@(t?H-<S8vT7!0ZNvEP|L$Oa;tAQJ3
zyaL#fQE<ix#m$(<5^?EsgQLjs*i=kci4;4GKApd&{V}1lfp)siz-@eBBDa5MYmQBC
zK`ov;WzUgIxu0Ggqk<)<8Szty2hGF=8<;ctBWsWB7OrjR{o*(@!Y8Zn&qh6Oe-^cp
zoXaMQ5q2~I6+#3o_PaS>p9tmy)z;tvK8c{m<V~l{KL5Ue$j+#PllgRE=3AJlxV{_3
zo{ZzBq?4MgyNlolx)56iST-D(xq4ve*xqr<c^04)0?vHhnge=y+cYRteic$sfJ>t~
zcgoJHE%e>rCBAJ;;6$(m=V%;LQ3GkiWT1dX7ibX{3O$VZ&bl4@Pn7s1j`;ZgfCM>N
zJB8xXTWP$_xfN&@pxxW8%Q;Lk`(k`TJ5quTTtE4|zb2gb4x!?@UtB(Dwi4Yp2Loui
zu{q>+=qD@itTZIITUed<*se@}^>o!iFBwQFU;u{*+ZQqWQ+TOenI1yqd$tZ-H~k#l
zfYol-DmoydRXQBwFGUkiC^CQ63kP*g4?WkwW3Ksel*_1Xqd3d)x`BB6D+-o#CO``2
z%$y6VlUpwc1M{U2e^^2<T2JsESw26gH80+%(Ep1RJ!xP3-eTfMvgbqNNV+?*20Td<
zEM2{@6^Hv&xoRoI7MX=#<K65urz<Db@+Hglz}ZQ?Lef8kxBWP>>J32h<YX#(vObr)
zy%#Z`_}oFS3Tz;e`hwHX#aKOH#ih!VNJ5$Q=O#cYvqiKQlsoH`X3#+8izE9&V$l4U
zM$M2I+JvSwoGAM`)lz3kVb<x}+jmHG)t|+mxa-U9bce@lHI1;+@uA%7d-o-WkYRdy
zx+pV-e9LjNr*PbHcE%xvI4$EersZ5C)~Z#nURn#rlw3d>B~X2Fpb}YGa2hQsKI*hU
z>~38W`p&~=9jQv$y>=bIoLjM>Y+z~XF2Lr)M;ISHk1A!Y#@^0Iztha}JML%_Q(d$`
z8##&`>QhUT-?`C{AWy#wRl=M3C1L8=O}bi%_HFZqe$qlmik?4aDQPLNfh-avS{dm!
zA?fyr7hgOS?rd<%5Rv+&N5e!jvr<NwibgfSN>haCqa@kPUA2OD?FdY91Ywbs?7P4%
zvYV-J+2FxFn?*{`p4st5;|sg~iZ}kgmWH=_o2KHY`3vz~Q57h6uf;E0I;TyYV#;lm
zYdo>_q8*xvOR&DvuhMqn<WC)F(sUU_jvuDdLw#g<3)4uiwLEfSVmfb>&--XID(zkK
z_lBTJ@wZi(^I6&21_svc@i+9*&&jKq%LjEFioeC%wMdU=zCz_=pJ?^Z;xsk?Xt!U(
zksM5+ydW0_I*zuLf%7*3RRW%wtP@mV*4mHX$M4e|)E3G=%bioazSc>iM<Px*8n*XN
z*^Y@<s%LF6(@7p0@@F@@$E*Zvov2_Y=JddB8}73tS|8xEwEQ-qDT~P|WSN2$$?fwA
z8ess_D;XYx<|WHG6$y1NEyGrt2U3@e88)vxStT??V*GFvv)y9D(%bD5Q+?;C4)^?i
z0U6|j_toE_$O`vy9_}r4oim@_*tl4fuu+_%q!6m4-2WR{pf7%JeYBZO^G9|=nF8fi
z>rdi)nRBKdEXNzgEwfl9l)fJ}LBBK~(?O{c`8>d1lK)hss7>*d;HzvQ-K|o=fq|KK
zvuIP;Z3Lof50F|(y%>5|2gx{oI+QrQ3Q(l|btQVrz^R?^*vN(CDacD!uflL7Kl@hX
zsb?S*_r2AsVOCASqIMyBcoV(SP`>rW)T?#%>c_u;2C~=XPd+%vJ~a-eL2e6zIzaFD
z^E%p&?LJuv$9#htbCYSF*oO(BiadhP<4<}vX6;_pEFVEmA?u)bW=odmJ8>-*nPMh#
zZ>vVKmg7a=b6RCye|yV*R3>4v*JX%%0X8<R$As~f8@}9w;_bKgIm7(z*#9YECTFlP
z-Kg1NEW5|m!i`e0_-N0Oy?hZ?Hy?U6srng`H#k<lZ^_IymEvKU#Xd~&Ji9t|UgGYQ
zkzi{$xmToM%7^gU!u{)R797b@7v(+_>E*cq)#@?gz|Q{l?2knm{w>VI2i~!Xy3d6f
zH@V_l{*Ide`j+LLAuB67o;rDJbWPw&;`D8?qbZ{5*yKf!;8bqiAk{`r6qdin_CeR1
zCmb_4QLX+S;f!igWyMpz17byVppCj=Qdx3Dp*1m_4%G2%dqAeUGWa})GIsb)CRXbZ
zAK}_>QgkOlHaiOxWUtq{_6dPy?NJNVb!^abW}`~RhIyFq79sf+JYMdjNskvgCV+?U
z1M|<-I1lv^a{~Hu#_N5ja=t&<juqBf5s<dhZT}z@6wTt0t!&zL!!MXzgsDf&migd3
ztz&zknjKU%4aVA>4LS3M-a06EqVm~BrBm%~MeWNJTdypSIXYMi91*JwftkM1C)*TE
ze1)8{u9<%Ol6+YLA16)6A=~?bvF7`nP%^bnNB?4HcevzzkYlRZTWwo!-e71Ix9x{L
zL4MjA8pa{S=q~%@`>`i)iv5;kepAqh{7g#><xzGfb)?0%McYIYwZOuw?=nE_9*K*b
z08o8Xp|ojfroH^$#!X4br{QcIX)$oKV)hYE`>^*aSv^%%!H8&vlNZ<3xbqz`G4tjs
zZnp?Oz>Y{V)w!?B6J@~*blf!ALkfVHP$zB$`{-yHicPB|9m_g}E1#g`^q3p(8tRBB
zHxDXg>Z84v1p#(n$a}y+$}J!5B}d<=<K|(r{_3E|qKP#;?_`thX|Xy8e&|l0Rt!Dq
zWj*a>ZF5`_x4(P*%PzOEG+&Ut=x?Bvb>wKySbCL?yp$-Xi1dqlKE;$zEt40h=kWi$
zA<IYIV|sGzgZu3p=Y<-WtogeW1EeR;gt-GAi(5G}l-Y7T#psC-m1cw;(esN=e5Ql?
z(4uEw^QP0kHmbV%^XJb=-aW&su9OWK=|)7pH>y=xJ5Y9xCr<e_>gel_A5{(w6i<C)
zwn7ue-phRZ_Hg5IWs)F3J`eNsJEx9m;&3_}+yItX4!+!xu?|+(!522q4X+LK#~Vw#
ze7{u{sM4Bd$!OWPUrL3RZcQiI8!+&_EN%GRcad{C%0buW>71A$@2!PVw<P;3a?1u5
zGA6$y1fE}&x2-*h|E#?qqwS7BP_+TXIU20*6cPpyNoYgRHo?YOm<UDMz<?Y(7$6~d
zBGiHttz!aAh}!{noI=rHW>2A;gkT%lF$bTD?^HkPc<p|njsag>*uJr5(WNt^ZMT`*
zZqsv^#@d6hJUX8>*IWU8*~+a>-6w9tIpx>75%<SAs%%8v`(=9@+_y39ejD>a2E7w=
z<TMZ^w>X7%oY=`N)l}ZgebzmfAS;~r?d1gcZXGZ&O6Q}f+u}dWf=%frA)MHp+*e#4
zX<$Lg%I{20m-|(vFCVaPIQ3<%afQF*k<0!L{pEfZdt5;SdQ%>t|HHanz?KLH?(tsB
z_B0^_N#?`#+fXc1R?4>>j&;RaH^er*ti=>lzV||B5u0yt_|H3R5LO_{hzT>&bz@_%
z&p4|fLTKb8mjMVnl?)62Gw~CCA*#avOXt<qk>CFLKmGUP)nmNh;)H7co!8jF#C;T6
zzRHH%PyG8Yl>)kgGGlD^@4QMG((kulmh<1q8l7YC@s6Iy|D}@>4$Qw_n81Aqakl1x
z6O`Ai5zOySM=m|P11Fim=h@oD+hc8M6Ub)uOrM_V@gx*6Ya<p+KOVWF#(9Ze|5)kZ
z8U5^x8s`db=Uf0%?fG*s&-1}ln^Q*JFKZ)JaLVEDVC_$$;5aZw8GTcGV6^IqsGXAE
zc02cZZ*fY{zYAco3oV$`=hR^q=QFob&S~QSvq&k)3kpeVutYhg6wZf+ow2=n)78RY
z^?Q}b#1~&4>vS~A{Ja|Zldp@)U7$R=9<Up{9_p$->Kz~A^B8RZ7JFwWpQtAc+2Ol<
z>xVQ{BtLpA4Fu2QN_$S#o+)c?X?Yl+%UD4wQn0x@Wwh**wR!I|ZtI8c{3`L&H@bx0
zJynB$vh-scL}TuNU4W|Z+}-@Kf7-|TnBxE-&U{_>>5tuj;rrl72yY6|on7k>kaGes
zul0%$AoKmo0+<~oLTdk*-#6S_T<ivCa5wm1TRk}m*J}2j`SQoUF2n=;<muC}EWPXy
z@aW^JzSfDWCf7-g9w3qp-Ti-pe}D>96g9Pet8mD*v&gxbIc-bQz8`C)iWv|P^8|Rv
zoPb$KLtTJEWyYy94yoFa(queaR=jDNiEGkd?zeVVH5_hmFx<JvG9z=vgxkf>d?B20
z|7VNdWgCAz`N^GVN;(bmg+N_DT9(uOsht;juacXQ3j9aQ%f?N=<sw|eIB-hVUAeX`
zk|+sraM|FswL_Y;Yhsinv!E-ck{euW*6?1OffDee!jZpkW(|s7#31d2R85^QJVN)y
zr*DL^^YQViwA<@i=Mh3xUma_1ZKWctzvyWwL(QO;>Et8Sm_Nf2eVgZ7p{{5CT{S{R
z(uL1&5?HiT3or;$<0*s1U_bv{8!UA)tQ%oPRAi<v=w6^};wFzpy5FRyBqZ3zg*$~7
z;+BSyvU3OQlgP1Z9pfi%2OpMC1w&*PkVVA&PtPXB4o>?^PYmCRW?%*MtV~9m60%hY
z8J9YhCAz<X>n-KO@7xl^%obZsLkGR{%nT|$nQGa6$ifBIno~}%Csh)jkCt69_*k5K
z3u>BpmApP|+72BlXHQ<scBE(Y`%Bl>UduDJBoJG>9U7Ib<dC|VEw~&SKYDA{eR(`n
zi~jJuCH-4oZGz>{eeuI09yb0^__wAQ%2gRC%inT|wm0i5@g9syY6VQ$QxqJ+b{#hc
zt7~kw<d0f2Azgj_@Y5>@L^y3=oeYSxxc#KASuC;ZfJM>`<I&MBDQI)DY*6Z*)??FI
zghJ8r{+3<Y#(BUJQ0vaF98Zv5KDd)Z^!JlO!CuKT*N1Y-r9DAI8*8I$N#uwP9UxkF
z=Pkz*biRjgtM~$_GSebDc@yF{4?A(ghdP=_ek0o9xLs9tP7{yE2Pa$n)KF>aEx&cX
z{N~9x9+aoFXk+j_8|IydpCXvITS30GC(XDbze<=|tq%lV_O(Nm>F{fH<drwRtS@5B
z>w*PNM7K}1RJ2CAPG&}n_gZNfq6o=Om248H`Y53%%#xZAley3R8?qywf@*oyqE+u!
z;y;~6-PiP7f?d)PDB4WaKIdK!XXYT&>N@CmXk$uR)+-f2oe+yiWBQmd{_>h|8ZX%=
z2v{Yn79$;pc1tI;k-+X`k|U-*;hB0N?y(NacsBp-+xG|!til(|8m|$Br#H9#pz|E;
zISd1M<a(G)(1*>tG*DN0JZ7&ue`LiYZ7?~|o0aE#(3Rh839P+!W?s#i)zHLbMOTxz
z16nxSXbNGFBgr7IX)!cXC+pAe${INkv$(!Cx!whI^CYy`+!lP`u{vEr6)9SLf40@O
z4QkB)GObWbUL8DGZ;LJ+v!vsqG<F^0e^daJkruB`g@(5~%9K@8X5`%~OWhjA<<dLh
zVTX2y()WL}S0PCAG!^&7CN+O4^VQuTj4z%GZ?e^lQrp;p$eun<4N)!*+)|fDef?im
z&vN5VsO{4KgLKjk+<H5hT_2i_c+zidw8Hn=bDI!)=v0#Lk@wS8v)pxrX!{F8m!Kzb
z{LmiR3vYc7zb(Cj5+sm<0=%Yb9L|nPMfvW;>DgXfnV<kJA90eVXAbJce5UZ@d32Xy
zekoL3na>x;+$(~nDM|f|IY!fc_GRH%1Ln_juYL-U@=$Lf5+rFjF6MqwphK-k*XvEN
zXATE3p=WE(^gm<U-?TF*C`d;<nMn$%@;h(6QtsNPD)R-{$6G5Yt?1Xsp)6;s#HwA@
zHa3SDn1uSm@=m?n$`mbcom%?a7pt(<gV5@TdKws#B=@$X91OK)4E!Cu!U#;J)*8{O
z<Jqz(JYLsIYyp-=E*xHG5_TP+Izv}px@B8zkPc=f!_p?b9jF~Wwyi)=IF4p5O9Qy)
zQ^s#l_ckHyP6l%-+q>&V$_lA^6nu(7(5D2eL;<R*0*<i4kaJtuIH*Znvt8Z6{ZPSm
z-|p)Q@*kMqda<yeSlwcBwbYXMbCXMalDXY%wh9VkEG}BcKYH>+>keyb^wX%V>ajV|
zDr*bQw{0qKy;QpWI^_3Ly)!b0By!%kO$_x!?Xu}vh!<Ft0UDUy5*wn2x);ch1Rg3!
zzf=aL!f+$4@SjIbF#3|6G@pywSrZ5zn7b5`Q6W>!fgB~V-%w9j^=2zb8<NvK=C}kZ
zhjL6kEQAR)LrEmwARj9jo*Zg`PF%m({9E(s35COTngv4&y?j|ffp;L*ygz(pTxU%=
zr5xe~!nBpkX(a(c;mdHWhKb%|?P@&rj%0G4qv6$2WwghZtfWchMv`*(*|fF#ls$8A
zhQ0I{(_}+J(&e=~hrdflL_Y9orRe53dHD{nL=ByjQ<;j4tvt#1HO2U=Y{2rD^U%nt
z_grpOn!_de>l&BMvj>dYVQC79C!fDEg*55!c{+OXndhyy#q6`^cWE#8I_4bl`fD}*
z=UL67#VguI1;gxv<@(~rCBBGj`30vo_SRb&Az|G7x=fDr7DsTPZDje@Obw+y!Bj%y
z!BFmTIl_tnZ;a6k(bLT=Z5nLxqQVRkY&91;dTr2|ERGs3IMnZRWC3D0j<;sl+US8m
z0d}GNi*Py&ZChk>vRrCseP>8m$F$KUQ1h4k4h(obTS?|EWJ-g6IEVHu8V-9jlINYq
zJr!>`g1O+G6eF@vFRi*o)-e6nW9FuZ!A;?k_N3-Ste}H|pNME1*$+MTP{OaX?rgJ%
zT6QeYShME=GVRhzT}aEvQC&I%SLo`P6F+8Lyrkf|J~L5NLEZ9LM&q%}t``Q=<w<rl
zL45Ld)YPg6_Pexb9|H1nhc0r=d#1Mb8g5)i(1Lti#K~?iLhE3wL(Q`7AM0FZUJN7N
zbT3N4s{fElG>_$CFR<?<6H(l?4~7+&mLJLmq%8}N%uSca(~r|8m4L_6OKmN_#K=9C
zexbP7&-o{4+xx@j_xB4!Rjfcj*ufoNlCqae^RWnrf+15;v~DSK5t*9P=D^rE(Q3?6
z;`#{z3;ag!_6!r>-7JCE;UGDO!VK0$+QhxFd(xTQ&#Ja{SG53d9B&#^vS*`L-;;cS
z;di2yXaj+#n(YxZqsBMso~e@VdQ!!rd}Ur{S<=ReACh?)f}2DNh=O?GRpTxK*c^U~
zPgYhqMcm#3AvRWF(kdPno&*$~b*h;fc{PMc9GJgOng<PnRVpCqy_~Q3h2s}Eb0QfD
zGnbk7T7=7GKbpAdIlHpG_fSXv)Hq7t-$}@mXGvD$B12}lz%y@TiigRx&}Z<4vVt39
zSRuB3?#QF5lH8)QWdlLFe(43qmGd*Wq1M?#eBM+gNxj2!Lr*W;8n^0)a|?3c^Jn$`
zSm<A%EUd0FJYu@b-g>I6+Oy8f_Uw0Fklt67l4?9~=4WF(%(U$M8ln-4<9IgU>T}+4
zZtQ)@HiQV}^60JnF&YC-azJ{k@j5Kej#$^KdEH&!6`XFysueC_6Ty5m;dr7>j&--*
zL(}y&2BW*1>kV3mpELJNn9mv7lx9iCE*<jbRG(tZ)ubpbI56z}&WBhvFg<_3VRG2X
zvlzq5<<c{CFSpq6MlZ2D;kZ(b#h8SjYJ0V(#x{I+g{1vpuB0|LQCB%OmbFb9TK&#@
zyHi?UJ4>jMR-)|EL!UN#-9CMpTpsK8N=2yHB$2o7{M%NFTmSloBQp2r!)%H^HGV|Y
zo=`b$f_s+CjrW^_{m9&9#bua&_xIb5X<fQ>DdH>y-#JjzpFy!+rY33HElu#M5Zn^}
z=z-^(J!@rq!*vIq8Oz!)EnW7P?HPKzI`L6j^+q<4G^unWInwz5k}Td1yVXK&qZ}C{
zn`8Xidk@+~npTWY<bP;<<g}o~6@ctAYG0YqxdL1Ft*N(ngPu$^dOpaaMv7BB*$dmL
zOL?48G*eONF7<&M4AKCCx@%D6Bsl_F&|Y(6+gs~UP8`9M^^s=dn@rkigM5bs=DlWP
z9i-!U!g?ahIMReZNbXL(ai24WMm1l5Yx=lNzyg{sCHzWlLaDRBC^7qW8}dxbeqSzI
zu&R&p4r|43Jk@=fxPr|apXv)K|6Jf=AM8Q77pW+-qP##yJj?cL);$I7x2s>@z;Lbt
z&|0FhjOxdR5zUfZ?uk_lKVcPA;`1enjc$3v+-$pdCv7rB@tX^Ob*kIkC;_4Rs&2O~
zXc4N#c3IrgXo)v~w(CCxH&OE3%feu4M}Q1X&nqhXvT{L~9@0=q{Z13?c2=D!8_k%o
zC<uD=pge{r*wp`~>@HU)sQu<&cMlHn%#=LZu{<GQL@8$0NXxW^%N8!w&-^4Ih9ArH
zc(dKU_PF@#TNl}~_!&sAmcZ{*NEMuP7PY+E+PgL{(I^v!o|T14h9v)JaH#-X2H&6Q
zdB2|@@1)*)nQjXEcr~17qfBZ#XTMh-(H9uHFcdMja`l@&;i=e=w_yi%<~PApS@uJ<
z)@${kH?~1b=2x1!AaxN;e6}m2uB^krGntGh#6{R~P#(V#DfZZ#rm@ro+H?sZL%2NZ
zd)0Y;%)NP_taJ0U%QlrpbH6=)!N6-#7!ysgv>;lwc(_A})g@E)VnClYsyElxIO1L}
zbn}}a^^ZVQTtD#A=jh$koChY>zL6pgg!QxhWc-4{0Li2#lmNKX;YGLag^G0Q%2Mmw
zA(4(##L5}9*C+Y!kUyVTK6H`lG~Q)TQR>lB4(S1-t1W>#Fec&Q%Mk5F4<74CE}Yk4
zUu;dI(V7F;PcPVi%V6y>Jj})|?c~tjyL{by&AMT3MCV8q_X=ZhXmT>Rml|PDxEAR;
znqXZ^U!4~j8DDb@dN`?cwbS<OPd1>9DIJjuyJ{79f2^gK$D%Bi3TJmQXqwHd;WT-T
zC{k`%ZJv2EhJAxSYvpyxKA^*1Gm#rQ@*U}V+q6-~Z;FMV4{@9h6}2xL;@Q!QkJUP|
z=^TcK!h1W(@S52QNjXgK&Y6OgyHzM;&&f|Y;C4&f#80ThJt?i6Z5$Q>)+r@|v)J8k
zQCuj1Te<}lbQ6FfPdQR`^Xy_Xj#qNMe=+oUUc3ME+);JKGab{@Ev(->KhlpiSWQPH
zT!`4RE<?NH*Q!~otzaO=2#?~~GJJU_t!d8d^zO;UyPbPaY8c_6aaC%R-=Zxwny%8C
zTvF&+(pR^Mn=bQM$MpE}TKO{dSo-?iwHGVp%g0p-Y-CaS0=CJ?n5q7~(@L2l#l2rw
zYZr(5xDlG*f3Y=Dvs;gArIx%d%bD#`P2P+El+=4Zu<n|ly1p4~e{<UGW!MUK0B<aY
ziu-PXiIs8~uFs_kRH+gyW3ph=pTAym8V85E!Skp@e}T#~{p|@NVyN`Pz-}7W+g_x4
z&h_UcV#%QfFO&&Mz(g1a`w8{l^%f(Tuw5o=rM(@&a#45}$o#b_D{S;0zV9-N#%iCY
z?<&isE)L19k2p2V#2DFJ&$>_p4Jjy%@*>3(^d$`eJHJ?yx`?HneAk%?eC+Q&&ZuVc
z=-*GTaUOz)w&xq=pigE`6|D~Z;<+8RJ^!McKXD8XiPdKO&#Slh-ql0EXc*5txMH2`
z`^!6ntr7&Ho*57qu<b)YGs}r>`4qpGZ4p8k^vz6&Xpsrh-WJ;SRxT>-3|8Aj_Izku
z@IQEj5{^V|2h5RRFSmai)7HT2-kLiWgBm(Z3IA;8dM*y_O+nYf8bsCp?_2v{FL8sn
z9<Xipwo^a7?d5N8{R(oq>4<&Geu1BoUVkNko?iohmhjHAKf}0x|GN<+H_Ky!^@#Ez
z@0u`)&e3MFhUtavsK0&k|CxEmfnXDQ>Ktz2#H3Gk%7SvaSpEPRM@dK81+QP`CU`8*
z3e=}fI~|m;ev@U@Dc&P&`ST{s|9lQ_0MNrSzbDD_W|zR>Z3Yczk7eOlMqX)okF7+G
zWoLHHL9bQY71+O>GLG&1X(!%)Mwh>aDOS9X9MuJUC;Lnf>b^o}p1n!+_@|y!;J$C|
zidGw1hx~?d*rf%U(i(QYyw4O_6(saDJMTlQa){{1e20;nmk+wUYfjduw$&DUtT?%S
zeS8egk+2io|NQ3RfQdaHbuL}l7#ub8Q0i33Fo4mqCStHp@??v50B7)mVp!bckJw1a
zOb^@J+go5mUswW8CkK#bzEaKVGG<~tbeGeY_Ybnc*x{?<2(dJ;pSjMDap(j;VSWMq
zv0%2za4JQyvwXV4I)&}i*x2%L2j>JX5<HSQ9H(w;)r5+Tzk8+fq@(rg$EV75wLt!w
zaZ5Sei4Dt&8Kisk`$=?pVDQOr7<9w+^xAM~U3hSPxzPm}_B0w%uoLVSCP+hl{5p_F
z)wm?B8;B>zw7V<MUYWi!M>k;=#BHav`b02*1M8tQzLv1=GDXDDWRFF)d=!(n{78{}
z^3MOf+W)+i|IDK&O1x$qMdu%GKS``;CfoED;lM3ZZ$f#x-DiA)HGtz(Ih>6VY7*@F
z4li~blsPji8N!U7K+I62B;(?QTdAT~H3~20lSc3IKnf4ERacEcT2Mwx(K2oAdvI~C
zy!ql?Rz2)J22h7-P!)87q;CzRTwEugpkS3J;zG!GB%2*H6iU&jwpf9B+BpWwN+tq2
zjm8Dy-rTgC4omkzB5u4KCi91o&=-MM66r$P(4!wahn8Ol=JN%0DvyPEvD9F_ki`)5
zPy5G@Wl0r?x_j=ksC!w}mkD55ana`<sLfNn^l6n?qjBi(&6aW;)moV7rs!muuie(T
zJq9fB4Zud^EbHzLg7-u5he8E_l&m9_!^GSF*tTyBQc7|Ls)EA?!JhH<0aP(7Tp$dJ
zTXmcc_G?ejI3^9mS()pn!g!-dQH82t5u3|A9+}PY*N<eLFR&GekEdhn)w~OF6rDW#
zQ6oPJEQuC?e!7{aFQ6^CBoUutEp^ENy;bm4CaiDamD4i3?|4Z*fwbezxkM{ha4Gth
ze-nup(Kdeu7fw42+F|82S~Cb$Z1jeh%gJ^hrBaw;*O%(Ny4E_9S6FMB<pM-+vJNMy
z$3Ce_Pu9swEEecpA@L3&u13)UKp+RCRG878m|@q-L%!d1cqYqWVp#*<7^n|k|7hOg
zB*Y<5e}gt$2z2du!AxxiP_(s_Rz~j9VAwcVs~i?aV72A!Q#n&AYXU8z_d72eGvFO5
zD;Uagu7Ke&%WHk5x24!?{T$=Y7tq+Vfg)<)n4ReE43|4y&!Sgvf7+rYDfb9s@?mzU
z`|5=IO?;F}QL_7`ytrFce)^AhDEIyy)xD(jDl`UD5-&dl?O1vO-B8EpjVTh$Fs9_b
z$UztE(jZLHPC1#`vo<|4x*50qm|4~qiq{=u!-*G@Ue*CioXfn}1N5=Db=Mj8Gq?@f
z9MQC`xIUqDf#G;43qGLVVEE%VMEcaCH&<{5&o}O8;m4_I4?MuuYAe$jdrKimB^b#Y
z$_C~<zlc>!^Ra~$K0DqgQz|7?k)NL59LW@dR>JTf5bBeLxGVNmTrn*X$2$xDM5odr
zK&cfdDzSMPK)-gz?Havx1icpmok<dHL|j`{_mC_gp-}S9HmET4?LP4dUs(q0V%{|W
z>m*)V&$FUxlEc;2O6;pa_`4eOlnH_YQxW2}2}@}gnWU`klp5w`zq_!~5`ex1xgfnP
zCCynDVN5d(p<GaYozJ;iAy_YTnaCU*5b^kP&#K!V!QpGE5^2Ze6%5v6i{H#SZ~lia
zp8o<ykB-3-qy6U2%By+D=KHr#!ovwA6AuiYV`KeUw>MwYPC?5guFgQX%@?LOv=U7t
z(E<Tu4R38!Klt31r<C)8Sl`J&C{>pf!4kvICDcdWKxRrt|CVT`M0<n1Bl5!5?BESl
zFzW`aM)`PCf<|}S`wXY<@^waZNoq2-0;Z(&gbWf6R>c(fO7T|J<#F#V2k*?(<nOw^
zrdGcQi{4%rxB)SZ>_i{>8dnBTAn9PttV3Bj>W><7<RHY7f*mn*4df5Kqn4*z!=)ii
zJVlg9luXLACZb%yO*%l2k*9JY>nh&@8}bdPr(+JRp3kK-4|}5s>Vg^_yGWO%FDSSZ
zT%GAHLUn_&*MbMN*^AEAB}RA~dM<rLB{Jmo?>?TGVqELDONtUH?fEc~U9{k8uzD}0
zXO7D(`PW+d?rSAjb^olB?Q|kGp(Ik2zd5km6$w8=gs`&~E3jj8VnWD$UpH>yGJ?+|
zMBGb}=Jl4FhajJN0~)5KAPIe%-t|S9kH~rgi%Vmw{$juJn5ZyOZ~lJa7e={mOV(0F
z<wd?|>_aUDMZ)jCTQEp(`(6Ezq*an<GfUuHSh_Yt{*~fI!A88dVKG#I@?#AK0(4X5
zQnPg~%s^5<<f@pkPu0{7fkYx!lQL#VR<xe9uBxoO(@v$8><|&+?n&{nz{pYL-M=wf
z=02F)<qw8hC@I(xq6Am;VpAA>Zb|)${mYfe;Joi9AT1&Eb)zV=M$*a1&iCQoG5-gJ
z|IgtTb$Tnm3h^^vIFXp}Irj_Y>Z_x|Wts{qHKm*!+cocK@!fy=Rv<oM$ud|-mknbz
zw$KUs4|~0)z&FUa3S|<-BqZIlwGF5m`@OCB&ny6*`|HW+rL;ROcbOYkpWR%SP2W|+
zcYI5Y)mRR&hrK9qM%_~3iihBggd7j?ev8b9i|s5MfNP<KwJug332@M-b9|O67O14q
z_0n!W5NsY#!9UNn8=$RK$V{S*$RdQ-D>Lf!1MkC<FKf4V)a|+@cq;$2xZE(=g4s~m
zuT`h1^_!P!Q_7m%ylxVYz+$J%FHo0vx?t41a{cQMSnSns@jhl>l-o&Yaf~TxgUO;3
zp-dn{y*=e(17wMDvNz2RrR|uU0V7%}oLR32^Rc{|+Toai<Mm$;;up_kcBS4p&z`GU
zPU&5(mIsQmK(I1dK9F|77y5kbP$SZNc(lzAetl~hYN5Swkr0g73WE!iKKIaDD9quP
zIJ(i>dja=}+kU+eEK6~Dip}q=bJ{u1mEE@`jyO(P@;#;;dJB_9A>}=6*1V~=u?IB8
z&o!+{q5~ar1{%2Z@}xZST)(e_IeK>Qn9wSB<AR^0JM(14i@##*zk9m^`Md*nE=B-C
z5grx?)|>r!BUg?FNoiY68o62r)p|O3464yFh!43#a4WmU>Dyb4&_8491nqM3eMgn<
zXh(!Toddp#h8;Y!x}RuY1d?oZ-8YtZJd1+ExlK0rf7_wA;01|li)MRuS_(2<6W*kz
z+Y+Of0UsZAuskenreaKxq@T6y%56ZM^xefX;S(lxc2Dcn%(o%?(S*2sOD*DaTsftd
zvxh~K0f%K7et??9v-KH(E#lAw@CSm$Z$d&wMAI4fvjS>kkB<W!Y$uY70ECW@l2iPp
z7g7{&tdHAxonmb5gCjQ+QM#b<+OBmRsF}f@<~Z7Yi=taK+bAB8*DxON(F+l=m1i~y
z4(kOfxEFi%XsDLdeQ%}~zj2MZb>W|flJ_|VEJ~VJh@}_&&xd4z82^Yl3G+H(oA1i(
zNWVb`o<I}}07}Hb0>hg0y%tz9(}?d?p2f62cSzhD2W`z!;k1{>q8knu<T&2ZjCzj!
zx&XFm=3XwHA~+i@E1w@~iM_@y(MwL}5Xt+tH>WVF|Fc3(p`>GTD-zoRHjAzL#>0Qu
zvN`M%J6Ln1E5k&UY*3gVCg?o!Io8%+(fw7r0kK`$?%S2%=)@%}*1)-;Z7LtcRxOv7
zOxJprW`?MHKcU}a9wokP_H+aGNB?%R&jiX0onJIv&Y2A+P7@YM)s$uR?92!;NL0B)
zRSB1JQHdVwswADH#||~fU25g%`E4iHaPn2{;<d99-3}V2>@`bd{9Oo~nHdB4;^b}a
z<Ml-})-A+MKYvmL@B62M@%QQu23h_(nm>8>RJ!K<i|uB%v$4F|05ab?QH)YV-7T8p
z_c*#O(#FOt*^fUPyB6{ey}1~S2+5imqRvMawoO(T;_hk1;$v#2b)MTUG0fI(5F#+f
zxx6)zoVe2JP%(>q^+uYH5KYO@uVPZp-SavnU?9;dO~><WejIx7QU|6Hn^Jl5gVZki
zo+~CBi8Hw0<bHjKv(#t43q5?AaLT=G3zx`ZVxQ^ZQiHiT-Wvr)nE^h}md0ZkO^G;4
zXvXe=I-h4%a(bm5?=uzd|JDv?PT$}YY`X<9A9u;@dQL}{#pQ9)J*}0SW@Z|=vKvBI
zh&~I~P`^ZT$cnyuQf;^0X3g&sqG6}?El&?6iEhN+)|vTW01XUIP?V6WrZ|rMTm8+V
z&MTxS#s0nH%yuY(u2^hTl!}`hH=g?EInP4AXZSf}d+$vG$^KtCgQ!KY9<fCV$^Nns
z{qbQRF<?E?av9<JMKShYM;iZGZ}i7jG1()}4=_PtcGb`O#FN)d08wnJtt|gjxc&6_
zdn5!LLiP?_#m5^;Kd9Rs$=m+RNAQ+`T<Y5M@6o^fU&t5cl|_<qD?Dxn{`1KF_k@=}
z2;_*%6)o?7TnK;u!#iIf8fJ4<pY8qEF{wnzkzLyafBMva9TV?a$o~#IxbXP;zo-4W
zz%vaVy7BLggU=5FGt}6B_N3y!57eLJ34%uWnVDiP`K1E_<8w&R&9Tk`Akq%LTpK9!
znJKFC@83UQ9*0yt#rYHJqpLigi_wK(6p21L-j;SG^UY3XY3GSdsLbkvj%&Z%S>|ZG
z3WUp8<oc?KX~Cxx0R@xMrKx{E`~G|d9PGS(y5&3@gds(o79n33A#0DaR=?f{7Mcv?
zu<)1pUU`$#;8qFVy9X$v-UFvijer@!%(_Teb=<Natp4P<3T2w1Zn39Fvg2b<wGw=(
zLWi(7Vt|7y?NZBjW<WBked|tvqD&xU*gqKNJD5EQaA=)El14eDoh(f+UJpIC%y?8V
zc|hsvs{WpQQ?{SJyDQgWwg|Wy?`K0K89gS*!Ogbn%>Fv+nX^&|-8y3*_Nit9b>5_i
z;U#V8`u6G7IMVb18;$mOM;5aOH%P=3POlUDx6d_Doo&Lzt#pP@F<%rs0Vluo;A`V2
z*&41-1=RuvA22+B@1J~<%WwJ9V0hlEOsp>*3{;*Xlk@poCd{doPsbu(`$Tn9opX=^
z@&$7I;xcY4)7#RFO}{~}U;0V`EXpuZUDkKla{V}!@UiFTcLFIk1A#S02oaYMU{L6)
z-JS1v-s6Js8z>TlsS9%JO9A>|czY2x<8A<@Cq3uf1GLjvT15wRo9*eitk4Q2UpwT4
znSqTk?UjnuCt9!wPe<|t;25Z5_0Tw$>w&e#Gd@_LUki3lE%bWj9PvN%z9YxR##_Dw
z;@Sj}{x|7aFh`{R2rvfe6~oQ|3ye0<(c{B00oh2KB!{WHj#s_H7Rgg{FRnaw3J|yH
zv9u|eeh9@+4MBj+<0ThDi{o9mvY@me(6b*nmR$=;l`$5r$t`a$GE2B~jJBTvtM!`-
z2(%iCcm-|cA6;jMzeK$H_>?^?{lc7#K$%9J%FJMmz8o-AN`D;63`mdS6t^@D2?^QO
zKH7!ZmE;5t8{dN2c8RC%Yccw-COit}fd%ZW4mm}E5paUAQ|b^MdLC-*A1!+V#&{DM
zQTFWFc6xpEdDCJbkSeSio%s!3l|d(#-oyL|M_Cj7d<R=Oqelfi2i%f|+T<z_<_63k
zej_X4;o;*cPLp2-b@tHPzjx0b+38Zb_QyVzQ~-kcpGOe?P{icr<b5BP)ar``o3I7b
zA0%1Bem-B#Hj27yEYF`nRNSarD3Ig6q-(^fMp}b99?tBJX?sU18G&YobDNgod8`28
z2-oLXcS}hR1Zs?dOY~&TTst(<B4DJu2yAg#3~2t$@nkILMMW^_Oou|8>|MUkFxlMb
zU3nSIzYCnlm!2m^%!P?Mv7R<aHS$KX7k*G{8%V|aljuO84O=P8u&Z<zed#<B%-SZ^
z|Ll|!Uo`7+m<{CaYb$0GhA8jmZ&Blz$t2!aJ~8N*34WXswaIyd2jG>wmR<l#E~2Vs
zoTWcNYat$ZuX(6^Lz!wk-N2ZLv{u@e_W1O~td_^~p!h4vE&*bZ$xLxNI1S$G3wos<
zC>M4ioo_y_I&qdNs3n%+mUKn4UC<EcGU6HTG8nQpwa>E{5VTUAkB(=5i_6?IfyG5I
z?^A6B`BT`caGh`UU$s=2hfZb#`A8ZvRM3(=1+Uj6qMKE%7~J(;g&IcKy03C;(*gCK
z%L-*ksJQ37JvtnO8$AUsB)jqT<<?@$`LT8c6XZ|EMKDZ_0~WZ*D@bG4WZE0(fx}W^
z&$AB~>yqio7>LyE4*@Toz@sEfOVpy+L(|>>y<<;iqItdWi`^eM1B!P|{`81@7Y0`>
zj5t;77eO|vXG~~yOs)gO$mM37w(e?+@LV=yzoK3;3c7{_Dkv~7kvB+G7s}6)l)<98
z`+l^e1#>OB`tqvWu(y{PxKCdU7SIYT^MZ0fFS~EkE}6c3(1hLeamJn5R^3#l2=s|&
zGQN%_PzvOB{o6x81hXG_=wpe#U|zjx%T9-o%4omqYz_=Skane1po*zc5$Wt$Po4HK
z+7qH*vLOc*{YNo;WbIdYnLFpS$T%UBx6h1g<j5OBeLfG+KCTPlJQuE%;!&@f1Fw&7
zQA0#srrsDWW6cdnq_o3EIy9X&g<q>HN36L%E^A`5lf81Y#|3iZjvC8eGiTSk<IT*3
ziPwr#;bz~B$EErNx+bGLZ~EzKC!&)DqY`;|<Mko`Y#z3pd`ZXt_GRO3#YesLGo5AK
z&_;&faJQa0zf1O(Na3t{gg)#&mx4KxqY}$Kz|fp$axG-qRornGp#eyLnTm%g^{1Lh
z%AAcZ#2U(WE5g#loT)VO?A@fHT=V8c(qt3Te<O8i9IQ@u+Ld1zOI1s<x*e)@(M&7m
z`CE{VxshD(rV5QNl#OdW^SZzp5Fz_>{5ir8ID>ogfHC4Fe{xm6RD_t0CB>v~CN9Nq
zhzq5C$2?EG_Z4za_0?v8`E~Bp6xAMpZ<Ua3+kZ&|8i%BtSt`}1663=l9_7RwbdHAl
zB%ndUH&%~QB3Lm<>9y!T9&B-ESUoNhtfbh}E|q?R7fs!6fD&MQZ+$%L2>$Mev};w4
z12kgx&5vk&=$W4B>4_H;zQ-NvFxbRzihP27Nx0W;EEdnu*7Spd_2UvK%p~a7!D%if
z8@J>(e<3_(Fy2RQ(oW=oWLC{(EkX+RtM)>2s!&TVA!$f`qs6*5cH12}L751OT##w&
zH=CJW;wOsb^wf}&T*M>GAHIOcl&EAyb#ilxVK0Fqm|;*l4YHmDOJ;z~+0~465|IE^
zOK4Hb3nBNh=?G(nnu5KDZ-AlP_eWSeSY%lU?|Eamg3VLINNUw>Bt_)Yf+nZxRRigv
zpM?hSdjPz;)+;;k1h!o7bIknvu<Ze%COCzV%Fi^~6OEv^gSDChE#O;yNMjW&!z&yv
zzIdB`nf~;<e!DN4*L*?;CCz%meesJgg-{igoS9x8BmcVBLAim+W^MjL4>nsn_DOSq
z;uS%++~0Kz+}`mf465MNdLp;r`e&~xvNy|qmPw*9&E0p&_nf2f&OpJkZ}1VTyi)Fc
zriu8dg7kp7`wyoEx2%6{e>fT0n_he5=^e0>n0o4=a#@?O<(a*+vKm)S3G(F4S?NYI
z7U-8qCc)A;TgSmkNu0R*;eNcy1CMM?niNqd(eD`1azFD=$`<)iX1LIT;r3xGa(}|!
zeQIX9HTHB98fF9O`v*@Awjplkt}lZr*h?us?&d&mn~J+H-tlU!rw*<OB?Ene9mQ=N
z71GVlwQIuotwNqaXQ;T<Xu1pWqIRHiY$W!f_E0^Qw%NBXPbYBbNednp|G>+N03I7L
zixvkyQnPQlcHqJlh#Q96LxFkU<y59Qr9Kmb(Uru8dG3zd5X%(Pp!oZcN!YOCQb=Yf
zUQ2s8EmZM2K1r*ADAls?NlqmsKU73_X4?D>O{p$LAXmnIPzmoTTkMe~+hd<JGh$%A
zO55z;`rR%2z1QZwDLmQ#5ZL<#^;MNcYtd8kM^(W40LAjh^`o~JuX(4k3F*8ZJlVXQ
zXFrrL>Bf3xE~7WFrUS%82wojJS9(GJHnh)l=_MgRn)x24lKkxAoOYlUzz|`#=1F`k
z5xNar0n!$NDW`pz?>=a1mbzfv3yZ`O+_A1pDP9aRwWyop)iTl$rEH%B0tv0gK5^}|
zxLEP&6x#B#viI~<-P*F&B*vM%z1mc$*zMikTFkMeKEt~~j@NsRpd5);lU&ROqQ1AF
zgAPI?95q~ldpz{1d3ZS`n2B;AgBBbBM=zmqlxwN|sFXkKUhUAAF`+cOeRRp4G1of7
zWk7YZS};b{gi!J_T9dT<=NqQc%NX76$jLKxrF#Seza;aR^2g}+PJ$--`^|tZo_^U1
z+8?c}Mft_9*1;l%6Gby?y_@pfIsjHE6!W|o8anlp*|vd+pbx^dwii>bC)6pgm>>e&
zUSu`)Wol>`en5|ym5}}s-6wGOoun><v>}j*EbBHui_d#@m4Y9<1>Bx~D!#phwkAj;
ziGD_RU^c{(?!68o>L-1tL7C$?6JZ`klL~+<JqT3H8JDK(m$h%-Ch4Y(Kd{QTuCYoi
zF$c7)2f_H#DD7sJ(=ve5yXXK>9qEB$lnp<wIT>1AgEZvV%FvH6YQ5Bbq$ap4%L32-
zdSR#>dpTd?qGN#OlO;H8oqGVFJI@>=Z=QO6()g{vBo;U7!Udg-oH-NGgUcQ>H4^8&
zr&leR?YL?6&3ZHdiD_UA`b@my@3;!youTx*&?(bVA0VjNqOEMNaqKgg`$ZRIbHRS3
zhr5%+vwhLyqnp2@7Y~F%XqcJHUXyj5eY8^8-fcpT>fN<|;O@_PSZ^-xd%<&oI<Mkw
zO@08$H}^z0v1Te=(x^wu2{tSLX5<2h8o@-nj2FrwIa0Wd^;+Y=Bs=q29?20TUvK}o
zBo=-|^m)>l4u!0jXV+8)H*w3y585cd(}x6Kq*-F19c#~yxYeu+MI#6H(cjQo0;A<x
zK$*;&v6$#w)ycGYCUdk$R^2Gau-MZH+G{Qz986E1p<Z^``6T09Yb?)(4|b32O}mtl
z4g*0!uDnJ>KuEuNq?LU(qnxr5Hzb_?F=K8)c~){hvT)nJqix9L1%~d^4|*?p_x6YI
zpCEt#y6pXMqGGH)g8~;<Oi&M`{O;gcmsXxKX`flld{~6{#iZA_lWQjHE780Kw5yl1
zeYzGdOe%kqT>NS7hkX9Fc=b5a%#pq@YiIxI!}sJv_FTDLB5~k57=rXe`VfK%I>AR2
zTP}5q6g#3769TPoX~<vIew$G5G}g8!iz~|~2Th7KZ#)$vKgTRmV1#z%+pDM2Y;nC_
zL}%NL8kaEBzM?ZJl&HG2!Rqerdp0t!@P;1E&@TB+v};Zu%T^qL4VQRfWM!+v`fl=W
z>+4|;k>Nst%EdPJpOF;antP`<6(y{sv?RRysP*GTcXj23p2834ipiPX2=fuY7`{WN
zE$xGpig&u^&av_X3sYapx@A&B-NeH07v>jfZd<EJ7aq1xvYdozD2Q|3MhP%^ySE-=
z;|VQZ`I5PF+}xtKuIQlL9_*exu<zV=C(Ll+)l9?rpPD#$u_&gVRl7h3m1b&LF3>i7
zo1wIS9%%bc=b?@I?lZsgt6X5+N?1H9_<WC}X3|ef<p0rEkUw3~iqd=Umj3UT%-xe$
zgft`Ngw9f<pH@_VV{bm7wk{sE{uuF72>w?_6@IDam4t>9`w>RQd+#1lnB~S`W~2dm
zpxv6RPr3M%ZK41$Z5VQs4&vYFgp%9_5&NKqZUY(Ba=bGq2e~tb0j28=u&t*fh4E<t
z_h4cDB&dV=M4eQAb=x3yGc92M$*+U@Jla1%%@x*-C>=TolHba%M=d-%Rhx<|MEMD*
z!iSiO;AGB3E|Rd${qkEF_`#TG4>~qop9eChf&!#PD2i#L8>~XB)x~LXI5?3a-U{5F
zPyh)d4lhTDnW?O}=KG6rd5$*%<9Sd>h}o^h!K@oQkTS&_1yz+blKKV@T`dey+c_Mb
zGM9PyAS7{3%=mLlk3m>LCeq>Tfh&>Nzr6!4ouV_<!xo=$ey8LIl<P5wkD2EXGY>2T
z#vGytg7fNDe?{%_9Vy`|;_I*0M&3){ZneBYC^kDlZP|aI&MYqcHfZ~4h`GZt%e)zl
za$|%;uULRuc3RLvhlL0p-O<|vKtEV$%xmyxPP;$npdxZ};ioh9YzQ~Wx0y}Ez!FQw
zWyvmdojM%>{U%zmibP8wh@u^rgyJ(#{fg>-DFTpxY2n@;WP0%(tq1o-6^)h-^^mwj
zEyQ}410Qu8X4MXg@vod=pU=ZLa_u^H35i=^d3?_&Kh$dhxV>~BsA>=cJg6h_NN+sY
zjQS5tZVCVEZ2prG{gc1@1S>{J(xMi*Lh^Tg`PkPGYhsHSWgy0+TLto{@U<@0uTNg(
z;oS7s<@3LGC~qI;&El4AyN@s4;QH~()Vi3i1szWd0@btA4CKCBtQgHyC?456C5L|%
zAok?4(L`5Xva891uOkr#Kk~!W^?8pRInwM~x%i7k&C3IP`_QHjzqq;ZzkeDwF5s6(
z!*A&QVut@0*9q;}o84zN-uyYN`Rk8APh=9<H`SeOrAy;L%5LZ<l3W2~XY8-M@()4Y
zf6gyt^4{9|yBK7jjB2RL65!H?9LP+pK~%P|vmUOZTwRhmmhCrZhU#{Odkxiwi-V6t
zH_IGrs=xRi4nxEVbDI0|PdDkme8$GjiaNXr8+#Ff4=!6W;e>m5WRZqbnm}9|q~0Ko
zBw@2H&S(7w5L1-ViP!Z@0I;l}bu$c`pw1V~X#e-COZgSO!5D)OyMTl<kY+ycxXF>R
z<RBzU%L(iZ?I1lW#l8dvOCNrFxN^8uGeS|U0KV$Z5&>r`WJs9;iydp#TO=EN8Pq%v
zpoXm1O}+cIIuXft1!PG*L#auarh7*5AwSF}$Zc-OOw%d8QT7`GvsaQl9Nx5C?8Hka
zSk3{hZ3LzN%Nk&>^y`4&Wf=`X{VTg3#mh8MIMxxyY$tzxTo(oah@HtXAVHlK3_NfM
z-`iz^x|{;c7W*ppIZnZ#CXNasU+|731|l>gPR6;h_C{{1C&LxFBTf2JXC&m@u;<Ex
zqkN`2*(lwo2tC^ZRquS4y(;@3$aQXdY~R6JZ@Kh+;0G@z>c%N%{;fsu$PzeB?7fE|
zgA5@ptw4`-nQz_z6oeEzOY-%YhX7sMQC_V46K!cRoIF38oW3wEZ@d~V<@g?-_EYrp
z=QeO=<NqV<t;3>P+xB5WKm}100mT3Vkw!uURK}K+tu#ZZ^bkr)45=sv9nz?jFyzpU
zB0V$=4JzGTLwwiLz2D#S>=)keKf&W*)~t1}`@XL8Izv`E6gF}O-_eWOCA*Xl=K?m?
zR%mY%^rP$3?YHQgV~AY~(9BGG0QZ}AjfPGJ*NRhZ*6q@KT1v@V1877!@6p6Xgo~>P
z9UA3h&@w(A@61d*36#5M*<1}c?q7(!5k{M!7A?s#?HQ}H2Q5ZUuHE7f;2mWmd_zQU
zp3O6N@V(;vTEffe#FDIZTVZQTe};x4kc^%=xI<_0U$0;)s!2)=^8#?H^tedJ-z@fr
zRER#p1ev8?Xh^_@2pN77A!ckL#$~RBYK=fRNCaXU3_{FI^99@yUoNJJR%+eREaQ$Q
zJwix8gLU>pe~HcvLwdEu`7MAFKi2!uq%A4t8c^aHCKMl1rI+9T|9|nQxx8DPQ>c<k
zhJlkdDS!orAqJdT3b{H?OuG789q(AT0#0a_`JfaAJ}OxQ8If=<hV?C>_YBUTKDvJh
zUy)4MECRcpT_HOVmjB~M90@=*k^hE!znqu{&ioDHr*}l)(6<-Z&fge6tZ@NIpd2}g
z&S`x|j1}8b(#_|)rdS@Lrx7v|Tlm{Vr?uZPv;56{q_>Pb)r(CK5TRc$C+4MPa%Ob5
zqzjpL+W7hCulQVyP?g5lGr5~#+nt`QSV~{U892+5klF&;5M3HZ?o_CQ@}0aRFFtVx
zj$@wuCq$`*YymB=)=q&nS}4~tB928nt(qvyE17hp>rNG3*lIXYT8%>da7<v9m0T;d
zm8$Chb>Stf;18ir8Ud%iY5}bEit1k{(*OI><pG!cX;tOeT${+}Klj-~YJzb?)b%Wj
zVL6cslN8ND-2KRFyt~j1Y2gq^$NS{@K&4Nz#D?1zg7iT_VXZNktuaq&Gtu72n>TU>
zQL7}~^*Tj)@<#0jQb>s0oduhDgj4*Cm~gn>QF83Vp7j6g`2KSay*^C`l)9Tg>6^+&
zbZnF{jy*UWH;|Uvdy>0n$L=31N=wh1>q-`NvcgrPm~<b?tDAsGA$hBKeUK07uR5TK
z)$ot|$Ok<&+dpS*pl_mNuWtOCG60q)qudTi^|BTSCcc|3-NO>raxuEPDy`j$U19QU
zNBvN8<2jpsZwJ;athd<;T^8s+h}$GfJMeVA);oRjhzd`Fc8Yt{=3`2G`VLJPG@d!Q
z#M#ad8wRkF>y2SU;>}le>4W(I(}VZl{*Pn&_kk3JqdAupZK3LxqKc2DJ#o1KNi|T<
zdHV57*gSB*OrSvK#2i5*==ZiW$A9*&BR(+HJ>x199g0XqG<UGz{g0|8>H@`sPtg{C
zq4#3{{AKTIa>X`od_RdVZjSEQ@tr;p#Z2w}BNWt?r;*|N7fyb#t%0LCPuQ@jY;Aek
z3`pE{f(it!Yr54(LIw3bk<v-bdIH(*TuY;5!!9=#k*37OAqWizhn^CYqYblQbee(8
zXkpH3s+gd#FX?{JODe7VY5it>!Ls27ZG}OUdEvplH}%7IybB!MD0egJz7y`qrV#Dt
z?3zivbl}vLc7S?x!C~XA*gE2cxLUIZMA5KzkU47eqzDjLkfIYv`+2BYuqk2|+k{k@
zRPm{tZ<=<aY*6}P46tH6#>P!e<nNcV`a30fR;qt7o&aMa78!oNrf|8SQk5NzT_4Cm
zZThB%&K+bLd!rLepawGg_Ud4^-Qt)k_~MyB-vC(i8*~sJ230`>dq7gv0kEMtPz+>B
z7+NA@W0+8>-nrurDCDy&hgI{Mx;pC$!~&GJzY8j68;4}M=7DbsZLid|M?N!NCQ;K&
zbbO^*PtNF1{LHQH(zmQ7P}3WHUimBk&Q~c3SIOB!TaI*H(P_@rd|WEy*&zSwf-(7q
z*Hj^FSXs|!f650={XHe*eJu`6e?t-HG;r!}gFJLt;^B{&@SEZ>zitF1B>P+Rzv;qt
z2M4J}jXLU$E2%#Hd9-FTpJ@d<O^H9*)jV5$bjKnR<xBD5n~3C07$fXi7fi@uI$muh
zS-sHeQ20aEVllxIbY|g5lgZlv6yB8?jqW+vaFXDhwrv~EzHzJ+w&%LcE_nGgyj95t
zbh0RE7*%rqdZ|XheLWPXo>@55ajLjBgR{mdyMO$Chi4UE`<R>Otma*H>zCcJRVSSO
ziV$!5bb%z7U8LhPp)aUd`{4U0*tx&l_<DKUSNYyOxO+EbCVr>Rdq+BxC;l)YElmQ!
zeQK)~QEUK4dg|dNGth=~uNNM?y5LO<J`|SmLyJ8*&pw~TE;N3$^5DI2HV5fit(EVj
zksDPV_&L%!|L)N^Bfd3>;5qi_($*fDy0w<-t-0`4lDTgyY3T{g3Hz*Nu^OCICYtC!
z#qvolnsl4Ne0?oqz?PS~LphJd{dQj8_440}%YW?~-rUd~uuDhGBTS7L8Eu$Pe-yQ`
zG6ed;v-wqbts!Iz4%L!CfPFIX)#1Nukg=%9s}!eNM3;iNN31-eR8uPckuo<}!!)vT
zxF^?~0e{CnSh4S$apdmq^`b%Eqk1lFCu`F$sXcAV342g)_Fk~7Ann$x@~Um`BC;?2
zS0@);Py+PfqnvRCR^v(tD}NFi<lOQknR46tp_gsCy1Kzbi;7={*`x!ZkG(o1b&X`P
z>~76HNt1c}!>BG9zMR1Gv@Sqt>P;6<LzZ7h8-dYbShVKU6k(QB-mG_=7-UVGeDMFB
zi~s$vs9of_7!$^Rl=YvKc~-RcOn>PmFcE+@I=E&Ev?TEy`AUPq$o$M6Mn^c{ueBv>
zh$D@Ju8vNQ$!@CsPkYcnnQDV`EguGgH+!t%m6wON<YaWSYq|rM?Xlv-piq-}sjEcX
z6BinHHSH=h#`PIR8po}dl<Zru!4woEwpi#TS?1j+-_d4Rem}VjU<?#*KWgRuMvvNM
z$#e^#WHT4B=UDw#xRtB|4^X2m*=Y&7m~ng?SAI5ZiJQFNmx{DY$<_0%lMw6U@7A8T
z#3F<-fhQeiN<Aiwvh-edxMzxz^jc=poyJ26^Fu2ePhBaCH@Nk&xpVpC#r7?dxV|GD
zi6}-6fuWzw<<NWiZ5lHYGZ`|pxT!hIH@DlUbv2NF7Sa2Bh7-@}FoJ=jqFz%92nx<q
z*eIrprU%?g6`2b&oX-tp>Q8EJNSvA2(6{qb{%SjV10PFL>z`9k>b5$l-rxVZHr+;T
zW+TXaXeFG#n&$9U@TH&%)Q)@H8#!m+y4(7a_y1hhk#J`LdkqhLzAj>D1yO1JM}TcI
zCaa!|v0<DjDjE}^?-=H{I<C7#<tf?>QO#WJDrn2IAKAAl4t}p2mm`coe)=l%>f`U(
zivKg0HvzTGLKoHZn~XQc!j?_c3I1f=pWah0s6CaF>t7!K!nkhm$(}MVB7%M;iJ0?l
zxB2RJ_X~c98fAiX@6qB4^l9WkM&;35z-ODD6*TMB>?$ZV4p#mtig{S4DBRaa&ZK2F
z=q_RC^3Bs?+%>Ymv_m99zm(}zb=7{k<#GM>9X?<e`9**L%A+6c{&V1SoRjBi#yt|m
za~{(Z&ko(lZ8z>o(AaI96-l{vFx#Byl~(fOYWG@yMm9GomWF#}dXKqlS^kpVj{R_y
zKjgsku!;EAKyUB54@|qhA;=C8jy?um!(63x@n?vaZ^Ugz=-Iba9s5LH#8pr-U6(qX
zRdtOqNBctnsp28yN-=$pW9g`w`V!9u&q_B|Pnt1$V(=pWjt|6@hX0jWDd93XY%6G#
zOAid&+L3M&PX55~a1$8v;(^qXs8w79XK)Rt;@ps2OiVb6|7ZpjxkJp$OOu+Hj14i=
zD@qBEv+Qy^7*jIloB9YwBK(O;SYp7X-Y)WC4Sknin<5p{`ey1g^EN$PPgiC|E!OQy
zA8kM4MxLG>GHaYBRg4_Em*?ynM;)jX?|sTC{K85$^gB!g(B>BcoW`S*V>_>hY<keX
zRgbkLVHmjVI{Y&-4$GI=l)6&pmgDp-t9;pVr>v4sN{zTh@NC3KT~KGWk5g1B?yNQ;
z2#_X4x`O}uX04DCynN(UxN*SJxPh>~Tqbj$EWSTzI+y6)7H&Lwy2luUO;NDim-RFC
zLg^2t(JwlOHkajntRwWoW8U0`2BTTV^T&34K7T8hh1U)Ki~0UkM3(L}!bGSt_exKf
zrfs9G=-K4r6v_ivswNsNxaD1ptAujiQ!;;YjfEl%kHC7A4*6E&df8w<+EVfzays-O
zH1%Qf<}}z=k2jvxk7z_6e>3X_A!3^x_Svx6e-y+5Ks&U@31A*wL=ncpk7KoFrQ17M
zabt{Tq+$=ijWxpZl+K9A3y#hUp#|;bsiv7Ck<-;-<H<iaUSIEk{FJan+v_;FyS5fv
zylMTf=($&_jt8zb?_Z|2oX~SI)ejXm3mAX>b4?z{_+5D9@tCrN5J%SVrw`?5bkF0z
z(CG`m32Og3N##4?)d^e@N0Sqo(9;1&u#Qn3>04^a4edXt2@4^MA(ZvlZgSy*T@*<z
zd%GcFHwy$}_TV690#5qYH9XnvAe5Rgb0VonU=-Tgz&_`fYr<wZu%p)AvaKaw6fsC0
zS3Lo}AmXWI-6YB0foSW?n?S{;JO!%o1U9ita_*>)luPj@N*#Q&1-8l_!|!SfG}<u6
zF4x?+8w!J<=xTLkk<I9Lip^8*Z}@B>b#=Pu{PZ*2j+Wo|BwfroL^-JfE|K%%m-|FZ
zdq21>k(W~M&Bs2ODs#6yEZ?vIegNqRb*eT4>7@b5Huht`UuSU)h?ACVz6Zr8_=9Y}
zOh^|RwY_^_eW||jcK!_rR`p`3L=6AYsW#xTwgkm+SJZX4?tPgWBts`=a>KJxwY~24
zw(~1?*L$_IIuc^^Fq`kLu8Yy~G+?3v@_$kCRgUW}+wrUrDgv?ft&Aru-gd0BJZ7mu
z!07)1_x}0O#s3iO%L3VFWcU0Gc$PJ&i4rM}jY?UH37~cXeq4_82#}`f@=oba@M;xx
zCml>$U=(68wYJUzDqOk0ED54ha?-1Rko;TH$#a<BRtrN2P%mOX_>f=iNf&@TH^(zs
zz%V3FFf6W;qQJLJDceUi$xc+mROQ#k$-+uwQd@gmykZ}@zSg6`DTfZ-4Yk}k#xP+*
z`t)(*Qg$;T(qMjjMf#3XW7bjN3CpeV+?ysdsOv@RhW$c&c4aqf{hTO`wLGK(kl}G7
zDAaD;T*5yV7SV@R@y0mA38nlW?#GQ`x|)utLBXg6gZLvDMEauTS#1Dr;pOiENXy2E
z(n)x8bAy!coHAtJ>L^6fq{NC3RAmczzEtF-jxRV|Pg7`%9S>FPJG;cz3MGmlAHL+-
z7S|%q*0Fe_wvc?%v$BfZ)JlHhSI3U?8t)@68MG@@W~M<$_~n>SN4?43cgMkoHyqO(
zC;N89*{h0fq$x%^E&LK)<pwIlu(@2*X~PPnOB>yyOx!BkFse;J=17bYr_si{smE20
z<=X*rDPnB0@Eu4w-an3qyJwD`k?#m~%X@?mi<&+^eU~}0aJ4Pl7GlBH%Q3=6s{QZC
zG_$5fhlW-Z`<F?4-CPY>59>}8CmnoG+}-uhO8M{d4;GV@Jgdp>A~u7vFEHr1QBvRE
zQ(%i%OdKK{9}^~wC)T#Vd-o1)A2csjyH=`cV$fTVdlDijmjGMve-5NwgF>||<52;J
z&jH45)*2N69(EVy^12_~8y)Jrd__H+N-uzGn8HycdC|2<-Q2aPdY31$z_`D}kO>l+
zuTx?r99mr^>Yk>cLOn%Yw5G#%M7q4uzi+VV=Cxe&u%3MsTeis^Kg*_WuioKFx-2N2
z%f`mu02Sy%RvVC0D`lH%*gWQeFp-j^gDh153>pEDo(<z#{tP?VMtS?0)&6p&VYYcm
z&489)&Yw0PI&VTu4*w#vxG>s035+&t7tP7LTsbAc2M&J%##LyKiE3)0J3;{)5dykz
z2*H9C3QGA;?(#j4FNr29NRgmux6m!^$~I#WGlx2K3DER*UpDCpgaiuw*)tjMUs6yc
z5W9gXMb@XQ>7j2nrCGDkHcU)V%r4<z8al9Ex9UY%sI0wp6RLs^y*-xIDNI>Avi^_w
z?K<@Su`3T(57mJy96zLaDvS2nRC`JulBY`nuwe<7QWts3t$D<Y4qW!$(UcjYK<Swd
zEe+wlY)2t7D<XWj_zl(=BpN6`5It7vM9IJ~kp!baz$L15kHHZBJF;~Fon>PJn97#p
znHqc&dxMV*2wfR1^PtyAwSi7S2@Y}Kc&fA{9)*TL*r=rv@nUN(5SWG&Ft%$NaM;Y^
z&Bc@@fP6b(FnCnV8hXYh5X1K9>LDafWMs1Z?xnT%>5F8#3khX-*J2|~!75gnM=-_O
zwhmYtxCaDs_HeVoDZ482xFwOA_(!FY`f5wR&T_Bx_th4W&J>?&-_C*a_YnqN1BJs|
zvP>B(+YwRCT^|)UgK*dkJ~*W=7-Rz$D88}KoCHW9BBbp^A_$VF!BW!vx;R$;(hTqt
zwE_Ast{Er*eyK4M#E~0)@kogxtIfEQ*{O#q9IxAfGa?Sr7(K(#OLi1u^2xtZIGnuu
z)#(ZhSr7tMqH;>aB1n%xf33@<^!)nw)L0p#+*1IOBZE}x-vEdyum$b&SPntF#@UIz
ze4Zn^k42WHTv%X&+LmGX(54T($-kjVGaq-6hkVPnzG=2;#g}N;Y{%ENY=ciy^|0xu
z^xVz9gg3XbbLnf|lf(U^zq=#w>KGMG@0K#YqnUpC?bRI{25WUz6KS#OrGVz^K6@!R
z71H~2y+W@_G|(`wi`Yd{FWHd{wA4lq$*&I0{;CMAo;RxN;#9OI>{){~0o2TVse%Xq
zP-BCnL@}FvI)Tz{mqm=*`(+K)I(ps#gQyImw)vH-bM**ewH4qBKAacPWI=qhEQLsW
zFbDJlbd<hKuPqkt`n44@wFM>y8VwrD+C#O$(0&G4jR~J*NiuXI<WGo9rU?AV=%I_(
zd_pWJqF&<F(ow=ksJ>V8uu)6%rSzP}xcsWC3hq<3Sg#}16_WED;1^%wra95XHoMvW
z+j(`ngELAVmIuxE^6NjyU6^UPFZbKx%{0^cf##p|AqbzlSu|9A**>_C!v5QLDWW;p
zNF~!Ceqda09l(=eKrQ8<BTBg5{jk~({T0MINhujUTK-u^Gr&Trxt(F6^-4PR9p4}7
zsmDJHbo`JFE?T>edhRz0W49y#cMzhxK91!;8D`Ikr_NV_5@)^C(Th&gBq3C5uFi{j
zHUOSiW=fJd>N4}G4NwaEwLf^YzY!*c?%Z729W@IYMcuB70>*dWiw$jXp6yIdBW7+&
z;5WQd$FvHE3u9GQwVY;5TMlMr&R)x!P5s4_NO+X%DXaJv?Z*A_pvr`+90QHw;izp{
za^7-qW12@zV`fw|P;d})_ZJ%Is-vzK!`vr^+co_yO!xFcz%yGbAm)Ib#RpEhFEY9T
z4ob0x0(kF-jC_$URRRT7DrAc@u~BO<G=BoN1DS{i!KytAL>V0i#>t8)2(gd$LUDoE
zbzTH9q2n^3+i21XizSPd)3+{|?V$sorMq3W>-D|r{xnFVM)N7AnF`-pQ@HAYCvZMb
z>D0_h5z#kuQO8Ty))kexK4cHUUM}V22{y~ra{5Y+zm`#wrdd+-M1-h2rgmiIjSXp6
zz_t5?;m6s0NhK$>|I!vXux&k6vTtBB=(#+cnJxmXQQFDm&2}7M*eCIs=(J3nkN8jm
z)+rj+87$n92<$uRn;$(5um5N@HnAJ82D60;@cBGQ+4Ce^(qe;?q(=6n2M-~*++)dk
zHX$Nn1swf}zYLyH2nZA&uGdtxro?6&FL4U5KgznRF|8<GcRfvnGS7DY@%ZBGIPc8y
z^^Iai2fJv>-sPdHC8?|GY$;|t_gy}FkEH57@d@*jLoPm=<^q3wkgMNjjaJY?Y0T)Z
z16O8Hp;zs*j&HOjscY;Ttd!}77PYg$c8iNNHHjG84Os<Z9cAaY#;hv9OWL?^eM#S*
zjFMA9D6y|Bv|`!mdao51VH5p0%i=DXcxR@4|41{>)2`<<3&)JfQl|SqX>nhk;>C#0
zp2o)%Pj~vT?O3tipaEwe1F-|Yv7Eh3@{CiEbATu_VkdeLlv<zz!OD^Bom0>I$r2yN
zd1hjWb$8w!eH5~#+C{bhC|$2?%OK*Ke8=n5<We!v>2Q+teEgDJr(8lbmy*)4%1JY4
z=8h8rO=hL85JeKz3LA!LEU&i8q}*o*NGe6Gb&!UKl+2aX-D(puX|Mf}G>F>iYRT{=
z;JaeoDtH_5UP7_zQh;LWF;H$~f@jZ((M=a~Ax4Wq*C$zi)%#62wdJSV3u~F5K@_p{
z;yyC{m79!E<bk>+)9rxR1@ng<j{iVVY<6`$V!;CTiU_<RCsMtx$e**fGhxpw)uddl
zeAaZDwW*%iy>_zoy#5T42uwSWdM`k{bX$nF>Gpr1PI=uI%MqqXv<FI0PcNdxI0Iz@
z?y4HrFlU}$2tI#eRJ?Xh^N_cl{ZQ>-Vj}d@p<Om<9iEf<SXmmcsV6R;6?yxAPOxu2
z%6d9wx-7Y82UrEA$^f{HYT-OuggpN=zfPrh&t<WV1Blq5*{tE+QI}7L+rQOJBq?;%
zEG<f1F`G@t2JzH#%%XWLvg_z_3|@q?T)sR{|FD6myEfJFID2hCdxxhG^*>fbceF()
z_c~9VXMJ_n+mu{Uu$Ec4X5X4AG*jkZRAWW5&64c$t=L?oO@-{|?$9`rf+-85%up6<
zmlo=7?l`!;rvk{?zK3al6V7yjerAi$dtYGdu*EdclUi=PdImq)Wlh8ObTx_ba+DvX
zGT*#wk;sRQT%GK-Uprnj4q@70Y{JI>0Ib?J6h<uqiG3}&$82zRYkCa#5iTY5g`eVi
zsjSxfj~Bqx?~{xM*m%k<bQt}DMxn$1^#R<-2T0q5V*Y#pOuG^k2polBe|_UVDx{vt
z9oKE~h{UVR%RbUI$4X;IrW0V3WtB+Xb-ZHRogd03T_o}^IIjoI{Rb(#eU5Hslj)P(
zO|w&dYoaGj(c`)#UZJk6%%|)#0`9w&yY*Zc_ozIbU5CD;-0J1{+nuZvTm;U^f84LZ
z74#eBm9M6>p6WJ)8IM&&;yTdlxasx|sA~CgA!Me(0-3g^_Ug%Yy?_Ey>~8Ta`=I4Q
zlX{%4c!Ew@LCbT-!Tg1^LJ`lj1ez^O=v00iW0>WaNt^<k9t1^SXRLC(^zb{sA~(p^
zB00~!Lu`cq*G}*90_2(J%gW?_uk2A)`635fME#Zgkr$oQ0Io^K0J7-l>$79`=W4(C
z<u=s2h&LBp;!m8^-MX_cV;Iwz7eL8!(zJE+gT9Fbwu)T2$582u`LlPIN3m3ezU^=g
zex5VMWZFKQHm=XM8)`;}qDEzerLyx&p8u}T-pHV?_suYM_}#t}>U1~r`19+U)4H0M
z7SE@gz{9IUsNTw{ylT{vRT7Y`9`%<6&E{MZ<=r0bAzicP|NZs;{>pQRK|bay^WQh%
z0uE)xhtGZ_$agk7p~KqjTHoN?+Dp&pNsJk-2|@G=NquFm67{wF))+GmE^W^|2F0f{
zj9>+x`sLpG^H-%5P=i6oFA3M+f-O3QxoVYo1?Q{PnBrSa{NXP)SGr0#4pL&b{)+n6
zu~XhAC|M~SYQAa=G5@Jl@42IR%ntO4(H2tI_imXQYeqi&X<IYo`ab&lL%tFFP)egV
ze7{Yk`qNq-O7V!o9m-pm2fQGP_V3f?{WAe^iKf<Rsf5iZH>AVUM2~ATl{!@9i1EGm
z%ULB1*|x{NWpZD}giuy^a<|V;?3Cf){}>ddF)Ksne|uO*zd(JL(xVOe&>BDK{_xqn
z82){4B5P8Iqmtb9KBHj9b-M)0iXIK`&mG$`eBn7=$RuZwnhbdM==Vxw<Ne>0W_yd=
zI~g!I>i{*Pab1`w!qowt=x1PcW-DijRE$HKpB}Vdsfd}S72-Vx-QC$wOn6L6VH=>n
z<Q^FdhvHR<I0sv^f-Pb7!fqLh4W$I%m6c`FN!OwdFWT(>cO1znCALI1$1C>RL7~V3
zj**9*furCH`Y`Ma-uqoV=-E3FZW1yxvKmuB)P7e%PiE^b@VIR6cvp4`GR;LiGStDq
zG#sKNmVn0vY?o%V_X-dds3{!~_RVj{*A3&oq8=oO&b9EgkC6IIwZL@zgRA-t7^NYd
zO~!%4OD4mVShMVjicQnf`aVDt7`UHK2~&=`V0V1>Y$(a$2Ug;_2!)r`mK<?5LSi$Z
zc+)SmU(Nymx&xqcb6NvX(DH^ZL(iOzm>Qixh|%5GK#-6DPEGAV-t`&(03b}xJ1yOf
zv6%fofv+b_yykF^&t>wM{RhQ(gDfk2$C@xDi$kItx`Al_fqP|CozY=5&2_o-&`!rD
z2OxIpQl{sE5aQ%H^(X*21ATlBF}q6%@S$_~7gTAe+j%t$+7adykil}GGBQ=IU~lRL
z4xKo7n9_FyU>_)=*a|RhHOG27B?wt&Tr2mwEGze|+1;g*a3r|WM{qx9`NPZn7o3ay
z=uZyA(ID5*N;c%deqp5(jZ7B63Tv4?Z$=kF(nkhNmeqj@gwC2?nf>+6#v>oMiWRpS
z8<5?dXsbP+rOj`EX&b0Jy69G8yR0YsfHJ{WH(b!@gNd41RvXtOI^|jEtLCYvG-&I$
zlz01dN)Kwyrn?u;)x{mW@LXcYtBOv8okgg~c0a-{y2s`M)<{Flo<p5U{gnM*LAls9
z0f0do2({E;%sc^H9n1N3B<{c=`$G-T%|1bFrN+f<uKweycf`Q{jX;l^_^ES=pjc3G
zN|WPBNIpyYUY+TTH(Wt>9C;mCPrgglB(D|cVrd>Ld`{C1GY@XH$7E3U*_XEA`m`}#
zSCUw-p0nzM(Q8iaa`b^Sx8))@8P_gSZ_D;|u{>P4!8bc9Wd3J}>HUBln*S^?+yLHy
zewH(El2w5DZT|junB}oN&ShL%qB5U2;+*25rn^;+f%$+RB8SbbzH@gf#R81cHglB_
zcI0CC1gqx|pWM<>^U`y(H+kRQp{-To5#$=yH4&Mswh!Vf+X{J3mZ#k##<)&AsWksq
zQYq0?gJ$6hpNH~?l`jxJQExFT!OMNv$#nZ(*ySnN&<rh6`}gh~)j$q;g;9ivt6e8k
z1gc>Tq=RD^yRs9o12TUPr8!7S!P%>n_Esf;?eOJ;jmXW4ba^QXP`CsaLQe*dzp{q<
zgf)(`j%CdAK#_2rE{QRn>%dO6SCdo}ouQDS6?Q*DhCoHVprAT<${1v*89>JiPPcz`
z;N;j)Cqg6WftbW42;))#G#Cedb(dhD3Hp+VOra}nUU0s`arZ-^K9tWV9{mxOWT&Uv
z>rfw3pKau>npt~8<5vIcUj*x^)c#_YR3Js<8viJHHhd(3@4$AVL8M-2x()kXUN^^|
zN;(Gx?EVQriLzi2jZB+ws^+U9s{Navr-><%>PpsTFl4(qzVQ-?3DMeLC<1uc6jZ_u
z`ixS+sF!bsS|43fpO0Hs3-+50z3(xtLhf?*e)Yb!qLtT_T37-05&PO>Y1aBz|CC+-
zYv=G@B4-HX(~4MPDy2kT+V33T>nu`nf#;D+ZEZnEaHG_vyEIGZ9S&P_W66Uv&C?yA
zv6K?Gcj;4Gq`|&-Nsb$&z-rlkb>))?q+ggcm-<Nh^{<x4WlMkhT1jF;dihq0pA_f5
zlN8ZRK4)G#Ur;*o%=oig`^>>lf=ZUgyR-V*%LS+l#5g@arJQk#kB@(S;nUu;<Yz``
zI9;3UZ`@$!7+A0G{84WIuu|6CV%$dJe$H|Rc4;#VGxMk?^U^<Gkgk^eB87nK>MZ}I
zKb6Fpd#r@2xt`41=hzc*85W<T#qRftZTw7+nhs<7zFF@N|7frn>t<ov>4Ylfq+`JC
ze)12?o9+?$v-h`{MR-Cp0<Dwv@$OojZL7az_?F%|z2PQhL|io&{pVW#wMWu9KtbX*
z9n62`?<-SA?Ji<o;o-HE5+|{a*w-;U4!6&o<=*J4DASy^9q1I;!Odl_G?WXER#k$J
z{Q0EQ+)**%nI|ica9mMYDmvw)=5+LI+iNbFgG&o{%QQ1{P8B}eNkv147|6Pf+w%WY
z%^CP#@b%Rk^3Ei(pK7E0bxZmR2EVmolSW)8vTrnjiS2e(Co7!+T#<_-c{iAM{H_~0
zX`)4#JUlD}9B9I1`ZfZ*h>yG$E5^&jHXd8VY<$I@!2}UMGV$&-NpqEEJ)h(L&EQWB
z8lR6cH2i`jLY+E!<j9dqI7eatH9AGwDE#>3p0xD*vWeTAW3SpRh9T)%NA_<vqyrqL
zZFeR<w6E0<uUJG^ZOTZln)>)(vq_oQY)&$t8Hre&&i1@Y@l7A^d2@MHxAbvi+LODH
z_3saCd-5CHgKtOsMY`|zmv>}H<(3W+7BAQw{~<ozldlXm_B<|7*X!z1I_!7E9=)5R
zn#3(&{dPe2?YVOvLK0(p3)_A+SS`QVxbvx9`kHT`MMOY+=+^0`Ofw#_nVv7e*>&JO
zVDb1zem%q0?K-GifxFnk&yL#)s>97?#Ze0NcL9j&6$#|Bmy2QF9L-1Nc1dumMYG;N
zN8Go*7O$$ju82E*_SPGka2c(*g838-wneaK+Xn|rbd!O#e_YFK>Yq7D=Aj1dl;q(=
z*&1;cP7eENH;JT+HgBxH2A|n8U+r2H>a=t{#qzf{<6r9=`Ef<r**V3jF!vt{WQ!6j
zhvK=5x662~T#HCX_KSz=v&-d<U`oEk8fR23^kmJx9=4$Sd(r&6<Jfx%?tAIdt)mD3
zUUmwM_nI?<d3@G)*WH&J{I%Hr<bCmdoFlu(iAp;48u96Fv{!G=ksbcpOv)}Ytt~qX
zkv}VxF8(b_2B&VQ&t9X9$70)Y5?dO$UbpYP#p)Bo(+nN>o>zs<3_Gm%hNtk_9Ip$E
z_<OyS1d($Xo_T*b+{)oCR<l#}m7&H~s>)R_Ci`j6^eD4(UM<gvqJMwAzuQL_DyZAK
zSaFvB5S3`pc!S$vS2wqh>y^IVujyrZrTf;b#pNb6)Wf1u9`=gXWMimE)+G!7Jh6Yj
zldsF1|2!J!6bk5k`9MJ91BFgCxWKSoyH@moXlrYm4h$w$n1b$SxrPQt{e?h!gJvZ)
zHBRXu-hh@Cg;AJ0t!5(S17a96*p-%+W)4(>AhY@Nt`tAR>J@&)p6NKITiX8OT%TMo
z$4Qf(8ShF#`tQ9fZm;PVGELWBz1CyesfBc5J;M4;WG@T#P2`lk@s~EVALrY}A!I@F
z{{0{@I?b;(lnD`%L|!ZiLx0PyS)ejh7mlg3hcbE~tIO06WN9eijEz5=1SwDqq~ZwZ
z)gGO<0I8Sl3H`HQDTMlu6YoXmq&^aO`iowsfjHLiaDr#NMgTi+adv<3==NJZDt_+^
zlag%6lOKsF1rrCI@^{B*FC}Y-S-8mtdLBZtj!@{;g>gWvivJKHv<B?<IFB7Wmaboa
z%Ile)zJ922F9<87rKH{%Jbey6A46e2XYW>nxNZ=Kuba!cBLXBg6Y?UH&WuCE`!Rm)
zhg8Xvo9gxm+@?Ob<~ZfSp?kg0TI2hsiH2%Jc0AeCFOpC54obyaA5d^|Abz0+ca*;`
zLB`4l?vyvA!XEJM$y}fHa`JcEb9{+KFLMhRId<aQ9cfQqAr8R85h>SyfXA-%qmO>M
z1G@9Xu)-4u;!ZoGD5%e!J9qCv)AXQK=DB%5$*-KtP?(3tug0kucr;RoRZm^xcp`>*
zhUdZsZ}3D8?Tcv*k0WJrXOO&1y6%dFKl#YnP`O$V9Ub)k8piBf?bp#h+-O(q=l0`8
zOr%`<+XpE7n5voe<`({0<^Ni=AJ5P=2s;RC+rdmsN-5-vnu3CYVR5!NSXXl(%ugYs
z0e^erl30wCQPl$td{gG84%RI0I`t%<goGC2AtVYjv%;|^EsG^1{xD#a9^#;5Vj8|^
zMp$TFwCf4oOm-OXxvK+zy$KwS=kuFGtM}YJY2`n{1dZhA8AOd@Inn+X5?QdEOt9Y&
z*Zuy6Az;Z2Sg88Naa>kYgPu^L-BVCN73&WU-aEc2q@^ZQ-OUzA|AJ@J3h#eeHJ7&W
zr(FIGF^7duWn1gv16QtHcNAY1i|o0><FL3zckQ@>vlula&GO>6-M30jwmoR#1CR&X
z>#5)tHBL+k0#ktN5SlswcHD=R&wtQ+Gb%whCb0<&W@7A-_)WKBz2-5AcEe+X(?z7@
zrX=Jw2KR13RqeZDBnZn5fauX0pa{S_-`lOeyJ~CMX^S10J|Y|7cvaB^jy{GUtLV^#
zU0}p3`7c}eW#~93ZB)p~kVz#pnmg~>AXr=eJT%jq$qV|=Gca(#<*E%MxpXzaHs=);
z6&<v&wkia|RihOj@K=n7=uV!T6f5I`U9;c9<&9Q<W1sHDixEzQks#X$&`j6@e<#L%
zc}j1i7gVg3kV0r%Hfu1duZ#$r-klWBs&-D<jA%A*oI*4TLo_HjR!91DT~4{>OSATo
zT0TYLD|=7sHXHU_Ki4+@cF(m?edV}}c|1Lv?fCsB{ZaMv^F$^rhDt2*t$-xA!>q@r
zgMV-K{|VK(+&i+@L*Xo8s7bp7=o06V{6RF_)N~s&)zv$+I5|*3;f{YPeL#5EdiWDF
zsv$5SfDOp299yf<&=Z(__8d6DUx67@C;A~V3G1Naode}h4T$+=l+t^(+^Rc;F5t+N
zO~i^2Cp4UhsLd3bKBx7fZ5LqntOzj!cHM4@0AcM(Bel8(u+tQ=Xma8Pf6fYBpc0&q
znz*KabqfaF0iYGT4~QlQ+zmlJJ6fbmG`Ikzxy(Epv+~v&NfqzV)0M7Ub8B}NbAJ<G
z{X6h4a0reAH|e$Sr<P-5K62?D%x4+KQtUf-?(I2Y-q5Ev1FYM6)w3AJ*It!LnVnub
zsP*>h<W@kBpTxE+5=K6|Xr9MvQfl+lF!A~$Nua;#LV^=IGCGAW^Me;PeTi{vLO_*c
zAv7q!hXyT~z_0=9L&5Tlfw)T=<BhEVcAR!1b@kQ8QrVh;l9>KQ>w{0d3rQUY%iR`U
zqw3P~Tch`#1YFudt0>#l+$^h<mV6sGc+~0yY_8`o*dqq^_C{5Wz&lXj&FLp6dM998
zqh=ASH(RijmR(Atx+PSz3@FegVi<IlpHz!Nj>XM4Ma=`#O6`jTA;BJufMui9c5~%1
zrc%8T_ESyyj8s{)r=tQ6WK5Vt{19O}9P+_JsCQ2-1C$fwu+VrHz~Hys%mqnbq9?&<
z_QJs{GtR1KRh)=M&h6O3{<#CT^ugq#gj>{Mb3fMnLV0{0F7KMayuWl(ra@T{JK3t`
zQLvDz@+;g5yKO8TY<{R+l;o#oV4c6;52OV*lp8LDBUV<LaXjQbWLPsEKh6a&UwT!R
zuy~lng9i_)-3pVj3|hjMXAq|=ff{8Es{m1SSCQ~>*@}E9hB<5a#ya0IziTT=PcBZx
z+%amNP&eyD;sb^{lGH^k@{;=!gY(2RiFsbYA527p-*>DNtqwc8c(5!(2(uxI1fHmV
zpcxI#6c9fC0VyqnqMZXZ+W1N`%=&u$Gl_hI1|Eam-RSuk`V9eTSdp?Gwn<@8Wa?Kz
zNV>xON*J8S=P8*suHuL}j$Xv)r`d10qx|tYE>u{-u!N(eaB9|ll4lxe7m6=W=H2u6
z8&yxZ-e<-XOj3DLQ>4Opti5&zPeoVRD3?v)7Lp-2W<9*)d1CaLi(YbGTSX6E-|Ga3
z08Uk?XBiuAmKMhzj+$aJ6deND=WPIWa_9ODHP(_jXk-Q7XRhOu58;kgJbU)+JUs!+
z-C9UAuVbEQ^KIx95>)1~q<yR`kFNP_#81r3dTND5(m>CPs|dy;Qp?771Hj$Sy<(A<
zH{On@0$LPNP7(;1;h#g~`<&ybHAX^S>VhsvRBkFDPG}PCCI<I}i>&Dey^&3%2jjxC
zp8^<#Mx<{>`Jx^esk)mL;mf{mV$Tq&i!LaQUeTHuY52Bp?b-wdcI*7o-2Lz<vka9d
zQwz827j~+%Thv(l$@)U>;ErX!WfVOoWGYJ>XpM^F`p|D3U!9#-m!wyXP&}HVCH>|s
z92#wG_3MuFXeE#-(Q?{Mb$6U-O0yG`qwO~un$g;vWAghxF{WH2h*8y7E>Z3q?<@s_
zk{~2ZROd*3omC-H+!WPGY&kTt@H|V}FbW=hQ))#NE<nntqQeadGz{vB#d{K|my`;U
ze_kr>eRIaxes8p);V>F1^)t3|fo2Z|>pxYjE_RG`CvuOF-q9~?*cIKAd8ZQ8rxbZA
z_cuhd4Vfa|52X;EJooE1c<S;GhU-J|6+*iNT!tFpLFqt9hUWVlHC*Fj*+}f|NT*N8
z^+bt0Djh7zw5@jam2TW*=b*p0WeAm@u=7VUJ;IIVMQiZ>c_d&(*mrm3=I&g8Yy*M7
zC0_@0g1N}QKHmwp!e+ksqDLwM{OSDjL};qF5Yo|u6fp^&ATg&+2T{urZ_#93MZgW!
zN72@QS}M9r<QLP<7!qn?{RVYbJ7Z|YGflB<WA3kR;|$V@*W@QeA`RO~jp94>67nxm
z_jFdSLFR%00N%C`uu*GQ=(O^UWdPK=iI2qsY<M*C6rdPmN3Lb{S@=cBU_1tPF)eXs
zv{6Q*dK&6;AiUy#(zIxWgf^u+Sn>1%!?ZQWJw<PJY^{)pws?<uK9~Y8w}?es?IRwG
zB(~K1qJu{)kd|767tz_SuCCpwN$9nxzj}VR3jV!t*A(dS0<U4$o`+TZou>VW-P>0w
zaL}H=7+@4XMq2Uj5xb!u!N*P)pZD1KZm)}3(=^9^>-Fp1;UpmwA>(3audwbqFqR#X
z6dE#yYb5SUx`rLk%)6tjprHCSBbg^GC`f-vxI`Y)s*4>`Og#Qf0D4L^swR7!>8`;d
zeS3zXAN1M5eElscacde$<7q4~4p<yNCzEXY^zEk`aqHzn0!c%6J<eSo3_qTiq{<n?
z*KIcSI9yl__cnQP%oFg_11tK?i7snD&T+3h*Ya%Flzo-b29oh7>2@ltq`u9fJN44o
z)v@Mm&2flz*UD3hp>lTn*s8Bt%I{eIr_VwqQU3gSn>mOlpH;21gC%KN-XkxIyC1{%
zfo-W<dHtF4)K2g53#F2vVMK!Zdnr9<Ysm4qhd%4`n7xarb-iyh?p{ldeeKMIm;URG
zMvNwHfZul43%_$&+e@**T(lT`g7~|+xfSRf*wP0ZWJUD+e0V0>(1*jj7p=3b<g+_i
zS2z3@J#+7){?W$){;xMWe?4|uDTb%#nN#p%y5{8epTPPU_}X_SKI{KP&aRbx1|=dQ
zBE%<;zkmOJes6(QBK-=Db~DN{s)DX$)V{kjlCd#&4;v?+`a~@^hhb;ulM}RAjahN$
z-k;zyoYy==+6ZFfVplfb6Zvp(a^3K+!CkHf`l%`z7$Wa$g`?ElzA3^W07&Y1E%^KT
zZrnl<A5MeYWQ75^1e{~ud%%)TAFS)iSlsr=>=IeW{Ggb&Ha^V3VcxbVdsxy+>MH4T
zX8wbT7KJo@*?{uMVh%&uP!Fp$7Oo!AvuEAvzdvH&=EPoU-}X5<E0h1F8U9`JpOys`
zPaQ47bv!qDFcGu1_C+-=jIS-yRLBP3w-(f{Rkzq2o{LzS{8=0SJlo$t-tvWE#67#t
zT87^s1yrK<?lDT5qA$&F$TbkXWbt-M5}O^}f8nu{Z1d~SZA|}XY5ezJ^I-)8ki!uI
z7JpoCSeq!DLf+mI=TbdUOJ7BpOLP}@@K<3%KbbNtp7n3na{5UmSqN@Z3d2?GH3N|4
z$A&}j$-gf--Wgwnk3PveiBm0lCAB{G_<ar<C$a7{6tRT4|GLoOqt<jiVTVQHKpCdM
z^gmo8Ix=#}FJ~y_+|3+4D>n<z5Y$LkO>Mc#!6^L3?EW+@TLs+FVW)4O4g`fPjo-^T
zw*5VO;Ri03KNf_7|FNJup^Cyg@C-XS35|pfudn2)4tp_-;~IyOW40XSna$QDB)puv
zw!_h2;pRa4*KyPP{;SCSBs=m!!l{T`HT@P)@VJ0a7O=9iBK5hdrp6DXkCmW#M-CYQ
zkglb1*Muqm8yx{-SOH8McU5(@FX&_MK^<A+HB059wy&%#Fu=OE`0lxysySI1PMTfB
z1i!U#V+~6a)!6B;*@GkZH4hC-9QFBiKS?VNy~9xW5Eoslvzen~5jQ$V9%z6Z@TQqV
z`}*2$FOOP=H{cW1sgJ&$<Kf`}t3e$=C$3Ne$MGx-v3$YxsuE6bUc^d4CRiY*Ne7%H
z&IM^$2Zw>XsQ~2sbmUsZp}}@x4ruz7NY0aY4%lrCl4)&_et=i8FVM&};OpxpH||Q+
zyni!Fj+~>eFJG;56{_l(@X!q0Ly^VY%DjO!K+l<`B}Z!xdIaFLaLv34_k+;e$vxn1
zAL67-^@W<Nl{*yz#_yn0l_fCtOA{AWjg3&q80YJCgrq3|u?vPk2}eVAkFSr<BS&Ix
zJ&(9}m(DyWl2xD(miYs&`2PpEVqSa;W|~IiUtwm$PwamBjdl*m<j=#tQITgd-m{Ss
z$J7vbX*L5*Ru}J&H-+ln-%97>ZO2D4*w?*vER+!0eu(5UL$_C@#0Fr1AE#C7Rr}@d
zqj>M`a)~;0@7_J{=yP#brhde-1%so?&rYEfwFFyWAwN)2F~|}=n}a1S4Har?DFXMy
zZ69mCC&I&T0@FX@`INmavaD?!BE2h58<eHR_ry;!Klg+(NN>v2X)U&$PlEe#7a22c
z_8;cWziT2F`6I9QwL}5N1X;pSd1lc8=?6F+s6Zlpjp6m%xAP<S1+i9zn#-}6MI>)h
zz;Z->wB?Im)lxV(C6G8;iz0!kS2ZCEC|i{uyRvOWG8qoDJ70vBce*|8=3#pJ9vjle
z20(>Z$bUMrpy+z|KXCsPu<=ktpL=CJ<>RZf#g`U!Y{H`_U;5`u5y?caIO&$^+YeHo
zyqyr!5A%<{wC__~+sw_2jAWbB3i@ASer@0UyzoVpkQY&i)m%HD=XeomYWMDOc~eYW
zTvnHPH>@Ri`}XZQ@X)S=R7>uja8@EjhY`dO17-d96yO;kFupXVR*X2YtV5bjvgLa!
ze~n|`lHDK*%GY|blk&>z&!uMMlDhZV;<>52JSKY8E~N(6kya7}2^x{?TE#=d!$DYH
zp~~mR5@~S~obRoT{XH!<@8U4$JuI)a5wrJqZdUWSTEEzj`0xKGaPVi(?mx0uMRK@*
zKwIJW7H`E+SzCLAkZL7N$Y|x(S_b{7z%hU`404A7pZB18mU#=0eQ`J|Bs}rDx(PUB
z`yi}m;r<?yRUy$iCGfrV)|U^q6JkNi6z`m=Ag?Avl|=s5T;Xoo)CcaiG`BJUq@}iO
z6Wh729h~@(vqKe0Q3wZ1l+&&K2$sH=)&(lJN@!&X+L`at1ND<~+Dw5&sslQ*L3+*$
z7x+t6X<L+t0s;a9z`5p@z_|<_sLLx7qHs9xJ%k%PKJI^xkDIQlZb3EMYV5*kRornF
zhVg;<uo@HgU(1Kmm<m+ml5_TMcX!Ar&fF})x9REB2CM!qM1-76AaZN~FiIh|0O9;)
z00?|XMhvP5ga9x%(?U#$4LdWwK$ey|7~1Ky;ai`Cq}fNu4lj>1ehVYQSf=qo<FE4f
z;!SKPZbmJU8_Hc;>z$M<AtGC=IeE!!=JdB$%J2qp`u3tHa+|+9F5mV6OVYdrJ(gaV
z<p2HrkgH}z15eAn#!u%IQp`A0W;o1)2$zkW{jRTHIQRE&s`+c1kh#jq(gP82hP@bT
zvl}-C7axG^p?1zf!wdll>1SFo@WN>1+ukm*A>SwGEs~pd&oql34Ra12&pC}gtBESH
zVd}Id^?RdR*Um37O^g2O<Z8RDuhExt$7;f-W``>~L3v^Hgy;PLQRDz4J91A_lH=e~
zp8F5#&AhC|OmNR2&Ayq=Jm^};pxD0j#aVClj(Fco@el02euK@)<J}$oKMq@k&cF01
zm;J7ZS1{Js(CWy{tm}F9TLSxEwaD*^;zcB^n;HAq+JBzr*crOPbu%9-Y^JWLY7rZo
z?)2^r{cOj>R-_|oVheQk<9rX+fP%F#+lBIaL)u<Vg7V(`LRNBkW##!O*!Sq@=~ctT
z?t7L=BR3wfWhF@s6_pDhDPiEb-kFp3@N3!)Xs1G}<i*G0`=rID`iiw+VRD8YyAle0
zz0b{82ZG6BVqzri?D8u5yS?%cl$Dj;gLl2QKuTKr{i|^m#S}P3eA?UF=ZdU02+-6i
zStXQx|L(E&CB`!jy1McLpgDc2uPFrAnEF+l#lsd6*NWdHC9Eo3snF_LI~H${X}@gh
zXB!_WsthxZzUZH5t4O=VeEVkoW}GUX6VK#jSzCCp_xG1wL9te?Ue)yG@{RmOfekP2
z3&~u%9(QfzlkM>|=!RsR6egw_rAN2A+1eCgYW8z4i=`aXwzVKDT{U^cUvYF{l9rYk
zXzXI^`SwJzZs_wuqoxtJ3)lC<VHTdjibh5W`_3E9$b?djAmfKTB^{`930a~~SG=~?
zhb7L_Ie=kk4R}owG-<`g)_j0n;R~H&`Nl%ja|!I4WuvW8a0_ro=s$of<Lj{JKRxsu
zrYbacx9y5ajsrkeE5vym37vbzS!m=v-?z1yonh1}g;=dypxyA1*rj0GlyU1*m!Fp8
zjC+EgZ6#<Hxsg^h%)FXIK)YnuJQ6z=2n_2G4B81S-fntcT~Zl^qpx70N!lE+wEP3X
zL}*Ao>e!BU0*rSKv7PS*S?1t3+ra?kXc*s^_u38zW7odBF=;-(J_Rh5x&ACFd#lKh
zu%}=54HCzWV=sNa>lJwFs@?O6lu?x!zmDI9y1Ww^VUFi%wHT&Bq+Yd4gd=kkyY}hT
z2bHvBd(=SEaM|Yi$uupeCRL)1m&&l`?Ct5D0F~36baP}{v|%SfA!80Q#cCMk$SNgM
z84AzDl4;P^88$}^gB9R;&}t?x#KY)`3o+)6*$7!xN24d65rAo~3t#<HsW`!!HVg4}
zmx4q_LMoCL*@!X%mX6GfqD>x4%4ECK>L3$+LHFOC?5;dL#c{7Jh)O`9`a!UUN=e6P
zIN_bPHq-S8%wFL4)F=ED=VDKYXq_OAt;MH5k(h3YT7XDKoomNKCUKFw36|h&7Fn~q
z$SAsFP*PGZs{g4v<aO<HAIE%mP8mP($NkO~g-Ya)78Wd3c-Sd=TJ=`V)jg8<O-r|X
zevFTcHwxNWHd6%a3Lk6O+IzqAeDvS##y{)VxtP**Sc`A`GMvYR`ud<xpUyP91)4{I
zW&sC!aL=o!R@1@5CWN~WYKT*-WvrJIj^WjmRBPBspA1pTyL&^b*5Cze`yx%4LqvfF
zw#LfMyPkruvZm%%{?eZD#6ZKgsEojbwpcAr`SF5+f;mW^v64%UWh)cjm7-d}+@GAh
zKWp6K&sQSn_VzvBRJqzzcUu3X7SH4f9fH`e!)eKdTIa;>OvKEWZARwbHa|$wapBm3
z?R$%ro2&!C!Fb+M+X%d)l=JpN)m0^ps4yzMLX(iH#28Bri6nL9V$Db@J7J9A2jZL+
zL0vFN_-04pl{t7*Zwp#!NH9k)#E#lBx3@G^gTNtdwzc2RL>=H!!gGYkGbatpo2P=s
z$ze5XX`DH-$P}`i*H!on8tX%;EQI3|WsA{KktYe3ZL>y+M`v1BjiOX)(95IEcZF(@
zR8ZQpQCg`_x1NFh6~E1AVk*?L=?<BLdO-otdAq1XB$5Cl@W&^o4B`Zhw{jj##F;X5
zFr%IHsc0E@Z_BP=eMfntDB40&(&Gw%oaW`n^gm?&wY3l@5xi(1u{uCMdx`anOU{pP
zuYx3oz#Fg8|7hFEkg6}3Zlh*lhptQvJInxVZBy${?ENHx442<B@m*pIvXbqo;mxo8
zj7!xEhDl?};8`SR_Nr$POZ?0oBCeG+j;g|{M~n6xMgj<U$xrnNp&^6$P%6QtQw>}7
z^uequNGrbdn8w5QR?Bcb9_f=ZvOF2MY-AP?nv(OAy7Ae#{$1jW-b_r8<ygxXhf3!D
zR$EMSdYhW5L)q&?axWXax9V=Mp0IU8@$WTg-LsMY*1mb`(xT1#K<$G%&3z1NRxam<
zmi6LQ$_7}xkFxh;%z8Or%Brod&yM!7Ep8Z>huOHDSghKT`sqn)K2oKwLsC_iHm!Jq
zz1HSF)xgA9bo8^5z#o^2f4?1cQsk#uW)zJH*N`uVxVoRWpF<=_Tk*-H0z>BLL?c?Q
zLM69y5TFS47#w>{xYWvOk$Q=6sx>Nu>WQ$0k~A$;;R1PWD^`74&Gv*3Z4=$F!E}Zt
zl$!M@2f-$AYq7)Niy_Z2?#VkZFFqDmstzzh*D5;s&P?b-;&sCf;}hV`>g(;Tjh0yH
z-%Y<)4r+Xu;|p5y^x=*bhfv}i*MgDC53>m<y5P~ciVt*qRmnCEzjXZawa_q1akmjN
zYirv5_pI$z*!Se5Gq)+ve37kgTUtCF;pkG$9{HAD?g%Ns>0ZlReRR)VzBj^ajA?SY
zC9lq!tRy?=HO|}|Jn+v}@n59|MF6TYjwu4-WCV07>D&^6HJArlP+|7v2G}xTK^Zs8
zwa;wt-9+0H=Ua?*3GP>6luO9aAnRN^YM>WJrSIqGCp(m&mx8$I;Up7vA7Z+gg^hGJ
zpQ+>Bh5N^=9g<8WT&`%j6|5a>M$%JqT(icSqb2DZGd+8V)2$PlJoFLZT(NFRtHnbr
zr#g(fc6|!zx?66PD0Z%*V()#VU!iT%dmnzGts^M3u>-$(cVbzSy2(d?dz%IT3dorl
zRHgokQS1Oj))+|fV;EzodW4=RxEc9$3)RTtZ4p;W{Iz5%)$vXtKV?}yuzU036{5qD
zErA)d?ZmR)O}7F;y;}a@v{I$yj?h_5@F;Tb^N|^-8Y_d3Z79c=jS7j)#)&4r(>R#F
zTarFJ@q8A3K4eiG9K)b?7LM$g+g<Sc{oZ@cW#vWsgeL9ER3BGU0(R#8?96K%P0As|
zCiKLX3<Jq;>$T3*Axw94)JeYGJG8oMG%p-VmSNCEM_XjLPn#z(G%}J8E<|uJym0W?
zmFKA=UiL`*%WXHSkMI>$@!?j4AJBn^e36N~*7HP4O==Z7ZXr_9#*%s?%Q7Ms4vk7q
zrbMS1b|?@YBuSTKgH$Q@-8_VXum|eeS|$bK5p<%W$YuAQH-TdZVq<~oD<~MFnAk_O
zY%%YEB8F3|*al!&J(IpSx*4G&W~S+B!bh(dxAO}oA>lnw?3i_m!PC*`%Fwez$A2$7
z+OytmGSRF=j!6gkb6xw^FIiWM?=i%?hw~}f(Qt%szHaAY3}MaOw-zgMu<4NC4i7ip
z%Q?W$?+G!^El*SZVw5t`u{)*NIYn>XUCkaQv^PyE-vR4>ofV?M4>aPQscleL()NMp
z-YAe_H^dT#Th+=niwADd?^Av2dc~)(tLD_B#Lu^X{v6-9ocZ968}?GR$r1at&yT0O
z&nY!4$;D1zJrnMja_O(>Qc{i^Rp!jB%ILP~*Dl(04P`eqri2HGGgcM*ze6jqV*8It
z)pc%f?{mIkN|#D-*y~pszTL!hsp7igUR%|o(gdv~M;~UtaUk8E@d`ckyDR(m2Ih=P
zqWH?$ubvoe0f76T|CLuKgClhk_xf2w)e@m9VX}2oLW1KqE|i5HGm?8I=2i4U%&W4^
z37#!4H}M_*sv9VYO!(ErkZ6)K_q8|ISAQ2*Fmz#k`SPVA`M}^{74UtykeEhpts=BB
zX*qz>a&iIO7o^&7st-pyVc-}H-byE~jNm>>-DH&~|Eq3GEas#pjWXUr`ps@(U$q`$
z-dxJ#@a3&w%!G|5o&8d=k=0f99XgX4(5}$->VK<*_RGcl@CT?|5d81F%m>!d(a{Gz
zYLlSM_XQ~<IE-|kfwL_Tk{d#7i@;RQyvroJ3I_VDu#y))<Wu<|8BDfNdneMKvqzUu
zk<)VY>ky<q%|8ix^Wui#@HfV`%EU5ei=d8lCc+{6&MkTlUZZ;)YskIWrumZqtDnW)
z0@(=QaArFw>@;*avmflhVnZd64G^?IfBz%&Cn7}5B1HAUY7!Y9iH_s&SCSjw!|bzy
z!a>|}L_0afl3bi_CL5LKH;574Ivx;t*4L(_&n9Jl19w_qy5$A??FUWYH8%p<^OC0{
zOsuB7Xttl;u+9MEZTHsxqhzo?=orcO?c3+;@2_wz2;yg}p{uLTUX+DS1tb@RSEOx#
zwA@sLwYTup)Y1@dN*halBtcHg88yBVD?Ywz)Xc4=kL~t0yt*dcGWoE~s84o+AR!HF
z)MI^JqjG<w%l50uInH@LCE#y&#xXvG-qYxzw>D;1I3<g>x0*ymo>Gv~Ad@u(=D^+e
zzYy&;4yx>~o{}YQ#++_oC9g6(Ls_Sf2HdzSTGy>R_PUr!hZ0Ua_I1H}d^;xa-l)HL
zn0dX;Tj6MPcdT1?q}=wmqC_8V1@?~QU3gJYIy5K527-U60AJ=e)8z;QLlvZhpGLyg
zEWy4J*C!5Uj=8XYu<pugW4J`MkA}9RP$np~kM|L@kGeVOqc?iKR(<8<@EO6%r4m;;
z2z3EXcRX6}7rZ%3mYvhj=X*HvoJWnsAyQ@qoBsHnD)H5?oOe2B^2tq~J%5u|etCQC
zfc)W^^`Pexo&{j|7tm(C@N3d-NNMbEq?J`r@S#vUGlOE5S_uMkgM;X)htO>Y?`Iae
zm(h*Sg0{H|_$;>sErdzgMY5!)0k5qWGYy;1b3Ai3he~0hGn1?T$<ttpHQ@<v<~V=d
zmGWC}=e;rC^Z!5g-aDS^{{J5@(IBNFr9owc(6D7|+2PoSWM#|DCectdjBG0F9D9%K
zBxG}rO=TY|9D95pFIU&~zTWRk*XQrw?e_am*X10q*Yov!J)Vzo9{}$u^Wgm4#{_e}
zyN#jDHGX};f#1FFqQ3<?E!a|!Q;0T(Zr`0Bpz!ic`PL2Qs=Cib2UkZyd-O}A*FGc~
zu{8l&NdwSr<>xP42n4K#G)Nf=TZ0%R6EK9%_p2!?-oX)x*V6MoD)%uASb4QEpZ(cD
z*T!`1?CRpm>Xb-9G)zRK(6(t$XqXmWAFOhBcz07|sLAc249m6|3wOF$*=1K`Ncz;@
zKNzWLzOr+yAHRjT^}G|GJ|pQX)sh_wK&M9;fAzPH=Z5vV;L(nu{d0xBQ-G4<f?_wD
z+oDUwd2xyZi7NmtY}Wr#9dJ;eCjRIp5|er<2P>MIPOjH)wU6#NkW)kN5hfLvN99tO
zl{(!ubXK944VQoG><xSJoY0Fp+XW5Mv(ja?H8mOMOh3yaQ}n%uyiN%@+|0A=l`zOI
zJ=nVJa_#k-H^S1~YN$+&ESDpPBa*-Dw$yA9U}u*B2Vs1fqoB<qQjVX5G3pHwRtbOK
zlxx9R!P8!{ukx>-P3dwWa`Ui@G)31-@2c-nK04pK<bCr!Bg@?4=~w!DP=fV%p@%IM
z_%|O!x4k;o^XZ^k4gS|JNRvNg9UPHR+gfaK#+R>Oe?K(Q9*aU{cG8#kG2GQV|B4|-
zqa@?!*=u{QWnPhR{4A&W#J-w3V|Ns%%|!+2O`cm5RrHUhT#s#!uGa%r9^Y5`i>-eq
zIHKhDn$*H~yZVvW^*tVq#^t+w5;V&`Uc4U7dbd<mSjxB}ecQMDALFnkWrj1i7zI)$
z{54$Gokn&SgxELd+(kQQjde$D`gWv=;20KTmAAcps>CAbt`&CQXZq_*)nMqYnSJ4J
zUwdZ6Z_ofGa{8`IvN9Jnf$N}v*%6i}>-7nD6-tD&WmF%(6|LNM2oS>z^E#e=_y00w
zO?=@wv^E_C9Xpd?t6TRwSE)8y$6ek@^@?aDt4>zwlxl7jh4adL8=u5(dn{5R^7NG>
z8kdj!x{bX@y*^RD`lNEG+W}D94-Hj_CvL2!-t85+mK;^>SM2yWaHDnKN~)%IsYn6c
zwx=2CIM}-8^*yqKm-mksOyvwb->}JW<N%6!c0|Hj%|<`N(n&k{NaG;^l?!p38$}%y
z{rk$FUNqQtF)MB`FuTTA$Zj2-XVEd)*(#7B{3uscQaTG@<&gY*{($w3J_I4~sk(Y~
zHCNoJ^k(K0&M!e75$kuYx-MLk)ATzvw2T(Y**kBXB~Sl)=eanIr#G{P=>66gY+q-5
zyU}SoOHXJEBD}kk!4JCZ+LGV@7}NgG9eTY4zt$6n1D=X+ol_6aQ4Nu{B3_dB$HAO}
zk(XDM<kTnD3m5h;9lLsmB>&L;qb_-V)Pqm?J2P57#65Psef##PJ7+{$VsA=F$CV9M
zIaRLL@wemdz6fVLjLzFE-fuz?kWl*zb@4yZX@e%QyY$Pl<MW<hQ9q6B@ZrN(8(#SP
zFLW5~+F45RfrpD?Y1qh*TiOP<Ah)neMK|t?YqI@h+M+ED(~boU*R{)wPwZbDnYX~j
z$6DSh{hz<^>z|6^Wm&0re7w#7a~8KL!{mx3Rb1CXW3^VNa;!0``j7J)U&V@Ee;>Fp
z$NP#H71v&AQuuXST`sRXM_}R16(0Tkm45<`;6uFMC!x-ftw8PDzowZ(E8|__CQn_{
z#}fBulWn%%_#2;s(UwB&Ut8(VPg5buT*c#xm%q{}hW%vP8YKd&*A5In?kA1(%(Y*a
zyc=ob6{<Cc-K28a_&O8La%Ed$7Q+`92<-0MrN{s4_Nn4Ye!h@W3J;XyJSihY?{1D8
z)%sL8qnTw`x|!Lr8A=y_FwkV%88g*Mc$vI^2weQrq4)`@SGLjFt;iPHQhi)cNS`p#
z3=i92*qAGdeZh8-U!&ehPe_*;!QB12wg3Di#z1zL#L0H6US*)tApy~;5MU49gqFD>
z=s_N)pC-*^%l`QBss{LMf-1DU??HM_;JH!PWq#FYM@NT-vRV}rgD3buNUQDGA-3a|
zl(-7a`atyp&anMSP3+-&AHDS4?Y3yix3m8|x|L5hoM&b~j4hrBuiwt_P-mWP?5=%S
zW+C`!?0G0+c>BLkVF!uVAwctAoa#Ydmc2q=z)F03XBvh+jKad`DG;n?8{r2<D3jj8
zoF1Dxj?4!??}kE`L?Eq@;OFNz1QLGXb_AT<=Ky*xXn7R60$LNbP%lp^A+Z&B>-Y2U
zatSf|YLAFJjB(&5-&Bpa$xKY1h?`H+x`s<lv~|>e@}K4U`%giDSW)HG+aiyMo%$_y
zKPodTOOAIM3=)}`n67d2Uj<aFt-L+nV{?76C025ez+`so2OtJAFBMsTlL&C{6k`}9
zZ3*|2s+Lx{9s8vu@(kaaCS|?}JIkohwd9=%&jv?sj%9fqajChHGyd?+wx^6(j59>>
z_HOGf@sF5a>GYrh=V?or9S4C7NIq}9*&q2dnw%n%A*2<gZkS+z!8eU9(rE;Yp<3%4
zpJmTz*C35o6Lr(&;d;%G7rpx)hQ*{=JRa8mc-&qkQ2odB5yIEo!{)0m5~?-^P*$mf
zC%28QREgpYM$oV3iKx(nUtq{20|~a{ojY_IL^wGce#rj9(5JSqemNKL1q2P^loRBc
zyB&cbF&sFaDhiE6!`Z<a8}Jr=JLkSmRBMjLgiVruW$ROxk!YpbEA6#(U8$`@qmhrT
z%b-O1>70grrXCY+t~4&@q|2#2hUhipSIqTQQR`e8BgLGr3vMK5-3Wi)?@!oNn22_H
z#xrRCY5`4nu!r-{)BV5hf?<~Q?p_0_K1t)xusy{j_ns8A`|@6kGW+wVPY)*8+1OsQ
zL_LtuabGqZ>nkqOD0Qq2WVHb%iL^c5>`p!qUTnatR~ihMuB}c2qUc6xDkbI81|Ygq
z<>oMpJwTqgxBEUrHxz|Tmo8m;DY-YyMgQmO`;Kgvv`;K7GUg7x>JYw8xWD?|7OzA{
z-_b~zK4<RfXyJTomOk~3Dt+nM%!wlzrOPUxID@E1R2Oj3pLDLOIYi$tI62DmVW*{z
zZOBT|)9v5>^ZOob$he+=5s|SKhoO-MBIkj@!P0@sg(#3%|08wR*WM;|-<5NtPzS#v
zY(3z4Z$`BdpwY&{EA|$nFd&<n1jqtdI3bhRBX<_~2S#43t*v<mls77;E;y*K$tx==
zGW7Zj<eEPQjMGqL&KO-O>zLtm<xd4ljKdUL+@@{bz6EF5jQN+@YqK0CZsP*-yccPX
zLU-mL^K#_yj3M8Nn@J(@L2reR9NN*=gn(q7;x6q(vw72_##ORCz+9;Tnr*ER{!U0L
z0UH-(R;bh18Vys9HiOk*$BGlA71zY@^Y)o8a^0o)K%oO6NhpDUiifr3I{2s8_Ec^B
z)WRQ4+b%loWq_CLvTJ~6<(aKrGfl=#u?XYnK&0a~(N{E8SZ9D69v-g!Tf*LEFdz^7
zqt~xrO9i~37wzl$P)HySz)Zsclm>>^0B(H?;%_niG2^q)`ZbX+<KtCj)DO978T{7X
zEnI-GGA>xEhnt9)IM;Z-I#hJq0&OR$DK+r*&mCgf3aJmyAudj#fX^5l9K4yC#hhd`
zY1+D|AWT$oqD9=WoYFdrrwHS!j>S<uAg%8NZqs$E7p9-{?S1rKObK%nK()Oiq;3a?
zpH6hKwY2S9`OHU-?bN*Ccw}|&mCP<jp^DoFwmq;sJ!L?qhI8K^GZojfkee&9UL?T4
zxxY-fgHTe6A=p#)VOFe5{TKH>3KDaDcoXY@aNRESpMBLU-^4dPvr_Wya<c>a)3}Fp
zP*z6aDV7#nv7qV>L@a2V^CAikNMYfQzYgMuk_=s}pIRzdb)$MZjt9J<2+vSv6gtQM
zdHDTpfI(9O^1z)-!{XXg<Y0oqUWAp?(kQ1LVhr$)Zx?$&q1YnuF`6suSE%q(q`lXY
znf%EXSCZ)o>9NMs5yj{b|Na_-T+UKdvP^17bgAa~UIoGW35v%_m-hTuWcjmGv&JOm
zn^yXv;7=YRgQ6NzVb*u?Yos-777m>{&;B#ntKZt>{!gW*w$SL{%=qn>8rp1omvGPg
z{yG)BcI;zh_RF@z5nsHZWxa5oV&(MLgH3nxIkvesxL;B~C@AhY9sZB}OyaGwnnyh1
z{iG~QE{ReGjl8dVCc;v`^Kbw8M~E20MfZH!KeuFsB17$Rf8MbC@{hpz%o@X~qs_$M
zQ5L9Pkz6_?V8*-!SfdIifuVdRaDM^{hT!diKCS_9R}fC_q5b+ZixJo&`lF?FZTJc{
zPHLGt!Rd}4fzJMKT9%}YBXtRjCmuBwt!OTpVu1fXi&C54_F;PJ!fLJfgobZD<|WB8
z8xl11olfVmK@$L!ex>1ntakUX0!5uYuKPElZ0#2-zdIhMw2A-lf&Wrh)BctXkvA))
z6ubQ*A|h549-ZE%?FP;KEZ~KTdfN&ikKW7oNK=dkfatDTDr#$+=fc2iB%{Co1ibEV
z!~)9uZBwV*O`8Gz{V?J{6GV^~*8+o0{AY3jJm{%4dhFgmcO*QuM`1fQ3y+y9JpcKh
zWcnXF_t$QI3`xuE?QNszpSyAEE+kYyN8pJFzwdA0gMYgabs%av#2iCx{IVP(`$W&2
z5eKsoE|v)EhY_x`HHS}~I+dt*Ul(ql;XJ7wM-4dYkX5N4Bi()rZCyBNNEx&b;s951
z_{bN%Q&%4-BJ_aeO8O0k6?aewzYIMuaNY5PJ{<E)$sc`xyPEB9T&#%#NSjzeR#w)Q
zOhcJn_5d4z44}%7Hr#U>oDvYecMB-g*!I+14~G2r%W>m~GYFYOEFw6VIoF~NrU@)y
zbNUkIh<=6|!>;T4;2bvm5r8M*`UVF=T4#Rm>w}z7u(lG~2&LkgJCtYfAG4+qAwE{)
zaq^`pK#dseIC%W>CsJ2%TXO4!38Yg$B!*5P$c%9F5WMAEOI<K+*8uMsq=N>t&npO3
zKsKDW{v3n$&kgf!>+b(brGWrB<7uU%69ssgu-6S?&8xt%s|Dk|j|(z#azT*7R3tC2
z0`}qyfJ3+tTI@a~xh}d4x%H754#;v4vKZpX_GaG%k&^$=uiM&o&u{Ioy-Hr5dt8kh
zqd0&Cm&;n0d^{g9X?uR`QiWM6Y=Y1`X*w~6;aQoPjOWfthVhsg<?6aj@^qGZl<q$z
zsB#+o{nA#4ehD`H{?CXpbRbM8=u0I7Jx%(VpP!sj+HPaF_6=aFegTrx-C99WnE2U%
zv3%G>KcI$mdmp<0@qPU7JUMT~$Q&4@ecb@g$7B7an}-??j(BS}>qGJXS)#Oursn8W
z<1%n@Un9g;8=K74sV7dGtqP(wF4M(oBK;j5>ILA-RhQ?qZHU=Ha^vXU2W5DktLxRp
z*wjL4;0*}~0PqHYwe>K0j*gxdj8jbmivBG4+SLNi!pa?Zyc|HUES?{4mjODG2DXI%
z4-G1>!o46*@V2b|1z+Di+=u9^PML!c8Udy|-h#fV`yO@GvT%&~M{K(Q1TUjtJASxp
zNoO76sWoJ*KsN(46x(yW14STcKeuTNVAez~7W}q52SV}fa&7`l>1AM2HVBj$zDhxm
z@EHQ&E=|3^Amn^CFJ~P-hF!V}c-m!PDAqfdIHOfm0<$f;aowE!zr?GY_W{7^y1d{R
zqrkUEN50aA+PzqYfd>xTs95{}=6^OIO8R6s5n#82Krx}BbhvKQ%5)w;K#U-OT&NyQ
zlR>=_cXeGt2a?HJkF!U;pKg1(ihI0b$d|@N9sI#bn$(0y?@Oew0Uw<6nhVPa1!bp}
z)kvb*(iZ*Xfot<M+Rz-YMnS@bF~}41(AKB_h(yH0$6ykv?J}7&<Oc3Aa)JPFEje||
z%C0X&$Jt~#5OMVz1FYz>17nHPYz@nhztGf^(K~4(j+9HLyla5)GX$i`t0mx@ZyS%=
zWVU*#gm!OwPF!XKM3=)yl=H~j!cIkmg^fRcWkUgh0>FNBOHD0#*)vp4uRn)<TOEBL
zAPxA$N_{2!w?4ax+*y23_GW*wx3jZ5<-BiQ29chV)+Eq&%~P0yaZ(n5tJvmQR39Ig
zjTA)9`#FQV^DB@9?sx5MT=zR{Kl)l`+gItyUN7j^hGU$>gT84+y57Be7rc18Sjzjx
z4LY}mfCF4!pe>~ixUH}!2Np0yfOJXia)o4{*9^Z>C;E2k4X{`tCcXij#)Y@OH0c?#
zZ!e0c0<r=@t1e4?0!C%O=L??`+de_6XC!A1%}`!b|NMswpr_0L%)Vd3@JAYO@93hl
zQ>ZIA$H^|k;a(WL9|zri7^^XUKWSHibRUqR2#D60WxA9u51&AY#!O1PBB66XvvhKl
zmN+{*dvzgWTk6FQl6H!=l<PLc_4`|;GQ{Z~(rl@eTo@rXSy&Ag*MIr)65NncQO@J6
zSC+qi{rUvms9r}?6mr3T{&fQ);B*z3RWPbzgmmR@cdiq?(`L{`{Xatf|E%;lGxRJi
zEw%q3U5LFPK~8;mH5Cz!(8~g7+BIUDM-aTbg}mp!qh$CIh^oN?*g4&Xpk@fd{D&?%
zGCLXUTt7M5g#$(T*D$;+fDyI7>&_e7w*kr|x3-+{Li*O-Q&E#>lH>t2QvA}<-Cm&H
z!0nP16C(yVQ@3`e^+!+35L8+dpKN?3_y1h_AOE~(jcgJbco`q@xSogzJMrtV^^_^g
zyzsD?bH9!-DR(6TE86Y_$f1RL#N|rr|DAk1lz`+z@#^l~zj<bV`}Q^+_|?h!d~g%l
z_LBee$M%v#OGNzVv42fG{<tAU7x>j2z8`<K*#7e+|N9eUG|<?q(!WFXw`=-M3Ni_)
z7q0I)`~PvBkR-++4Pfd3FY)ca>F>*%L;ECYok{zTg%VrZo7|y#Hh`lxWZLocY|V$2
zZ2#cW$l&w4x@Xmr6~xB=+~fbagn=g{Yi9F@ocOnTUGT1_DoiZaq_`Kb7pt!4KP<j2
zf^mxV(x04*#44Hb9FYIbkQtsq3@4Odgm<5LU_r!=QK-;7NI51&9F9;tV`I3m;adG_
zSCz}0bX?m}lTg>eFX>(REBk47ef~`;2czIZ_&UkO&Pz61&sTIGwNQJMl`~f(Bun`h
zu6=*EZl`bgo1+)G0!sL}*^ciQVMkIWk<es0_nztrKm|<_+%5S(dRXg~BzNiDjh`Jb
zUY?!leERJo$MAn;O@Du?RJ-B3DhJM7Mod{5PLlR`e<|mUe%v5Uy}mtcYwMe+%t>zO
zJh|9$z3#z3R{g&+?ydh@OC}*wxd<Jw!>8)+J{wZVj<S-LmL3FrUm$30NQ3W79C*sV
zj)*XKMuQ2>Nr00Yc?ero4#DkxBFU-GPMKV8m+MuFizQq+jmE0g@!@&H7fZ;NZ8ncT
zJzS@Iqg_~MVcQ6dTrBrXWGYF^rE>xLZ^XAa5I-53_Sh`CM}=?aURnYbvnRXHVoedf
z45Ynsk`1y#$h%bRHKiuzs@8xPZ5RrSC-q=Ml2Q;cIG@+*IX%@|RF?;ln~98!Y^Zw+
zQzW5r$4VzjTY&wBQ3_`=k$ax5Sao%KRqw4O98mD`By)u!3>OkGp_4SbY-$bkKsE4d
zRw<bVE50|7GTfAvZLj$Pu{sA-=_?~j!Otc1s*}xf&Z~v^lDWp(+RLHEaFD*C;LI1B
z&w<jS{jvxs*&${rrUACdqu9&jrKBb0G2S)q;)b5Cv;X_U|F2!~ibK(!y8R$C6r^t;
z0qg{*={#`90WU5$qqwVFbaO4s4p`97B?27B+iFhg0$LVY(DepUrUiP!EHE3V<FmW~
zf)?DX#CI_84b`s?=i{1|l%eJ`X^~o)A3JyZHmbUS$L~RJp*@h4&cGRd=#)On0$j?N
z-{gY|WTwNU=5c=Om=7O5JTdAR?^vAdHvWvr3WVsUx|uO&8hsr<31>z<e@|m`f$GVG
z-BmnThceHd@m%%pIN4EfhItUEiYv?C{WS*_0SD}AeZJj(AZ62r_z5h}(2wVVonclE
zm=p^<n2BLzVhT03L7Z@-*^opPlHbr|0z!iS{^LiT_mndio*#i^-;$H#!K)e%kLP24
zz{BFxw-(;BZc}zpbnVWCd5RsTDMRrV-A7MlxjW2feeY|NZ)`2gIgOp_EYhuXfyI9=
z<|ch<_3kH*H}3~u)F};?H@a`0eVw<i)5WJVC3@xd_w!>D9LGntZ+yV!UAoC|XQwXO
zQEiu9SUncN)(+z(dX>IUgAbe*u=&6`iMZ7XfO$gwDr^`n&^?iTISnizZt-47k90W;
z1`pe6@5<@oe$WZ+uelA#C$4y@q5<x+xt_MqcVmdl<5G3cCg208>($$pKGVq}q^EsL
zI)E{sN(fyPRGrKNR0Hz^jJN@%7=lh6MqPOgPALq~3XU4(M|NDj0XLz5ZmE4BZnqSA
zXaD%;Brk(v)#~<A507T04n#%?akrc&tVm|rV#7*z3neH-TRx1Wn^}h}&;Yh|OP-Z3
zJ`TPZOLV4;GSx7rd^a*U_#{ki{}LuB>b<x#<&0?)I()Z>uG0|Bk+t@+1*m_{B1%3r
zO{ulABg3YsQ=5<x*I%(FbLDAD{xJq}6wCV)z58@0d6rV6PoQnpMGtmma}zNr?AH+)
z??J&nNcnr!3#aM;jG%JVIwg)^u_HHkM6E=BW}?}1veK9Z=PYJl5Rc>gM)Lb37=v)#
z)T-Yd4+(3=$X&Uf!<}Um1F}r5feyGkQM*}rb`VTiBJExynE%xJSv(>~XwYf;dGZ=9
z+F-#f8YGu<dZ6MOew)gNH6t}Unzq}sw;N0-WSbPdedxqaLXLSA9xih@$~&4SeD^_w
zx)5Sq*8nOC23g}G7GW>0A}T2t+Ql%txmn~R&I26FNM1Q!mUTKa{=VtJQsqI>!1w-w
zU{*MBrQrKBcE&SjV%MDAl-t*MPAnk>P&pv25Y{7jF<!!jg%V6*X2T<Hd&;jUnvlPW
z-JMN5NvUmiQQf;b)n!cm@&4m?<?%#MT2Afr2s5%X_1<t_LP^@#gQ_`up-Y!;b9<(R
z773nSCn-ZbnpRdoU_@%y!io<`eIl+b&)rL`^d5j|yatX~llc>bE?eztCl(*vzFPa^
zoY_yZlX3}37E6}CQyjU_CfkO%vm4Z#ipk@B>+9;Sh|5#TnwH6jyUgS<m%)OzlN#h$
zskM4ygUM3?8Gb>j-nG3fZFelOFKiMs6pAHZGL302=FZf4ztg;)u|1yY74b%vZqBC+
zqmzW<91}5(I+-F(xgTvY<50T~g0Ju;jdGgmsquvG*aqdS`#gm9cJ7Oii_?eiM-vxO
z(CP}gCx-UU<=jOnO*p2V@-Eb@l#mnuBXzDv7XrJMPP|4PuUUMi5iQ|LD>$X$WDV=%
z9msz4ri~!_k^?Ww+$d%ZapMxUm|!dB_xKY$4cj8nkx)MK7*RUXY*4=<$%9p?yGZED
z?c#1#+f@?Qs}&vd!3Z~ASeVLVm}5mrVyZFvOs<a1?3S@`{GuHfpZ3q5XWf7wHaDO=
zKv>t7tK&0ky_4u--C^HD9Ixl+kwM9Qq+BrV7i31Wbkf|zxfRBz?vBX$kj)j|fUc7_
zo9QZUXBL}wkL2pp1r-$LR?Rv=GgVtKrIB4o3u}T5$!O=|^X_LFSyR=fQ#D0b*!n7q
zuxqxsbz{cbmEy-A+Ky(arg==&qJN(+Bx*3k?Cb|fLi2t(v>$4hck1OTx*0S~Y4)j_
zzD!M(%et3m(aj7ZE~ppid;5$yD3Zk`+}bA)?~1(7gn3e7-Cb3Jk3o(huZ72X0$%Zy
z?Eakh@FHB)yn@=Z|Ja>cDYH90hC}1;F?dAL^y|IF%bzthHFu3^p6&`s0m#i~*{p10
z=*qh>x|*<9?U4B6U4;$lsj<nTbZfZ7y(`Z_4^=yavjEy+XVn>S8r_SX6mMk9)bPu3
zzm4DwR2Psv%=R!?9O~mKFD1(BrixI=IrnrAvlS;F&!Y3Z3QY*ZpWmyDdbQB2YFSSa
zF#2)OaITrRpH{nKoim?igkff4szLmd4ip(r6h`a*EDlENfSB{~wB5-(Use!8643!@
zi>ZcK!pJy3_b7%b25lH=MmM5mRm@cQIW;%}cfkOipxUoyPBW6`uX)-9jQwoD_CCy}
zu5h<CAj7a<ZBjdQ<+{T-l5t(!wwlukL+`jiJLpFmfyHbg%NZXx=G?H-+0^J|G&JsF
z74Gh?<NEVjGfyZaMfW&c%hq09PAQ%YpQGCAxYO#gMX^9au6#)?UFu{9lCVSS{(5Q?
z-ib46E+q@Mww+0CWsYU^B;vpmd|fpq=j{gkPeP${H}~6wavY?KJ*50RS;|>1{VvnO
zN2~$9!7DxhBVN?F621Tpid@hgXeSNlp-|aXS{QFtm4Pbk=!^|bDfFEL{cR)Z6NDVP
zr8WkANI2zKzd_5Uc5Qi1&RS(xizbOi_~|suh%l!$*fuq2$6mvR;-&9?a`C3X8)JWg
zEnIF4<B^GSoxP=8tKnUY1ivwCiO2rqm!&{5MiOC!eO<}d|NQ(Fv@4W1fMOfABPCVK
z6{uRcW!@E8yY(SZ0{c3DJKWrg(jA+$yP9}I^ah-o%b@qiz)xm+B9YjBVZ%Z5-PPRl
z{iJSg%?Z0*auohj1^BN_Z7YS_ph}NORyDp>F=JXNk`lZNcL$3#d8$jfQt&W2b)7~R
zU;_05g$)+yFr#eK0SFRPa*Jj{komAyuQf>?4E>4+{p|J$$MtWM{OA!iq9&XP${NDs
zf@zqk_p8~uJodV@_#m1TrXsiIHrZ9YryDxVxoy~Qmwmw=JVO%|iDqz5POg*Wyh8ME
zQ_8JM2SnNUp2pkzneJZxh$Wvw?Jra1BU>ypv>!QXQo0+(-p39}FYt&wYMvZACwqJZ
z!3~zB8KtY5)1;+5=n0?13{PVR%&Jz^+O1HlY&vS;7L<MX1k}*&=#Bi}so?x7X}#_o
z+FR#ol|E4T`4yBG(z09{nV$lmSc0n&t5>pN<n~nSzZa-tXr#bJr3T~2!alA1-nQbe
z8rqd<X7y}d$@r^Z-PphW$992l?W|ZCp6hGRZ%f#}im4gkD(m)#xBN!k@NbtqKnr!g
zyWTMKzdV*dwv*>H*<IKEI7eY15=Qn#e$jwW0I?T#dCd+`gZOPlh5RuT1?hzWnH_QC
zdn&dES^oWY%d|k`+=ac$>HoYWv6ldTr_9__Vg2{d|FsJLwU}1$WSsy1Fa7_*n%?~=
zoDtjT2e8WT!=-0%IHG-FP;Y8D^^1Io1NH>91VX`|RO;V9_Psn_G5eN2m&|=s4=oy>
zNbtHl`J?HRNzQmB+m|<^FZ%rqCOC^yO=JYx3IaOz-O3q%{MW$v_iZ!}3R1Z;KFsNI
z9a>F`m2{gdriC|rj1E;Jcj?(2?Dcd~M%7tQKPaS*cJt||c_HjiW~vXBCA37jVS=P6
zIZy&2PHgA;bo6i9gMU06_23~dD(;=S0k_+Y&J`DY@mfFoSo&j0ZCB2<!||ewnI36t
zn-jhJBHHIRs(&_S%Z4AO;W*1^NzS1XY|SKd(?LVp0iUxt(#6F|et(}vrd^8t?S0p7
z;f}tJl*UOj-ExtO+;s9N>i@xK+%PM2r8VNpm9*&}mm(+gTs<25r@zkgPtPs8<0sqe
zc_+GRaD3g(DZ2Jkd8$Kxtg05X1l7s#01H$p;X=!U?xt?edNK-f@(GZc%mA)Z7ighP
z^tnvufOJamFlPsV*#;9mN%{s9{2ks{{yypP6gy6FUQ}_EVEf4C)Y5QHM^$q5n33&>
zy1~~YM$MkwXl28+ioTn6ojD_Y1_Z4s)q6n?>)+F4rz~a@Ckr(<t~eGMs-f+SRoUFY
zTT1@~_@5IeI$Q0Kb*9l3D~)Kk(~6U(LHsP*Q?&5HsY^E+f5Xz*YAlKgBCh2ffR0D|
z$6SqF+g-9r;WF1{*LCcXxoZE!qHgP{(i5LGf{LG3^bORKKg2JTw3%MDE7mfe%%COJ
zm3Q8R;Z+XINxStPpE3ezHEXX)0KcUGPZc0ee0hD|2Svb$Avr^av*6P@Vl2umA8`*Z
z{XJrE?odUerVf1-i$>waY*4+cO_^(w46T=o$oBpSu~dMaYQKwpDZqJh?=_!Yg?Sk|
zdYvHPvcqm$F^yllnK#LkzM%l*GB|IX&+|%4o3PiLHh#k~Tl*%qqVFS$y(glW?z2Ei
zCf}=%!K_a>u}<#zQ+?HEbH*7plbdYme4W4=J$_)Z`38Ls(2M(ya%$gWS51Dxu?P|q
z-5`U}t6v;{CVUV0NG>lgJHQdq38W)4=%)m5G@c@*O1w(4Qk85qJemNO8LFl3ZjSwr
zi#Kv-pICqRmx#o$lYU`b_5Gu92VkicXnYIDXQpVZ`yO4kzT2Edn><0(n?88Ok27Q-
z6w~i+`ie+^yV$2AzZ5vU_(ImLGTTx-oO55ut@%OFm>5$cN~Kka;(_I026CD%&Vb3!
z0g09)OP|mEJmy8t8N-0zJU`LSHG2KUJKMnD2`g_crxzm$9V_>uuU+3+I_r}*nL{~U
zmF9IR7iL%HGNxFPCn%3E6u!8iGhhjC&!yaYJIA|q8&<t`b*p%{iYfYpLP}1pEF(E(
zvWtKbVN38#n`-Cy6D_iBM7zlZyHFZKmN~0w=Z;=A^3^c#9wLtFJM}8`bi6pMvS2jC
zPa>3}Q(OqV4bGYUjxY-G5+z%^SYJ}1mWP@7mwE#>R1#?*%b?D{iOKs0Yu11#STG)S
z`R>y^vuMZ8ZEVFHRIC$Qj+@eHFX&$}Zt43Ryxw1EqINUAOvhz%Xn3f7tpqq+xgcT9
zRjzau<~l(d496B$&P1|j?Wz8)#V7pPF)|z885CYW)@t5e2Oq_B(B*x%M(GdBV6~h=
z)bibD4u>!%8Y%DX{kr`{R_@WxPh~ZVIP~1F)&PkWzamx~bOxjuVrD|%FO@$XrrLh`
zgX99|V?AddeS{|CHR=3FC5>W-&8U5OZ}211b%dT?^D(Pjy$-P~t^_9S2GN39|Jxgu
z72RvJr_)W}=;rZk#hds8_puEdTW>gx)MDvE2gPAat~V_2!BE+<S5{R|1NJv5NlBqv
zX&}#GPj}Tc9ShKpN+i9}c;;WwC}Q`?5cYb{$rgbJVy{^Cs1FiHF3eVVn^g7<IM+<2
z2@O(banNZ%a0@5tHIPd)InF`A-O9bQyYa|^k!RJry()Kil9m19YHho1#2B87<y@pN
zbP4?;6A-9Ldp6v{61##n&N*G=q)(jE={2uRjWl=5I@K$}ZJg$NKb6=1BZW&&GVWh`
z`S9SjUf2R=MNX3bmg|1peK=LLT>^@JF&h=V5PvPc<n}$<uq;Qk-7#a-!r_IZ@4g{?
zwm(0&?N5U*#TrOUPUM=Kquf{Hq`14Tcf>QO&CR9?eGEXY^9p9qeJ4AiW|5wg(tn@N
zuMKm=LHWYtUl+3N7jFGIgKG@NB>nc)QC7?P(YYU^jgC^BZTT3KBE&5yYUL|ejzov!
zsa4;9{|&#cT5N&LhIt0vZ+cLR=HKQVf8FDwBZ-GCF9rVvzuW%lAbax*&rXtlZJ~4X
zdw#{~-~Z}fRxzMNJgfTsZ~R%tXdhvisn5A0;qia}>PwP#2~M?fzZBilK2Sh20a#Y%
zO-7~hZewp)7BOZL(6LACg0bkLTvUoC&l{9RZp6Ol0NAACzIO)>DL>-c8wHUJfDv?C
z7EKzR6100F9PQ@_4HH9XkH^6=e1*0s<c%A77`PNyM1j^T;)9d|69<9L{`5#`JC4U_
zK76nX=nPN7a3vcV{SsCIf0K7ap8pKK@YeP>@#U6RGHPjjml|2gqhbLkbQ3EnTcoEu
z?U>_hcRp0$mRB)I89%hK=Lek#Tk(mfEK&J2E5d+h0D%A?!6JIzh!V*Hq${0(e$sCY
z8Ek+8-vW4^*LW|Pww(t)w9c!MD^7O-%j^KM69$>mak{+s;?viHwJbBh7CFDC=#y^U
z(6T28E(wMU*v1Pz0<=iC=;pFQYsfAcY|v+ywCt4%hi#w2c?+*(vJ&9Aa!l&$xgcx(
zlx(S2<!;4Ys{y3lj@S=lQ0(31EPVio>V}eQ5;=PTL0j6felT?jOjkRhYkJbPwzqU5
zvre1o{wvxC@4k{Q^w=td^WT4mW#-6=%zXcx6oCMB0I)^}^PXd-uWpn&9T^1vwN?Mz
z;y1mLloP_-Gka?nl!;}{!A9-Z?_~8L^eAb@vg!E4KWSWspjpw12s0HCQj4@1R-hwH
z>!KB!nh$9RlIUDUzZj~^*fAoMr=SC)d4Rnb74+V6-tI+J93!KiH*J&dU+6O9^2qyY
zm8JSkYxqX(ALbzB`vBSS2>^6fU>KCy>)78bFS>4rfVFxqliw)<(L6*d9NI)k+skDN
zhA}xn{%iz@{6PXV^TTP|xxmX+R15ItlaQ?Bf=YE4s%Zs(xEDfmugW)<w%a$DoC_eW
zabNY?bK06F3?RzbQ!W!3J{TjO3wgee+cy@wtwd=V&GM(DQf>sCMjX(>{MG7UtTp%a
zDEq87TM^&{x<NmD64%U&(nwssX<r%>Q-%Nl9d#>jY>tR-W`Gt$H3Eqj0+(P55WL9+
z0B08{0s2i`={!OgrfwTqD^ZP&wa$rd$Ep=sV8VDhPP<H868iD{%3TE$5iLrSl)1!`
z(T^*`tBW8NTWGP08G`qw6FM<N5diHO#o;fP$Ir!3>CR=2iwbHk0yyOf$7|DW=SeU!
zx5LM2i>!>EYfb+O|2<!*;f=gtQir5I_>?s*ZvtE?m8*n4sX@HZjIEs*6Zu7EI$VaU
zxD8|)2^ckfD%Pn{&2hl;+bXsbfN0XQdbyJg^9=Eclhb|M#FJu=2i2B)KjGc)<kqz;
z%muFt#|G{MBFql--QOZ0sos>Z3E`*)3HDNWI1aU}3Hi28aC~<nP?AI;XL^8k@-1i!
zZmQt*GZ%Htf3Qw1nf-t*ZP&gDB#b;n*WG<}l5ep{VY&beSih_v_!)l7el5$m(P4TI
zCH-{wX!HBYC+Gar5j$DYs_o#sVggh%dI;ce>_)j2FT2}5*Shj+lC(&6BzGnz-4Ot8
z3ozX624`=*JPP_uWMWBX3UC@#b38)X1Sz>1G7s{)Y;y=A=$COb!^7cLxgela&XM-=
z*0-o(X<=fUGMfKbx8g$~ayGDxc{337PJwlQ%S;`D^Lueh2<P)OA<`i%+(Uz=3qBZ2
zp-wEUU?qDS*2v!+@zZ-ZrOg?vL>&-?KJ0IOD16H>VTX)(epqf&j51AJp>B>TN(jbi
zz3ZAerkx;OBqu4&o~Yb}oiv^5hq2b>{Gt!wII!+W8^J22!jSM%WoL`qVy`ie87><-
zot^MNRqm~Wa^_oqH95i>L3h)pTalp?;6t>#8zY7A#i<*t5lWlXjJkHs*QT86ZzIg0
zZgBrFQ@Hx`5k+Nc;W*nW+;R^=@E~}tr@8^;i*o?UT6#nF*hRU%jWq2h7bG%~`g|#x
z$n>LlXD>oC$o@z<(V?CPpYLY45>R1{4j0)g=Xjt(TRACp53$iD$&~o&#_3BwiDO}X
zZv+F#uh2AE=CrFifa3x!dA)aclYo+_U6eumV!DKe+gJr({i{{by~r=?77ZCy)jite
zP87jAsSXwQnXTA767{I*`!ROG?!{==z2<j5tFNDwNWJ5_X5Nv021nx)eOZ~}$F$oK
zbU;m?OGac|{(Cb?os~?l5(9)rlfRrr^x5U+C8?`4))MEpy8eA+Q{H}!Ln}AC@_^9S
zd7W1oeWl3UFcK6!A#fB^3xC_r8QDww@Lu0yHuYwOT*>}q%CQI;7T*#itOnf%V^CPV
zsfXgHJV0d>`xzC}8Y8}oR`%m;ExW^)GmrUC_p14H0}W>Sp|+1Sc}$x_w*?<<1?vZ;
zsuU}72GmZqFhm;PTJIIx8{!R>RfN<*D6K&Iw196#tVO|4uw@>(CQU84IaVX7u>`ug
zrx@T#j^AVeSx8YFJTA^w1hQpC3@U^@c}c6|)4bj0%A-zXFUg}&Lp`ZzA?5VcJ91~#
z3ss98EYxa}+%QSs70(a`Yu@zg`!ph+0@@uIJ)0a0lm`sV+lo^!DwHRQ8Vd$oR<WM0
zL~|s*yzFjih;n25sH&`<7NfBqoU{@XqplY^9ZsT+S{)Zu`ZA*R&cck-uzn;teutdy
z-eVW<cnSqL%M-BEaqUWx-WTyI7=rjTQ_ijf{AB=@d)9<@AB}xHvAHq5Df{hR*mIOu
zn)Kk(mIDi$Crt@~PN_8~_Sxtq{m-BJybIG7<p6N^gYWieM#+mrg4E`H;yT`e#^$rf
z{j(cL^!Y|a)Ylh<q^9?vX;*9>mIjmCGp;Z^FU8$*5XN8xJ6*Wkb!$%dI}=i->gb$3
z#wb{XCbN|%R@wyGL#R~Zl9ywl*zd!yoP8l-=XPIwMDiRX46y#$W2}8F=GuX6eCFCZ
zV(0#3B=dA%Xc<15%n}JU(#%cidXlvMY>KzPV6}KCqcJZ>AObt0+%Zzh4DZu#(uA3k
zFQ|}Vp2_r6XR<N2%$6ioWM_x_(yKBgS;?7nn2?!>thHZCJbUr>G34l{+2`ulooD9|
zYOY4}gxtkVIXyv7zmnT$p8PJ)U@{vE>Qy2}oxX)NMtLT|^HzCrg{n_K<||Gh8l)^J
zOuZ$?<v~*>x8tr~$5eaiQo0%!TL&dCvHM_W@?4Cm4E=(s01;JreU%{F$kd%1wP=x5
z&ui<5a+?T0IJW-dbzi*T9=+70i+IV8Y8VkS{vl+NiW=J;()zOhr`I&VB72DM$X)0c
zR_v*lqU~`SsbL!Uw+*EjvsRu{StTdGrJdtQ<J$kpO1H7Pw7OMel;Xt3I)>P8C&z52
zx0Yl1(Q3w7Hh1!3W8M?hv)If1c|gJ{Tnz9?HoINF*(c+;=FHU#b34l=XA7C2&h087
zuh}!uB;kn+yKXlpSDr?Ga^6XSS3;LW*Mm79$!XYLd1B_d_h<RKw$n4CCNW(I&+ZbL
z&t>ur>Z5G3%kWB00uRqF&1@Z7CIw{;X)$8*XEI3C`seqgW`|3#P<kiNNTWVr7F32J
zhXaYRG8ZqlZJ=0>uf?Notx$jFd^5SHk}*XsPCBRwYxYD3<vHF#9cJ?7P9zo#(F-l^
zbz@GesHc^w<(0g)0VNO1<0w-rI59HZYT1_f7bJQjrPdePqq{uby;N^pE~&EM9Fr`q
zA0T-eMNtwpwk}ouJ@3_27~3R4e3e)!x2JS(%m__bHJ{JIN{hFz5k7?BVR?RErL!(}
z9KG{n$%j}cY%5RkG*6GUtMWC>!GU$N^2G<oC?X*ZXt6)k<KLfa*}A-HcqFp2r9^(M
z<aMvFfO`t|TexA2NcEynXRg!fZK@LnKGJysRo4!J%kZuNbJv>a|7>^0m}l~oc|zb>
zp-#0jRWw~FASvv<VVxAFT0RaKVf5R<kx=9`8fTMg<o~h1G(X&1S<fHi#b>CT_zojT
zh7-xgQI(N<@GxdbWI&2hpePDQ^_z>H7vx1NJC-L+69<+Ct{i)^I`qs1J1&pXGkvSk
z-zJK>vKs#_%>H!g+N=y83wD?a)!NiTzQ7%wd~lZdaYef&Ms`G8@}e21sz?1je^sK)
zNc`0tJG=@rUb5o{uHdI`*<RHqcYqLG<GtS?Fp@8z$3vPsWu~Kn`hg+qucZ6D45P3d
zLHlv)&g*MO=nB!O&qtVLpHS{eu}v7CR;d(!oY53=<)Q9{<`N##2bpG^kshx7%EFxW
zi#qStAHk9)aG^%slDL`?{p5W&7;k2~MQu0Q{F3VFYoeM+NH`+fc4E^RV|X}4t!1Pl
z<WP{ELh=?8S#g+zyoXwX)Ps9sN>$5ZsDh!MHAY&2vdR^ZHb=*zCH9e(fPM`x<v~LD
zltKi5b3GXrBZ?x&+NkU_H{q+N#OmFnk;0hBYmx;5-P!TXtCdu|$`EE;m-5=`V(DgY
zy&9q%hrdBNjTw@8?~|e2o*Y)eM=a;7M+;i^<g53u&m~!c`*;2XPRnX?*dBE!elGEw
zJ5P_p#jDD?no^JBP{bS(QOcha;SG2>zTUy%HFSYZoLuPTDsJ=iVMdNytxfe9Zrs(f
zFun)hDtzM4JVx=1?;3e+UU~s9=P2h>aQazmKa_yw+>XhZPNE3z;xP14&u6mgGOa+J
zn6DSZ6JnJ<CY-#u{N>d-isHxKb)Wk2JnwBC0Z*U5*ay(*(bI&F@uqqg+UVU1q-?1P
z&P_QkpgF|YYc9C?f&Fu_V#K0O4|9vie%m!?D{ytHtCvezb|<>mf94ZuJ-wgeA4L*$
z92lwh9vSr2VE*=5W`@zrYTrz4D`tg3*Gu*ty&Dz1IaojCA>?e_tu|`Ld#6zGOR-0P
zgvjJPXPe|EHGCJ3KTNzmrxb5W7vJ(&+`YhV^%ibVX-R3jO^L$Kb;X>QI?6+u>A<D1
z%J_DX=#@Ks_qkP=s51TcJ>kAT1B-(X?&d8w%nyLFeEQ>@KgBZj{BkrXIQMWjegF}P
z>C%4cNB5!{p+=zT%zjvb6i$>v7&gs*>WUsfJ#c!qtV4E)ryM%zFr`r%c`=VJSKapX
zQa@i9pL$Rd<e35CqVrJc<O0eFPbrf!{vC4677pwWRFe>hg1Nw(fxc$;c0Epu`FW&o
z<0NDwlqdO{K;Zmi2Qy6!>jXj#h-Ki<6?-TIm=|rSGpU#q9`me*ACQO+4r}<+pH|yh
z?4Gwr(#2xr8GC9vfS1BZgj4zf(k1ls(-98!f>S-Z9jhBxM;+fcc>9J!ryyW5qxv*^
z&Ky-P7ZbXH-Y>OO3a|2(5X<kTXpN24o4@u9IHQGx6%002x{wv+EDDw^mK~M?tB^p_
z4{PbUcQ}A$UBQ4Z0mG)ItJBoe_!F;c?V@)9DU{XLQwEevwd`C4aR!nn66Ow0iMFqQ
z|G21R!PRUw=D!I`>UgGv#1f@RSo{Q~$%Mb|Cd>9HO+{_5kN1Xdf5>MKF%YJt$vc_7
z`KK1ZO?IK3sChZCyqdHIsq=Vx)Vsv7@ZDjH3iMyGf)gRS?)N1hvDvjXy?{dP!Ne{d
zpEAws<z6D%xJ=AN<vRl-k+OMTq2Dmd_CiINx_>?l^L9SgCVgzzh(XXba_z?l>8Jd~
z>7T6zTZ&>Fq}z}Ug<^;opdiL(`D;~XjZ2ymZv?p@`d?U2@&j*@o}J=IW4JCk?EbTg
z)-vk0k4lNjrr@5H9XbnX#S3BBwl@iV^!5at{Tcamy2iA+$#|M3f%A3o*1DpF2j;8D
z$Tn9h9zG0{>&;dmqa<g0O(sBhARowi@ppJ`x6+>Bew3WLY#_I)7<3;KU)wXV_P*SZ
zIP`2u>}KT+f^g_Ww~{lul<;u6Xn^-fZ{vq(y$L&=_QfT=$o4#`xeVs<H&HTnYvqQh
zUM%7B!RV8&a*JRjY>Lz)#qeU>YRqyUscYHf4CudppB}3#*f#*``j+hO#j*`*YlT%#
z5#wRxw!%wpm<k?Lt98T1bf`PasMU*9ebo5Mz~%OpBpnL<_|kGZ5$DqNg)AgDt{>~5
z=^;?;cRxL9u)5DIwq!I+W}%41eKl$AM~K{ldiP7?TkiCb2YHB0{9GL>K=OxwB!DrD
zBy8Mw4)nj?8~olruiH|nH6U#Gn{r$bCjX(FxF_4&hO2Y3wDO~pw<+n9zA>V1CBz$2
zYbUPA#CnRfxdJUqJ7!EY;y{+Hb?MJVn(<GI9PXh5KfwIFhb*Cx=(e{+hi(xBna44E
zayU@kFOFaNj2c!+(-CePror9&^4Xv|u7HQ=+xB>r+>82fQkb;uYqsYJXVQ~M=S3{r
z5^q=3n?=c?356m&Yo?j;)@<n3p6IZ=%P8fbXx|LEw(pfX9PbyUGt=><QMVE?_Ls+u
z*@P@)<iz24$BtZ?RD1UW@8F6>VS~q5N=%Lsz}~73rC-N~N{?^yQ088b+P_+Kxpm#x
zrsQ(3!@D8-zSogO7xCWx@xCV&+ULnA+(K{4;T4#5HE-sii;v<aAcN&WO_=R9LVM6X
z({^hPDQn3|Kf3Z=otY!Zqhs>k2|dc!>mt&5Y3bF`mO>x5og#>-L+3PIDd>&>D)zK>
z+0G1W3HoQIr$v1#$z4%~AD$hfyp(kCO1mAlnyx~f<LhP1a;23UvDv;i+OGzJ%J_4_
z4a|;2ZU?NBgyDN98Mh$2!byzH?e-?}CfB%32{gpaM2VI?lFrHBY>TnF{#vB9N@6<$
zQnkoLYb3s<uQ$`u&KEJ_wVcL`)o-l3w>h=Ve5@(4KJ1`Xkgx3|tXt*Al{l+f=@|0C
zhX+)*-)kG@4|kw~ayRI$q1E|#qe$(wh~>C_C7t=Wd%D{{Hx$x%lkqIe%Vw-wcHm~j
z`o1Kh$~LUIOM76(sy&qEYpQxPNiWJ<u2OjA3_TBvMWNwC0&`&+k>{jsdJ_H9FpLFF
zA={t@jJmMgII6(#OvFvgZ7JnoAC2{66LMRmjYc=_^!Sv+`O6cShxRBXq`8Fy(xq8v
zW}HxJx|hvck@Wo}q2$^C?+s0@m=$1-Y2G(4i9JRVCZjed5pu1SY7!|CYwxN}?Aqp?
zKyKl@*npNq*|W^T;jGq|FT$|A9cvd~hwV{r&o&LyQk9z{VYfyp$0R9=$7X0^40}je
zH<~XtL$k$?Bi5aX!j<Q_Efli7nT72$Sr~~~nlBkI0%!b(wc1>LZC?BMX1C2GIiJi;
zs#ViV8@|&<6<Qvze-*SuCIHEro%gOXtz>yPJoRwC?MOp;RsNqsj{od_ZjBPe&Un4}
zF?-bc*&Zm|%TtF#fn9Jrwo(@er{QzAZS8Pry8STY)M*E4Tz%*a-51aBbylUwdaiY`
zN|aob=E+JlG#UlJwa_U!fj4O`kkI&4W<hUdoW!`^N2|*MQnTFRg|0d~Ehx5I(;KQx
zJk|-ia`AK`LM2IIrqFP*pIbU@*CcM4YDiYUU0L(@MSi`HYKi$xl0hT;N0hzQnqn>H
zd%JZFEGkV3xqIpAlu$w|)SPCpma*gA7wZC#reIv!AA<;XH?$yo1AK#Vb8)@g9!Z$z
zZdED6-%+hUO-s;|q8;<<X;pl}jssqM3fXihUrlyHGeG}LdTxT4hV6O-G#a=nwrjQh
zeHwF(bg!%5OO-yBLc6iU*ubENY`1Bm9LC#Necu_*_dFz{=&=p6QbXmEKXQZR)6LD#
z>zamBaLm_Al_z;q0NSPKn{@k?{fYvl2qua5P~X^M+GpAYE$dB^W8d<4WMDp^CX-F|
zDZfP|nw0dxsG(|q&U4>S&@If9Yi||gCF<&>(JKixJ3QD{<?}zQ=q0Kwc!8~044}8@
zCj!p%_tbc>P7k29dXN0PTkF*V;A`-L6Bw;D+kgs@aH;y_58C^-<MCS8(+<*cpU?H5
zLTDDKDrRRnH|4j4aLn_ZX=FGqI%J7@k-Tw!f1fEfMO*huXTVB~Ubu1&wB!l|d=q3t
zzYlYWKqs~L<42I*&w=SVB4*ud-PYxiiQofVzCUHT%LQ_-4cv}DA-|hw@FE!idPlyP
zP%f3@KhB>2K98Op`2s3~8jMo~mz_r0rRoffCm8j;pz73gcXzKPs(iy3a+BWAJZcnb
zKnTtop%JU&XjW_pJ?ZzAdZ{IoC7p=V3K?rawWrYgY^+pNLg}qNlh*X_h!w^oOW?5i
zg1p~AFlcSrPA~02#iQ4H!Aa-afT+Q7zvBch=iS@;Cx7$e{n|QAN09S#IpZ&%`Tw0!
z{I$hiB15y}<}8F_$dDn1(Q7%9R-W}Q->#@Zw%3)N5IT|%CO#}=P_tUuX1=i_|IbeN
z$4@pAVooGpR`L<#f(Wr}jdj3XK<yiTF8XZGb|u@Nqc<rM)H^4VrxIt6Z&h}+>SAZo
zHk-LdfJ+d3|6s?T22k6T7J+;_hxWDCY9)+u!GK1wnUs3|`q@|N1K8>nGGRd6KXXVt
z`IkDzHft03t_+%u@&j7WqRf#yJD=f+aiQ55zmWnDfvJ!A#5PT<KlhT?GnSZ&OGD%u
z*cl>?1x-NkIf)%!vln9e<vQ<`uaAdZ`FD5%OUxn3tYEe_7+9e<ADlw|xUHA@rlrzM
z-N_O|_pZ+0SYsf|Fr>tQwOzs`DObmW4>B9Cok@YphgMjCGo;knY!k%N@XY&)<KEGA
zd?!|o{|!8<SV9`eVPJ977?qBIeAWZGc%2cSBy+)?>vw-67V>LL>5&Gq!zV{Bo-#NN
zdnzWAoN7%Kcct<E??H`JI-X^GA8BO%Wx)UI%LQ1(e5@KG4-0Oqu=eYh{@A>aU~o%0
z@=57mw(=hjUaSG4|8Ng$tmOawtNie?Sj%7djm_ELK0g>CCJ$41Uh{P5E|-7%8qU{<
z6krmH>}sjmpi`gGWNSzqfIyuA8!D(Bu`d+K0^J;@$ql2%C<{RDDimpFlBt6KBSMc$
z(a4Jba#m2sL*yaX?#5O)btVB4WGf2R6E4;nhkNOPk_46<YD`_%nc2Cta=+SLD9^PU
zQ(A@Doduwc$`dSr`!)68+^Z)pI@kNk`o~u&b3vCP|HsqP`1Q-;%A4Apy}`ugnd+zs
zx0j=?%l2EXy+60~Ki+7z_sH!bESh47aSfXwXkZg-JY#TP)&<5I-U>lAZe*?jA}fhQ
zLqkC|8`(#E5`M<&tu+9?%^cM4uRW|?8d4HgRljQ7a8h~TSU-$GTuxjul3i;(dQ}0W
zM>7$q78DdC&%q`7UNnHeU)f)FJ9~RzI`1cpZaaZ;MW7F^P4RE_2o+;S0zXWBKufVz
zn;~^>dRJe+`)Yl0JV9a-N@qTF+T|J1>C(*-7Yjh%Zw;wB5u144J7qguV9|Qb$~_*&
z3p#Lw%gDv%N#HUqYx?$-&!KQ`cRl|0NdFOM#5l=S>U`wo0QwH2*EH=>=~$7(+H?TN
zsAhqEXQT0|c#diN1pVexH8bHe(mHQdaP<>afD%6|5{yDDFOhkZi<z}0>cPNAW(^ZJ
ztB%S8>R#R{$DRz8+yrSOvM-25Lw8h#7TJKB_I)2f4YX4y_rm}?RU!|{%6*GYAl}b%
zRfp^i42d%VBv%B7k_(*O-SR?{gYIjeAI6hErQx22-L+w8xr^lI2>Z-o$KS(ykp4RR
zOJbQ?yEn%CD<h+d{n-3iYwyd?TSHx&FK1NaYCqo{LM)-!#8D}o;7u+x8sptl0}Uq#
zn3f1k0ckQ=Myhl-OOyA*i8qYHyHUPRrc%t)vMh*02E)1ayxsYDng<)u9OFTGL2_3s
zobWOo_)nHMVGent8>ZdB2X%)0`LTLp!kHjlU^}P@V=dSi=PkhD&hpEhP4zsu$!Pnr
zmMWFoE;tQi4VeCE$63I-YhPT33B4wyA;_>fI6B%2X(VOnS9ih4{YP$!J<-|qO*cs0
z8o(aI2#KID0Hg@bsA`F1nbfUbg$B&0h9MXN$#tV4Tb)d=JkWrZj}){FADhT|=_~6F
z#El71QOQL7W>MVOE*DS<-<rtQwgO+TKzay}j@`POD{5w-E!|zxkaxkp1jT`|eRoIA
zE87Z@(MOBIjM@?PG1`4434vDn6hn(*1qO5G2-8!Dw^Zk?7CJx>UouwW{~q$34OYB|
z$7Ct&u_<RQ5EaJmMhsu&RBQIz3u7d-%ICpP_FJlC@xe9X+uB7y5;(SZ6fS`D*N;@1
z<M+Y~<>x<s<t$yZ!K;B0Fz9e)m}%jQHxEl?T9{lzD<I0xfUx=klS(0>0nc5@AizA8
zPyb+GCkTl5nj?S>q<>4`?_4eOCTk4wfb#9PaRG_<6b5cLjeWAn@3`FwpL%pJUysrS
z=$pE<H`$uzB=;c=RG%n-AZ8(SGG>LSlP6AV%w@8DRT@nouT|;3P0yW%#r<smK>nRO
zFIi#`L3QT#Ih*lCq)n$)==el|ct(=YX`&1vku|XGVi|dpzv#vT+A_AcDCKmF7^Gwp
zvb~F2vuuDZ^sk?MFyIyM<cLCH<L`7fe#5%g{seP^3}Ee+k<|&R=71Q^Ck#yNCdYjr
z;v&A3$6G7vMdQ?F@_-WSI8km`5y3)MLcXlRgjHNXIBCSy%sNinaHV6(!3{cf=l8ln
zeyKgsmu;e~yaQt!YwHOAE_6e@N;u1kN8kBrcd#4#{4~AX?1pA>TbaL9UzJQa(_!}&
zG|v-Uq(>yh6WFZ1<-WOsoT0Y@O<|h7oz7YS{aGy>`7N3*f_fXZ^@^BkfOmXXHK0O<
zB}|=IC*%wh&17~avNhSG$LjVA2AC$f(v-0?s2b*)QyZ}dqFQ;0VI@=ySNf@!FiINu
z#0WcW)1=%uuIu;sCqVJF3qe{u)QP0$IX<@19<movs&V|9)VIenZd9~<ZYcW4zfwCA
zCrC67x+1MX_0gll28pwZ;t3V+IC48G;MKWG>uiFyFhc+u@REO}<24_}@bTQhSYiB8
z_Y(2S7S&cVdb;TrMU%OyN-C?UV_1w1ncKKug#hKW79d&kbbU%oSPfL~#9u_wgp&KY
zl~SL@Hg@H%oa{-!sG^QzJY{KO$o){w;fZ4%z<Qp@jWxyHnMq<$5XHPxRZpo!^;5q?
zdRv?il<&Pw_7PCPB%*d;+*lu}q0-m{?Rzj67OEo}bo7dbTj0OHRFUg$3LzOx*kG3m
zDqssWOs&<SlsvWHo@RJlJBhxl6l3VtT3G1sx&tdcx`fd#HOFY782at;=nQ%yS`Hst
z;XEnXo(zM2-Pm=r(3m7q&U)GueUyx0BHmtmCG%0L@u^bL5qLhA3C_w@DgJ0wbK&l_
zkUiOQlxeAE&3rx+f_a74b5b2;b2uX3162xVA*0LC0o7NpfHKu|H%#}(N*H5*$Pq=F
zW`B;<S$Y3oFW*||nxVKv&=YbV@5*Bd1YZ(PvSdtzN_(=eEH1r{JV?cD3A)mS)@@eR
zYNO*B72N%=EpDXUID3fCAe*GfK<-R9i4sbb?Lw>dSHWN@IaDCVSH&=;q6SG8sMN$$
z+3un=!jnyoqiQjwPkrw$5Xb`LcK$#1-aH=4zil5cQ6W*GB1;R3P_l#!k|=8!`&QZ4
zvSv3GDuuFTC$h|7?7OigB70-(OU2j;V;P3;`%3rydhX|*KF{m*`}_B&Xy&@!>v<mM
zaUMtPQ!5d=8dTl}kRy8BP7_$n7EoZr@=k}O+Uw4LV(o_Yi*cyzXn!fWXQ`D&?wJbW
z5w90NYM=i8%#z6LtOq(cmAp}!t4trbZD_<??KhZB_>n$e^l|D}=2lyID&b*CcGsQe
zSAcJq8{SB^X!5!?RmtSpqYq<S)JfEd&Q#f+(wST<Sg%`=#WW^9;V~fw&PPyGrPI;W
zune17K&G!SmByAZ7NL&oHkk1#;<n^f7IVhdRt?{;%WiLWd;T*%fS3~g*ndlr@w?T<
zL147XI1|ObRc^v$h6JKUA1SU<)zBD{;)`k2L<DUsqHH9QrbAGwdOglglEx99(+4p`
zOs6pvVkQZv4b>W?Y%f|7^i;4Y8=CqeOyFZ}SBDSjo9?dO$cqB1nn`BeC>=o^`OVcH
zwaOMvst%jjL%RD>lg(0q$#i_-T$jh)qU68>+Dw_T?a{mrOiw<SYOU$_>fAu7Ar;qr
z=o}RG=TI~ym-FU^2NVPEwH$6Ws<$k@gJM(>u`9YOmk*Vm^&{xidGw{jvKC`CaY5b4
zw8+q0ek#m;WzEy+GfKQBY^n85fS)PUBNB(KkSr6-LD|Bc&t?onMTI8>#%aSVp6-AP
z7FL$kM{-$wb_>r}BJ_`WG~%v6Sn=5exP9hXUr;$&znLL(Hl_YLJP3IyOaq%v|5!vV
zi<gTDdFhG9qmMS~#XyDO{p3q*ScwgbQ!l;rY~tbVa2v@l_16P*eaoQQix@Kv-(}6r
zkM=eT8gydW=02L&(|^D<%`=csi{oQMAiY$98@)oYTR1T~XG!2QAy$f(zP+p8HyFh3
z*&Ji#1DC{s9I`X4oyAyxrJUDg&AIkqX~NtMs_1)-#l6_awz{?=@MC;qT3GbJ2l-&b
z$ijt;0^ICOI8sX_z7QD)-;YeUF_zxV3uN#80pYiLJ4+4p3!dE~V{z@(z3@}O$8NQ6
zyJ;pmFm}!4T<7xguayPj{UEqc4lkVUP{JKef8G;t?Je7TgWV!2`>w}ig;;{ZbuNwe
zmXi5<q;!@y{22P7M$I>cwgi-|dWvlUw<J}GQ<qnljxd;qdQ<qyj1RI_d-0@*-*D3+
zA9jngYx9*!Qqw<1tL9A2`T^)k9{8g)(~Est)F{(w5UDw<@hM}^eDE}e7^5!mYDn`5
zC8UJ~S%}oItpb~CL{_ZZSS>=JPwSL7L_EWN-@Q}923O#XkeW^l|E}|R@Qj#4Jc<JT
zh$-6`P6aW`XX}*%JXz{{gREkM-2h@P3Pin59!GVsNa(f!cU%y5m-<k>^(`~!M{j|+
z?G-`3&iw<Xk@-bsDhxRQl-Hjs!WC?FjisNXnb9Egan`tg=FEZ)noCm6Qw34sD;!@;
z(RZ!m#x-p=)IK!OhZTs&iT8#zVD5^>RF|B#lSl#*3^gc5cvv=rE>jDQDm-Qgv?_or
z7jKxH^H9^2ReG)%b0)b{Z*qr4Be@3fR}Z>REN}q0P$2Bvm2VR6#G8nARF#lDP&Y0H
z0mqjFfZ&@PR&PY#w40aiDi)OU5g!AhtS*slSw}PpJdx-#1a`<YV_3Xvvv;U7D;AkV
zNYpsR-vk8xDjPNJm2l6Iv7m?lJB5i)z8qc=S6y;!RHK^yT>^T;e(BVI<yZeBz{-40
z;nBrgqbPdRaK*pGaTF#>`RD^Ilj3Usja(W-xNIiFB}^7l9o-o8<yY(U<83#fLgK75
zkIduk(-Plq6B{i{zQ+_h;P^;DV_vf(ivqq?J)IsmS5GVWnl1rlf9Z;5fITRtGFBZd
zjj<~K4vV%K%_=!PdE=wL30`*VXnXN&bZF^%M@9u!0y#jJIC!fmJ^2afrg7O{iqvFE
zLtc{c10@&Wr{|k51MAv(I(xxWYR*~0*Io(VTk4Wt5%|<N>j(+dr_z_P1f;Estao2j
zv)*j_Koru;6?42=?Zmv!)?o~NeIokSAo6fWmJJ4^#7yJr*AM<foO#iF{~I?{edEs`
zWl_sVe~7t*dTPcIuVU|lJRS6<%)89zrZ-JFGbh4}H|DJr$2RiBe2wfSVehe3RmhzL
zAX|&NiOu`uo2XUe3;mDT;thKu;qDJnEJo8_`m4v?Vob=TQ^p6Q-;~m0qUJEYFQrST
zL)wC>2h!_r!4GZC%!2Y(Zn#~{m5$ksXZO*)cPv>JBU`S(IW#tBq(0!TS$~NKvo2@O
z<CVlc;J&@ZIk$PYh(A$|GocprD^1<DTWXzdJq@~+^KEm>(>wFFcL;XFHZ1;Njq(^&
zc6=eJzQ0ljX@TgDu2MxWe_J(w4-)=Cl>l{PVvaR8yT{MBy6&xU9@#&J$ch#uQ_z_4
zFY!A@(KtdrDMf>j03L1UDk3wG9R~CQ-wCwR#u=Le7(IkeuzGs^GdsgY6AG3P$x}l=
zA1sQ5RtB=G32%E^6U1Ok@J`g(VM0K{hG3Lgftm^(Lwu;wtj+TmW9q~hJ2eElUUndf
z0@#EIGH%#ZJOv5dZn;dzXixKDnqT;iq@*8Gq(ZM)H6s&;l^o$VrRAL^(m2@&6VY-x
zOz))$SCoc-W|El-JSO%)R1_-(04`eJgaE-!Cv_P)OIs_Vp*_gGFmWq}HO{TP44V7I
z0IbXkr1scA<%;V=*APBd+GwS2>n*G5jbFDaC!!37s>ZkmuJiFQcAi;{Wlhj_qxfju
zzLh%H&gw<3tswI~1to@QrOF9%OqR)~5_Vq#b)uo|uYg9ICt}_G$$l0nCguS}(aOzy
z(r2h<%nl?e*V0?G+uU<!hCsGh?lDjt&u;7MLgh5@nVKg)tbhWd3A<yf0XSU<j;@|X
z-s$;wTuMW3BFg2MCxxK4(iDX~(+5FAP<6Yc--30vD>cYE^Ovaskjh6L#a8{&*KU|k
z-=YYmaE8Y~H%>3`o*4uz1fw$Omh%GMPQ#PH098*+=o18xOfLI^K->=En<18g7{SDm
zZ_-lew`a{J4}3S>JwB}F5+c024l&G%D3Kc4E3IF&U@`IM6AFQ{wLTKI1F)k7{kKg3
zjdqlr%X>D6KH_RW2R^v{Az<M;;h?7RY;gE4NW=hr{5VUal^j&PCQcP7JPx@Dou)CC
z)M9c03Q5CS4EODTG6ro9MAIxoI;?XNL&bWc_dUh=eR$Yiu7-tJ`Ef9xin#g@00dYd
zN`M;QtLzW<HNcRyT@801FxRs|Q9OUb7>Jgx>sP279EGI!QR=v2fnd-cIRKgn^nQA0
zE`G^<So4D(wo&_=ti5Hhl7Gmc)3Af5vZU&F_uz6EDJ6*A24F1iav2~K&2Qd1UzRlH
z!=?s&x`!~UTe6GR_BuGsNGMS#`1Td6&SY7;Vwa{<24Rw9uhxi%`vYB$##cecIh_n|
z5z(U4Mm~zjyQh6F#@t~An4I-t>tcY3H2MN@er0V-(A*@G{GvAElBS{mGT?H>`ttB@
zX!WI~UxBD1Avr4`c{c!Eu11RG0KAg~p6K(SrNXr)8j`#OVB&j&HK-Z~gg;wUfztNL
znM<8_vBIGh6zD7S5P%3o)0?R7AUoktGE*#lD#rs^4Xv_m2BFf}Z-`=FPfe~tfP8G9
zu{{vW$rZv$DK3ZlTCUGvTZJza)qe-g5{nB8!L#o9H%&!IzP+<7BYG*(_4fkIev<}_
z0M*wG=qgnBBmsBx0K4IvcXsZ$g$Q?l3Og&OEL-RZ)q+WYuK)PHV><q#5eMPpzJRLO
zb={R|-s@N&5t3AoxK0^5Y#kg%PAQ3rnj#rDVN}Bv=#(On(uyh<Oo;g8sZmmADWtZe
zf_ti8;cHt{@tF}evzSTDQ8(s~$uU-PQTBxC_2f+>S9=L)25sC|fKq6iJ*Jw005VB;
z8f{m8W1y3gKd~AR2&pdm>_5L|_EZw0y5GkGmX!m5VDccOY&|2FbjwH0mCbeqXiDWx
z`&`a|*ysn3oKm+!%dTkQqDD)JkijsH@f?6d5DV@IV%QYvo|OXPkc6m8%vGySPJG&^
zg6HCtM32Zv!5dbY)@+6P6%So#f03^(Fv)PDm@r6|d&PY~>+}7I2x4tPb|5_+`y*Ld
z1np_Y!)D!DTs;6vD1B9b8vY>C<zlz;T1NrE?|)G;R$MZafavRW)9zm%hDuF7y+EZ6
zg*lgWyW=Nm^0oSrNM))TBroB9DpJxB2)CY`vX&!48m%4Wck8;felw1WlPNfhAyFu}
z$U?<z?t(TVgMu4QHRJ9Lh#`jNmXG?}1MFzdVevJjc}MG@gT23O%d7+3MQpx3N=ZZ>
zA*{<`<A_F=S=nlM-I$&OZeVNschENQf^CP{z;0Yw>v8s`B_J<F<Z5sc-1EB=&Ih1r
z!m(QS<_~DmT}qf1zuT)os;(G&N^593kI%*xKL%C?6<0bq8h}3XvC1e$Ox71&+&ldw
zn2+J%O!a&47@ALTrtDf(9Q?6CfiLPVe18fB+-JXRj^ilC4dfE#;;x$gL=0u<7l_f0
zOQe_sgst*u5lhID+Qh62Bq}3Xr*qZeq=o*%jLH(N(`Z{wygIY}U<rSL!xFWT9BA?s
zo!7w*Av?7r9rLKRa#NbZjhX;%>y$DB>Eh}u%mWIUJG{AWesgO&m!CglR=o}r+g=Bx
zj)$(`LAdN(zaC_8rB2#!v(-d-(MzP-l2ECz=oNsm@eG}oN}j!p^}>c4j%k1AAREuL
zl*?!zF!4E+b(ZABwZz|nUoQI*!q+jJv3Qs>>^uccjc`N`FB_aIvqd3*+lD1dM;2Kh
z0f_8<0kObtpvX%_jvG~=ijB7cFsB%N8CjsIA{$kY>ULnE(YwD0cx&@d*#o38S!y&b
z!bh?lStzvqyq+S^e*AMeHy!3RMQOtQKxc?f9V$#HoWUCKW`pSjA-!M{7W5@g1#+um
zqqXpgcsXVXOVEw%kZDHMZqq2P@P(N9$N;Y^Tk1%|3uI`fctt_S#*f6KN0sKHX^jsR
zd~@4or~++lIjn2?ayMp&m`b^3OJBPh@#d}Jkb(Qw;p16$J|@T6!u{GtdCRnmpDanN
zm~<|;POU**L~;-KzG1_@*%;UtN22s~%&Tv<;*7ocnFlJV-fgGS=)n2-=HNYBA{eTz
z#2Ja+lT5-lobq;w?-UBoHfb6e00N@(aA!2Xx8@Z#ZxbTEVd%4ow7x>*B*uD>#Ycu8
zd(~pum@qg=J`6DgiP9K~J`D&L&P}>~JJiHtp~*qCxeoJD&!IPkU;4PYk1(DMV}uis
zFitIz2ws2T(Ue%23$F7W@(K1(Jj+uptf7s*&fvF0XqE+yhU*||AE{pZrm0p=vbruC
zaM06-FoRu#GW-+@5qtu#2IJj~!6?`%xNF-V=sWk7a>Hw6<DvpbvyI{%f}WiC%z-ag
zRGKnqILpD>Y$0IL8Rfg%L7!f(;iG@!b)c>Jt`=N+W<BR=_(n=3LGm8_RqP4v8cDFR
zSy*#*v8hC(DFbiU6hDlxBitu*pJEN3$&L5QHUQ!0eHCo1%n1sz?;R!5Z`H;5Nqx-;
zwAU6%DM>w>MjDN~G8^?-D69SLO%2i;&5Mh81YDXF*8R~nWOZE;o(%ii^VoWjCgxg<
zF{hrtE{p2Rk`#4gY!mUOT;Aptp-B5jn=b3c9D0;K$8%EOstThEZyLds0jDq~@VPcs
z(N0Y=Z=Z|G*-g-VpX89=H=70cWeq{oHKN)>wo1FZm#kNCp*3$tjgb#un?#XMaU(3D
z20#`et-3U1j&O8bpx(D3J7S)`bH8<_Y(N~cx7!w(+`Sz{l+kK9V*}Fj%p7_LSChJ@
zHy`>~A(5xRolOvC%!18fAjEbGr7nZW=7p~!(qTumT(e=Bz1tC>Gs0OmD3CJ#%KfL1
zK2h#F{^61$BRV}#pg>efU&*9yn|X_F{&WooRmn^SGDTSU{*~8?u2-!WBt544R_|9n
zwcbT!)h|Oh81~0{v`t(UBwRsBC0vx_8%?~cM`g;A*e6CBsQ?AF;q7AN6n0Tngp>%k
zS_}2tZV+Ut#4h%#V1?tW11qFBS=47~C5eL{Qr;4_?wR!76G8iOL|gQXp7HsmxYe1n
z1Ea(n^0)bL4QN+5r;Wb@a%q~}Q^A@xGnUqg=Sex*?{jOgpyAYcOZ`dKran9;9tGM-
z)Mc_vg}WHOOxrcEN2AlQRrSQ1dIjkzFHkz@g+PhktuN>d!9Z(YeNv4g>D%azi^}*i
zR2^EHosPmn(>AY$*C45_e=KsrX)h^p-Y3FiThk;ycMs7Il8dutI0q@<1=#&QwOM)E
z9E>&_bpTx}OSuez@rk%Y&BVNW4T3~BP~oan*OU`MF5i;lo_S^vkU;EOWBf)$3I&uR
z*?S-QEamtEh%AtORVA{XMMIxS)CS4eqaK00vgRygGdNxHv9k<_oGv^wU<<#=Af1+6
z!a5Y1hzpuP9_jIQ{W?7WV(#$}`GYfYXbmN8?C~L!3i}lapiT(eA8-Zv>B!F1de&`t
z=Z$nv-^s5r4(-NNSv##<NDJ%~)h<w*dT{^4DiofHokU?SrtZ@^i^e3U+DBO!0E#vW
z95UF>7M!8;n`!qnVrC7GZ?g``xOdSZ;0epTtjU=Q5aAC0icC`**a1M(P;}@jRU)fn
zRhPZ0kLHTp%8X-#BQcJ_doD@M4b^pdH6ZO4|FC#BLpuxh=6KSsSLS@gq_{>VswOKo
zTHB<)Tandi0kmJyMx7u?o=nuNA*3e5eR`KBAjT&u9lqP$A1HP3&8hBkXG_Zuw(2-M
zpnC^r=f1tQ3P@5hIf8spuPA>ifr*IJAMS8BG@B^n(HhScw3<>f^YjDSU)7J57=7P^
z6L<IO>PL|T6Qfe1bnM8=&brJvf8_Q+%PQ$u&8}IfCfmaT*C*r}9RL&JVpwsl7`qb3
z3*=30j#ChrzM#Ny8Hp6T*Aqk}CTrI%>RBdCOpZdCn_=TC6Jw4jzWB~V85j`gx#iPq
z5e&UeS-i8yc|{h}U~|f)`~*$xC*%cIAlN?`bd!Gc=^$1;6e-k_X3YktMXBA_f8=*R
z$`Wu9G7Tw+MyQtRvCzQh*r6&GE*X=GuDnK?B}Vq?)@ztc{@shFKKmQR;L0-|!!yAm
zG+uDJY{A+Hi=HdpH1<p~S(g}keBdW%m4rdd?BkWh84)_4-LYr%6*f8n;ly*VZ{O@(
z;j#wQDtn|UT$6UnG4iQoK(-AHorqI_FUYAT69H9^q6}*Dlu}*qfOIr=qAutQfEgX-
zZ?i_4j&!_1K{MJt`W_;ndvE{&lZ94oq8BK((DTeVwcp5?{RI*jCsMEYx0B7nY|rRF
z_w-vhaUwghj7&rg957gz-KlwJ=kvzy19AXTPG?5i=vs>SyyaJx1d-tIRx-e8a(S=!
zpFFr-qG8k|Q3G{0kVfTj8lV8BiE+H^X>U&4S!xPf5XD|iH;BfOtrcHX1DF&t<DJ?E
z?XTy3LB)SKEG||DlvtnyfR$MjTT_?5BrEp<`KskmqGvX}9HFj)7XiOi`Jj=86^O0J
z>1Z^(;;rfon%D1e)NDA%cd<8U0i4(S5rOx#yZ|UMBta8~Wjq7*cBIpf0MX0$P6i<5
z$k-Y)K|GC}EdJ(QPfQl`l|~L}$UuzW3iA_eNnU4{LV0@_pW9|?Q1RQ;%_Eo7?}T#;
zJU<NM=%WNh@8o#uso*PIbUeXc&BW@o@x>E99zOw_z;)%7O<Dg&dW%60YM-#4*N-=1
zLnmFoo8Y3IwT$9(tvP<l79ed2hM>fLJU92QAyo<s^*QCrpYN2Eol~1Y7f1H!x((7h
zO0B>LsjnG`K@(X&lD29d{u(?AQ+0V=hjW$2ife4-)?5WbbavY|xib>O;=BbKuXtPx
ziIovw&o&}2nRJh;lYBxQi8arV50nPJIVRMMMb~eOg6P(~-neOJpl4Sa_-8n5f%Ue{
zErd}q|6r<ZBHVLQ*ac?s_5FR|xADOXoz8MxV%Rl%XO<K{<u1Fu|GCF*6Z!|o)_Ty*
zjc(ZDbV%1af_wp~CC>l$%r_Tb*6@bQ1Y?c@;@K-AbBQ<g=;Skz7exX}VKOSc@^q*y
ze@!M6<RpAw_A|}K$OT&t8r@=*HZPQ^9^CRvEYhPmKJcg3zHCsbS5J<&F(4)n`pIOy
zI%TiW6p^{<;U#h;J*q3HU^*6NhwXqcl|#cws}&}P>D~!f5FB%A?x$np7!|In4y4we
z7zIh?cN4czCWmwyui{SGLu@~N)tVeRm6Ow4z5Q7XrGYkvx0LQQi#M#_C!hlOPh4Df
zdNd09buTY_E>V-_50`-UB1(?*Dvlww0;&@U=Y8(T$QJL29^?aH#$o4>r>z;4Nnaj+
z9MoPS0OkF2+Vd>lBYp_J4a3I^88hzZ1~+`3SBG4yUi-d&!aKg@7GfD#3>yYij-Zg=
zaVH0aOC6C>3li;VyIkcZ+n{OvbymoEENFGOimmv4xBoX`#Dk&PW~>sdf<Dk^KOd;1
z4YvoW&QRD_fYT&y$<zRlL%<6DQKcZAUPz=Pf#NV7PnPLgW@?C2<VB*o?w~h>U_U`M
zA<IAlV_(o#p1ck<c{5cL$26STksk-6Cb!r`Y$9m0O9yQDbnIE<{jQhEwZSu59?<z@
z!E9l5S5z8;ATeQqT5X0Lb*_GI&-*1!LJ-cyO@Xdjer4TmNH>byyc0aAHK9L%O>Uf5
zDj;Z`omS9PEKwGKp}Y9C^s>u!8nlk%zhB*eO)owaC{;#3gnQZQzkFAu#mz#vWkLz6
zyMse#hCD;p7DuJE%pL9!wFYL>K_D=4a*!<HFZsh^Ggn2+vo<5fbh4+m{nZ+dAz9Ab
z6-P^-))(wQG?Zn$GJD8TlW-PWqEXE}gjuk)vMx3)eis<l8}_0uec$GXV1ZXskzZgB
z_#OlCkCp@((I6KAcU_miBkQ1#bS%iF-E+*~%aFJswD^klAsnrEnQ#3k1@wOq{gS(>
zH)>1oew1`N`uETM7nGmL34w?|Z#kVJiyZxrwV}QVD8787;`kS2&)Wa_08bAyY-op_
z_;Mfm@AoNTq*YBXz(;uy%*@Ex2f8>dS@8czE}c+I*58~JlKL&Te=%1EpkCP^G}4Az
z<B3zL^Pt?E4~cyOb4!yUm4ZJ;CS+g0>>l9k1#iqje!3TJ5!@=Vec&dy+$G{_vH>&6
z4Xo$^sC^z}I()OadE^b3_sSd>sQ`*5Ky^zLB8NusAb=lv7ics(3F%L;0nQPi@s=)s
z^C7<a4}RbO7(p-?`vEa;-$97kglIYU2pkrW4v9+XKNWT_)_@tBSRd+ghEO?O$f~py
zP_ud<5ULX3vGbr!=wh#%F7>~r^nc96zt%{Bpv+ht$ytUTya_^xsSLPh*a29g7{coH
zi>&U0miLD-3rPw8B-b_#=!a2>Tn12x>||^T2u`2^D*PL~GlAq2SfxCRRNh}Xi{DZ`
z^bac}Ia6k&z`XUGe9|PuyWEGcqlH+9K<prL-XYHq`lr9VNV6;==9;BGK&Aof_{<G)
zmw$TWy^lG39_Z)Xerf*q5B>9Ph%<~pKc_DIX7vC4r&qvy+>g-xKb@}_nq|RkpSf}A
z|9{ZG%;>)c|NrBNa=KGOx?9rVcu7eDyPyxW1w=-wA_XJ!+v3Fwh1g5l-d|1CU|$FP
zRlQ50anH!wcO+k@z`iOxKD)P3)%7TCdb6z|X95?<#FT&a{UNdk%kMuoQ}U-7wfj;C
zxxD>t+3x)!`8ilMT$E-|{P53z$!P%ovj3gsUmT?O$iTmk7#V6naaTIrJoEPl{B!!q
zlz_9lkgLP@A2#ORQz4GifNjnYhW)?)(heL57D|bv|C<iY1vVG|Sx(g7sn!2_tN!C)
z8%EvO+H1H2Q!|S*_*uU7Ra=0seIWa<h9~NE(B0HWJ+UVW%Ue<#B`9t8_Re{Jq1E13
zKFMePO2U`92==JD^5s3Ewv5B!%Y$2o3nd7bmnXD>UwaFSNyrQSb;b&~)%k6`EcU>j
zNPSaaHNpJy)`NqP2Oz}A%x^luqgr%HGyN0$mL#MIJ%Ps$f<o3D3XNG-Gx)>(%x{`r
z@CH&SqE0`X`yK$!Whv<YxUqI!XEA#aSq`6eEwO(xBM|)Bt2g^sNffT!E7vXB_pc|l
z-2?DKDo4If{+_^BG*GJSyE$=kcC^NCuJIrp%cq4|)o1~uY`{j&$Ade|5=#4Ny$C=1
z@*mCvusyii7$}1Ek*>Hyy48c=wxpdu|J74GUwMnf@3<4n9CM}qoZ8W%Dyu15ezz|#
zbGxYjwx$sUOkm;-9M^tppE5A%LKVVEdqDZr2nFK#vA#f>VDX*;+XRPxBrp7OKM^99
zqK?7<z4J0mag9}-1Cr`W8(eCg_-UUb5moB<?ZGcW=9ph0@`3Vlqr0qj+J$baZqcfL
z9}S89lS{)cz;+orf;eB+)(6lUy_dB3dJ4EX>A(9(BTWL7=>D@oXO4S00k)?oC{RKd
z8PHXfSc8<l?E&$l0GuoRAj*d*;rY0y?u;9``o%545!?ZwZL}CI*aB{_e);IEy+@mS
zLGw1phm=LOiPX{6<9|E$lC`KZm-ykTqI=_VTgG{b*UTR9aILK-KtFIFR5njTVF33V
z^hofa^ox-Ox9_6|fG>@T^CIA`l{=K-A*~;vFFFtUDqUOMfQXSW-Vq4a7?z=2wM4RA
z84q{7zuEx&rl))R0Znfn5chk}1q^^_V;;N}|7v_-bRzJq+4N=fe0)M$UA-Elyrt1I
zewRyFytc6=#=;#)-|Z59-|eGY#jS7S$`1ASZ-XP&3RL%++uIAvblBYUx}}1RF7+7g
zf~-IHRgJPcu_j@@fVLskvLV5T+@Zi|0-75)+_HbW9;qwur^+YZ>$y=)1Fr$r4PqG%
zA5>r<`F8`1=_X^1C7@$5%Giqm$df=GWcp(S9O`zsGyyBfz5Y2h7=z)8avvZt-<WUC
z$er}~^J(e>CT(=t>eu@i_mx_8)w>N?XN>ijJJo<XcW;%)%9}0y($99jsaCIyu%fS@
z)Rx3c^s+1rjM$-{wWJA}m!F^;&ylGq5P_q0D)u=w3Ke4c_(mV=lmy59AlLqT5o$=%
zNHyE*#<))fFt;BRngLI2KonTwh}q=6Keh!JtmQU_R0$XoC{AmutD6FaSr@fSVbG68
zTt9*Gk0rp0;!K3y1_ANA&u17I6}zQymI6|Fu9IjP2Y<#R@g4C|$Ld6^c6DICMO<95
zovlaEYDsgMrJbA$Y?Sv+smNf-T*Fkhe%{Kf^Lb_|6T6sZyJ7vpnN22q+4bXOcc$pu
zea;&W=X6D(RvFGP!!b~VDg{oUjm+<Xx8gi#*2r(FlLWI}MgV$U8xgek7t4Nhh}()>
zUWKH-Kwp_CUFn(la2?r4AJXrqkvf}Mc&{IS$w=*XA}89p13TF5dI?SPjlyMB_7k5@
zd1DTw-q$N4MhD7xoOq$VHV{Z)&eBfEE3Hb9B>sMcf1Un+?lI8o#p_*oyCVb~VTcyK
z&Qt~ZN`N*)=SKQjTX@PK5L6N?f{M_NV0SuQkLA&kwF4D=+WSxwkXF9euq#dvlpltK
zq79v18cc#6lMm^fl?;pOPq5g7ptlmm?M6?ryE=>OAjqzy+9*x~mN);Bjd%shR5@jl
zqejm-bn5EmSlhSaChw&@f+i9O_|D#msTWf==FGaqk9}tdn<9LL6y(KDWo}at&e`7h
zmll8ra1#hND>_K~`}&J`>5tUnBgTIOI)JVS`$sR#sH%YAV#*t)Q+!2l9FZgNM#l<!
zE(2*_uev6rdM+UAYfmnbfDqnJd;`v%SO1u`bIrA2yGI>%Wf?X7$cZTTTS-=jy(>(R
zhNGZbURYnlaXf*uO}ZN7?-x9?9>ByFy$YWeuMyrcm9#F*^G;s<O&Iy_BqidVKTHe^
za@N%}=Szr!z3BWa)$+;;|EV<_+G&b`jp_>k;&?qdOjnp=Hruqwlfj{rRgvGKlaPIH
z5V^W3jT>%$&%?jmWv~F-<qTV)u+^!|!y&ax>0*;&f1h>#Yo@`EDd6UoyPD6iCr~IZ
zpzcUpi0q1C>Ai)Q8MYQ{l_XxqjW>sPZF=L|zx-<Nk)r%SvCG=tZ-Yhrr2$!p0mKmd
zg5~!jm}b5GbkB0iy|FvG4?Ke6?K~1H;&Kcgz6Hy&1Y`Qcb~{)m7lb^MGchS9<l-oC
z0gyWV;CZpIcgHa3Cp-8Q+H6{|u()q(w9fX7E|&~8o|A4XYR&`jvmB=m6-}>wmBY%1
zU6GIVVmckiN#tq@&G(?xK<cUs?e9AR@qpTH$1)Nv-%wc9v!#G1<e0zB`oX_E(9<|@
zf$wbI7plyK;h&%8$VHCH-@)T+4>`BK{r!Ymw-FMAugkWvOxYp^C7-gbgRQ2v{8Y~g
z5w8}8j%K~)J>#59o2`$@4=NUIB_wu}&v>0wLE4O0rDofEco4n%3XFd2sueq)U$th$
zzu0_j=(gAqRx8onpyLbv*kc#v5p=U{+UTJ&*+6Fjd0kO*t!E&%22omT%;fdoQ^Ol7
zGjA1X);3_lO7nGRqm9jdWIRKEB=JzWX<HkfmF}ta;h)bT1#1hoLf&nhFi$&m0=d}v
zq7H>OQae}mWWwG#&6(zpUM8K_pQwcC%bvE6?GQtIC<S}(MwWMq{Y=+Af>7ZdJ79wJ
z61#MbRe{WiW|n~foN0Z?$?W%fh}4r|!o<>x6r9zb04mas@g1d}7<ul{!saqOZy`fd
zA|FNmrdKKx<}XQnvo09QheFza@__im8@!|DUA&`DU0Efo+d{M_R(GA(&!6YGy7iOf
z<aEWQ@FoqAq+a^IS*&cLNM3lvUuF_KVoY5AL1@>=a2@Z<yZ;R8z3Ws<tNQ9^keaTS
z;K|z2yF0n;N-x1~<sqf&?hZ{?$rApq%Wd&*@yAWC<KrdRT*xJ{$=)<td`ZLUxV`NV
z@un6;4fa5cDZnkA;R)!{6QC4Wy?OeEF6cYrH>wa`O<e9p7qZllzDJo<%0C<&LtQj*
z$+Oq@wIEpd+3~Sgbzvme-gORs{rnW9YT-FXm7=Vb?K3c_VI4%8-*?<WFS>^XYwzFy
zzxFM3+Z1WH7_`BStioDvL%r;xZmqDrz5Kss|6++uBZ)G<??U!(`$qrTBnklI6av+{
z?af=OBY=J905~M6Kpa3dAW<;*5Dg6IyC(MMMqL&*SgX{Vudtwt+b(xN8C(_`yNME6
zlpJ*Ze7#kYuq<uTn(qZz1igTMeP$M$zNiX_FYvy7PHnt}o||1M@u|`(28{LD8XsR3
zgD|Ai`NscxvOhl}K2aQ*j_(Vw+<W}Y>l6+Gqc;j8TfZ_EY(|;%EE8|xS8_D4=N7AN
z-u^nj%{O!|Z<<Qj53Qdg!Rokql6?I5iVFxVTo%>-vo+69YrQ!-b<Z*Q4cpQ736y1t
zy}3t*g50Q8aMhIy`j{Jy!*~u#(em0BCRX=*UWF<cKDYlqg`s66&&;`$(`*_2a1Z;e
zrM1I+MExv=cZI1)Bfw=IH^Jul%FCi54`ET$PG8kxL@A1>9T2gg>JpD3m_KJJ7Ogwl
zcVX`w-&;EW#=asIK;*X^ZyUl0y;8Ce?N9mLpFF7LHx)k#bCw=nbNET(n<JH}LQ0S4
z%+HS1xYRysH&1DUk;}Yz%K!jMC0%#s5A1h!f7^sXG4Zp9w+<N1H?|%b>!nqn2ldT*
zd6+#W6U%fwUdUX6lG=0JapaC-&?rQSi6qgCZS0*&(K4@iO@g%zK2NUr;LU@pHXlqB
zj9azdPBu;GAAoliEKm8#*<ST?>`yK{NhZ8Pyt)jg=WT~T)82(AQzbJ@w7}ZF&k1f%
zsqnVmMp+E*PIPp-e<E3!`0BcxPLu^gGFt0Vw?bXXukv%?^PWHF*&B<@F^VjtcWO`~
zZi1n8*7u!U-xa&$en(oq>xYAM(!raUx3smB9X1RYwnpiuGk^NB1E^-cU28>~r>D5?
z>S}Bh?@G1Ix|U4Lmz4H*H!`1gQ>Zt8avbrRO4RO(LG^<lHeF{w{lyDM-3i=f?^p{U
ze^79owj|X~8raRPaMZ-ee>iLSG|Ju6uhq8bjXWmC;VphaDUVX^Y*-2qz|YYe%)W~~
zqGR!x%rJs=qIdz^yov$%KQBH-Q7n25bvh?+C=aIJ7+2PU`w&^{;X{;s*_`&4IYt=y
z0rki`PTv%}{Kl)3pg<PD--pmRmZ{BaaR2<?0yVpM1+e^sZ=K68R=qw>36U=!gpv|K
zggXFb!yr>Ch5|=vdw@U}pfN<=HK?M|o2n4n_rt{<vqQ86>`61bd?4pA2n?Rsy$mlc
zd>eH7u~dOv&Z#Wd!&2_@_vOEtNboF|Ppsh6pYTFYh>Y~w;oTyi20a@B0`m3I<MCqq
z1>(rT`I*@i)Uf`XY{oNUzt4HS-x|q#$-Lh~LgpF>grA+Ogw#q%D@nUWAZ^eCbsC!L
zT!sxOk(bgzX}hQn92o#Pr4`f}>pInW_E|A+-v^OYi{jbX9K({w0;BvaTsLZ~e2$96
z?WW3ezs_q8ch?<>T#vCgUiIcOjNv)9L_y;v;{C)|rO*aTDjXu0*@4sJKd`76+RM6m
zweS@PA)aV~Dx#2}YA^7$KGzD*(rA2_vkP*LK|nB_^FMG@@ze#|hcgWz)lYC?m3Py<
zmy~%jp;ZU5STL2>9w(3TM&WsGbXWaozqZB2LiD}bXs0W@O4^#Ij{b3n@*R!(a9l*-
zNpfqR;o?h>>HK?Vz2Zu483wq+0H}}Ifn<WZG~ZzuA}e+%w;lzeX0Xkl;Wt}zfGm2t
zYJv@z<PreJG#NvE%JzU_3vR$`tU_il-kiF6ZcTqFgklsl0Q4`7H`oK|m%3<PoDbZ9
zcM86Bo`slh1w|dQ4}-nM+s>V5SIoa4cs5K1!~y$h)W~8L|FMS<N1&kFs2NzRoPhYo
z34kvvUvdQ)IT}<w$-lG#q>|B>ntgy|>W~wS7Z%cRkM)8Gl*IraufPKfIy?8F$3d<D
z6ac>EEKn^r8wgcL@f%F~7D<Uv)kD-$Q2S~xYCj)(<9lN0t|efiIc!+d$<F)j)Kp9E
z4@Xw5NbN{gFJ6B0f)gdfc>7mH0OA4*SK#xF%&S*t=#DGCUT3+saNX)r;*%3cKHPHr
zYFO&eZQ=6BMTYJCoz73ni~*-U99CqI&FiO(9l=Sg`>5SB*U=o-($UdLKoT(>J*!#H
zBLwHz=S_aQqhj9_@D<iu{fQH{N~E({6X9bXOBO-Wt%sV+opRk9d^S&)`+24`$O#;q
zx}98p&qmB|eOdWnIf?544cXh~@CD`CtGoM$RCFC(e&c#!SN6xbN&sof=L9@HWTAT9
zSOsXIN@|J;t9gB10BBowx%0FAx4lLW2BuP3fE*}|iBJ1UhafOV73^c3X*&26-xikv
zX4|6_Xw@?lhDQ1eEp35<aEitFq9|?OK~G6$uY>t?qJyT73xX3`0akPzX6Mkhe?s_M
zGe0lfV0X%LGw>k4&ny$cZ~F1#kEYT1jLBn3G*UB#ivR`e0hk~$+pos0>hX`;2`=4t
zd=renuC$041mfSDTs$11l_|AKdvLf2qG!7TM_L!n)J1kW4p`NqM202V!}It>)7`C7
zDPq_)?-MWX>ij<K@~Q2HE3mX9{mxJ9V*ou2?6>L{{)>Phy8886$1NADYqkdv{x&u?
zRoTO`&klNYG=;G7u`(15-Wueb1BQ1@UG2WtL3`i11^fO2`4Xc{EiJ7~48|BZz^53D
ztbhOh;0?Q!#P`G;9b@BMEvi7G*^rxaq7zp8i%88OH|397TF3JzU(h$8BL)rm)*~%l
zUsG>OIQwx&u7_s6o1zz88sV8$UA=W;D5=e<<Av)Y&HJ1N6^q5OAC<fP5_6W<majb+
zrrz;LmL&VeAE@&_Itw%HrD>~KwV>^x*lD6==@ejv^#YKLZO2X!{2M+Y-}aideO7Mn
z+S4jg#rU|mS^L2`sVjbcc4I&dADNc+X^^7gM@*F{!<n2LfY4&>kAoG;n&BuVBoBF|
z7Pt0!eb!3(suVu&RrKAEC?(F5aCZF%Z+VTOOVP}Y4huxD#TVa+_euE+zHd3>-snn_
zU7^(soDHQXp9ud>SYD`^0wr2~fQ^C^CN({sTd=33pg_ab)wKfn!4^IF^#J|D5r}S8
z02*`kkulIWAVYgnV0B?|PCqVR(r?Ebh*x;j4L}@XFNkbB9mwj?37Q_NI>NbCUR5wH
z#fv*viT7ojN-hAsfNG*1DTr>PS6fMiyLm*Hhbh{%b==S+7;qn*aOkK+55;ya9e0DI
zAU)nrT@J&Id%lSUT0!Pyq5DqT{q0NhB4^p4K+orV7@g%tNm0?0{ezvo?b_WI0}>%8
z5H<%2%+0%=g^HTmJu68F9(L=u*mhe$IalWnxaZ=26}G7hT)rGJ>?YA~4%VAbCDy)8
zHe9t@jrU5r+*8>V2VB`SKh=U5!bCIPapbb)OmsyK`=Xo|vwTUs^>CLJ=(>>rdhC)T
zcWY*)B-18uTZi?FR$U;D{%8*#09ny`Fnhw{%5U`dzf(QfvBL@Dtx@es?;rinSHsK@
zPoGk_9$6b0Po=V|q*)svc0H|L!we^i>zl0UHFd+cLR<@GDjxVpL<<%R-yw)7m=IrP
zeA)cGUhTPXVqMW-RLYap#w=9E%iu~M--^<i#<%!~mh=>S<Pb!eWXPXxEl03=Z*aqW
zDaH%ERZ1L>G>YTTiCdaXVO==H@SCFFK6k$2)jP{D(%w($C2783*+|qQA7|fT;2evi
zGz;9zKR9zJTZnfy12k-%my}0OaW5LKsX2Y#r7!n#Zzg@DCTcL`QH=XL+8KyH`wKWD
zb&_*z(D|DFULYT~NkKYz5NFk~`X>K+;a5Ejw`3n_Flj=-Px!O8R;%uQF;fbRYu!YC
zg7asL1v9zN4D$rmgIRuuzW$kvUFwJkrGQ2aFO<uajNJ<BlWeEl_*ty04(yUU!SSK3
zLe5vl!{+PEQHtf|G3B?l$-xT#{`8cGH;7|gSN=qx-j^6)tvE{W1KbAJqJ(}ai<?w_
zMosT5J6I2<>+0X*IYGrT^9*&aQtZU2fGxR{VCgNeQsk&UlK%8JO6HHIv`fzIi?6mR
zoy>W8wH0@{R($cec+%!8YkcgAw@PPeR-I1k1ED*|tt8GY@n4lzU?h7)F-%<pZ|`$k
z=?^>D%wML_f9#xHZD|hwB$fE?%0cmmYHurTE2)VBugck&uHCA2z@M>-rg@cC_4J4y
zW9y%^@;}o;ko8h}lQxatGwmHU!1fcrF|guoKP?-D&RGCkTdP-X!NeqTYe>T@_#Hn*
z%8FNiySTx~^;vR3r((*!{iBTEuL<vcn)TUEN7F~Ku2>CQr8!NQMdt7Maraw!J|w2o
zI1&{*&Cf8<9^>!mS)*V<PHiuY89ZLkcaf`i_QpixVwlO4`HbhasrylA)x^tql#Z85
z^$JjG@S-_c-+AYlm4ncrUukSk{K_)}2_AFxgJc^AIlaK%u~@9MoY;%xKMANmzTR4j
zxm~conw=3(8{HDK%#~Hm|0L1T)|p;kQ)Hf!W%j8iccs|8G201pg<wTBMAd4%XyM-P
zIK?Rl1i}Eu$Lo`+ii*lWr91je)624?*YuU{i`vf4HWcX|#X%8+>Z!FOB%66Z<&LPW
z(394EhZeJzQrF?@E0RTcJ{>|P2`E`agopdoWUK)SUsiT@wt{w6ZhVbjw9|n8H;H6<
z1Znal<@bw{?h9J7va)CRy++OE`tqw6pR<#VXca$z>ACRW8Y474GDK=dW@ekI&Xi*(
zO81WeCmKE`#92uWNw>RqlZ}W|2folLJ_`+hb=OPu4Qf%{biA0?$1z9sl;pj3o!r;u
zKAa4SFWtYMm`i;HE}Q9M2d~f0h|SIBFOebAXB{G=qM}yy-G}%-l2`JoKG}@XiW!OP
zI`I2Zn5lnGOl0}(?i@2IFXw(A`KIyUubT#v91?xbh=)O~F(Noy{0{C%x5-ZD3e0NU
z5%HNgw%ym>Kf3s`pIk_676D%LwP(WRJ%|bcW6nIqIT1^DGnO?iaDbhDQaP3v|6%dT
zJVofJy-;B3S{XgdhOWOqI@IK-Cb{IVSqp5w7@~#V$-R~BbJnzj|ET$9@29sDwv#`-
ze>sUK!AxxIqqbV*HM|T=zDB^p`AtVtN%H#)UiHiljgO^mp7_tLw|9skRvE#mo4EK!
z@6S0dfmq(rFh+dqzJSGk5}~*3<#)R6dv_ry?wRapnP<vdG-Z8%HFuw_-&Oy;1LnW5
z_&dkICo)?6S^As}l=XU;L*EVFho8sabxw4=pFbq#C%7(F6N2W@-&EiPhYq15{yp{H
zi|uWQe`7CtoHDgyw8lb8d%K-;oWh(*L-y^r-gDT`rNxDTAsQic#Mtk!(p$vYq~kCZ
z3FUYGvv(=I!=JZZ`ClKk_v0-ZhNU;FV<-N6+b3RzmSrh33XJI3Nj33#XD5O8^TkK{
zR=nNqLo-eer0LhvuEi$p56{mTs?zER{6!YzXyyfzqerOFyS(?Q)QlIeS{5@$Dk4O)
zxZ86Q+FarV{WRNS@OM=Rbt~RK=-7E4S-W865T7edpFLQ)CQJ5^*oR01&#J;J_H1v3
zh25t}xjf1eKEY}g>h*Z-hKc2dZuy|-!DpDq>$e&r4(BB}-|aj78>9NKV1ZAjHah9R
zpL6|%Nv*}kK(?wFC)+FORvnMc0$Gy@re>pRjfIws?@1^4!}&VDF&tULpS-(CRn1!Y
z@y@_%6wzLn-`=ldl?kssUL8l3Lv|n(`Su^xQ$pby&Gvr1l;YpETx>w>jlDd&yu)o3
z$1!>#R9dz@yWGj|*ZJ~Rj<?Q3ho>Vh{UXSteo|lHt#qF~Sgyp}JY1vH6w`l5o21`<
z$Fey_N^rwDvWn^Iak5LqnoB0oQNLP%d#@VjPftY?oYwtn?33r?DD-@-_>UDQ3&Uvw
z)01rn>&14{n(b*5#L{N<$6p=!3t<%WBjZ$Fl5XD}Kvb`@?o!n#LiS4zyBr5W?}@`4
zzhCU#`MgKtH_I1z28)e7r$4d)QMFHFpeEl9u;zw!kp8;%sEBza>16AsykQj!OiBQj
z!?9N%-riX05&&jlR`wq3^0gM5y*NegM;Sul69%Rw>u(CLo5XO<6V(?C{KB?2&WV=}
zaagJe&pPj~q;fVuMc5Wp&<6itZ#P<Tx*~VuF8zWey0w4nt?(1wh>Rg}aR9h)DKOhd
zes84<Mq{AdK2Ysd>Hs`gmii(iBgbbu8-a?J2#}3P@Fh;C7>q9pR<FLB>`3C+F*F=1
zG}UzX36h*R=eFO=4K>_eUZP*1hj})aUiIswWz=SWU$xOfFQNOy`dv#S20zQ9IZ4DV
zdwi9x*SfQ?G%clfCC~RuWJXeE4Xo+55%-Q3Fp!7{YGAI=)H(@F*sMyO^9bq6F(0f5
zPEJlAMoQTLR<G$;^(znvW1W$wy)Y;-Ot4SV{LYZ4hbiW6C{59}5v?0FHuFX%E4D{Y
z-WaV!M{eOyVx@7Et4*Py4>g3RH4(RfK;HC@AFWFf-lsxQf?UBwrgoGfetdRuy<raA
zS%}w+ec-ly_3G6s5O1&o$bwOB*FkeuR@P!*F93(b<-gfFI^yCSF<HQ0NL59}!l1ul
z0+h<JKrT=OXwFLkF<4I^m@F514&#n<Z+hi6I4#4;@C*BxZzjaGa7O~S&92wz&nYL^
zu}qLSEj~g^9GI0w!_HS1O7w~O5#?$fChUMw%2#Q?pnvR()_dbj?AJ)_XO_78__K0E
z;Op1R&ke|)Y@DW4%}W#b-+P~PngU3{8t+>N)r0V1{mt<pj$VM4IDNWd*Xahb4i?bI
z!4A~NK+dbLW8@lXmxqfA3_RHH>;rH@n_<AY;NI_2rL*{jCrQh}Ar~l!WrL2Hdq4v%
z|K#-&djQ&&Od}l)G9G4SWxc%4*9R-?Z-g9v#C)47zLb^JEgv}aHR=nDKZH8$<h)p&
z$4d3bQPtu&<zeH<4-bWZ7FkP7=8U2UCuvuA<Z(!k+WhJnp#&C3keP@4*l=p`c;niF
zMrnK9f2n`;+qz8%sdO9OV85?i)0*W;e2E%Hksr8qP2U;yi5Irk7<E<cAUhzd2!Blj
zgV{CN8kgUsjZUTgaa}eC-uJ8j8$>~uRubA1r3?U9PU=GofJce|lSJ^8H*i~ef*4W%
z2$*!X03U<+o$j^(bi3C<xqw_$mu9O$o75Nw1OxziEW@V3JLMz!4}$=wCNU!aJNqY#
zb#1>hL*1KwpS~7PW;nx3HU*smL&Kd{NE)*DhkBGU0@qHmL%rpGsve7~Sy{43VZ2Cm
zRFibmd>oVRsv1Zr;-_`qyDX#(lx`Z)CAvhpv`D+fY_jh97m<s4WjL=<&iU<@D4csY
z9(Q^%91iVyYpy)kS#8ix(b5`Gd?M)&WO5$dbd_Be>w$OW<pXABXR!kV$dVFKIu_w(
zFSA*@!IHMt5jN0MzW^Xu_3)+jYwg$1d)o_E%v2T?xf}G{KJqSy<EG`NsMuWs*m5J3
z2k2QBHoNU2ONE<<OYZ4c$h2dnziJ!#WlNeC%-Mrg_0D)Z;i^wr=qTHfHPy{`@w7Gb
zuA9I7-?G^4xFh}&Kq#CXJ2INedhd6MoU@KXRL*UpC34&txDn?Rc8FR&apQGES+b55
zHzuaqcC;7o*Q+#I0j*v2#q%Ydni;C_J9l25JhJd)1WiY&z-L5Y%Mn=r&Y<{H4`cB}
zj7O(^&SUcR7{UohDjmw$6$!c1b>Tk6o1oH10z!4hUV)DBjR(DkjX0Xc$IZmAIffLT
zj3pQ1afS_3m~B|BZ&?z+>XW%k2|7elKyMv%W&jv!x!?&OA0J#tCj(UR<bdJq+N223
z;Y24&x#M7v_o91ERh6OWLwBHT*wWg%LQx0WfveZ&KzC6Oo!C$^@IZT#H182F8dv5#
zja+OeY@ErE@$k!92>Uj0X4|+|usuXVnDvITIQJB5*)DUrp!CC%<1p>^@@SXLTJvto
zoBKz<ZxnhB?|OfuT-?17NgsC2)}cGSU54ydF^k=2xnVKDYAyvRHCHM69|bxpDFvU*
zYx}Z!L9Wx{7r;V(vD>Tn9rQ_!%uVe7Y8v-zupHg)<ATLFSBE5tH<YH{7Q+!lx_5@_
zuIUDaETn($;^;5Dc+*n9(r@R@5YNIdmWjTc1U1svE=k267K~Jbd()`OKrtU3+rtsZ
z4nMz6pLwqG8XAjfyrFH0X_8g@?7X_DUEi2JVPu>+n(RY!cHZH130&ma4XD<ng6$#u
zojB;=8+Z1Wdnixa>BclyY+vDNQR*Agt78@4xt==^l(<rTlHwx>BJ;oX@?y`-$t%27
zNywVa#t9xOtUjQmlD8yx!aMDPT5`U?sk!1s(GXK~N!5v?YZ4~ABgg7Se)2zmOE$M@
zJ_1GPzu5Jt_O87h#wjt*>*&}6+={0Z`_3fLVz?}qH#L;P4Ov%<`#(keJS=<$RwI@;
zA8$+Fk~nA^?7S9Wp|pVJwrqb*c3;RGqy<;!SafUm-uEk(%RG*`(;$kkIyi75cET1n
z@wEcpUUtXH-7&iI6Q$3kF90AKHRxVh@d6u>RCc%hWkzfHT++L1Cb!83oSY3%R=PoJ
z;juT|X@<$)A4M&m4WAesiRzrb?(7h*DSm4qP;6pn(1&_HuKmMdVUgIeYLB2zs!7z9
zy8*e5<*mK~>%(FbM{e}e_PQH|^ZaGvPJpqJ>iQt?Cqh^?OwPQ;dr545=(*)4a+g!Q
z(D9m&q|@$d)lhWe+~&5`u}Xib8XR+6`S7#vJNE;IbP@^reP8Ve=s8(i<X>t!?jUuq
z>={4(=Q}Rk=8TxIZGQx}ReC@?LU5|T)vs(~FvJwv0wSs@>d_Mr6j08CUw7CJ9`f6*
z8BvbsbBOq?c6T*<ik@udl+6tC94tDi@lWP-0rB))R*}i0JeCuyDvutw-_U6f82#nx
z8htrz#80g5d}o4w>_^cP*&S`+G{FxKi03IFnofGUCRj(eY|g#z8@RVer}?k%dH}wM
z^%@h?pO8CQ2LiI<a2)USV?D9(qR;KCttv9VC{~&yE5A|tpw{J4pR1oxeQ#cH<+xTi
zBDXaWFC-6O(I!cud9o{4CYcpX$MD$F;Xhw}{9;(NU?lxvx4uJqn39M1SF;n!*OeA_
zhMl5^B5S|&MAqZiqdq?$-|@^Js@YX(&p-e3Yb>iKD=*n|(qE<nLJEgoGyho(3}~5v
zFUziHvOoxD^P)7`VGuYFeK{@EO)KhoI(@iz^1d>KeNJ7rUDI|?e%V~6%Fw7__}1vI
z`G~9Pl|BOv-*2K?WWl6;hz)?Hea7GB`&Xu;z$vB!vKs@Nqm(r~#Exzn@k0Ea$syDA
zH=`{+JNdUl!+b~tGf8}@N%+=Rxx}~8{gTAAxs?ZvWY_J5d;b28$MaABmSN{4QR*8Q
z5P<fvf_HIo@yF4d+rkf7m)fo(Oyb4|)p26iSg)#zi%!|T+WL<Exw9$eb4|5mmBqH!
zeLaZghHq)cckk&qCm1Qf?%0C_vR{{`%qeJo{@Ne+TWe0!1DHccZ&`dODJ?}Ik#{#Y
zCzS^-L<7hA){vwHu2PI~dGVp23zEsbEoT*sLI?sEFJ8B)enCSD)!Fojwys5H{DeIY
zT`Q`vVG%j|;nk>g-+DEu2Y*xTq02Pw2pTz~Kk!wW&H3Ce>^Tp<f@K*+#d~GYRV@BV
zIt(_kA{0(9z3`<G2mxtnXmp2jQ==qpzGXfq$SaV2_6u@gsDvU;?_J^cRCb-#+Z(@*
zt$@zhYF=L6h*!1hx&6mIKp)Prl{<s<a1O;ayS}^OzidnHFXR>#6@9`>&F}(t%TRPu
z*VxJEi}=Z{k97?dtSvenNEGUaf%`d~dnV^6VzbxIo$zP#_KudS3?<!K$jxyI;XoDG
zE@<ib(jQo5^K`JXQY2MKehpG03u<m(u8nH<^_Bd(Cl`-UW}ccY^rjyaH>tVZo|W|0
zhQ7t9LdM2YX@Q-j8&K*jFk29__5H+T)lZ-g%`I8z5b1erZr+K3{ojDae~g9<gxcKG
zEBhUOLRAI<_1+H3gR~!enNPgVS?C#$?=Ssw8m78XqT&?&K3R~<4e_Y9RM>`fT@d7e
z+Zu~{A#`i((%d#u4u3i1Mh`PA&9mxw{(<Nbx|GD2`FQ&*@l)F)I)2QIEiuk<rLIL=
zy&My;Ep?MO5qhKnQ?z)44wbxyal|#0yWQKZjhl{b;G~LD-7>VuNy3tSm9T>l5IV+n
zReo<Ag|w5?40$h$4qlEc^tL^^IxiITT0P{;YH=fJSg71jcck=(`OU>)^Ys#8s%O>Q
za)UA3^5Q+{=I2a><Q3<-Rl$N8s21XW{Tok`38a4B{u`LtAFeVGZT>yRDYvYSwT`r&
zbZ4<@Bs%q@-{>3HIuS+=4`1|F-_3Fk`^ak+AL8|#d;D^;{|o<s9vUIc1kUSp!4T^|
z=k-;J#z{NfE%VZ&3$vF^o=tkM@r~Pkt9@5_;Y3l|Edb>T$KdQO`0x~k9}C58%jF*H
zY~KV>w1|P(`uV=Wx&ZP^O5-d*te9qA|7*_QQm_?z+lS?c21_~iim%+r$}-NSBL<cT
zEc|%fgWL*xN%z_PG8!GR)m`J^?qmm*3?*R`P{$vcGw$7G@Zn#^j{M7_`X1$+I0isL
zO*aY+BzaP<gL71Toa3jq{qL*b(p!oy3XH)hd@S1Cix7_{IZprl^45OiFS`DBK)oc+
z#p&Rm6a$e7-7}6jxKa-PJ=BG@^z)V_)WkLE+qvScIG>jIX(ei!BGEW^Q;gGc+J|>z
z)fIA19MWgzNOM$9?OQDR|FIMP4HAwW09yj~gj#w3pErChgAwN`$@U97M{T+3c>sR&
zZ5y<TemIra7)%;jk~r=aK=I3PrDl7^PEb5wZinZ&edNZ5VJR3CK5D9{C!rQcXccQT
z`X}G?dGl>3;>_E};Xl7tU|Cyz*3M1~`YE@U`Lci1ti_Q+cX`}NHiE1YW)t}7Yhq+a
zz2HvmT#v$D%*44#0qU0#vlZhA0GDV0V=M0M@|8wlq=N)f0Ltd(<|C<~{%!>r-QzaE
zjjjNs1;=*0i-J|+M6EZMC&q06HLn7QL)IsiE54O6#!wSua}w+W(Ig2AD-G`N$}kg&
zJcWgo&`WtP2q)gq$lD3;Bl3O36usi8OT=QWV-7?HD@|aCRYUOIah0CUDC*br*FU>!
zTHGW%-e!dWh_>_H%7#2H6yMeY6YyfMb%$Z6R{rtRTH4xyXi1YBz(%QHzX@w69hJ9<
z=QLMalC-p8aaq~*|5x6X|3kg@Z+jGNDwQm!BTklt8VzAO>Zq(mIVDThWX9OHnZf9!
zb1KWBEZNHv*)q&v%+OIJYshY>vG3VN*6;nXoL73r^ZWr{KlI9L7Wd~~-uHXCuIr5z
zs{B4h^x7arN1hWIhR%SOlV0oGO;faaA0}?)(qE~Nv?M;T(1z|gaYGbXYPE_pIv;qk
z@_)LL-f>{^JYzZ}#-~)3!lO{p)owLgl$R#L!Rq-?c3DF_Da-pMpw(Uxl9r$z6iX@a
zS@EWT4#Y|TCenLGoIc@DQBiTlaMXRQBNdQ8)q&JvVBvPLcDPogd{U+Auk;BOPBwVK
z8|f}}5zPlX#bvtf42P<US}o_to{|S;Ln@|SGmW0G`R?1*YUVdW%so|sLngQnMl(*s
z<H#lCzFkar!EP`@o4NU6+(|+3x~LS#>{#ck=g*H)fKL2Pg0(iu&IAxW>_G>oKh3gG
zbu?Ng(5=!3BA(IDo~gozK@WugND{r6MYB@_0s;WjB{D57?RnE@>+R44XMUsP_Hm_t
zb||wANv2oDO7ut2wd)u%1;vFDmtM?UMj%8ZuA!3YCl(4ihHy^Hw{xJw9j6s+%hY)L
zB5YXYvC5AoRx8QzwVd0xZ!Z!ZwoQo<C+AH9vAc8)K-&4!ag<n{67BS1&y9Roi=C#r
z`n{1>GH8CDXJ5SJ79gT#vg_=GZ!BmN(c9N|ce1CV@N1c%D`?4wd1KsO#`O#(D7f2%
zWFJ!&C^VdR2+)wAX52={p0~&$%N)w<J36GJJdq2R+*<SO!UaZjX7f`I$~JR&IakA}
zW7MUzG*sA9!KLfR=jutPdj5*@XjeAu-#1j(*l0;=%cv!x+QZVVq{@*AF*bfh`_Y*O
zf3?;Wph{l1;l_Hmi#hIedp+A<l$a=HJ3Mx#4j@vLynC42&@K=~%`8co`+@U<=;LWY
zq%Y;Xi%l>Bkvfn)ZU(rT+_s^iyO8`);fl9&;N2ASdj&9zq5(2{R&8ZUjp92BkYLmI
zL>5J-F9OYd1{)dRfG5Ym`K}h60%?;S&EKX|)G`3bMuTszEq<&7&tB>}dY_N2eKb*j
z&%qo!5ml_mTde%N`X^Xw#szK9nTi?WxnjQ~oWd0SrDaUPeB8uwcxbJzQr;3KxG|0p
z<X&mKlGYJ`_Og?is~J5kXKh~Jj9Lji=GuA5^e1+vo<T5ZqC)+PkSGCdW&A--qJ8HO
zr}GHBh^Q5D@*Loeo18Ccwtv@AU%z)JtxdrbwR-@xw=Fa*&dAUPw3NxVc4^}%PFsSz
zM&OZDfWE6FcYO^4$)Fc>4S=6{S2@Plnf~&n9u7yjtf{F^DjJ7%3m%r3F0nSg8X~55
z+i!H6n-pnTFNAC4WYwTeCEOve!$u?oxEk=M<(KK=t}lb};p0Uo+&}qv>~PMEmP!(B
ziNdEzGbKgiRriB@kZNxd#35(F-vPABtY$zbD-8k!T96P@RbFg0X&HTrIxr4YoGivG
ztv}O1ckE~KRu4(W1eJD0gOSy=&<xQRg+c|K2_?N6br^-!!%FNlOQMa23dcZPueXBL
zO+ehZUy&;l(m)dzQVO2_>x^l>K@D8MlU}G<CA)W|w%cqauvlth(Smj}X3+5cw{ltW
z&M4OPu<Qq)BfcMGhyD$EDEk0v&;Nk*@9{u6<7H}UMP+VAdipfcjk*Ss`C_l}i>z#T
zbxRDOwM|Y=7MYC#g3x$g0Z>hLDdkhV1IT-{ejv1Rq4TgC;K|UAAg^b4UiAXJt<J!T
z(tX>fU-9lVA(%=Y51kBk56cp20jLh`c=w=Np-Ynd)2{Z^+76ei-1Um;WZj5j_W;*b
zo3W#~Q}JG%)OvvYgXs{vRg#*K;mS{w5?bA1(>LQm&;b-^$6eLD$4cy5k?h2TjF{0|
zsN{@>duGr04z|am`ppUMnwKvJwiz0ZDl;|!GK~PGr9{*DFayfTIm5hNrD-DRIO7-F
zqDs(zw>n`avJ-#T;H_^IWyB8mc;{XNg|doBXQQ`t>U=hUIL3GlrQ$x}@=Y0E$-S8-
z$IDxkNQ+vM!vfx64<7+enuif7#T;42JTVGj-e_7n=yCE;CZ?e`Ku#Aj-RzJ7JnBbr
z`%8sq10|#KNHL6Bco)48Kc^9&f%Sc22NSoIZ+B=uqP6_?VU_HzD%d#_eh0Va^bqo*
zKt-IoNWcJf8f}toSn%yl6>Z-(%)*^W7iNSoUql&H^Jp=w{AbN=d4GhpS#HJh(8)9T
z@k6+)n);IhipX52oi?+DQrf}Oy0T(EJ9&gZ;)bskNcp|7>-W1+KfuEyj2~m_^&sN2
zpzu2H<kFg3>x(ew82LuATDpf5I(4qg;7MrpH)7YE1lp3(<t<4ei|;2~tcp-3eZXN8
zfcQ?$J0~3_V@a5iwLbH&2RIz=x#Um7=0GA7_*C4M1to~k-Y8Si(~6frE$I7wj&@Fn
zmc^C|Dg%clccJu2)eFOWL?Q?9`t{qCL+3<l!=0R}!#*9O426|3^-)BFvA|eWDB7Hd
z&NuK=1_BUw?D`M$PoVA&XzMzXT&eTM@RXRJKQ+HMHus~-R68=BKJ<qXnmY?F)VvJs
zx#i^Qu-%f8{N>t7ai*J@0x~G=cE2jEd(@z?+9KWf5dY>U{k%NFq0UGJU6|R>*c|y*
z)L@kOJDh!LNAco0$>ZS)c4HFfoY|<hlU$L^zxi(v7X7U9uSY#7TIUmNpeL61tg6j`
z6X!hDQx)6K8Q1B$BEuN;nJMlNEDl|Wk|^@*YcU&_Q@_`4Q7PBYc%~3?e2bZJgTk({
zr*gN~r)Gi!z!I8;pBx^(dQaXX!U<W|ypPdqQ<T-HCFXxct>tA2-5s+(ChHk0S~t?X
zW9S|q$}P*nkGh4^bx|42ye;EH#Cno}+k@@Ccxkise4CS5+&*rOtk9)?01=DyBq*kZ
z_uA#)aE6EP_JRmfFnB8RkQ3Fx*5RQLJCz!Bil2E25(K0mBlG;c(8l;vd{ru&V+Y%5
z&k8eQ0v;>$YC2KRHYjYDrfAMjj8i9b-k7-8K5#124w+Iz^VU?m{;Yt-owg9)f<6Bl
zD(7E&=(Rq5-MXx3I~#(K<I+!4*FtmPN&;BTnx3hoF!2ISfkc-RLVG%FghLi)?XDJI
zhIbXtglf=WE#d=4ze`6>{khc|#6oK@MUk^xe+F)z9)XMOfjr(ycZhEG=p8DT=G|?Y
zGE5F+kQdYoG_{h^4<9Y0tb{BqSka`Nsya?hwCGyzqtz6Qgko-Oks@p339k9uE$cDQ
z>LC`>@2*QwZ_9VIcpD6d!vT<oeE9I;qhk{EQXs193OWT?^7g)xa~o+%C@U+o?>EpR
zm=;tA++8sv-;|UjXS9d!nvyV0<aZm_TlQDXTd|p%`PffAz;FOE<$u#7MtaX&w*7P<
zz;n!$5_T=F;QWG7mD<AcEXhIl!osVb2<M=vZB3Hh1l}%?A4<*pglvfMleeu;-1qoH
zFw=edRu+V%-pV&uFA7UM0@Rjjl!nv)d<v==<ikLV40IV2a8x@ag6wt%DW8gi9}F$_
z2?${NI(0r+&yKa7;B3<_8#hZW@*#a%nEw3w>Glri7%qDc?Wc5|5QU|O`ToF5Bs+47
zzG74w6ZNc1t>JMiX@SMlhlRQvxwQMj_s>F*s~t)7-`ucY9t|%pVu6^;8v*Crdzp@R
z&e&HbW@*_!LIg4wh|8Qvx1{QU(YgKTGONo1A$<4Rr9TB4DU*na_MarQo+$Lg&{+3(
zvOSre@Kk3<5ydw&c1(}fS<RZHE@h{Y3AdUg4ORM1rlyBG@hOPFa=nbTHgr%6_y3X+
zJQ7}w%Ip7@oPEY7R;vG>aSsItC{1h{_oOvncgeIMYWb*5onEWL{TE?nh25j(HRAz8
zrP}KoaC2gy{vybn9HvMoZ%FsZakq>{J5MDgd!>r!^$RIC$c9g~?x;#MF3=ngNn3z%
zk5U)YJJd2Kf<e0E1=_B&UMw$i$}!{0?JK<T224u>EUapf+dS2N+kSmQ>zr8$oFpz`
zex%{jYfJX%<I*+zn=wXG<OSfZ0wayPM#JfkHK;MX+^uOx161Ji7m@u(I`5O2E=Xc3
zn50uUM=e2M{-MV!+>>pzKb*1a`k6Y?jl9mPql*9rqFWREx6{h9VnL7$Z<lV%JE|n7
zaB=%^4ZO>8ziuSW6`&@`#LvuuM7F<xEy`zlSbrTohd;<M(p2*eSUb{3WD-&yr+=fW
z;2+q)E5D-!=*8<G06*pe%jBDF5`N-5WYc@=$XUfRLo9AmHc?+kB7E2mg#VWD5A*fd
zeNh$|)xjBR#75iM>GNhKpMk4%gR0G^VjDSE_;cCuaxR><qv_()4z(5WpDgl=3pK_T
z#^1y3pA<P`$isQBPs}x1BeuGde+EX?`mLSip7qJxeGSx4(heTPx;UU4B`;AthirPV
zONY@H8Gm6Cwb3(oJ=}ZsolF(*^p3OqBa~}7!=sr1F66FsmKoLzAu0nrisW89i~5c4
zCm>eNSxfbrr{-OlJXhBYXY6g2HDc2|a1o>S(DS&ERH4sOXh(052dT(x{QXxU5fM+J
zJ1b(9AQc%j{h9f}u$2T0qHTJlVPiprxp~hP<vFAB8XY{0=)$LTU9Ls-4o4tOF=Lrf
zVWYj>3wHh!%(H*?+4UMj>k9Z(#|zRf0I!WCV4UY9x<9^oxsmwUw8s!$+!Lk?^>h`v
zjf5TWFV6jKsenVl^J^2Excr)rFE&KtqM|?3d+giA!0lIdv#m`l+dZMN!;FmroKZ(B
zq>MU#Y%l*r(hpwp`mcyG?jEbE%fEB4+LN0+(zo*UGKc;9v4ctBcdH`gKG)8E0dXMk
zWK#@`jWY%tQicICV~3jQYxDas`;LRl03ak`2Y~dSYc<HA#bRiftf_D&=&P3%WnXxF
zG4c`bzHCr+Km9CgzPqXBsrps&d%s`dpdg2-b#>~Re=$9x?nL#PxzcZWj{=~2y^j|S
zCOT*@h$r`tbmi&!qaANZ%e>s;3bj27EriLw2FQb!zqAP`lp`4ubQ{OQawP83`Ev$R
zQl-ta&nXmRAQPP-@P`mE?806HKgqJ31x%?d8X%h`ew*rl(7sO~O5r9(uPM{`)nEJ)
zLqLOh{G3!;GNputkT;ddl}i0hQu6Q1Qdh}|KmGi~LLSGyYmv~=B@@aeEY%<Vs4zNo
z;UBtFGZooqa+r6vug~p<;upNY$mw%kzfLj4!hM!6v`lPd4>$R&_)`LAn*FDI@LD3k
zj9k>ft6*k5TA`N3*9Kl?XP-1b4mu^*-4`$!;Iyo)taPnD-8=vUk#{v$$k_z@x3!G}
zc|&fs_5$Y=(5>*$XjicsHf!pB%ssw2w~-glLaH|L2yZ-3ny_juqr{{F85BJU;j;}p
zy0uH8yfnI47<U<+8M0tnJRJ)aW0mYJr@cOo`4b?>^5I>KO@m!Gbh|IwK-OY$HUc7B
z1ShkBW)Ih{T>0C?<1;59MT-wf{uXdJ35}%-ezqAenhFF=zdt5yqF5I}C&;0K{#bqR
zGq(&RdD7lJ60ieQYA?`ya0X2IgE-fsY}T%_vfpHkh*|)-BqZV(GstGOloaAbsD2GE
zXT<XWCD=1@OAi6q#6y7x^1$xc^`sMvfsaON-q-(S{Bnv;0M<KKmjfFl{&qF`-Wc`2
zS5W<RC~r?y5{G);CuSi4*I2+(Wh|dQ-DHFwOna3%<Rk|ZRd(*vS^}zdO$;hSB`cP1
zwynOdP82gc29jbK0Azz<Kxib;A@O3Ugl-dPNu3GM5S?O<#8e`&Y2hvkH8a%eI{{>!
zPdW8mY=s4#o&3|icuItrp`xBoH@vf0<N7W7H#RrB{v!k-CyN*ZqtQoyDGPWTfl$)Z
zhL^0VRj0Tw@pRW{0cCq`W#pt4yx(W?(QY~Iq6*|~p_7qSFfv?CK2$Y%)_}Pi%WAsg
zOrM%>6np0{tG30PKl?_?>_{8(7a((!m8=^#hI)MjMFAS^6T~|^IoM1bNX{`(#`oRB
z{U+UWA(|(?`cV>;V7t$vX=g##;16W0T9!aJ`*DDO%X1m5*CR|Y=I6bDijE4D%Ro45
zYX|)H_~0JMXBufOSNG?=RHjEgTuNmZjMlZRK&2Vd5`whD18+(Og+_(Py&t2G*$sp~
zGBC)yz-OWJQA9$kp&v#`ua_BaJcW!-J3%feU+ViLY+$b*(5AVNT5YYEa;~+|^%ZFp
zBMF+aj<gu?Rz_^Hn(HZa?&;r4mKkUa^0rBkQT<wu18=gu#VoKhr!}_JuJR8a{%OF)
zy1;=&m9(pp-c;k+w+$(GeGAMin!GIAZAU<%`bGiMvuGBEAfVb|=%_n_`-+B7gbKB$
zMg!fOJ?=zg>+?W-%)^J<Tf6OTZF>%-%o1Wu%X7S!yk>@*WuN29`*EJAS|wqSHZ$G%
zxsr;?E~YPBZV_XQ|2?$p(uHI7-E{sYRF+a<-vg)d0+VA#T}|Ot;f5c!rS<qMJu2I&
zl-X}=h*GfG&j<}Xt)_VYoeDb6?|_Vq3~q|5pMAieY32Y#)h+b}S@#DaQ-bK`W)(bX
zT;OCiw+z^_uCuK+57gQpEiEuGMhXw?+oVke#fZlg6&1g<LXb)z!}RpYllS(m9bdIf
z1i1sR3wzJqmMyW1Yc+d5T&TeTm{@cZe;(dbM#}9~Cgf4aYE~cVOs7Ai;ZsNZLZ&hb
zuAY5E3$x}QLCN{pR*lakUCLH8|BU_j%x$tG_!rxQKVuAzO0Hf-VmE)JV})jIrei@^
zRHd(P`6*R2&_xJ??*PSj5B=jzvQH{NYi2e3^asf(769$e=yf;>5oMJhfTTD27Dngf
zu^pU8rpV~ovt77rSMyqF1$CD52M#0&n|Fx83Cpd~oeTW%a1*=w?%Ky_odjp!M?Whx
z(}y2lXS&fuyEOnGYld!gTL--EyXU>X0(O*`0W%Y9Yip%?MA;6X(V4R;@wP`Ir*qs(
z6_szb_`c?Ab35UNu?A?Q*I6~LBmJ3e8m|NkZ4{o2Hhq9JbviG%F@%)eEPKOhW~Cph
zsx-`Zt5!t2zG9mqoD%zHqu@TnKD|7pJNU<r6T?GaKaP(t>)o{t2|Og8bfwO{)SpiS
zndCkF?kZEQ7W~NGH3C-a4DANw>UB#20+t_0{>R+vXUSMXvXhLT-g#qU)Er9zV5vI*
zETwwWc+4srV4|yKdAT%D$yvIWM<@8{ahHtj98_SB-~oP=ud)$gOy|ck1-Qr~*zxp@
zw6@X?-h)SqENUlFY2%BhmQ)Eb&hr!n>$Upn#_`?oPmX{XmX3I_;Tv1VBd)w|%#zQ~
zX19K1Lx}OXDFE^SOY;$DnFcXGMA?8!1?x7t!&Z!v2~lZQG}eq?3YE4ZI_or(-WM@V
zhyHm9+!9(^8`c${zK0t%ddpDebYoI~j(RKQLef<od#_CVItc;FGkx!9A+*5oEzF;9
z0KhGQ1DDXa64C9nYwRhKh%9`nR}wx#bHO-QF_oRb3xnuoRPGSp`pFTcb4qeug#F$|
z3yU>U=AWpZ6S;C=O@`ygzYGOe7yUq+khL)(G+4=b1p399(^oqa0(BR0op*={ZtQ@P
z@FaYEyg;5gy>s_K9#}jR{(9E+=dw)n<qPw#nNrEuy=$i(WEIjGB|QGu4d$tQQQ#o(
z_$5MBC&&Q5N8oe3uU_&zY0<=mYe@KmooY3B(x{K=@LAX>fZoxriFme=9Q5DOBHa~z
zw!;4DEu+!#cQ8e+TA>=LlSWDc^O(%4o&Ryaul~)wkc;@!W0wtF+99dnC_AE?s(60U
zJxo%Q^~#CU(Va#^U2u5?rXlL?i!XgI876GZp&=Wnc>yrG`&&A|o_=a<e3_JV?2EU&
zYU8I*o)!`o(rN*Yfb5?~OrGU6PDwonKoDap@rc*0eQ=(4%h42&RFDgLqHp>$XWRZ?
z<RYop#LvVe_X2>s%1!s9;c&+<Z_m4FFx5lio-P0jp(<g`0*w>za^O8EBsw}ezTJ!L
zmQzzR-49S!jL@$Mz{hpdv9fA)RkhRC4+Us&13fdMBmkDaSs=1@O-?$&1Nqz2^iP(z
zh#$bnEbQ11=vK$SEqv-2p^uEV@T6`O|2MV`w5x)xz$PY)+$TUn$SnBSRrb^S;Ggyc
zAWNFL@Sww)@alEna_rbKjjH2^EK-~vgF+Oa@Y$9f-pNkF6~bhYfCtLKMFKs#vn{EI
zYq=OGf3)PXcPooBC$ojayYPd;i3`Sp`Kj3Skuv~U5H7OLL0nyQ=KqLlL$TrDBa620
zpY6v3yLyUN(bxa1zSOM-SOooZU+GJVa@JeG4_3eLxIjyhC&7QGkfc0Te_1l(hvUfP
z*WGboMdv#1FeO)RdVk_AaF#!qvhp*(NWY(sFZ=*<7blp(d=A#!pw;)|p8(6E{Y}bj
zv%%W*cQ>{@md*3)=@;unM{Y6w5?5mtpx@vH6dGJXclAM_fp87%MfsHAoYy0TE3>5H
z!N&?Ub-A_;Io6#&O@Ky329ONrhYufqCCSe&KxlRBx5<h5b-u}cHD{*rFAJLn<RK*T
ztT&LIh$(@}WT#=UyMSULDJ|U?<4zkS!)M?90h$rLEG$$)cd~EW*|^JRu9SMPjD!Qd
zE&)EEtF7&3SB>1?{6=KA3~`<4mr_|p^$QTKo4-{B$26@O3}wd^Ci8NBS>5qkd$U_9
zpxpwV`mGwBmYdsH>C{QE31Y(|#AOJTc`e;>#*D^PeM&dL9&3iXDh$3^URs#>AeH6O
z+7U>%tgkP;^=PR8oIg;`@h>h+ui79q{ji=BAUiU7Z*5EZ)^Z}V#|4V#E_j5&SdgT`
z>WYfR6~SVl$X7Z{jHG5{WJvz<3wKv#&m+*1eGmjy*NGgMQF1P;X;cMujq+hNIc$Yu
zy4%IHe1Sy4Z<p3FaAF$WpmD@i7|N9lq!5-sf3;@NVj>H4br#zOfA^vYs6i_AKG83d
zZr-PREVXo(AWUnxDWd_1wakFNuy>{hJ{AIP>ky@FT+GulolqsFUC@HMI+z<W6QG}+
zSNq{O=NkA8x<maXiNO@LrBsNA4|+mr15QKrKpxPn7zXn7jZ2KKxR1k+iTT5s<P+G1
zhPdTi(0x|BL{a*mb-?luPxz5j-beQRz;NF>NY_>(`*tZ8)aneX`#0E+ebvgRP&zAT
zb5WxQn)H*kzRIgAqG?BT-EABCfZ$)4O3K_ewK+Tl9-x3v+yp`|CO`!<_l9~Ae`k)B
z=J^KT@_%Y>*65(Gk}&`C%$x`NR&emmY7Lu}1AE-8Rw2)*qfM(Bd4}9f{iCDf7J#al
z!dPw^81zuWfj4dkQY}=h#$V5#bpDf1hn2ww7M8|+Orh|paW{3eAELn&FIE=6%*{2x
zBM`GicQM+`^PTuiVCLLVrTlC68TMy~(Y$6a)?%Ptu5dO*7W;X}hv|$MpXnDOgXFHI
zOo5)fH^oplG*8dPsbh=k!M6pR`>H<raqcK$@i8`zv95`Zye(x3|6z{TeibuWtF(5>
zsH{2w)7ML^T>?easHDz1Vf^FW+3oWogOPi(<dZL)I(_;+q0r(k;@H}_ey~qDdL_a>
ziY^0sj{fKaRBYHfmadRz_lP!0v5EZ6pExmQESt-WL5vVphO&y~80)7@NBNyU7L13^
z^+ZyR7}HUFH>meJf$!4#HTK4_|79O7<AE9H#ilZc0DNzob0BD`TanFmW|!=yi|7!P
z^JO=c)Z8Md|9ABG5lDJKXgu;7QxELBFGX-cwcL{M4<X+@<q!5}?N>0JlQ|ZV>1#T=
z_yIOMH{zf~)~SI}<I#^&(-ULI%X^2w_I<7p)W2emxu~(FxHkR_$6VG&$M}yN&@_mC
zWuuR&Mg;4usJbJ9=6N_V&U<tF@ADmLSB;Cj@$rSH-WGZhV8D0}u*79Aep>}F-Q#@D
zX`;OCibZPZiSr3nG=}|{s$Nl+YP_$MkWmxPvZd{hc`X6<zAdLYm<qN*1EW)J4VNLt
zll!bQHp|##oD=JV^DNlz6ng4p>37@ayJ>{Zy2C!OH#Atk$btU{XRyT(yp)w(um@z?
zNfw=2`Bdj?_h|DktBpRy6r<~csK1BsRGRZ`xib$zJYjZ|qngm<%b$z4*xiaA{-)L+
z{@(k_)+;p;g<x#kGmj!xfrb`azioX%r2)@0!f(gw8>Q+RwAy2v#xKLK7^yR?RcF&p
z$T(+Rf@l8<@tk|K;d5-w?C8*p(`xqMJAdKdKNBCs@qS|T=idugk-HaV@wgdQ+*ImK
zmAuS?a()sDY1F=rsZ+Q54R}ki4YiYD7OVH}+;!3J%d=9n`u5qcftp){AYk=J4uJIN
zi1$i5)HG@$0D&f&>%2W0C{y_{OIEfS%=3Vwdsl?POXcLJFF}C}gVfmKh6Ue6;{g#>
zj0nU7#y{rgj!6=<V<D!@Eb&^i6V>L;F9oF>)Qu9}_3M`3z{>F)V3z~@{*i$u6yLMs
z54LiO5WETDiY`x3+1gxqgQGh9fK+m1b-l1MCs#;(UU!WNiW+2`!XXOP+gNLhC}#I2
zp7w4sqWp1t%?*N3+&h{D<Fs<nN8D(SDurIO7e#B%^pzS<r}YHZ+}U!=al6PMN=SBs
zv1?nro2Ms=fD_4o*|=k_&Dnd4Vls3F<}LuDzbTDl3ffoLqq{YeQPQqnh0*7&_!uts
zDM!Lzk=*l!^%+M-2A`1A(F=)NpP^ugbX1{d&AU2>FSbvg(!(_Fv`C+*jg$STYhpX$
zN0oyO5}$#DtLlFl(I&Kh35m@{I4;G;`ud!kw-)ceJNCz2j3480D#X0sBH~*+GwYvO
z0lRf(oP67Q>&aYyMg$4i6msup=Bf?ff4tiiJo++Dv1;oPZVY_@%2AWfo@JWo+w@qA
zfGFQwJnkRK)SumCz&gP*jfOpV@co!Jy<y`8UmxH~%riPq{_mNJHv-SpIXSU=W8|A=
z@p}t3kp<AQD^x*iW;!*Shv5RAiDV=!{J*X?7Pe&okRR@HFRl7trjP}154lH`rRv|0
ziRoZVKm!=;(t>@&1`liN&0LoOTmXAdk1aA88}kXBAVYA1e7g7)nEJ_^R<sa2Q@hNr
zT};zbn@0TqeUnZuDxY54PP_Ly7%RF|xc`^MeQ(ZM?s)XNZi%Aby<K2seAPIRzRli%
z2z6ZILG{7Kuh%1(hZU=<m@1I{m0fU}@7-Py&@9KCgf0nDX|t8`)&mQjTwYPno*CS_
vHG<n|_=WAioP?a~qWebj961Da=T`0y#LNb~+AqoielA_mIiLBfncx2bQfn3`

literal 0
HcmV?d00001

diff --git a/docs/index.rst b/docs/index.rst
index cd9ce41cf..bbdb4fea6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -52,4 +52,5 @@ Transformer Engine documentation
    :caption: Advanced
 
    api/c/index
+   debug
    examples/attention/attention.ipynb
diff --git a/qa/L0_pytorch_lint/test.sh b/qa/L0_pytorch_lint/test.sh
index 81d7822d7..e2c50c445 100644
--- a/qa/L0_pytorch_lint/test.sh
+++ b/qa/L0_pytorch_lint/test.sh
@@ -20,5 +20,5 @@ if [ -z "${CPP_ONLY}" ]
 then
   cd $TE_PATH
   echo "Checking Python files"
-  python3 -m pylint --recursive=y transformer_engine/common transformer_engine/pytorch
+  python3 -m pylint --recursive=y transformer_engine/common transformer_engine/pytorch transformer_engine/debug
 fi
diff --git a/transformer_engine/debug/features/per_tensor_scaling.py b/transformer_engine/debug/features/per_tensor_scaling.py
index d648b517d..7b4de0a18 100644
--- a/transformer_engine/debug/features/per_tensor_scaling.py
+++ b/transformer_engine/debug/features/per_tensor_scaling.py
@@ -82,7 +82,6 @@ class PerTensorScaling(TEConfigAPIMapper):
             transformer_engine:
                 PerTensorScaling:
                     enabled: True
-                    margin: 1
                     gemms: [dgrad]
                     tensors: [weight, activation]
     """

From aafa05375710576c04e3c8ac9a96661e99bc19b8 Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Wed, 21 May 2025 05:59:03 +0800
Subject: [PATCH 06/26] [PyTorch] Add docstring for CP load balancing (#1802)

add docstring for CP

Signed-off-by: Charlene Yang <charleney@nvidia.com>
---
 .../dot_product_attention/context_parallel.py | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
index b52d1003f..2b8d332f4 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -3487,7 +3487,64 @@ def attn_forward_func_with_cp(
     use_flash_attn_3=False,
 ) -> torch.Tensor:
     """
-    Attention implementation with context parallelism.
+    Attention implementation with context parallelism (CP). CP partitions tensors along the sequence
+    dimension, and by reducing the memory and computational pressure on each GPU, it enables long-context
+    LLMs in a distributed fashion. Transformer Engine's PyTorch CP implementation currently utilizes
+    the DualChunkSwap strategy to ensure load balancing across CP ranks. It is applied to all `attn_mask_type`s
+    and all `qkv_format`s, and it requires sequence lengths to be, or are padded to be, divisible by
+    (cp_size * 2). It also requires tokens to be re-ordered before entering this function.
+
+    For qkv_format = {'bshd', 'sbhd'}, the token re-ordering is illustrated as below, for an example
+    use case of s = 12, attn_mask_type = 'causal', and cp_size = 2. seq_pos indicates each token's position
+    in their corresponding sequence.
+
+                   GPU0        |      GPU1                            GPU0        |      GPU1
+    seq_pos | 0  1  2  3  4  5 | 6  7  8  9 10 11      seq_pos | 0  1  2  9 10 11 | 3  4  5  6  7  8
+    ---------------------------|-----------------      ---------------------------|------------------
+          0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0            0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0,
+    G     1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0      G     1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0,
+    P     2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0      P     2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0,
+    U     3 | 1, 1, 1, 1, 0, 0,| 0, 0, 0, 0, 0, 0      U     9 | 1, 1, 1, 1, 0, 0,| 1, 1, 1, 1, 1, 1,
+    0     4 | 1, 1, 1, 1, 1, 0,| 0, 0, 0, 0, 0, 0  ->  0    10 | 1, 1, 1, 1, 1, 0,| 1, 1, 1, 1, 1, 1,
+          5 | 1, 1, 1, 1, 1, 1,| 0, 0, 0, 0, 0, 0           11 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 1,
+    ---------------------------|-----------------      ---------------------------|------------------
+          6 | 1, 1, 1, 1, 1, 1,| 1, 0, 0, 0, 0, 0            3 | 1, 1, 1, 0, 0, 0,| 1, 0, 0, 0, 0, 0,
+    G     7 | 1, 1, 1, 1, 1, 1,| 1, 1, 0, 0, 0, 0      G     4 | 1, 1, 1, 0, 0, 0,| 1, 1, 0, 0, 0, 0,
+    P     8 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 0, 0, 0,     P     5 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 0, 0, 0,
+    U     9 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 0, 0,     U     6 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 0, 0,
+    1    10 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 0,     1     7 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 1, 0,
+         11 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 1,           8 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 1, 1,
+
+    For qkv_format = 'thd', multiple sequences may be packed into the batch, and they may be of different
+    lengths. DualChunkSwap divides each sequence into (cp_size * 2) chunks and distributes 2 chunks of
+    every sequence onto a CP rank. The token matrix transformation is shown as follows, for an example of
+    batch_size = 2, seq_ids = [0, 1], seq_lens = [8, 4], t = 12, attn_mask_type = 'padding_causal', and
+    cp_size = 2.
+
+                   GPU0        |      GPU1                            GPU0        |      GPU1
+    seq_id  | 0  0  0  0  0  0 | 0  0  1  1  1  1      seq_id  | 0  0  0  0  1  1 | 0  0  0  0  1  1
+    seq_pos | 0  1  2  3  4  5 | 6  7  0  1  2  3      seq_pos | 0  1  6  7  0  3 | 2  3  4  5  1  2
+    ---------------------------|-----------------      ---------------------------|------------------
+        0 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0          0 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0,
+    G   0 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0      G   0 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0,
+    P   0 2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0      P   0 6 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 0, 0,
+    U   0 3 | 1, 1, 1, 1, 0, 0,| 0, 0, 0, 0, 0, 0      U   0 7 | 1, 1, 1, 1, 0, 0,| 1, 1, 1, 1, 0, 0,
+    0   0 4 | 1, 1, 1, 1, 1, 0,| 0, 0, 0, 0, 0, 0  ->  0   1 0 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 0, 0,
+        0 5 | 1, 1, 1, 1, 1, 1,| 0, 0, 0, 0, 0, 0          1 3 | 0, 0, 0, 0, 2, 2,| 0, 0, 0, 0, 2, 2,
+    ---------------------------|-----------------      ---------------------------|------------------
+        0 6 | 1, 1, 1, 1, 1, 1,| 1, 0, 0, 0, 0, 0          0 2 | 1, 1, 0, 0, 0, 0,| 1, 0, 0, 0, 0, 0,
+    G   0 7 | 1, 1, 1, 1, 1, 1,| 1, 1, 0, 0, 0, 0      G   0 3 | 1, 1, 0, 0, 0, 0,| 1, 1, 0, 0, 0, 0,
+    P   1 0 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 0, 0, 0      P   0 4 | 1, 1, 0, 0, 0, 0,| 1, 1, 1, 0, 0, 0,
+    U   1 1 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 0, 0      U   0 5 | 1, 1, 0, 0, 0, 0,| 1, 1, 1, 1, 0, 0,
+    1   1 2 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 2, 0      1   1 1 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 2, 0,
+        1 3 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 2, 2          1 2 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 2, 2,
+
+    When all transformer layers in a model share the same CP configuration, i.e. cp_group, cp_global_ranks,
+    cp_comm_type and cp_stream, token re-ordering can take place in the dataloader, i.e. only once for
+    all the layers. An example of the re-ordering code is `get_batch_on_this_cp_rank
+    <https://github.com/NVIDIA/Megatron-LM/blob/d6eb60b5ea1efca47401c0be97f456fbe3a55bcd/megatron/core/utils.py#L1725>`_
+    in Megatron-LM.
+
     """
 
     if cp_comm_type == "a2a+p2p":

From 90458e773e7a05683040eebda482d7e429e4f8eb Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Tue, 20 May 2025 17:26:11 -0700
Subject: [PATCH 07/26] Add missing docs for C API (#1803)

* Add missing docs for C API

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Grammar, typos, copy-paste errors

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* remove contiguous word

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Better wording

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 docs/api/c/cast_transpose_noop.rst            |   9 +
 docs/api/c/comm_gemm_overlap.rst              |   9 +
 docs/api/c/cudnn.rst                          |   9 +
 docs/api/c/index.rst                          |   4 +
 docs/api/c/multi_tensor.rst                   |   9 +
 .../transformer_engine/cast_transpose_noop.h  |  16 +-
 .../include/transformer_engine/fused_attn.h   | 143 ++++++++++++
 .../include/transformer_engine/multi_tensor.h | 204 ++++++++++++++++++
 8 files changed, 394 insertions(+), 9 deletions(-)
 create mode 100644 docs/api/c/cast_transpose_noop.rst
 create mode 100644 docs/api/c/comm_gemm_overlap.rst
 create mode 100644 docs/api/c/cudnn.rst
 create mode 100644 docs/api/c/multi_tensor.rst

diff --git a/docs/api/c/cast_transpose_noop.rst b/docs/api/c/cast_transpose_noop.rst
new file mode 100644
index 000000000..ae80c5d2d
--- /dev/null
+++ b/docs/api/c/cast_transpose_noop.rst
@@ -0,0 +1,9 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+cast_transpose_noop.h
+=====================
+
+.. doxygenfile:: cast_transpose_noop.h
diff --git a/docs/api/c/comm_gemm_overlap.rst b/docs/api/c/comm_gemm_overlap.rst
new file mode 100644
index 000000000..090551f60
--- /dev/null
+++ b/docs/api/c/comm_gemm_overlap.rst
@@ -0,0 +1,9 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+comm_gemm_overlap.h
+===================
+
+.. doxygenfile:: comm_gemm_overlap.h
diff --git a/docs/api/c/cudnn.rst b/docs/api/c/cudnn.rst
new file mode 100644
index 000000000..5d93c4d6e
--- /dev/null
+++ b/docs/api/c/cudnn.rst
@@ -0,0 +1,9 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+cudnn.h
+=======
+
+.. doxygenfile:: cudnn.h
diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst
index 7bc864dcc..27ba553d6 100644
--- a/docs/api/c/index.rst
+++ b/docs/api/c/index.rst
@@ -14,10 +14,14 @@ directly from C/C++, without Python.
 
    transformer_engine.h <transformer_engine>
    activation.h <activation>
+   cast_transpose_noop.h <cast_transpose_noop>
    cast.h <cast>
+   comm_gemm_overlap.h <comm_gemm_overlap>
+   cudnn.h <cudnn>
    fused_attn.h <fused_attn>
    fused_rope.h <fused_rope>
    gemm.h <gemm>
+   multi_tensor.h <multi_tensor>
    normalization.h <normalization>
    padding.h <padding>
    permutation.h <permutation>
diff --git a/docs/api/c/multi_tensor.rst b/docs/api/c/multi_tensor.rst
new file mode 100644
index 000000000..8ba2d274c
--- /dev/null
+++ b/docs/api/c/multi_tensor.rst
@@ -0,0 +1,9 @@
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+multi_tensor.h
+==============
+
+.. doxygenfile:: multi_tensor.h
diff --git a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
index 678ffe919..649b5ced5 100644
--- a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
+++ b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h
@@ -17,23 +17,21 @@
 extern "C" {
 #endif
 
-/*! \brief Transposes the input, providing the option to immediately exit the kernel
- *         based on the value of the 'noop' tensor.
+/*! \brief Transposes the input.
  *
- *  \param[in]     input     Input tensor.
- *  \param[in]     noop      Noop tensor.
+ *  \param[in]     input     Input tensor to be cast.
+ *  \param[in]     noop      If this single element tensor has non-zero value, kernel will exit immediately.
  *  \param[in,out] output    Output tensor.
  *  \param[in]     stream    CUDA stream used for the operation.
  */
 void nvte_transpose_with_noop(const NVTETensor input, const NVTETensor noop, NVTETensor output,
                               cudaStream_t stream);
 
-/*! \brief Casts and transposes the input, providing the option to immediately exit the kernel
- *         based on the value of the 'noop' tensor.
+/*! \brief Casts and transposes the input.
  *
- *  \param[in]     input     Input tensor.
- *  \param[in]     noop      Noop tensor.
- *  \param[in,out] output    Output tensor.
+ *  \param[in]     input     Input tensor to be cast.
+ *  \param[in]     noop      If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out] output    Output quantized tensor.
  *  \param[in]     stream    CUDA stream used for the operation.
  */
 void nvte_cast_transpose_with_noop(const NVTETensor input, const NVTETensor noop, NVTETensor output,
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index 3400eaaeb..f63ee636d 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -634,6 +634,8 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
 
 #ifndef __HIP_PLATFORM_AMD__
 /*!  \brief Update the RNG state with the seed and calculated offset.
+ *
+ * \warning   This API is **experimental** and subject to change.
  *
  *  \param[in]     rng_state_dst             RNG state to store seed and offset.
  *  \param[in]     seed                      Seed for RNG state.
@@ -666,6 +668,8 @@ void nvte_populate_rng_state_async(void *rng_state_dst, const void *const seed,
 #endif
 
 /*!  \brief Get KV format for a given QKV layout.
+ *
+ * \warning   This API is **experimental** and subject to change.
  *
  *  \param[in]     cu_seqlens               Cumulative sequence lengths, [batch_size + 1].
  *  \param[in]     workspace                Workspace tensor.
@@ -675,48 +679,187 @@ void nvte_populate_rng_state_async(void *rng_state_dst, const void *const seed,
 uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlen, NVTETensor workspace, size_t len,
                                        cudaStream_t stream);
 
+/*!  \brief Set the seed and offset for RNG state.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[out]    rng_state_ptr            A size 2 array storing the RNG's seed and offset respectively.
+ *  \param[in]     captured                 Whether a CUDA graph is being captured.
+ *  \param[in]     seed_ptr                 Seed pointer.
+ *  \param[in]     seed_val                 Seed value.
+ *  \param[in]     offset_ptr               Offset pointer.
+ *  \param[in]     offset_val               Offset value.
+ *  \param[in]     offset_intragraph        Intragraph offset in RNG states. For use with CUDA Graphs.
+ *  \param[in]     stream                   CUDA stream used for this operation.
+ */
 void nvte_extract_seed_and_offset(int64_t *rng_state_ptr, int captured, int64_t *seed_ptr,
                                   uint64_t seed_val, int64_t *offset_ptr, uint64_t offset_val,
                                   uint32_t offset_intragraph, cudaStream_t stream);
 
+/*!  \brief Copy keys and values into the KV cache.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     new_k               Key tensor.
+ *  \param[in]     new_v               Value tensor.
+ *  \param[out]    k_cache             Key cache.
+ *  \param[out]    v_cache             Value cache.
+ *  \param[in]     page_table          Page table for K cache, [batch_size, max_pages_per_seq].
+ *  \param[in]     cu_new_lens         Cumulative sequence lengths.
+ *  \param[in]     cu_cached_lens      Cached cumulative sequence lengths.
+ *  \param[in]     qkv_format          QKV format, e.g. sbhd.
+ *  \param[in]     b                   Batch size.
+ *  \param[in]     max_ctx_len         Maximum context length.
+ *  \param[in]     max_seq_len         Maximum sequence length.
+ *  \param[in]     max_pages_per_seq   Maximum number of pages per sequence.
+ *  \param[in]     is_non_paged        Whether the cache is paged or not.
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
 void nvte_copy_to_kv_cache(NVTETensor new_k, NVTETensor new_v, NVTETensor k_cache,
                            NVTETensor v_cache, NVTETensor page_table, NVTETensor cu_new_lens,
                            NVTETensor cu_cached_lens, NVTE_QKV_Format qkv_format, int b,
                            int max_ctx_len, int max_seq_len, int max_pages_per_seq,
                            int is_non_paged, cudaStream_t stream);
 
+/*!  \brief Extract the first half (half_idx=0) or second half (half_idx=1) of a THD tensor.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     tensor              Input tensor.
+ *  \param[in]     cu_seqlens          Cumulative sequence lengths, [batch_size + 1].
+ *  \param[out]    half                Output tensor.
+ *  \param[in]     half_idx            Whether to read first or second half of input tensor.
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
 void nvte_cp_thd_read_half_tensor(const NVTETensor &tensor, const NVTETensor &cu_seqlens,
                                   NVTETensor half, int half_idx, cudaStream_t stream);
 
+/*!  \brief Correct the second half of the softmax LSE (LogSumExp) for context parallelism.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[out]    lse                 Output tensor.
+ *  \param[in]     lse_per_step        Input tensor.
+ *  \param[in]     cu_seqlens          Cumulative sequence lengths, [batch_size + 1].
+ *  \param[in]     lse_packed          Whether or not lse_per_step is packed.
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
 void nvte_cp_thd_second_half_lse_correction(NVTETensor lse, const NVTETensor &lse_per_step,
                                             const NVTETensor &cu_seqlens, int lse_packed,
                                             cudaStream_t stream);
 
+/*!  \brief Read the second half of the softmax LSE (LogSumExp) for context parallelism.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     lse                      Input tensor.
+ *  \param[in]     cu_seqlens               Cumulative sequence lengths, [batch_size + 1].
+ *  \param[out]    half_lse                 Output tensor.
+ *  \param[in]     lse_packed               Whether or the softmax LSE is in packed format.
+ *  \param[in]     second_half_lse_seqlen   Sequence length.
+ *  \param[in]     stream                   CUDA stream used for this operation.
+ */
 void nvte_cp_thd_read_second_half_lse(const NVTETensor &lse, const NVTETensor &cu_seqlens,
                                       NVTETensor half_lse, int lse_packed,
                                       int second_half_lse_seqlen, cudaStream_t stream);
 
+/*!  \brief Correct the THD format output of context parallelism in forward pass.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[out]    out                   Output tensor.
+ *  \param[in]     out_per_step          THD format output of context parallelism in forward pass.
+ *  \param[in]     lse                   Softmax LSE.
+ *  \param[in]     lse_per_step          Softmax LSE per step.
+ *  \param[in]     cu_seqlens            Cumulative sequence lengths, [batch_size + 1].
+ *  \param[in]     only_second_half      Whether or not to correct only second half.
+ *  \param[in]     lse_packed            Whether or the softmax LSE is in packed format.
+ *  \param[in]     stream                CUDA stream used for this operation.
+ */
 void nvte_cp_thd_out_correction(NVTETensor out, const NVTETensor &out_per_step,
                                 const NVTETensor &lse, const NVTETensor &lse_per_step,
                                 const NVTETensor &cu_seqlens, int only_second_half, int lse_packed,
                                 cudaStream_t stream);
 
+/*!  \brief Correct the THD format output of context parallelism in forward pass.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[out]    grad                Output tensor.
+ *  \param[in]     grad_per_step       THD format gradient of context parallelism.
+ *  \param[in]     cu_seqlens          Cumulative sequence lengths, [batch_size + 1].
+ *  \param[in]     first_half          One of ("add", "copy", "none") correction op for first half.
+ *  \param[in]     second_half         One of ("add", "copy", "none") correction op for second half.
+                                       Must be different from first_half.
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
 void nvte_cp_thd_grad_correction(NVTETensor grad, const NVTETensor &grad_per_step,
                                  const NVTETensor &cu_seqlens, const char *first_half,
                                  const char *second_half, cudaStream_t stream);
 
+/*!  \brief Generate partitioned indices for inputs in THD format.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     cu_seqlens          Cumulative sequence lengths, [batch_size + 1].
+ *  \param[out]    output              Output tensor.
+ *  \param[in]     total_tokens        Total number of tokens.
+ *  \param[in]     world_size          Total number of devices for context parallelism.
+ *  \param[in]     rank                Device ID for current device.
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
 void nvte_cp_thd_get_partitioned_indices(const NVTETensor &cu_seqlens, NVTETensor output,
                                          int total_tokens, int world_size, int rank,
                                          cudaStream_t stream);
 
+/*!  \brief Convert tensor from THD to BSHD format.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     tensor           Input tensor.
+ *  \param[in]     cu_seqlens       Cumulative sequence lengths, [batch_size + 1].
+ *  \param[out]    new_tensor       Output tensor.
+ *  \param[in]     b                Batch size.
+ *  \param[in]     max_seq_len      Maximum sequence length.
+ *  \param[in]     stream           CUDA stream used for this operation.
+ */
 void nvte_convert_thd_to_bshd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor,
                               int b, int max_seq_len, cudaStream_t stream);
 
+/*!  \brief Convert tensor from BSHD to THD format.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     tensor           Input tensor.
+ *  \param[in]     cu_seqlens       Cumulative sequence lengths, [batch_size + 1].
+ *  \param[out]    new_tensor       Output tensor.
+ *  \param[in]     b                Batch size.
+ *  \param[in]     max_seq_len      Maximum sequence length.
+ *  \param[in]     stream           CUDA stream used for this operation.
+ */
 void nvte_convert_bshd_to_thd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor,
                               int t, cudaStream_t stream);
 
+/*!  \brief Prepare QKV tensor for Flash Attention forward kernel.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     qkvi             Input tensor.
+ *  \param[out]    qkv              Output tensor.
+ *  \param[in]     stream           CUDA stream used for this operation.
+ */
 void nvte_prepare_flash_attn_fwd(NVTETensor qkvi, NVTETensor qkv, cudaStream_t stream);
 
+/*!  \brief Prepare QKV tensor for Flash Attention backward kernel.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ *
+ *  \param[in]     q                Input query tensor.
+ *  \param[in]     k                Input key tensor.
+ *  \param[in]     v                Input value tensor.
+ *  \param[out]    qkv              Output tensor.
+ *  \param[in]     stream           CUDA stream used for this operation.
+ */
 void nvte_prepare_flash_attn_bwd(NVTETensor q, NVTETensor k, NVTETensor v, NVTETensor qkv,
                                  cudaStream_t stream);
 
diff --git a/transformer_engine/common/include/transformer_engine/multi_tensor.h b/transformer_engine/common/include/transformer_engine/multi_tensor.h
index e78b31d77..c21fd2627 100644
--- a/transformer_engine/common/include/transformer_engine/multi_tensor.h
+++ b/transformer_engine/common/include/transformer_engine/multi_tensor.h
@@ -17,6 +17,25 @@
 extern "C" {
 #endif
 
+/*!  \brief Computes L2 norm for a list of tensors.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]     chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]     noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in]     tensor_lists            2D array of input tensors.
+ *  \param[in]     num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]     num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]     output                  Scratch space. Required size grows with number of inputs.
+ *  \param[in]     output_per_tensor       Fixed size auxilliary scratch space.
+ *  \param[out]    ret                     L2 norm of all inputs.
+ *  \param[out]    ret_per_tensor          L2 norm for each tensor.
+ *  \param[in]     per_tensor              Whether to calculate per tensor or cumulative norm.
+ *  \param[in]     max_chunks_per_tensor   Maximum number of chunks in any input tensor.
+ *  \param[in]     device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]     stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_l2norm_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists,
                                    const size_t num_tensor_lists, const size_t num_tensors_per_list,
                                    NVTETensor output, NVTETensor output_per_tensor, NVTETensor ret,
@@ -24,6 +43,28 @@ void nvte_multi_tensor_l2norm_cuda(int chunk_size, NVTETensor noop_flag, NVTETen
                                    int max_chunks_per_tensor, const int device_id,
                                    cudaStream_t stream);
 
+/*!  \brief Computes L2 norm for a list of tensors after unscaling.
+ *
+ * Unscaling is only done for computing the L2 norm. The tensors themselves are not updated.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]     chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]     noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in]     tensor_lists            2D array of input tensors.
+ *  \param[in]     num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]     num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]     output                  Scratch space. Required size grows with number of inputs.
+ *  \param[in]     output_per_tensor       Fixed size auxilliary scratch space.
+ *  \param[out]    ret                     L2 norm of all inputs.
+ *  \param[out]    ret_per_tensor          L2 norm for each tensor.
+ *  \param[in]     inv_scale               Scalar for the unscaling operation.
+ *  \param[in]     per_tensor              Whether to calculate per tensor or cumulative norm.
+ *  \param[in]     max_chunks_per_tensor   Maximum number of chunks in any input tensor.
+ *  \param[in]     device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]     stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_unscale_l2norm_cuda(int chunk_size, NVTETensor noop_flag,
                                            NVTETensor **tensor_lists, const size_t num_tensor_lists,
                                            const size_t num_tensors_per_list, NVTETensor output,
@@ -32,6 +73,27 @@ void nvte_multi_tensor_unscale_l2norm_cuda(int chunk_size, NVTETensor noop_flag,
                                            int per_tensor, int max_chunks_per_tensor,
                                            const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for Adam optimizer.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      beta1                   Coefficient for first moment of gradient.
+ *  \param[in]      beta2                   Coefficient for second moment of gradient.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      step                    Iteration counter.
+ *  \param[in]      mode                    Whether to use AdamW (L2 penalty applied to params).
+ *  \param[in]      bias_correction         Whether to apply correction factor for moment estimates.
+ *  \param[in]      weight_decay            L2 penalty for weight decay.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_adam_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists,
                                  const size_t num_tensor_lists, const size_t num_tensors_per_list,
                                  const float lr, const float beta1, const float beta2,
@@ -39,12 +101,57 @@ void nvte_multi_tensor_adam_cuda(int chunk_size, NVTETensor noop_flag, NVTETenso
                                  const int bias_correction, const float weight_decay,
                                  const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for Adam optimizer
+ *          where the master parameters only store the remainder bits.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      beta1                   Coefficient for first moment of gradient.
+ *  \param[in]      beta2                   Coefficient for second moment of gradient.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      step                    Iteration counter.
+ *  \param[in]      mode                    Whether to use AdamW (L2 penalty applied to params).
+ *  \param[in]      bias_correction         Whether to apply correction factor for moment estimates.
+ *  \param[in]      weight_decay            L2 penalty for weight decay.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_adam_param_remainder_cuda(
     int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists,
     const size_t num_tensors_per_list, const float lr, const float beta1, const float beta2,
     const float epsilon, const int step, const int mode, const int bias_correction,
     const float weight_decay, const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for Adam optimizer
+ *          when model parameters are in Float8 precision.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      beta1                   Coefficient for first moment of gradient.
+ *  \param[in]      beta2                   Coefficient for second moment of gradient.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      step                    Iteration counter.
+ *  \param[in]      mode                    Whether to use AdamW (L2 penalty applied to params).
+ *  \param[in]      bias_correction         Whether to apply correction factor for moment estimates.
+ *  \param[in]      weight_decay            L2 penalty for weight decay.
+ *  \param[in]      fp8_dtype               FP8 data type for model parameters.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_adam_fp8_cuda(int chunk_size, NVTETensor noop_flag,
                                      NVTETensor **tensor_lists, const size_t num_tensor_lists,
                                      const size_t num_tensors_per_list, const float lr,
@@ -53,28 +160,125 @@ void nvte_multi_tensor_adam_fp8_cuda(int chunk_size, NVTETensor noop_flag,
                                      const float weight_decay, const NVTEDType fp8_dtype,
                                      const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for Adam optimizer
+ *          with CUDA graph support and LR scheduling.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      beta1                   Coefficient for first moment of gradient.
+ *  \param[in]      beta2                   Coefficient for second moment of gradient.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      step                    Iteration counter.
+ *  \param[in]      mode                    Whether to use AdamW (L2 penalty applied to params).
+ *  \param[in]      bias_correction         Whether to apply correction factor for moment estimates.
+ *  \param[in]      weight_decay            L2 penalty for weight decay.
+ *  \param[in]      inv_scale               Scalar for the unscaling operation.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_adam_capturable_cuda(
     int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists,
     const size_t num_tensors_per_list, NVTETensor lr, const float beta1, const float beta2,
     const float epsilon, NVTETensor step, const int mode, const int bias_correction,
     const float weight_decay, NVTETensor inv_scale, const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for Adam optimizer
+ *          with CUDA graph support, LR scheduling, and FP32 master weights.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      beta1                   Coefficient for first moment of gradient.
+ *  \param[in]      beta2                   Coefficient for second moment of gradient.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      step                    Iteration counter.
+ *  \param[in]      mode                    Whether to use AdamW (L2 penalty applied to params).
+ *  \param[in]      bias_correction         Whether to apply correction factor for moment estimates.
+ *  \param[in]      weight_decay            L2 penalty for weight decay.
+ *  \param[in]      inv_scale               Scalar for the unscaling operation.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_adam_capturable_master_cuda(
     int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists,
     const size_t num_tensors_per_list, NVTETensor lr, const float beta1, const float beta2,
     const float epsilon, NVTETensor step, const int mode, const int bias_correction,
     const float weight_decay, NVTETensor inv_scale, const int device_id, cudaStream_t stream);
 
+/*!  \brief Compute and apply gradient update to parameters for SGD optimizer.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      wd                      Weight decay (L2 penalty).
+ *  \param[in]      momentum                Momentum factor.
+ *  \param[in]      dampening               Dampening factor.
+ *  \param[in]      lr                      Learning rate.
+ *  \param[in]      nesterov                Whether or not to enable nesterov momentum.
+ *  \param[in]      first_run               Whether momentum buffers have been initialized.
+ *  \param[in]      wd_after_momentum       Whether to applied weight decay after momentum update.
+ *  \param[in]      scale                   Scalar for the scaling operation.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_sgd_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists,
                                 const size_t num_tensor_lists, const size_t num_tensors_per_list,
                                 float wd, float momentum, float dampening, float lr, int nesterov,
                                 int first_run, int wd_after_momentum, float scale,
                                 const int device_id, cudaStream_t stream);
 
+/*!  \brief Check overflow and scale a list of tensors.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      scale                   Scalar for the scaling operation.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_scale_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists,
                                   const size_t num_tensor_lists, const size_t num_tensors_per_list,
                                   float scale, const int device_id, cudaStream_t stream);
 
+/*!  \brief Check overflow and scale a list of tensors.
+ *
+ * \warning   This API is **experimental** and subject to change.
+ * \warning   Argument device_id is deprecated and will be removed in a future release.
+ *
+ *  \param[in]      chunk_size              Number of tensor elements processed by a CUDA block.
+ *  \param[in]      noop_flag               If this single element tensor has non-zero value, kernel will exit immediately.
+ *  \param[in,out]  tensor_lists            2D array of input tensors.
+ *  \param[in]      num_tensor_lists        Size (dim0) of tensor_lists.
+ *  \param[in]      num_tensors_per_list    Size (dim1) of tensor_lists.
+ *  \param[in]      max_fp8                 Maximum representible value in underlying FP8 format.
+ *  \param[in]      force_pow_2_scales      Ensure scaling factors are a power of 2.
+ *  \param[in]      epsilon                 Term added to the denominator for numerical stability.
+ *  \param[in]      device_id               [DEPRECATED] CUDA device ID for this operation.
+ *  \param[in]      stream                  CUDA stream used for this operation.
+ */
 void nvte_multi_tensor_compute_scale_and_scale_inv_cuda(
     int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists,
     const size_t num_tensors_per_list, float max_fp8, int force_pow_2_scales, float epsilon,

From 3a5ca57fd68854f4f6145ef0278a1e56a1f63b0e Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Thu, 22 May 2025 12:20:14 -0700
Subject: [PATCH 08/26] Remove `comm_gemm_overlap` doc (#1815)

Remove comm_gemm_overlap docs

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 docs/api/c/comm_gemm_overlap.rst | 9 ---------
 docs/api/c/index.rst             | 1 -
 2 files changed, 10 deletions(-)
 delete mode 100644 docs/api/c/comm_gemm_overlap.rst

diff --git a/docs/api/c/comm_gemm_overlap.rst b/docs/api/c/comm_gemm_overlap.rst
deleted file mode 100644
index 090551f60..000000000
--- a/docs/api/c/comm_gemm_overlap.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-..
-    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-
-    See LICENSE for license information.
-
-comm_gemm_overlap.h
-===================
-
-.. doxygenfile:: comm_gemm_overlap.h
diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst
index 27ba553d6..0499f52f0 100644
--- a/docs/api/c/index.rst
+++ b/docs/api/c/index.rst
@@ -16,7 +16,6 @@ directly from C/C++, without Python.
    activation.h <activation>
    cast_transpose_noop.h <cast_transpose_noop>
    cast.h <cast>
-   comm_gemm_overlap.h <comm_gemm_overlap>
    cudnn.h <cudnn>
    fused_attn.h <fused_attn>
    fused_rope.h <fused_rope>

From 9b80ea92914ccbf13e86d09d0fd2eaf37ab00549 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Thu, 22 May 2025 15:26:54 -0700
Subject: [PATCH 09/26] Add docs for missing FP8 recipes. (#1816)

Document all recipes

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 docs/api/common.rst                          |  4 ++
 transformer_engine/common/recipe/__init__.py | 58 +++-----------------
 2 files changed, 11 insertions(+), 51 deletions(-)

diff --git a/docs/api/common.rst b/docs/api/common.rst
index 95d4b50f3..541118985 100644
--- a/docs/api/common.rst
+++ b/docs/api/common.rst
@@ -11,3 +11,7 @@ Common API
 .. autoapiclass:: transformer_engine.common.recipe.DelayedScaling(margin=0, fp8_format=Format.HYBRID, amax_history_len=1024, amax_compute_algo="max", scaling_factor_compute_algo=None)
 
 .. autoapiclass:: transformer_engine.common.recipe.MXFP8BlockScaling(fp8_format=Format.E4M3)
+
+.. autoapiclass:: transformer_engine.common.recipe.Float8CurrentScaling(fp8_format=Format.HYBRID)
+
+.. autoapiclass:: transformer_engine.common.recipe.Float8BlockScaling(fp8_format=Format.E4M3)
diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py
index 1cf974987..466c2e605 100644
--- a/transformer_engine/common/recipe/__init__.py
+++ b/transformer_engine/common/recipe/__init__.py
@@ -209,42 +209,12 @@ def __repr__(self) -> str:
 class Float8CurrentScaling(Recipe):
     """
     Use the per-tensor current scaling factor strategy.
+
     Parameters
     ----------
     fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.HYBRID
                 Controls the FP8 data format used during forward and backward
                 pass.
-    fp8_quant_fwd_inp: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0}
-                    used for quantization of input tensor x
-    fp8_quant_fwd_weight: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0}
-                    used for quantization of weight tensor w
-    fp8_quant_bwd_grad: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0}
-                    used for quantization of gradient tensor dY
-    fp8_gemm_fprop: MMParams, default MMParams.use_split_accumulator=False
-                    used for calculating output y in forward pass
-    fp8_gemm_dgrad: MMParams, default MMParams.use_split_accumulator=True
-                    use for calculating dgrad in backward pass
-    fp8_gemm_wgrad: MMParams, default MMParams.use_split_accumulator=True
-                    use for calculating dgrad in backward pass
-    fp8_dpa: bool, default = `False`
-             Whether to enable FP8 dot product attention (DPA). When the model is placed in an
-             `fp8_autocast(enabled=True)` region and `fp8_dpa` is set to `True`, DPA casts the
-             inputs from higher precision to FP8, performs attention in FP8, and casts tensors
-             back to higher precision as outputs. FP8 DPA currently is only supported in the
-             `FusedAttention` backend.
-    fp8_mha: bool, default = `False`
-            Whether to enable FP8 multi-head attention (MHA). When `True`, it removes the casting
-            operations mentioned above at the DPA boundaries. Currently only standard MHA modules
-            i.e. `LayerNormLinear/Linear + DPA + Linear`, are supported for this feature. When
-            `fp8_mha = False, fp8_dpa = True`, a typical MHA module works as
-            `LayerNormLinear (BF16 output) -> (cast to FP8 ) FP8 DPA (cast to BF16) -> Linear`.
-            When `fp8_mha = True, fp8_dpa = True`, it becomes
-            `LayerNormLinear (FP8 output) -> FP8 DPA -> Linear`.
-
-    Notes
-    -----
-    * `fp8_dpa` and `fp8_mha` are Beta features, and their API and functionality are
-      subject to change in future Transformer Engine releases.
     """
 
     fp8_format: Format = Format.HYBRID
@@ -259,6 +229,9 @@ class Float8CurrentScaling(Recipe):
 
     def __post_init__(self) -> None:
         assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported."
+        assert (
+            not self.fp8_dpa and not self.fp8_mha
+        ), "FP8 attention is not supported for Float8CurrentScaling."
 
     def __repr__(self) -> str:
         return (
@@ -335,32 +308,12 @@ class Float8BlockScaling(Recipe):
 
     NOTE: To relax the default constraint that scales be powers of 2, set env variable
     NVTE_FP8_BLOCK_SCALING_FP32_SCALES=1 to override it for the recipe defaults.
-    export NVTE_FP8_BLOCK_SCALING_FP32_SCALES=1
-    Or initialize the Recipe with non-default QParams in code for increased control.
 
     Parameters
     ----------
     fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.E4M3
                 Controls the FP8 data format used during forward and backward
                 pass.
-    fp8_quant_fwd_inp: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0}
-                    used for quantization of input tensor x
-    fp8_quant_fwd_weight: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0}
-                    used for quantization of weight tensor w
-    fp8_quant_bwd_grad: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0}
-                    used for quantization of gradient tensor dY
-    x_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional)
-                    qblock scaling for x.
-    w_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional)
-                    qblock scaling for w.
-    grad_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional)
-                    qblock scaling for grad.
-    fp8_gemm_fprop: MMParams, default MMParams.use_split_accumulator=False
-                    used for calculating output y in forward pass
-    fp8_gemm_dgrad: MMParams, default MMParams.use_split_accumulator=True
-                    use for calculating dgrad in backward pass
-    fp8_gemm_wgrad: MMParams, default MMParams.use_split_accumulator=True
-                    use for calculating dgrad in backward pass
     """
 
     use_f32_scales: bool = os.getenv("NVTE_FP8_BLOCK_SCALING_FP32_SCALES", "0") == "1"
@@ -394,6 +347,9 @@ def __post_init__(self) -> None:
         assert self.fp8_gemm_fprop.use_split_accumulator, "Split accumulator required for fprop."
         assert self.fp8_gemm_dgrad.use_split_accumulator, "Split accumulator required for dgrad."
         assert self.fp8_gemm_wgrad.use_split_accumulator, "Split accumulator required for wgrad."
+        assert (
+            not self.fp8_dpa and not self.fp8_mha
+        ), "FP8 attention is not supported for Float8BlockScaling."
 
     def __repr__(self) -> str:
         return (

From 7558c445aa891428d96a5b4c0a2e6ce57cd289f2 Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Fri, 23 May 2025 12:55:08 -0700
Subject: [PATCH 10/26] Fix the failing test cases in the CI (#1806)

* Modify the test cases

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Make the tests reproducible on different machines

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Fixed the cache of the gamma_in_weight_dtype setting

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* Reinstate the tests

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* More verbose code and comments

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

---------

Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../operator/test_cast_mxfp8_gated_swiglu.cu  |  2 +-
 tests/cpp/test_common.cu                      | 23 +++++++++------
 tests/pytorch/distributed/run_numerics.py     |  2 +-
 .../common/normalization/common.cpp           | 29 ++++++++++---------
 .../common/normalization/common.h             |  7 +++--
 .../common/normalization/layernorm/ln_api.cpp | 22 +++++++++-----
 .../normalization/rmsnorm/rmsnorm_api.cpp     | 19 ++++++++----
 7 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
index f93c8c9e0..0e43c2c9d 100644
--- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
@@ -382,7 +382,7 @@ std::vector<std::pair<size_t, size_t>> matrix_sizes = {
     {256, 256},
     {993, 512},
     {768, 1024},
-    {65536, 128},
+    {65504, 128},
     {16384, 1632},
 };
 
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 32eb1d63a..e11b32689 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -731,13 +731,19 @@ std::pair<double, double> getTolerances(const DType type) {
 
 template <typename T>
 void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
-#ifdef __HIP_PLATFORM_AMD__
-  // TODO: Introduce a parallel RNG library (Random123, PCG, rocRAND)
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  for (int i = 0; i < size; i++) {
-    data[i] = static_cast<T>(dis(*gen));
+  // Check how many RNG calls are required to generate one uniform random value
+  int rng_calls_per_val = 0;
+  {
+    std::mt19937 gen1 = *gen, gen2 = *gen;
+    std::uniform_real_distribution<> dis(-2.0, 1.0);
+    const float _ = dis(gen1);
+    while (gen2 != gen1) {
+      auto _ = gen2();
+      ++rng_calls_per_val;
+    }
   }
-#else
+
+  // Generate uniform random values in parallel
   #pragma omp parallel proc_bind(spread)
   {
     std::mt19937 gen_local = *gen;
@@ -746,15 +752,14 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
     const int chunk_size = (size + threads_num - 1) / threads_num;
     const int idx_min = chunk_size * thread_ID;
     const int idx_max = std::min(chunk_size * (thread_ID + 1), static_cast<int>(size));
-    gen_local.discard(idx_min);
+    gen_local.discard(idx_min * rng_calls_per_val);
     std::uniform_real_distribution<> dis(-2.0, 1.0);
 
     for (int i = idx_min; i < idx_max; ++i) {
       data[i] = static_cast<T>(dis(gen_local));
     }
   }
-#endif
-  gen->discard(size);
+  gen->discard(size * rng_calls_per_val);
 }
 
 void fillUniform(Tensor *t) {
diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index a505d0179..b7af78832 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -207,7 +207,7 @@ def _get_tolerances(dtype):
     if dtype == torch.bfloat16:
         return {"rtol": 1.6e-2, "atol": 1e-5}
     if dtype == torch.float32:
-        return {"rtol": 1.3e-6, "atol": 4e-5}
+        return {"rtol": 1e-4, "atol": 1e-4}
     raise ValueError(f"Unsupported dtype ({dtype})")
 
 
diff --git a/transformer_engine/common/normalization/common.cpp b/transformer_engine/common/normalization/common.cpp
index 3be7d5004..029c89e2a 100644
--- a/transformer_engine/common/normalization/common.cpp
+++ b/transformer_engine/common/normalization/common.cpp
@@ -41,9 +41,6 @@ Compute always in FP32
 namespace transformer_engine {
 namespace normalization {
 
-#ifndef __HIP_PLATFORM_AMD__
-bool& use_zero_centered_gamma_in_weight_dtype();
-
 cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) {
   return training ? cudnn_frontend::NormFwdPhase_t::TRAINING
                   : cudnn_frontend::NormFwdPhase_t::INFERENCE;
@@ -53,13 +50,17 @@ cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) {
 TupleKeyType get_key(NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType,
                      NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, DType ctype,
                      uint64_t batch_size, uint64_t hidden_size, bool zero_centered_gamma,
-                     bool is_tuned, NVTEScalingMode mode, bool training) {
-  // TODO: Add scaling_mode to general_key is needed
-  uint64_t general_key = static_cast<uint32_t>(itype) | (static_cast<uint32_t>(otype) << 3) |
-                         (static_cast<uint32_t>(ctype) << 6) | (static_cast<uint32_t>(wtype) << 9) |
-                         (uint32_t(NormType) << 12) | (uint32_t(NormStage)) << 14 |
-                         (uint32_t(NormBackend) << 16) | (uint32_t(zero_centered_gamma) << 18) |
-                         (uint32_t(mode) << 19) | (uint32_t(training) << 22);
+                     bool is_tuned, NVTEScalingMode mode, bool training,
+                     bool gamma_in_weight_dtype) {
+  static_assert(NVTE_INVALID_SCALING < 1024,
+                "This function assumes at most 10 bits used in the scaling mode.");
+  static_assert(kNVTENumTypes < 32, "This function assumes at most 5 bits used in the NVTEDType");
+  uint64_t general_key = static_cast<uint64_t>(itype) | (static_cast<uint64_t>(otype) << 5) |
+                         (static_cast<uint64_t>(ctype) << 10) |
+                         (static_cast<uint64_t>(wtype) << 15) | (uint64_t(NormType) << 20) |
+                         (uint64_t(NormStage)) << 22 | (uint64_t(NormBackend) << 24) |
+                         (uint64_t(zero_centered_gamma) << 26) | (uint64_t(mode) << 27) |
+                         (uint64_t(training) << 37) | (uint64_t(gamma_in_weight_dtype) << 38);
   return std::make_tuple(general_key, batch_size, hidden_size, is_tuned);
 }
 
@@ -502,11 +503,12 @@ NormalizationPlanBase* NormalizationPlanRegistry::getNormalizationPlan(
     NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage, DType wtype,
     DType itype, DType otype, const size_t batch_size, const size_t hidden_size,
     const size_t sm_count, const bool zero_centered_gamma, const bool is_aligned,
-    const NVTEScalingMode mode, const bool training) {
+    const NVTEScalingMode mode, const bool training, const bool gamma_in_weight_dtype) {
   const DType ctype = DType::kFloat32;
   bool is_tuned = is_aligned && (batch_size % 4 == 0);
-  auto key = get_key(NormBackend, NormType, NormStage, wtype, itype, otype, ctype, batch_size,
-                     hidden_size, zero_centered_gamma, is_tuned, mode, training);
+  auto key =
+      get_key(NormBackend, NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size,
+              zero_centered_gamma, is_tuned, mode, training, gamma_in_weight_dtype);
 
   auto it = normalizationPlanMap.find(key);
   if (it != normalizationPlanMap.end()) {
@@ -578,6 +580,7 @@ void nvte_enable_cudnn_norm_bwd(bool enable) {
   transformer_engine::normalization::_cudnn_norm_bwd_flag() = enable;
 }
 
+// Only for testing, not thread-safe
 void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable) {
   NVTE_API_CALL(nvte_enable_zero_centered_gamma_in_weight_dtype);
   transformer_engine::normalization::_zero_centered_gamma_in_weight_dtype() = enable;
diff --git a/transformer_engine/common/normalization/common.h b/transformer_engine/common/normalization/common.h
index 241a3b77b..d1fe6868e 100644
--- a/transformer_engine/common/normalization/common.h
+++ b/transformer_engine/common/normalization/common.h
@@ -196,7 +196,7 @@ TupleKeyType get_key(NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType,
                      NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, DType ctype,
                      uint64_t batch_size, uint64_t hidden_size, bool zero_centered_gamma,
                      bool is_tuned, NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING,
-                     bool training = true);
+                     bool training = true, bool gamma_in_weight_dtype = false);
 
 template <typename KernelParamsType>
 class TeNormalizationRegistry {
@@ -350,7 +350,8 @@ class NormalizationPlanRegistry {
       NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage,
       DType wtype, DType itype, DType otype, const size_t batch_size, const size_t hidden_size,
       const size_t sm_count, const bool zero_centered_gamma, const bool is_aligned,
-      const NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING, const bool training = true);
+      const NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING, const bool training = true,
+      const bool gamma_in_weight_dtype = false);
 
  private:
   NormalizationPlanRegistry() {}
@@ -471,6 +472,8 @@ void rocm_norm_mxfp8_quantize(LaunchParams<ForwardKernelParams> &launch_params)
 }
 #endif 
 
+bool& use_zero_centered_gamma_in_weight_dtype();
+
 }  // namespace normalization
 }  // namespace transformer_engine
 
diff --git a/transformer_engine/common/normalization/layernorm/ln_api.cpp b/transformer_engine/common/normalization/layernorm/ln_api.cpp
index f660ca5b7..e3cdfaf45 100644
--- a/transformer_engine/common/normalization/layernorm/ln_api.cpp
+++ b/transformer_engine/common/normalization/layernorm/ln_api.cpp
@@ -17,6 +17,7 @@
 
 #include "../../common.h"
 #include "../common.h"
+#include "transformer_engine/transformer_engine.h"
 
 namespace transformer_engine {
 
@@ -67,12 +68,15 @@ void layernorm_fwd(const Tensor& x,      // BxSxhidden_size
 #ifndef __HIP_PLATFORM_AMD__
   bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp_scaling(z->scaling_mode);
 
+  bool gamma_in_weight_dtype = false;
   if (cudnn_backend) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
-  } else
-#endif
+    gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
+  } else 
+#else  
   {
+
     norm_backend = NVTE_Norm_Backend::Te;
     is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, beta.data.dptr,
                                 mu->data.dptr, rsigma->data.dptr);
@@ -88,7 +92,8 @@ void layernorm_fwd(const Tensor& x,      // BxSxhidden_size
       z->data.dtype,     // otype
       x.data.shape[0],   // batch_size
       x.data.shape[1],   // hidden_size
-      multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training);
+      multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training,
+      gamma_in_weight_dtype);
 
   if (workspace->data.shape.empty()) {
     workspace->data.shape = plan->getWorkspaceShape();
@@ -155,12 +160,14 @@ void layernorm_bwd(const Tensor& dz, const Tensor& x, const Tensor& mu, const Te
 
   NVTE_Norm_Backend norm_backend;
   bool is_aligned = true;
-#ifndef __HIP_PLATFORM_AMD__
+  bool gamma_in_weight_dtype = false;
+  #ifndef __HIP_PLATFORM_AMD__
   if (use_cudnn_norm_bwd()) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
-  } else
-#endif
+    gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
+  } else 
+#endif  
   {
     norm_backend = NVTE_Norm_Backend::Te;
     is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, mu.data.dptr, rsigma.data.dptr,
@@ -173,7 +180,8 @@ void layernorm_bwd(const Tensor& dz, const Tensor& x, const Tensor& mu, const Te
       gamma.data.dtype,  // otype
       x.data.shape[0],   // batch_size
       x.data.shape[1],   // hidden_size
-      multiprocessorCount, zero_centered_gamma, is_aligned);
+      multiprocessorCount, zero_centered_gamma, is_aligned, NVTE_DELAYED_TENSOR_SCALING, true,
+      gamma_in_weight_dtype);
 
   if (workspace->data.shape.empty()) {
     workspace->data.shape = plan->getWorkspaceShape();
diff --git a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
index eabed2bd5..c783e1550 100644
--- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
+++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
@@ -15,6 +15,7 @@
 #include "../../common.h"
 #include "../common.h"
 #include "transformer_engine/normalization.h"
+#include "transformer_engine/transformer_engine.h"
 #include "transformer_engine/transpose.h"
 
 namespace transformer_engine {
@@ -57,12 +58,14 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens
   bool training =
       is_delayed_tensor_scaling(z->scaling_mode) || (z->columnwise_data).dptr != nullptr;
 
-#ifndef __HIP_PLATFORM_AMD__
+  bool gamma_in_weight_dtype = false;
+  #ifndef
   if (cudnn_backend) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
+    gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
   } else
-#endif
+#endif  
   {
     norm_backend = NVTE_Norm_Backend::Te;
     is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, rsigma->data.dptr);
@@ -75,7 +78,8 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens
       z->data.dtype,     // otype
       x.data.shape[0],   // batch_size
       x.data.shape[1],   // hidden_size
-      multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training);
+      multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training,
+      gamma_in_weight_dtype);
 
   if (workspace->data.shape.empty()) {
     workspace->data.shape = plan->getWorkspaceShape();
@@ -133,12 +137,14 @@ void rmsnorm_bwd(const Tensor &dz, const Tensor &x, const Tensor &rsigma, const
 
   NVTE_Norm_Backend norm_backend;
   bool is_aligned = true;
+  bool gamma_in_weight_dtype = false;
 #ifndef __HIP_PLATFORM_AMD__
   if (use_cudnn_norm_bwd()) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
-  } else
-#endif
+    gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
+  } else 
+#endif  
   {
     norm_backend = NVTE_Norm_Backend::Te;
     is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, rsigma.data.dptr, dx->data.dptr,
@@ -151,7 +157,8 @@ void rmsnorm_bwd(const Tensor &dz, const Tensor &x, const Tensor &rsigma, const
       gamma.data.dtype,  // otype
       x.data.shape[0],   // batch_size
       x.data.shape[1],   // hidden_size
-      multiprocessorCount, zero_centered_gamma, is_aligned);
+      multiprocessorCount, zero_centered_gamma, is_aligned, NVTE_DELAYED_TENSOR_SCALING, true,
+      gamma_in_weight_dtype);
 
   if (workspace->data.shape.empty()) {
     workspace->data.shape = plan->getWorkspaceShape();

From d82f67b32b5068a3d5b9038d6ce101059ec1b220 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Tue, 27 May 2025 21:30:09 -0700
Subject: [PATCH 11/26] Fix multi-framework runtime lib loading (#1825)

* Fix single FW build with multi FW available

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Some fixes

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fixes

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* sug

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 transformer_engine/common/__init__.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py
index 871723a0e..49395fa23 100644
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
@@ -113,9 +113,10 @@ def _get_shared_object_file(library: str) -> Path:
 
     # Case 1: Typical user workflow: Both locations are the same, return any result.
     if te_install_dir == site_packages_dir:
-        assert (
-            so_path_in_install_dir is not None
-        ), f"Could not find shared object file for Transformer Engine {library} lib."
+        if so_path_in_install_dir is None:
+            raise FileNotFoundError(
+                f"Could not find shared object file for Transformer Engine {library} lib."
+            )
         return so_path_in_install_dir
 
     # Case 2: ERR! Both locations are different but returned a valid result.
@@ -123,13 +124,12 @@ def _get_shared_object_file(library: str) -> Path:
     # editable builds. In case developers are executing inside a TE directory via
     # an inplace build, and then move to a regular build, the local shared object
     # file will be incorrectly picked up without the following logic.
-    if so_path_in_install_dir is not None and so_path_in_default_dir is not None:
-        raise RuntimeError(
-            f"Found multiple shared object files: {so_path_in_install_dir} and"
-            f" {so_path_in_default_dir}. Remove local shared objects installed"
-            f" here {so_path_in_install_dir} or change the working directory to"
-            "execute from outside TE."
-        )
+    assert so_path_in_install_dir is None or so_path_in_default_dir is None, (
+        f"Found multiple shared object files: {so_path_in_install_dir} and"
+        f" {so_path_in_default_dir}. Remove local shared objects installed"
+        f" here {so_path_in_install_dir} or change the working directory to"
+        "execute from outside TE."
+    )
 
     # Case 3: Typical dev workflow: Editable install
     if so_path_in_install_dir is not None:
@@ -139,7 +139,9 @@ def _get_shared_object_file(library: str) -> Path:
     if so_path_in_default_dir is not None:
         return so_path_in_default_dir
 
-    raise RuntimeError(f"Could not find shared object file for Transformer Engine {library} lib.")
+    raise FileNotFoundError(
+        f"Could not find shared object file for Transformer Engine {library} lib."
+    )
 
 
 @functools.lru_cache(maxsize=None)
@@ -207,6 +209,7 @@ def load_framework_extension(framework: str):
 @functools.lru_cache(maxsize=None)
 def _get_sys_extension():
     system = platform.system()
+
     if system == "Linux":
         extension = "so"
     elif system == "Darwin":

From b1d2539a8ee6603b107aa444e0d2ed7844e26368 Mon Sep 17 00:00:00 2001
From: alextmagro <alex.magro@amd.com>
Date: Mon, 6 Oct 2025 12:23:39 -0500
Subject: [PATCH 12/26] Release v2.4_rocm

---
 transformer_engine/common/normalization/common.cpp            | 1 +
 transformer_engine/common/normalization/layernorm/ln_api.cpp  | 4 +++-
 .../common/normalization/rmsnorm/rmsnorm_api.cpp              | 2 +-
 transformer_engine/pytorch/tensor/float8_tensor.py            | 4 ----
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/transformer_engine/common/normalization/common.cpp b/transformer_engine/common/normalization/common.cpp
index 029c89e2a..442e11216 100644
--- a/transformer_engine/common/normalization/common.cpp
+++ b/transformer_engine/common/normalization/common.cpp
@@ -41,6 +41,7 @@ Compute always in FP32
 namespace transformer_engine {
 namespace normalization {
 
+#ifndef __HIP_PLATFORM_AMD__
 cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) {
   return training ? cudnn_frontend::NormFwdPhase_t::TRAINING
                   : cudnn_frontend::NormFwdPhase_t::INFERENCE;
diff --git a/transformer_engine/common/normalization/layernorm/ln_api.cpp b/transformer_engine/common/normalization/layernorm/ln_api.cpp
index e3cdfaf45..9b689ec88 100644
--- a/transformer_engine/common/normalization/layernorm/ln_api.cpp
+++ b/transformer_engine/common/normalization/layernorm/ln_api.cpp
@@ -67,14 +67,16 @@ void layernorm_fwd(const Tensor& x,      // BxSxhidden_size
   bool is_aligned = true;
 #ifndef __HIP_PLATFORM_AMD__
   bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp_scaling(z->scaling_mode);
+#endif
 
   bool gamma_in_weight_dtype = false;
+#ifndef __HIP_PLATFORM_AMD__
   if (cudnn_backend) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
     gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype();
   } else 
-#else  
+#endif 
   {
 
     norm_backend = NVTE_Norm_Backend::Te;
diff --git a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
index c783e1550..4eb5f7496 100644
--- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
+++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp
@@ -59,7 +59,7 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens
       is_delayed_tensor_scaling(z->scaling_mode) || (z->columnwise_data).dptr != nullptr;
 
   bool gamma_in_weight_dtype = false;
-  #ifndef
+#ifndef __HIP_PLATFORM_AMD__
   if (cudnn_backend) {
     // TODO: add check for GPU ARCH
     norm_backend = NVTE_Norm_Backend::Cudnn;
diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py
index fa8e29283..f43a6dd28 100644
--- a/transformer_engine/pytorch/tensor/float8_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -6,12 +6,8 @@
 
 """Tensor class with FP8 data"""
 from __future__ import annotations
-<<<<<<< HEAD
 import os
-from typing import Optional, Tuple, Iterable
-=======
 from typing import Optional, Tuple, Iterable, Union
->>>>>>> 6f5af6ae (Enhance recipe compatibility (#1724))
 import warnings
 from torch.utils.cpp_extension import IS_HIP_EXTENSION
 

From 0e1c8fe6d4e30791ed157527f5b5ef7e437115e7 Mon Sep 17 00:00:00 2001
From: alextmagro <alex.magro@amd.com>
Date: Tue, 7 Oct 2025 16:07:30 -0500
Subject: [PATCH 13/26] readd HIP data generation

---
 tests/cpp/test_common.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index e11b32689..9f4c9c3cb 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -731,6 +731,14 @@ std::pair<double, double> getTolerances(const DType type) {
 
 template <typename T>
 void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
+#ifdef __HIP_PLATFORM_AMD__
+  // TODO: Introduce a parallel RNG library (Random123, PCG, rocRAND)
+  std::uniform_real_distribution<> dis(-2.0, 1.0);
+  for (int i = 0; i < size; i++) {
+    data[i] = static_cast<T>(dis(*gen));
+  }
+  gen->discard(size)
+#else
   // Check how many RNG calls are required to generate one uniform random value
   int rng_calls_per_val = 0;
   {
@@ -760,6 +768,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
     }
   }
   gen->discard(size * rng_calls_per_val);
+#endif
 }
 
 void fillUniform(Tensor *t) {

From 758ed7e3159ba29a25fca12157291cd2633a8428 Mon Sep 17 00:00:00 2001
From: alextmagro <alex.magro@amd.com>
Date: Wed, 8 Oct 2025 09:50:33 -0500
Subject: [PATCH 14/26] Missing ; in test_common

---
 tests/cpp/test_common.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 9f4c9c3cb..ccc8ae681 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -737,7 +737,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
   for (int i = 0; i < size; i++) {
     data[i] = static_cast<T>(dis(*gen));
   }
-  gen->discard(size)
+  gen->discard(size);
 #else
   // Check how many RNG calls are required to generate one uniform random value
   int rng_calls_per_val = 0;

From d1b8dba9514c3adcc74a8e839df7fb15e46bfda9 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 31 Oct 2025 16:19:28 -0500
Subject: [PATCH 15/26] [CI] Removed Jax jit workaround, replaced with
 XLA_FLAGS=--xla_gpu_enable_nccl_comm_splitting=false (#346)

---
 ci/jax.sh | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/ci/jax.sh b/ci/jax.sh
index 80c61ce9b..cc080916c 100755
--- a/ci/jax.sh
+++ b/ci/jax.sh
@@ -66,33 +66,23 @@ run_test_config() {
 run_test_config_mgpu() {
     echo ==== Run mGPU with Fused attention backend: $_fus_attn ====
     
-    _JAX_DISABLE_JIT_FLAG=${JAX_DISABLE_JIT:-0}
     _ver=$(pip show jaxlib | grep Version)
     case "$_ver" in
     *0.4.35*)
-        # Workaround for distributed tests hang with JIT enabled
-        JAX_DISABLE_JIT=1 run 3 test_distributed_fused_attn.py -k 'not (test_context_parallel_allgather_attn[BALANCED or test_context_parallel_ring_attn)'
-        _JAX_DISABLE_JIT_FLAG=1
-
-        # Run tests that fail with JIT disabled
-        #run_lbl "allgather_balanced" 3 test_distributed_fused_attn.py -k 'test_context_parallel_allgather_attn[BALANCED'
-
+        # Workaround for distributed tests hang with xla_flag
+ 	    XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_ring_attn'
+ 
         # Test ring attention with xla_flag --xla_experimental_ignore_channel_id only
-        # TODO: remove this flag after jax/xla update
-        XLA_FLAGS="--xla_experimental_ignore_channel_id" run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn
-        ;;
-    *0.6.*)
-        # Workaround for distributed tests hang with JIT enabled
-        JAX_DISABLE_JIT=1 run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_allgather_attn[BALANCED'
-        _JAX_DISABLE_JIT_FLAG=1
+	    XLA_FLAGS="--xla_experimental_ignore_channel_id" run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn
         ;;
     *)
-        run 3 test_distributed_fused_attn.py
+        # Workaround for distributed tests hang with xla_flag
+        XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py
         ;;
     esac
     
     run_default_fa 3 test_distributed_layernorm.py
-    JAX_DISABLE_JIT=$_JAX_DISABLE_JIT_FLAG run_default_fa 3 test_distributed_layernorm_mlp.py
+    XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run_default_fa 3 test_distributed_layernorm_mlp.py
     run_default_fa 3 test_distributed_softmax.py
 
     run_default_fa 3 test_sanity_import.py

From fa8615df0ab6ae89f36d1577ce5bdffda8253024 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Fri, 10 Oct 2025 16:37:32 -0400
Subject: [PATCH 16/26] CI hotfix: IFU test update (#329)

---
 ci/pytorch.sh                        | 2 +-
 tests/pytorch/test_cpu_offloading.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/pytorch.sh b/ci/pytorch.sh
index e4f8380f5..207949ee5 100755
--- a/ci/pytorch.sh
+++ b/ci/pytorch.sh
@@ -58,7 +58,7 @@ run_test_config(){
     run_default_fa 1 test_deferred_init.py
     run_default_fa 1 test_float8tensor.py
     run_default_fa 1 test_float8_current_scaling_exact.py
-    run_default_fa 1 test_cpu_offloading.py
+    test $_fus_attn = auto -o $_fus_attn = ck -o $_fus_attn = aotriton && NVTE_FLASH_ATTN=0 run 1 test_cpu_offloading.py
     run_default_fa 1 test_fused_rope.py
     run_default_fa 1 test_fusible_ops.py
     run_default_fa 3 test_gemm_autotune.py
diff --git a/tests/pytorch/test_cpu_offloading.py b/tests/pytorch/test_cpu_offloading.py
index ab4b7634b..816df12f6 100644
--- a/tests/pytorch/test_cpu_offloading.py
+++ b/tests/pytorch/test_cpu_offloading.py
@@ -29,7 +29,7 @@
 
 # Flash attention saves some internal tensor for the backward pass
 # that cannot be offloaded to CPU.
-assert os.getenv("NVTE_FLASH_ATTN") == "0"
+assert os.getenv("NVTE_FLASH_ATTN", "1") == "0"
 
 # Offloading is supported for attention only for fused and flash attention backends,
 # so the use of bfloat16 is required.

From 08bf8fc836eda32b7064dad2ec2ed7fe89b28b58 Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Sat, 18 Oct 2025 23:20:02 -0400
Subject: [PATCH 17/26] Fix and add MXFP8 GEMM test failures (#326)

* Fix MXFP8 GEMM test
* Fix uninitialized var in GEMM code
* Add Dequantize+GEMM test to check MXFP8 scaling tensor layout
---
 ci/core.sh                                  |   4 +-
 tests/cpp/CMakeLists.txt                    |   3 +-
 tests/cpp/operator/test_cublaslt_gemm.cu    | 375 ++++++++++++--------
 tests/cpp/test_common.cu                    |   3 +
 tests/pytorch/test_gemm_autotune.py         |   2 +-
 transformer_engine/common/gemm/rocm_gemm.cu |   2 +-
 6 files changed, 239 insertions(+), 150 deletions(-)

diff --git a/ci/core.sh b/ci/core.sh
index 0953d7bde..35b4000e9 100755
--- a/ci/core.sh
+++ b/ci/core.sh
@@ -31,14 +31,14 @@ fi
 check_test_filter "nongemm"
 if [ $? -eq 0 ]; then
     echo ===== Run non GEMM tests =====
-    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "OperatorTest/GEMMTestSuite"
+    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite"
     test $? -eq 0 || test_run_error "non-GEMM"
 fi
 
 check_test_filter "gemm"
 if [ $? -eq 0 ]; then
     echo  ===== Run GEMM tests =====
-    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "OperatorTest/GEMMTestSuite"
+    ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite"
     test $? -eq 0 || test_run_error "GEMM"
 fi
 
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index da8a37ba8..4ab5fd237 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -64,8 +64,7 @@ else()
   project(transformer_engine_tests LANGUAGES HIP CXX)
   # Ask hcc to generate device code during compilation so we can use
   # host linker to link.
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fno-gpu-rdc -Wno-defaulted-function-deleted")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_FLAGS}")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fno-gpu-rdc -Wno-defaulted-function-deleted -Wno-unused-result")
 endif()
 
 add_subdirectory(../../3rdparty/googletest ${PROJECT_BINARY_DIR}/googletest)
diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index 7d0597ef7..b731cc701 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -3,17 +3,15 @@
  *
  * License for AMD contributions = MIT. See LICENSE for more information
  ************************************************************************/
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <transformer_engine/cast.h>
 #include <transformer_engine/gemm.h>
 #include <transformer_engine/transformer_engine.h>
-#include <gtest/gtest.h>
-#include <cuda_runtime.h>
-#include <cuda_bf16.h>
-#include <memory>
-#include <iostream>
-#include <iomanip>
-#include <random>
-#include <cstring>
-#include <cmath>
 #include "../test_common.h"
 
 using namespace transformer_engine;
@@ -30,29 +28,17 @@ std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes = {
   {29, 29, 17389}, //primes
 }; 
 
+std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes_mxfp8 = {
+  {2304, 768, 4096},
+}; 
+
 //  A, B, Bias, Gelu, D
 //  Bias type choose as bf16 in use_fp8, D_type otherwise
 //  Gelu type the same as Bias_Type
-//  {DType::kFloat32, DType::kFloat32, DType::kFloat32, DType::kFloat32, DType::kFloat32},
-//  {DType::kFloat16, DType::kFloat16, DType::kFloat16, DType::kFloat16, DType::kFloat16},
-//  {DType::kBFloat16, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16},
-//  {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat32},
-//  {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat16},
-//  {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16},
-//  {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3},
-//  {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2},
-//  {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat32},
-//  {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat16},
-//  {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16},
-//  {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3},
-//  {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2},
-//  {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat32},
-//  {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat16},
-//  {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16},
-//  {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3},
-//  {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2},
-}  // namespace
 
+using fp32=float;
+using fp8=fp8e4m3;
+using bf8=fp8e5m2;
 
 using Layout = std::pair<bool,bool>;// {transa, transb}
 static const Layout kNN{false,false};
@@ -61,10 +47,9 @@ static const Layout kNT{false,true };
 
 static const std::vector<Layout> kLayouts = { kNN, kTN, kNT };
 
-// <A_type, B_type, Bias_Type, Gelu_Type D_type>, <m, k, n>
-class GEMMTestSuite
-    : public ::testing::TestWithParam<
-          std::tuple<std::tuple<size_t, size_t, size_t>, bool, bool, Layout, NVTEScalingMode>> {};
+using TShape = std::vector<size_t>;
+}  // namespace
+
 
 float ref_gelu(float x){
   float cdf = 0.5f * (1.0f + tanhf((0.7978845608028654f * (x + 0.044715f * x * x * x))));
@@ -81,12 +66,14 @@ void compute_ref(
   const float d_scale,
   size_t m, size_t k, size_t n,
   D_Type* ref_d_data,
-  float* ref_d_amax,
+  float* ref_d_amax_ptr,
   Gelu_Type* ref_gelu_data,
   bool transa,
   bool transb){
 
-  *ref_d_amax = 0;
+  float ref_d_amax = 0;
+
+  #pragma omp parallel for schedule(static) collapse(2) reduction(max: ref_d_amax) proc_bind(spread)
   for(size_t ii = 0; ii < m; ii++){
     for(size_t jj = 0; jj < n; jj++){
       float val = 0;
@@ -106,41 +93,45 @@ void compute_ref(
       // update ref_d_amax if in fp8
       DType dtype = TypeInfo<D_Type>::dtype;
       if(isFp8Type(dtype)){
-        *ref_d_amax = std::max<float>(*ref_d_amax, std::fabs(val));
+        ref_d_amax = std::max<float>(ref_d_amax, std::fabs(val));
       }
     }
   }
+  if (ref_d_amax_ptr)
+  {
+    *ref_d_amax_ptr = ref_d_amax;
+  }
 }
 
 template <typename A_Type, typename B_Type, typename Bias_Type, typename Gelu_Type, typename D_Type>
 void compute_mxfp8_ref(
   const A_Type* a_data,
   const B_Type* b_data,
-  const NVTEShape& a_scale_inv_shape,
   const fp8e8m0* a_scale_inv_data,
-  const NVTEShape& b_scale_inv_shape,
   const fp8e8m0* b_scale_inv_data,
   const Bias_Type* bias_data, //bias is of dim m
   const float d_scale,
   size_t m, size_t k, size_t n,
   D_Type* ref_d_data,
-  float* ref_d_amax,
+  float* ref_d_amax_ptr,
   Gelu_Type* ref_gelu_data,
   bool transa,
   bool transb){
 
-  *ref_d_amax = 0;
+  float ref_d_amax = 0;
+
+  #pragma omp parallel for schedule(static) collapse(2) reduction(max: ref_d_amax) proc_bind(spread)
   for(size_t ii = 0; ii < m; ii++){
     for(size_t jj = 0; jj < n; jj++){
       float val = 0;
       for(size_t kk = 0; kk < k; kk++){
-        float a_val = a_data[ii*k + kk];
-        float b_val = b_data[kk + jj*k];
-        float a_scale_inv_val =
-            (float)std::pow(2, a_scale_inv_data[ii * a_scale_inv_shape.data[1] + kk / 32] - 127);
-        float b_scale_inv_val =
-            (float)std::pow(2, b_scale_inv_data[kk / 32 + jj * b_scale_inv_shape.data[1]] - 127);
-        val += a_scale_inv_val * a_val * b_scale_inv_val * b_val;
+        size_t a_idx = transa ? (ii*k + kk) : (kk*m + ii);
+        size_t b_idx = transb ? (kk*n + jj) : (jj*k + kk);
+        float a_scale_inv_val = (float)std::pow(2,
+          a_scale_inv_data[transa ? a_idx/32 : (kk/32 * m + ii)] - 127);
+        float b_scale_inv_val = (float)std::pow(2,
+          b_scale_inv_data[transb ? (kk/32 * n + jj) : b_idx/32] - 127);
+        val += a_scale_inv_val * (float)a_data[a_idx] * b_scale_inv_val * (float)b_data[b_idx];
       }
       if(bias_data){
         val += (float)bias_data[ii];
@@ -153,10 +144,14 @@ void compute_mxfp8_ref(
       // update ref_d_amax if in fp8
       DType dtype = TypeInfo<D_Type>::dtype;
       if(isFp8Type(dtype)){
-        *ref_d_amax = std::max<float>(*ref_d_amax, std::fabs(val));
+        ref_d_amax = std::max<float>(ref_d_amax, std::fabs(val));
       }
     }
   }
+  if (ref_d_amax_ptr)
+  {
+    *ref_d_amax_ptr = ref_d_amax;
+  }
 }
 
 template <typename Type>
@@ -172,6 +167,36 @@ void cpu_rowwise_to_columnwise(
   }
 }
 
+std::pair<double, double> getTestTolerances(const DType type, bool use_fp8, bool use_mxfp8) {
+  auto [atol, rtol] = getTolerances(type);
+
+  //relax for certain prime number gemm
+  if (type == DType::kFloat32) {
+    atol = 1e-5;
+  }
+  // relax for certain FP8 gemm with hipblaslt
+  if (use_mxfp8) {
+    atol = 5e-4;
+    /*During hipifying std::max is converted to ::max
+    to w/a HIP bug with using std:: in device functions.
+    W/o explicitlit <double>, compiler uses non-templated int method variant from HIP headers
+    TODO: remove when switch to new hipify version after fixing HIP bug */
+    rtol = std::max<double>(rtol, 1e-3);
+  }
+  else if (use_fp8) {
+    atol = 1e-3;
+    //TODO: remove <double> (see comment above)
+    rtol = std::max<double>(rtol, 5e-3);
+  }
+  else if (type == DType::kBFloat16) {
+    //relax for certain prime number TN gemm
+    rtol = 5e-2;
+  }
+  else if (type == DType::kFloat32) {
+    rtol = 1e-5;
+  }
+  return {atol, rtol};
+}
 
 struct TestParams {
   size_t m;
@@ -258,8 +283,13 @@ void performTest(const TestParams& params) {
     if (params.use_gelu && dtype == DType::kBFloat16) {
       GTEST_SKIP() << "BF16 GEMM with GELU is not supported in current config";
     }
-    if (has_fp8 && params.use_bias && dtype == DType::kFloat32) {
-      GTEST_SKIP() << "FP8 GEMM with bias and FP32 output is not supported in current config";
+    if constexpr ((std::is_same<A_Type, bf8>::value || std::is_same<B_Type, bf8>::value) &&
+      std::is_same<D_Type, fp32>::value)
+    {
+      //GEMM with bias and fp32 output is not supported with bf8 A/B
+      if (params.use_bias) {
+        GTEST_SKIP() << "FP8 GEMM with bias is not supported in current config";
+      }
     }
   }
   if (prop.major == 9 && prop.minor == 4) //gfx942 specific hipblasLt limitations
@@ -273,49 +303,39 @@ void performTest(const TestParams& params) {
   }
 #endif
 
-  // pytorch tensor storage is row-major while cublas/hipblaslt is column-major
-  Tensor A;
-  if (params.transa){
-    A = Tensor("A", std::vector<size_t>{ params.m, params.k }, atype, true, false, params.scaling_mode);
-  }else {
-    // hipblaslt path need fp8-gemm with TN layout
-    A = Tensor("A", std::vector<size_t>{ params.k, params.m }, atype, true, isFp8Type(atype), params.scaling_mode);
-  }
-  Tensor B;
-  if (params.transb){
-    //hipblaslt path need fp8-gemm with TN layout
-    B = Tensor("B", std::vector<size_t>{ params.k, params.n }, btype, true, isFp8Type(btype), params.scaling_mode);
-  }else {
-    B = Tensor("B", std::vector<size_t>{ params.n, params.k }, btype, true, false, params.scaling_mode);
-  }
-  Tensor D("D", std::vector<size_t>{ params.n, params.m }, dtype);
+  // FP8 GEMM path needs columnwise data for A/B tensor with non TN layout
+  const bool a_colwise = !params.transa && isFp8Type(atype);
+  const bool b_colwise = params.transb && isFp8Type(btype);
+  Tensor A("A", params.transa ? TShape{ params.m, params.k } : TShape{ params.k, params.m },
+    atype, (!a_colwise || !use_mxfp8), a_colwise, params.scaling_mode);
+  Tensor B("B", params.transb ? TShape{ params.k, params.n } : TShape{ params.n, params.k },
+    btype, (!b_colwise || !use_mxfp8), b_colwise, params.scaling_mode);
+
+  Tensor D("D", TShape{ params.n, params.m }, dtype);
   Tensor bias;
   if(params.use_bias){
-    bias = Tensor("bias", std::vector<size_t>{params.m}, bias_type);
+    bias = Tensor("bias", TShape{params.m}, bias_type);
   }
   Tensor pre_gelu_out;
   if(params.use_gelu){
-    pre_gelu_out = Tensor("pre_gelu_out", std::vector<size_t>{ params.n, params.m }, gelu_type);
+    pre_gelu_out = Tensor("pre_gelu_out", TShape{ params.n, params.m }, gelu_type);
   }
   
   //initialize the data and scale inv of A, B
+  //fillUniform does not initialize columnwise data if rowwise data exist
   fillUniform(&A);
-  if (isFp8Type(atype) && !params.transa && !use_mxfp8) {
+  if (a_colwise && !use_mxfp8) {
     // A must be of shape k, m
-    cpu_rowwise_to_columnwise(
-      params.k, params.m,
-      A.rowwise_cpu_dptr<A_Type>(),
-      A.columnwise_cpu_dptr<A_Type>());
+    cpu_rowwise_to_columnwise(params.k, params.m,
+      A.rowwise_cpu_dptr<A_Type>(), A.columnwise_cpu_dptr<A_Type>());
     // sync the columnwise data on GPU as well
     A.from_cpu();
   }
   fillUniform(&B);
-  if (isFp8Type(btype) && params.transb && !use_mxfp8) {
-    // B must be of shape k, m
-    cpu_rowwise_to_columnwise(
-      params.k, params.n,
-      B.rowwise_cpu_dptr<B_Type>(),
-      B.columnwise_cpu_dptr<B_Type>());
+  if (b_colwise && !use_mxfp8) {
+    // B must be of shape k, n
+    cpu_rowwise_to_columnwise(params.k, params.n,
+      B.rowwise_cpu_dptr<B_Type>(), B.columnwise_cpu_dptr<B_Type>());
     // sync the columnwise data on GPU as well
     B.from_cpu();
   }
@@ -335,7 +355,7 @@ void performTest(const TestParams& params) {
     workspace_size = 67108864;
   }
 #endif
-  Tensor Workspace("Workspace", std::vector<size_t>{ workspace_size }, DType::kByte);
+  Tensor Workspace("Workspace", TShape{ workspace_size }, DType::kByte);
 
   //perform the gemm in GPU
   nvte_cublas_gemm(A.data(),
@@ -370,28 +390,23 @@ void performTest(const TestParams& params) {
     const A_Type *a_data;
     const B_Type *b_data;
     const fp8e8m0 *a_scale_inv_data, *b_scale_inv_data;
-    NVTEShape a_scale_inv_shape, b_scale_inv_shape;
     if (params.transa) {
       a_data = A.rowwise_cpu_dptr<A_Type>();
       a_scale_inv_data = A.rowwise_cpu_scale_inv_ptr<fp8e8m0>();
-      a_scale_inv_shape = A.rowwise_scale_inv_shape();
     } else {
       a_data = A.columnwise_cpu_dptr<A_Type>();
       a_scale_inv_data = A.columnwise_cpu_scale_inv_ptr<fp8e8m0>();
-      a_scale_inv_shape = A.columnwise_scale_inv_shape();
     }
     if (params.transb) {
       b_data = B.columnwise_cpu_dptr<B_Type>();
       b_scale_inv_data = B.columnwise_cpu_scale_inv_ptr<fp8e8m0>();
-      b_scale_inv_shape = B.columnwise_scale_inv_shape();
     } else {
       b_data = B.rowwise_cpu_dptr<B_Type>();
       b_scale_inv_data = B.rowwise_cpu_scale_inv_ptr<fp8e8m0>();
-      b_scale_inv_shape = B.rowwise_scale_inv_shape();
     }
 
     compute_mxfp8_ref<A_Type, B_Type, Bias_Type, Gelu_Type, D_Type>(
-        a_data, b_data, a_scale_inv_shape, a_scale_inv_data, b_scale_inv_shape, b_scale_inv_data,
+        a_data, b_data, a_scale_inv_data, b_scale_inv_data,
         params.use_bias ? bias.rowwise_cpu_dptr<Bias_Type>() : nullptr,
         D.scale(), params.m, params.k, params.n, ref_D.get(), &ref_amax_d,
         params.use_gelu ? ref_pre_gelu_out.get() : nullptr,
@@ -416,49 +431,91 @@ void performTest(const TestParams& params) {
     compareResults("D_amax", D.amax(), ref_amax_d, atol_amax, rtol_amax);
   }
 
-  auto [atol, rtol] = getTolerances(dtype);
-  //relax for certain prime number gemm
-  if (dtype == DType::kFloat32) {
-    atol = 1e-5;
-  }
-#ifdef __HIP_PLATFORM_AMD__
-  // relax for certain FP8 gemm with hipblaslt
-  if (use_mxfp8) {
-    atol = 5e-4;
-    /*During hipifying std::max is converted to ::max
-    to w/a HIP bug with using std:: in device functions.
-    W/o explicitlit <double>, compiler uses non-templated int method variant from HIP headers
-    TODO: remove when switch to new hipify version after fixing HIP bug */
-    rtol = std::max<double>(rtol, 1e-3);
-  }
-  else if (has_fp8) {
-    atol = 1e-3;
-    //TODO: remove <double> (see comment above)
-    rtol = std::max<double>(rtol, 5e-3);
-  }
-  else if (dtype == DType::kBFloat16) {
-    //relax for certain prime number TN gemm
-    rtol = 5e-2;
-  }
-  else if (dtype == DType::kFloat32) {
-    rtol = 1e-5;
-  }
-#endif
+  auto [atol, rtol] = getTestTolerances(dtype, has_fp8, use_mxfp8);
   compareResults("D", D, ref_D.get(), true, atol, rtol);
 
   if(params.use_gelu){
-    auto [atol, rtol] = getTolerances(gelu_type);
-    //relax for certain prime number gemm
-    if (dtype == DType::kFloat32) {
-      atol = 1e-5;
-    }
+    auto [atol, rtol] = getTestTolerances(gelu_type, false, false);
     compareResults("gelu", pre_gelu_out, ref_pre_gelu_out.get(), true, atol, rtol);
   }
 }
 
-using fp32=float;
-using fp8=fp8e4m3;
-using bf8=fp8e5m2;
+#ifdef __HIP_PLATFORM_AMD__
+template <typename A_Type, typename B_Type, typename D_Type>
+void performDqTest(const TestParams &params) {
+  DType atype = TypeInfo<A_Type>::dtype;
+  DType btype = TypeInfo<B_Type>::dtype;
+  DType dtype = TypeInfo<D_Type>::dtype;
+
+  GTEST_ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input datatype is expected";
+  GTEST_ASSERT_FALSE(isFp8Type(dtype)) << "Non FP8/BF8 output datatype is expected";
+
+  if (params.m % 32 != 0 || params.n % 32 != 0 || params.k % 32 != 0) {
+    GTEST_SKIP() << "MXFP8 requires M, N, K to be multiples of 32";
+  }
+
+  cudaDeviceProp prop;
+  (void)cudaGetDeviceProperties(&prop, 0);
+
+  bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5);
+  if (!mxfp8_supported) {
+    GTEST_SKIP() << "MXFP8 is not supported in current config";
+  }
+
+  DType ref_type = dtype;
+  TShape a_shape = params.transa ? TShape{params.m, params.k} : TShape{params.k, params.m};
+  TShape b_shape = params.transb ? TShape{params.k, params.n} : TShape{params.n, params.k};
+
+  Tensor A_src("A", a_shape, ref_type);
+  Tensor B_src("B", b_shape, ref_type);
+  //initialize A, B
+  fillUniform(&A_src);
+  fillUniform(&B_src);
+
+  // FP8 GEMM path needs columnwise data for A/B tensor with non TN layout
+  Tensor A_fp8("A_fp8", a_shape, atype, params.transa, !params.transa,
+               NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+  Tensor B_fp8("B_fp8", b_shape, btype, !params.transb, params.transb,
+               NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+  nvte_quantize(A_src.data(), A_fp8.data(), 0);
+  nvte_quantize(B_src.data(), B_fp8.data(), 0);
+
+  Tensor A_ref("A_ref", a_shape, ref_type);
+  Tensor B_ref("B_ref", b_shape, ref_type);
+  nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
+  nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
+
+  Tensor bias;
+  Tensor pre_gelu_out;
+
+  size_t workspace_size = 67108864;
+  Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte);
+
+  //perform FP8 gemm and copy the output results from GPU memory to CPU memory
+  Tensor D("D", TShape{params.n, params.m}, dtype);
+  nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(), bias.data(), pre_gelu_out.data(),
+                   params.transa, params.transb, false, Workspace.data(), false, false,
+                   prop.multiProcessorCount, 0);
+  D.to_cpu();
+
+
+  //perform non-FP8 gemm and copy the output results from GPU memory to CPU memory
+  Tensor D_ref("D", TShape{params.n, params.m}, dtype);
+  nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(), bias.data(), pre_gelu_out.data(),
+                   params.transa, params.transb, false, Workspace.data(), false, false,
+                   prop.multiProcessorCount, 0);
+  D_ref.to_cpu();
+
+  // check if error message happens in running
+  (void)cudaDeviceSynchronize();
+  auto err = cudaGetLastError();
+  ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
+
+  //compare results
+  auto [atol, rtol] = getTestTolerances(dtype, true, true);
+  compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
+}
+#endif // __HIP_PLATFORM_AMD__
 
 #define MAKE_TEST_PARAMS(P_)                                                    \
   TestParams P_ = {.m = std::get<0>(std::get<0>(GetParam())),                   \
@@ -472,10 +529,13 @@ using bf8=fp8e5m2;
                                        ? NVTEScalingMode::NVTE_MXFP8_1D_SCALING \
                                        : NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING}
 
+// <m, k, n>, use_bias, use_gelu, Layout, fp8_scalinig
+class GEMMTestSuite
+    : public ::testing::TestWithParam<
+          std::tuple<std::tuple<size_t, size_t, size_t>, bool, bool, Layout, NVTEScalingMode>> {};
+
 #define MAKE_GEMM_TEST(NAME_, A_, B_, BIAS_, GELU_, D_)                     \
   TEST_P(GEMMTestSuite, NAME_) {                                            \
-    using namespace transformer_engine;                                     \
-    using namespace test;                                                   \
     MAKE_TEST_PARAMS(test_params);                                          \
     using A_Type = A_;                                                      \
     using B_Type = B_;                                                      \
@@ -523,24 +583,51 @@ MAKE_GEMM_TEST(Testbf8xfp8xbf16xbf16xbf8, bf8, fp8, bf16, bf16, bf8);
 
 MAKE_GEMM_TEST(Testfp8xfp8xfp16xfp16xfp8, fp8, fp8, fp16, fp16, fp8);
 
-INSTANTIATE_TEST_SUITE_P(
-    OperatorTest,
-    GEMMTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(test_case_sizes),
-        ::testing::Values(false, true), //use bias
-        ::testing::Values(false, true), //use_gelu
-        ::testing::ValuesIn(kLayouts), //transa,transb
-        ::testing::Values(false, true)), //use mxfp8
-    [](const testing::TestParamInfo<GEMMTestSuite::ParamType>& info) {
-      auto TN = [](bool v){ return v ? "T" : "N"; };
-      const auto layout = std::get<3>(info.param);
-      std::string name = std::to_string(std::get<0>(std::get<0>(info.param))) + "X" +
-                         std::to_string(std::get<1>(std::get<0>(info.param))) + "X" +
-                         std::to_string(std::get<2>(std::get<0>(info.param))) + "X" +
-                         std::to_string(std::get<1>(info.param)) + "X" +
-                         std::to_string(std::get<2>(info.param)) + "X" +
-                         TN(layout.first) + TN(layout.second) + "X" +
-                         (std::get<4>(info.param) ? "M" : "S");
-      return name;
-    });
+static inline auto TN(const Layout& layout) {
+  static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
+  return std::string(map[layout.first][layout.second]);
+}
+
+static inline auto MKN(const std::tuple<size_t, size_t, size_t>& shape) {
+  return std::to_string(std::get<0>(shape)) + "x" + std::to_string(std::get<1>(shape)) + "x" +
+         std::to_string(std::get<2>(shape));
+}
+
+INSTANTIATE_TEST_SUITE_P(OperatorTest, GEMMTestSuite,
+                         ::testing::Combine(::testing::ValuesIn(test_case_sizes),
+                                            ::testing::Values(false, true),   //use bias
+                                            ::testing::Values(false, true),   //use_gelu
+                                            ::testing::ValuesIn(kLayouts),    //transa,transb
+                                            ::testing::Values(false, true)),  //use mxfp8
+                         [](const testing::TestParamInfo<GEMMTestSuite::ParamType>& info) {
+                           return MKN(std::get<0>(info.param)) + "x" +
+                                  std::to_string(std::get<1>(info.param)) + "x" +
+                                  std::to_string(std::get<2>(info.param)) + "x" +
+                                  TN(std::get<3>(info.param)) + "x" +
+                                  (std::get<4>(info.param) ? "M" : "S");
+                         });
+
+#ifdef __HIP_PLATFORM_AMD__
+class DqGEMMTestSuite: public GEMMTestSuite {};
+
+#define MAKE_DQ_GEMM_TEST(NAME_, A_, B_, D_)            \
+  TEST_P(DqGEMMTestSuite, NAME_) {                      \
+    MAKE_TEST_PARAMS(test_params);                      \
+    using A_Type = A_;                                  \
+    using B_Type = B_;                                  \
+    using D_Type = D_;                                  \
+    performDqTest<A_Type, B_Type, D_Type>(test_params); \
+  }
+
+MAKE_DQ_GEMM_TEST(Testfp8xfp8xfp16, fp8, fp8, fp16)
+
+INSTANTIATE_TEST_SUITE_P(OperatorTest, DqGEMMTestSuite,
+                         ::testing::Combine(::testing::ValuesIn(test_case_sizes_mxfp8),
+                                            ::testing::Values(false),       // bias - unused
+                                            ::testing::Values(false),       // gelu - unused
+                                            ::testing::ValuesIn(kLayouts),  //transa,transb
+                                            ::testing::Values(true)),       //use mxfp8
+                         [](const testing::TestParamInfo<DqGEMMTestSuite::ParamType>& info) {
+                           return MKN(std::get<0>(info.param)) + "x" + TN(std::get<3>(info.param));
+                         });
+#endif  // __HIP_PLATFORM_AMD__
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index ccc8ae681..d37900a1f 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -454,6 +454,9 @@ void Tensor::set_scale_inv(float scale_inv) {
         columnwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
       } else {
         std::uniform_int_distribution<uint8_t> dis(0, 127);
+        if (rowwise_) {
+          from_cpu(); //Need it because scale_inv_ptr getting does to_cpu()
+        }
         auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr<uint8_t>();
         for (size_t i = 0; i < num_scales; i++) {
           scale_inv_ptr[i] = dis(gen_);
diff --git a/tests/pytorch/test_gemm_autotune.py b/tests/pytorch/test_gemm_autotune.py
index 562581364..1b54e8464 100644
--- a/tests/pytorch/test_gemm_autotune.py
+++ b/tests/pytorch/test_gemm_autotune.py
@@ -34,7 +34,7 @@ def analyse_storage(fname):
         next(reader)
         head = reader.fieldnames
     assert ("m" in head and "algo_id" in head and  "ws_min" in head and "ws_max" in head
-            and "aidx" in head), "Invalid CSV format"
+            ), "Invalid CSV format"
     return head
 
 def read_storage(fname):
diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu
index dcba674e4..574e8ab7e 100644
--- a/transformer_engine/common/gemm/rocm_gemm.cu
+++ b/transformer_engine/common/gemm/rocm_gemm.cu
@@ -1089,7 +1089,7 @@ void hipblaslt_gemm(const Tensor *inputA,
   // Note: gelu fusion is available for certain config from rocm 7.0
   // amax(D) either (next op is high precision).
 #if HIPBLASLT_VERSION_MAJOR > 0 || HIPBLASLT_VERSION_MINOR >= 15
-    hipblasLtMatmulMatrixScale_t scaling_mode;
+    hipblasLtMatmulMatrixScale_t scaling_mode = (hipblasLtMatmulMatrixScale_t)0;
 #else
     constexpr int scaling_mode = 0;
 #endif

From c6a2c65c2f6a99f1c61f9b7541a1fad933e01549 Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Thu, 23 Oct 2025 12:05:35 -0400
Subject: [PATCH 18/26] Fix FFI import. Add distributed tests hang workaround
 (#347)

---
 build_tools/jax.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/build_tools/jax.py b/build_tools/jax.py
index ae8e696c8..4e587b965 100644
--- a/build_tools/jax.py
+++ b/build_tools/jax.py
@@ -21,7 +21,12 @@ def xla_path() -> str:
     Throws FileNotFoundError if XLA source is not found."""
 
     try:
-        from jax.extend import ffi
+        import jax
+        from packaging import version
+        if version.parse(jax.__version__) >= version.parse("0.5.0"):
+            from jax import ffi
+        else:
+            from jax.extend import ffi
     except ImportError:
         if os.getenv("XLA_HOME"):
             xla_home = Path(os.getenv("XLA_HOME"))

From 499d2d86eda09def6b977ef92fddd820bf29a6b1 Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Mon, 27 Oct 2025 12:30:36 -0400
Subject: [PATCH 19/26] Make TE ROCm wheels building image directly from
 manylinix image (#340)

* Build ROCm wheels directly from manylinix image
* Fix build on top of the latest Manylinix image
* Fix build after switching to AITER
---
 .../wheel_utils/Dockerfile.rocm.manylinux.x86 | 34 ++++++++++++-------
 build_tools/wheel_utils/build_wheels.sh       | 20 +++++++----
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86 b/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86
index 2b78544df..cf5dbb3bc 100644
--- a/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86
+++ b/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86
@@ -2,22 +2,32 @@
 #
 # See LICENSE for license information.
 
-# This Dockerfile is used to build TransformerEngine wheels for ROCm on x86_64 architecture.
-# It is based on the manylinux_2_28_x86_64 based image with ROCm installed.
-ARG BASE_IMAGE=quay.io/pypa/manylinux_2_28_x86_64:non_existent_rocm_tag
+# This Dockerfile is used to build TransformerEngine wheels for ROCm on x86_64 architecture
+# on top of the manylinux_2_28_x86_64 base image.
+
+# Build args:
+# BASE_IMAGE - Base manylinux image to use. Default: quay.io/pypa/manylinux_2_28_x86_64
+# ROCM_REPO_URL - ROCm repository URL. Default: https://repo.radeon.com/rocm/rhel8/latest/main/
+# GPU_TARGETS - Semicolon separated list of target GPU architectures. Default: "gfx942;gfx950"
+# TARGET_BRANCH - Target branch for TransformerEngine. Default: none (use git default)
+# GPU_TARGETS and TARGET_BRANCH can be overriden when start a container with NVTE_ROCM_ARCH and TARGET_BRANCH environment variables.
+
+# Set base image
+ARG BASE_IMAGE=quay.io/pypa/manylinux_2_28_x86_64
 FROM $BASE_IMAGE
 
-# Setup the build_system repo
-RUN echo -e "[build_system]\nname=ROCm\nbaseurl=https://repo.almalinux.org/build_system/8/x86_64/\nenabled=1\ngpgcheck=0" >/etc/yum.repos.d/build_system.repo
+ARG ROCM_REPO_URL=https://repo.radeon.com/rocm/rhel8/latest/main/
 
-# Add and enable repos
-RUN dnf update -y || true
-RUN dnf install -y epel-release elrepo-release
-RUN dnf config-manager --set-enabled build_system powertools extras epel elrepo
+# Set up ROCm repo
+RUN echo -e "[rocm]\nname=ROCm\nbaseurl=${ROCM_REPO_URL}\nenabled=1\ngpgcheck=0" > /etc/yum.repos.d/rocm.repo
+
+# Setup packages
+RUN dnf install -y --disablerepo=epel rocm-dev hipblaslt hipblaslt-devel hipcub hipcub-devel
+RUN dnf group install -y "Development Tools" && dnf install -y git cmake llvm-toolset gcc-toolset-12
+
+#Uncomment the next line for ROCm 6.4 cmake workaround: remove newer incomnpatible cmake preinstalled on base image
+#RUN rm /usr/local/bin/cmake || true
 
-# Setup dev packages
-RUN dnf group install -y "Development Tools" && \
-    dnf install -y git cmake llvm-toolset hipblaslt hipblaslt-devel gcc-toolset-12
 RUN dnf clean all
 RUN rm -rf /var/cache/dnf/*
 
diff --git a/build_tools/wheel_utils/build_wheels.sh b/build_tools/wheel_utils/build_wheels.sh
index 5320b8a39..5d37ae1d9 100644
--- a/build_tools/wheel_utils/build_wheels.sh
+++ b/build_tools/wheel_utils/build_wheels.sh
@@ -30,11 +30,17 @@ fi
 
 ROCM_BUILD=`${PYBINDIR}python -c "import build_tools.utils as u; print(int(u.rocm_build()))"`
 
+if [ "$LOCAL_TREE_BUILD" != "1" ]; then
+        if [ "$ROCM_BUILD" = "1" ]; then
+                git pull
+        fi
+        git checkout $TARGET_BRANCH
+        git submodule update --init --recursive
+fi
+
 if [ "$ROCM_BUILD" = "1" ]; then
-        git pull
+        ${PYBINDIR}pip install setuptools wheel
 fi
-git checkout $TARGET_BRANCH
-git submodule update --init --recursive
 
 if $BUILD_METAPACKAGE ; then
         cd /TransformerEngine
@@ -50,10 +56,10 @@ if $BUILD_COMMON ; then
         WHL_BASE="transformer_engine-${VERSION}"
         if [ "$ROCM_BUILD" = "1" ]; then
                 TE_CUDA_VERS="rocm"
-                ${PYBINDIR}pip install ninja dataclasses
-                if [ -n "$PYBINDIR" ]; then
-                        PATH="$PYBINDIR:$PATH" #hipify expects python in PATH
-                fi
+                #dataclasses, psutil are needed for AITER
+                ${PYBINDIR}pip install ninja dataclasses psutil
+                #hipify expects python in PATH, also ninja may be installed to python bindir
+                test -n "$PYBINDIR" && PATH="$PYBINDIR:$PATH" || true
         else
                 TE_CUDA_VERS="cu12"
                 PYBINDIR=/opt/python/cp38-cp38/bin/

From 235b9b6525eb57ab6b1b8f34d1c36b05c0746de5 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar <125210283+VeeraRajasekhar@users.noreply.github.com>
Date: Fri, 31 Oct 2025 10:42:03 -0500
Subject: [PATCH 20/26] [CI] Hotfix test_gemm_autotune update (#353)

---
 transformer_engine/common/gemm/rocm_gemm.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu
index 574e8ab7e..9de4cfad7 100644
--- a/transformer_engine/common/gemm/rocm_gemm.cu
+++ b/transformer_engine/common/gemm/rocm_gemm.cu
@@ -750,8 +750,8 @@ protected:
       std::getline(is, type_b, csv_sep);
       std::getline(is, type_d, csv_sep);
       std::getline(is, bias_type, csv_sep);
-      is >> cfg.lda >> c >> cfg.ldb >> c >> cfg.ldd >> c >> cfg.scaling_mode >> c;
       std::getline(is, aux_type, csv_sep);
+      is >> cfg.lda >> c >> cfg.ldb >> c >> cfg.ldd >> c >> cfg.scaling_mode >> c;
       std::getline(is, epi, csv_sep);
       std::getline(is, comp, csv_sep);
       std::getline(is, scale, csv_sep);

From bcae45934fd5f2133801d5c3094e5001d7a41131 Mon Sep 17 00:00:00 2001
From: alextmagro <alex.magro@amd.com>
Date: Fri, 31 Oct 2025 10:43:35 -0500
Subject: [PATCH 21/26] MXFP8 test scale off by 1 fix (#338)

* MXFP8 test scale off by 1 fix
---
 tests/cpp/operator/test_cast_mxfp8.cu         | 47 ++++++++++++-
 .../operator/test_cast_mxfp8_gated_swiglu.cu  | 44 +++++++++++-
 tests/cpp/test_common.cu                      | 68 +++++++++++++++++++
 tests/cpp/test_common.h                       |  9 +++
 4 files changed, 163 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/operator/test_cast_mxfp8.cu b/tests/cpp/operator/test_cast_mxfp8.cu
index 33a9b8629..6855c9487 100644
--- a/tests/cpp/operator/test_cast_mxfp8.cu
+++ b/tests/cpp/operator/test_cast_mxfp8.cu
@@ -76,12 +76,12 @@ void scale_block(const ProcessingMethod processing_method,
                 continue;
             }
             amax = std::max(amax, std::abs(elt));
-#else
+#else // #ifdef __HIP_PLATFORM_AMD__
             if (std::isinf(elt) || std::isnan(elt)) {
                 continue;
             }
             amax = fmaxf(amax, fabsf(elt));
-#endif
+#endif // #ifdef __HIP_PLATFORM_AMD__
         }
     }
 
@@ -312,6 +312,23 @@ void performTest_x1(const ProcessingMethod processing_method,
                                               block_size_cols,
                                               scales_stride);
 
+    
+#ifdef __HIP_PLATFORM_AMD__
+    if (processing_method != ProcessingMethod::CAST_ONLY) {
+        std::vector<std::tuple<size_t, size_t, int>> mismatch_idx;
+        compare_e8m0_scaling_factors("scales", output_c, ref_output_scales.get(),
+                                unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, rowwise, mismatch_idx);
+
+        if (mismatch_idx.size()) {
+            adjust_ref(mismatch_idx, ref_output_c.get(), unpadded_blocks_Y, unpadded_blocks_X, rows, cols, otype);
+        }
+
+        auto [atol, rtol] = getTolerances(otype);
+        compareResults("output_c", output_c, ref_output_c.get(), rowwise, atol, rtol);
+    }
+    else
+#endif // #ifdef __HIP_PLATFORM_AMD__
+    {
     auto [atol, rtol] = getTolerances(otype);
     compareResults("output_c", output_c, ref_output_c.get(), rowwise, atol, rtol);
 
@@ -321,6 +338,7 @@ void performTest_x1(const ProcessingMethod processing_method,
 
     compare_e8m0_scaling_factors("scales", gpu_scales_ptr, ref_output_scales.get(),
                                  unpadded_blocks_Y, unpadded_blocks_X, scales_stride);
+    }
 
     if (processing_method == ProcessingMethod::CAST_DBIAS || processing_method == ProcessingMethod::CAST_DBIAS_DACT) {
         auto [atol_dbias, rtol_dbias] = getTolerances(itype);
@@ -454,7 +472,29 @@ void performTest_x2(const ProcessingMethod processing_method,
                                               block_size_cols,
                                               scales_stride_rowwise,
                                               scales_stride_colwise);
+#ifdef __HIP_PLATFORM_AMD__
+    if (processing_method != ProcessingMethod::CAST_ONLY) {
+        std::vector<std::tuple<size_t, size_t, int>> mismatch_idx_r;
+        compare_e8m0_scaling_factors("scales_rowwise", output, ref_scales_rowwise.get(),
+                                unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, scales_stride_rowwise, 0.01, true, mismatch_idx_r);
+
+        if (mismatch_idx_r.size()) {
+            adjust_ref(mismatch_idx_r, ref_output_c_rowwise.get(), unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, rows, cols, otype);
+        }
+        std::vector<std::tuple<size_t, size_t, int>> mismatch_idx_c;
+        compare_e8m0_scaling_factors("scales_colwise", output, ref_scales_colwise.get(),
+                                unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, scales_stride_colwise, 0.01, false, mismatch_idx_c);
+
+        if (mismatch_idx_c.size()) {
+            adjust_ref(mismatch_idx_c, ref_output_c_colwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, rows, cols, otype);
+        }
 
+        auto [atol, rtol] = getTolerances(otype);
+        compareResults("output_c_rowwise", output, ref_output_c_rowwise.get(), true, atol, rtol);
+        compareResults("output_c_colwise", output, ref_output_c_colwise.get(), false, atol, rtol);
+    } else
+#endif // #ifdef __HIP_PLATFORM_AMD__
+    {
     auto [atol, rtol] = getTolerances(otype);
     compareResults("output_c_rowwise", output, ref_output_c_rowwise.get(), true, atol, rtol);
     compareResults("output_c_colwise", output, ref_output_c_colwise.get(), false, atol, rtol);
@@ -464,6 +504,7 @@ void performTest_x2(const ProcessingMethod processing_method,
     compare_e8m0_scaling_factors("scales_colwise", output.columnwise_cpu_scale_inv_ptr<fp8e8m0>(),
                                  ref_scales_colwise.get(), unpadded_blocks_Y_colwise,
                                  unpadded_blocks_X_colwise, scales_stride_colwise);
+    }
 
     if (processing_method == ProcessingMethod::CAST_DBIAS || processing_method == ProcessingMethod::CAST_DBIAS_DACT) {
         auto [atol_dbias, rtol_dbias] = getTolerances(itype);
@@ -563,7 +604,7 @@ TEST_P(FusedCastMXFP8TestSuite, TestFusedCastMXFP8) {
     if (getDeviceComputeCapability() < blackwellComputeCapability) {
         GTEST_SKIP();
     }
-#endif
+#endif // #ifdef __HIP_PLATFORM_AMD__
 
     using namespace transformer_engine;
     using namespace test;
diff --git a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
index 0e43c2c9d..96663e752 100644
--- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
@@ -262,9 +262,24 @@ void performTest_x1(const size_t rows,
                                             block_size_rows,
                                             block_size_cols,
                                             scales_stride);
+#ifdef __HIP_PLATFORM_AMD__
+    std::vector<std::tuple<size_t, size_t, int>> mismatch_idx;
+    if (rowwise) {
+      compare_e8m0_scaling_factors("rowwise scales", output, ref_output_scales.get(),
+                                   unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, true, mismatch_idx);
+    } else {
+      compare_e8m0_scaling_factors("colwise scales", output, ref_output_scales.get(),
+                                   unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, false, mismatch_idx);
+    }
+    if (mismatch_idx.size()) {
+        adjust_ref(mismatch_idx, ref_output.get(), unpadded_blocks_Y, unpadded_blocks_X, rows, cols, otype);
+    }
 
     auto [atol, rtol] = getTolerances(otype);
     compareResults("output", output, ref_output.get(), rowwise, atol, rtol);
+#else // #ifdef __HIP_PLATFORM_AMD__
+    auto [atol, rtol] = getTolerances(otype);
+    compareResults("output", output, ref_output.get(), rowwise, atol, rtol);
 
     const uint8_t * const gpu_scales_ptr = rowwise
                                            ? output.rowwise_cpu_scale_inv_ptr<fp8e8m0>()
@@ -276,6 +291,7 @@ void performTest_x1(const size_t rows,
       compare_e8m0_scaling_factors("colwise scales", gpu_scales_ptr, ref_output_scales.get(),
                                    unpadded_blocks_Y, unpadded_blocks_X, scales_stride);
     }
+#endif // #ifdef __HIP_PLATFORM_AMD__
 }
 
 /**
@@ -361,17 +377,41 @@ void performTest_x2(const size_t rows,
                                             block_size_cols,
                                             scales_stride_rowwise,
                                             scales_stride_colwise);
+#ifdef __HIP_PLATFORM_AMD__
+    std::vector<std::tuple<size_t, size_t, int>> mismatch_idx_r;
+    compare_e8m0_scaling_factors("scales_rowwise", output,
+                                 ref_scales_rowwise.get(), unpadded_blocks_Y_rowwise,
+                                 unpadded_blocks_X_rowwise, scales_stride_rowwise, 0.01, true, mismatch_idx_r);
+
+    if (mismatch_idx_r.size()) {
+        adjust_ref(mismatch_idx_r, ref_output_colwise.get(), unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, rows, cols, otype);
+    }
+
+    std::vector<std::tuple<size_t, size_t, int>> mismatch_idx_c;
+    compare_e8m0_scaling_factors("scales_colwise", output,
+                                 ref_scales_colwise.get(), unpadded_blocks_Y_colwise,
+                                 unpadded_blocks_X_colwise, scales_stride_colwise, 0.01, false, mismatch_idx_c);
+
+    if (mismatch_idx_c.size()) {
+        adjust_ref(mismatch_idx_c, ref_output_rowwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, rows, cols, otype);
+    }
 
     auto [atol, rtol] = getTolerances(otype);
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("output_c_rowwise", output, ref_output_rowwise.get(), true, atol, rtol);
     compareResults("output_c_colwise", output, ref_output_colwise.get(), false, atol, rtol);
+#else // #ifdef __HIP_PLATFORM_AMD__
+    auto [atol, rtol] = getTolerances(otype);
+    auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
+    compareResults("output_c_rowwise", output, ref_output_rowwise.get(), true, atol, rtol);
+    compareResults("output_c_colwise", output, ref_output_colwise.get(), false, atol, rtol);
     compare_e8m0_scaling_factors("scales_rowwise", output.rowwise_cpu_scale_inv_ptr<fp8e8m0>(),
                                  ref_scales_rowwise.get(), unpadded_blocks_Y_rowwise,
                                  unpadded_blocks_X_rowwise, scales_stride_rowwise);
     compare_e8m0_scaling_factors("scales_colwise", output.columnwise_cpu_scale_inv_ptr<fp8e8m0>(),
                                  ref_scales_colwise.get(), unpadded_blocks_Y_colwise,
                                  unpadded_blocks_X_colwise, scales_stride_colwise);
+#endif // #ifdef __HIP_PLATFORM_AMD__
 }
 
 std::vector<std::pair<size_t, size_t>> matrix_sizes = {
@@ -418,12 +458,12 @@ class CastMXFP8_GatedActTestSuite : public ::testing::TestWithParam
 TEST_P(CastMXFP8_GatedActTestSuite, TestCastMXFP8Swiglu) {
  #ifdef __HIP_PLATFORM_AMD__
     omp_set_num_threads(std::min(128, omp_get_max_threads())); // Using threads = # of vcpus causes occasional errors.
-#else
+#else // #ifdef __HIP_PLATFORM_AMD__
    // Skip tests for pre-Blackwell architectures
     if (getDeviceComputeCapability() < blackwellComputeCapability) {
         GTEST_SKIP();
     }
-#endif
+#endif // #ifdef __HIP_PLATFORM_AMD__
 
 
     using namespace transformer_engine;
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index d37900a1f..d3dd6e95f 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -714,6 +714,74 @@ void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test,
   }
 }
 
+#ifdef __HIP_PLATFORM_AMD__
+void compare_e8m0_scaling_factors(const std::string &name, Tensor &output, const uint8_t *ref,
+                             const size_t row_blocks, const size_t col_blocks, const size_t stride, 
+                             double tol, bool rowwise, std::vector<std::tuple<size_t, size_t, int>> &mismatch_idx) {
+  const uint8_t *const test = rowwise ? output.rowwise_cpu_scale_inv_ptr<fp8e8m0>()
+                                       : output.columnwise_cpu_scale_inv_ptr<fp8e8m0>();
+
+  const float scale_tol = std::max(1.f, row_blocks * col_blocks * tol);
+
+  for (int i = 0; i < row_blocks; i++) {
+    for (int j = 0; j < col_blocks; j++) {
+      const int idx = i * stride + j;
+      if (test[idx] != ref[idx]) {
+        int t_scale = static_cast<int>(test[idx]);
+        int r_scale = static_cast<int>(ref[idx]);
+        if (std::abs(t_scale - r_scale) == 1) {
+          mismatch_idx.emplace_back(i, j, r_scale-t_scale);
+        } else {
+          GTEST_FAIL() << "Error in " << name << std::endl
+          << "Mismatch: " << t_scale << " vs "
+          << r_scale << " at index " << idx;
+        }
+      }
+    }
+  }
+  const size_t scale_mismatches = mismatch_idx.size();
+
+  ASSERT_FALSE(scale_mismatches > scale_tol) 
+  << "Error in " << name << std::endl << std::setprecision(4)
+  << "Total scale mismatches: " << scale_mismatches << " (" << 100.*(double)scale_mismatches/(double)(row_blocks*col_blocks)
+  << "%) Exceeds tolerance of " << scale_tol << " (" << 100.*tol <<  "%) mismatches";
+
+  if (scale_mismatches) {
+    std::cout << "\x1b[33mWARNING:\x1b[0m " << scale_mismatches 
+    << " scale mismatches were found. This does not imply an accuracy issue." << std::endl;
+    }
+}
+
+void adjust_ref(std::vector<std::tuple<size_t, size_t, int>> mismatch_idx, void *ref, const size_t row_blocks,
+                const size_t col_blocks, const size_t rows, const size_t cols, DType otype) {
+  TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( otype, T,
+    T *ref_data = reinterpret_cast<T*>(ref);
+    double scale_val;
+    const size_t col_blocks_size = cols / col_blocks;
+    const size_t row_blocks_size = rows / row_blocks;
+    for (const auto &[i, j, scale_diff] : mismatch_idx) {
+      if (scale_diff == 1) {
+        scale_val = 2.;
+      } else if (scale_diff == -1) {
+        scale_val = .5;
+      } else { // Shouldn't ever reach this
+        GTEST_FAIL() << "Error in adjust_ref, |scale_diff| > 1";
+      }
+      size_t ii_min = i * row_blocks_size;
+      const size_t ii_max = std::min(ii_min + row_blocks_size, rows);
+      for (; ii_min < ii_max; ii_min++) {
+        size_t jj_min = j * col_blocks_size;
+        const size_t jj_max = std::min(jj_min + col_blocks_size, cols);
+        for (; jj_min < jj_max; jj_min++) {
+          const size_t data_idx = ii_min * cols + jj_min;
+          ref_data[data_idx] = static_cast<T>(static_cast<double>(ref_data[data_idx]) * scale_val);
+        }
+      }
+    }
+  ); // NOLINT(*)
+}
+#endif // #ifdef __HIP_PLATFORM_AMD__
+
 std::pair<double, double> getTolerances(const DType type) {
   switch(type) {
     case DType::kFloat32:
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 7ac2b75a6..6b9514d38 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -19,6 +19,7 @@
 #else
 #include <hip/hip_bfloat16.h>
 #include "amd_detail/hip_float8.h"
+#include <gtest/gtest.h>
 #endif
 #include <cuda_runtime_api.h>
 
@@ -461,6 +462,14 @@ void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test,
                                   const size_t row_blocks, const size_t col_blocks, const size_t stride);
 void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref,
                                   const size_t N);
+#ifdef USE_ROCM
+void compare_e8m0_scaling_factors(const std::string &name, Tensor &output, const uint8_t *ref,
+                             const size_t row_blocks, const size_t col_blocks, const size_t stride, 
+                             double tol, bool rowwise, std::vector<std::tuple<size_t, size_t, int>> &mismatch_idx);
+
+void adjust_ref(std::vector<std::tuple<size_t, size_t, int>> mismatch_idx, void *ref, const size_t row_blocks,
+                const size_t col_blocks, const size_t rows, const size_t cols, DType otype);
+#endif
 
 std::array<size_t, 4> get_scale_tensor_dims(const size_t rows, const size_t cols,
                                             const size_t block_size_rows, const size_t block_size_cols);

From 34b1a3495da1f0f57a16707ec5bbd1018ee773d7 Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Fri, 7 Nov 2025 12:46:55 -0500
Subject: [PATCH 22/26] CI: allow numpy 2.0 (#366)

(cherry picked from commit 6b8a47d16a6b5b7ba162238ef05ca0214621ef3d)
---
 ci/pytorch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/pytorch.sh b/ci/pytorch.sh
index 207949ee5..93b9ded7f 100755
--- a/ci/pytorch.sh
+++ b/ci/pytorch.sh
@@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch
 #: ${TEST_WORKERS:=4}
 
 install_prerequisites() {
-    pip install 'numpy>=1.22.4,<2.0' pandas
+    pip install 'numpy>=1.22.4' pandas
     rc=$?
     if [ $rc -ne 0 ]; then
         script_error "Failed to install test prerequisites"

From 736ab30857d9e26510fcffa77a05a7360b7831c9 Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Fri, 7 Nov 2025 19:48:05 -0500
Subject: [PATCH 23/26] Relax tolerance to pass 29x29x17389NT GEMM on MI350
 (#365)

(cherry picked from commit 9a987f8d391a8b3dbad21d279899b53cbcbe55b7)
---
 tests/cpp/operator/test_cublaslt_gemm.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index b731cc701..1ef3f7ee3 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -186,7 +186,7 @@ std::pair<double, double> getTestTolerances(const DType type, bool use_fp8, bool
   else if (use_fp8) {
     atol = 1e-3;
     //TODO: remove <double> (see comment above)
-    rtol = std::max<double>(rtol, 5e-3);
+    rtol = std::max<double>(rtol, 1e-2);
   }
   else if (type == DType::kBFloat16) {
     //relax for certain prime number TN gemm

From baed0d1e146ddfc3246e104868d2a9cfd596597f Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Sun, 12 Oct 2025 10:17:59 -0400
Subject: [PATCH 24/26] Bring back aiter solib with aiter update (#327)

* AITER solib with commit fc3c0420

* [ROCm] api call fix and disable v3 fwd with swa (#331)

* [ROCm] update aiter commit with gfx950 fix and swa fwd fix

---------

Co-authored-by: Ye Wang <yewang12@amd.com>
(cherry picked from commit b08a1ed9273ccf641d58fe0a7093e1e1dcf6c2b2)
---
 .gitignore                                    |   2 -
 3rdparty/aiter                                |   2 +-
 setup.py                                      |  30 +--
 transformer_engine/common/CMakeLists.txt      |  13 +-
 .../common/ck_fused_attn/CMakeLists.txt       | 176 +++---------------
 .../ck_fused_attn/src/ck_fused_attn_bwd.cpp   |   4 +-
 .../ck_fused_attn/src/ck_fused_attn_fwd.cpp   |  25 ++-
 .../common/fused_attn_rocm/fused_attn_ck.cpp  |   1 +
 8 files changed, 55 insertions(+), 198 deletions(-)

diff --git a/.gitignore b/.gitignore
index 44de0a19e..874eed018 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,7 +52,5 @@ compile_commands.json
 **/profiler_outputs/
 **/times.csv
 tensor_dumps/
-aiter/
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
-transformer_engine/aiter/
diff --git a/3rdparty/aiter b/3rdparty/aiter
index a2ca1b460..74e71eb8e 160000
--- a/3rdparty/aiter
+++ b/3rdparty/aiter
@@ -1 +1 @@
-Subproject commit a2ca1b460f097a309ee5a128c7454b1c419dc331
+Subproject commit 74e71eb8ee8a663d5e33c0cfd8b4dad7708ae84b
diff --git a/setup.py b/setup.py
index 0012844a8..b7b234ba3 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,6 @@
 
 
 from setuptools.command.build_ext import build_ext as BuildExtension
-from setuptools.command.develop import develop as _develop
 
 os.environ["NVTE_PROJECT_BUILDING"] = "1"
 
@@ -48,26 +47,6 @@
 if not rocm_build():
     archs = cuda_archs()
 
-# A custom develop command only used for ROCm builds
-class develop(_develop):
-    def run(self):
-        super().run()
-        if (
-            int(os.getenv("NVTE_FUSED_ATTN_CK", "1")) and
-            int(os.getenv("NVTE_FUSED_ATTN", "1"))
-        ):
-            # Ensure that the AITER ASM kernels are properly available at runtime
-            # by creating a symlink to them. This is only necessary for editable
-            # mode since our C++ code assumes the AITER ASM kernel paths relative
-            # to trasnformer_engine.so, which is different in editable installs.
-            project_dir = Path(__file__).parent
-            asm_src_dir = project_dir / 'transformer_engine' / 'aiter'
-            # Must be synced with
-            # TransformerEngine/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp
-            asm_target_dir = project_dir / 'aiter'
-            if asm_src_dir.is_dir() and not asm_target_dir.is_dir():
-                asm_target_dir.symlink_to(asm_src_dir)
-
 class TimedBdist(bdist_wheel):
     """Helper class to measure build time"""
 
@@ -89,7 +68,7 @@ def setup_common_extension() -> CMakeExtension:
         cmake_flags.append(f"-DCK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT={os.getenv('NVTE_CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT', 3)}")
         if os.getenv("NVTE_CK_FUSED_ATTN_PATH"):
             ck_path = Path(os.getenv("NVTE_CK_FUSED_ATTN_PATH"))
-            cmake_flags.append(f"-DCK_FUSED_ATTN_PATH={ck_path}")
+            cmake_flags.append(f"-DAITER_MHA_PATH={ck_path}")
         if int(os.getenv("NVTE_FUSED_ATTN_AOTRITON", "1"))==0 or int(os.getenv("NVTE_FUSED_ATTN", "1"))==0:
             cmake_flags.append("-DUSE_FUSED_ATTN_AOTRITON=OFF")
         if int(os.getenv("NVTE_FUSED_ATTN_CK", "1"))==0 or int(os.getenv("NVTE_FUSED_ATTN", "1"))==0:
@@ -192,7 +171,6 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
     with open("README.rst", encoding="utf-8") as f:
         long_description = f.read()
 
-    cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}
     # Settings for building top level empty package for dependency management.
     if bool(int(os.getenv("NVTE_BUILD_METAPACKAGE", "0"))):
         assert bool(
@@ -200,6 +178,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
         ), "NVTE_RELEASE_BUILD env must be set for metapackage build."
         te_cuda_vers = "rocm" if rocm_build() else "cu12"
         ext_modules = []
+        cmdclass = {}
         package_data = {}
         include_package_data = False
         setup_requires = []
@@ -211,8 +190,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
     else:
         setup_requires, install_requires, test_requires = setup_requirements()
         ext_modules = [setup_common_extension()]
-        if rocm_build():
-            cmdclass["develop"] = develop
+        cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}
         package_data = {"": ["VERSION.txt"]}
         include_package_data = True
         extras_require = {"test": test_requires}
@@ -255,7 +233,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
         long_description=long_description,
         long_description_content_type="text/x-rst",
         ext_modules=ext_modules,
-        cmdclass=cmdclass,
+        cmdclass={"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist},
         python_requires=">=3.8, <3.13",
         classifiers=[
             "Programming Language :: Python :: 3.8",
diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
index a9e2e056e..f70c9f8bb 100644
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -351,18 +351,7 @@ else()
   endif()
 
   if(USE_FUSED_ATTN_CK)
-    if(NOT DEFINED CK_FUSED_ATTN_PATH)
-      set(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT ${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT} CACHE STRING "ck float to bf16 conversion rounding")
-      add_subdirectory(ck_fused_attn ${CMAKE_CURRENT_BINARY_DIR}/ck_fused_attn)
-    else()
-      # Use CK built during initial TE building/installation
-      # When only need rebuild TE library itself
-      unset(CK_FUSED_ATTN_LIB CACHE)
-      find_library(CK_FUSED_ATTN_LIB NAMES ck_fused_attn PATHS ${CK_FUSED_ATTN_PATH}/lib REQUIRED NO_DEFAULT_PATH)
-      add_library( ck_fused_attn STATIC IMPORTED )
-      set_target_properties( ck_fused_attn PROPERTIES IMPORTED_LOCATION ${CK_FUSED_ATTN_LIB} )
-      target_include_directories(ck_fused_attn INTERFACE ${CK_FUSED_ATTN_PATH}/include)
-    endif()
+    add_subdirectory(ck_fused_attn ${CMAKE_CURRENT_BINARY_DIR}/ck_fused_attn)
   endif()
 
   find_package(hip)
diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt
index 2a2afa328..c44a930e6 100644
--- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt
+++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt
@@ -1,20 +1,15 @@
 # Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-#TODO: compile to a shared library
-cmake_minimum_required(VERSION 3.28)
-set(CMAKE_CXX_STANDARD 20)
-#TODO: remove after figuring out how to install clang-scan-deps
-set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+cmake_minimum_required(VERSION 3.21)
+set(CMAKE_CXX_STANDARD 17)
 project(ck_fused_attn LANGUAGES HIP CXX)
 
-# remove files that should be regenerated
-file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp ${CMAKE_CURRENT_BINARY_DIR}/gen_src/blob_list.txt)
 
-# create gen_src and gen_src/tmp directories if needed
-file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp)
+set(AITER_MHA_INSTALL_PREFIX "transformer_engine" CACHE STRING "aiter mha shared lib install prefix in TE")
 
 set(__AITER_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/aiter")
+set(__AITER_TEST_DIR "${__AITER_SOURCE_DIR}/op_tests/cpp/mha")
 set(__CK_SOURCE_DIR "${__AITER_SOURCE_DIR}/3rdparty/composable_kernel")
 
 # so far, there are only gfx942 and gfx950 v3 kernels
@@ -37,82 +32,22 @@ message(STATUS "AITER V3_ASM_ARCHS: ${V3_ASM_ARCHS}")
 list(JOIN V3_ASM_ARCHS ";" V3_ASM_ARCHS_STR)
 set(ENV{GPU_ARCHS} "${V3_ASM_ARCHS_STR}")
 
-# generate v2 (CK) kernels
-# fwd kernels list
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api fwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_blob_list.txt --receipt 600
-)
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api fwd_splitkv --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_splitkv_blob_list.txt --receipt 600
-)
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api batch_prefill --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_batch_prefill_blob_list.txt --receipt 600
-)
-
-# bwd kernels list
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/bwd_blob_list.txt --receipt 600
-)
-
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS)
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_splitkv_blob_list.txt FMHA_FWD_SPLITKV_GEN_BLOBS)
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_batch_prefill_blob_list.txt FMHA_FWD_BATCH_PREFILL_GEN_BLOBS)
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
-
-# generate the actual fwd kernel cpp files
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api fwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600
-)
-
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api fwd_splitkv --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600
-)
-
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api batch_prefill --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600
-)
-
-# generate the aiter fwd interface cpp file
-execute_process(
-  COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/cpp_itfs/mha_fwd_generate.py
-  --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 5
-)
-
-# generate the actual bwd kernel cpp files
-execute_process(
-  COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
-  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600
-)
-
-# generate the aiter bwd interface cpp file
-execute_process(
-  COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/py_itfs_cu/fmha_bwd_pre_post_kernel_generate.py
-  --filter *@*_ndeterministic@*_nbias*_dropout*_ndeterministic* --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp
-)
-
-execute_process(
-  COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/cpp_itfs/mha_bwd_generate.py
-  --receipt 3 --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp
-)
-
-# generate fwd/bwd v3 kernels for each requested rocm arch
-foreach(CK_TARGET_ARCH IN LISTS V3_ASM_ARCHS)
-  execute_process(
-    COMMAND python3 ${__AITER_SOURCE_DIR}/hsa/${CK_TARGET_ARCH}/fmha_v3_fwd/codegen.py
-    --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp
-  )
+if(NOT DEFINED AITER_MHA_PATH)
+  # delete the existing aiter/jit/build dir for a clean build
+  file(REMOVE_RECURSE "${__AITER_SOURCE_DIR}/aiter/jit/build")
+  # compile the libmha_fwd.so and libmha_bwd.so
+  set(ENV{AITER_LOG_MORE} 1)
+  # fp32 to bf16 cvt env still required for MI300X
+  set(ENV{CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT} ${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT})
   execute_process(
-    COMMAND python3 ${__AITER_SOURCE_DIR}/hsa/${CK_TARGET_ARCH}/fmha_v3_bwd/codegen.py
-    --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp
+    COMMAND python3 ${__AITER_TEST_DIR}/compile.py
   )
-endforeach()
+  # libmha_fwd.so and libmha_bwd.so will be under 3rdparty/aiter/op_tests/cpp/mha
+  set(__AITER_MHA_PATH ${__AITER_TEST_DIR})
+else()
+  # use pre-built libmha_fwd.so libmha_bwd.so
+  set(__AITER_MHA_PATH ${AITER_MHA_PATH})
+endif()
 
 set(ck_fused_attn_SOURCES)
 list(APPEND ck_fused_attn_SOURCES
@@ -120,75 +55,18 @@ list(APPEND ck_fused_attn_SOURCES
        src/ck_fused_attn_bwd.cpp
        src/ck_fused_attn_utils.cpp)
 
-foreach(blob ${FMHA_FWD_GEN_BLOBS})
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT)
-endforeach()
-list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_GEN_BLOBS})
-
-foreach(blob ${FMHA_FWD_SPLITKV_GEN_BLOBS})
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT)
-endforeach()
-list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_SPLITKV_GEN_BLOBS})
-
-foreach(blob ${FMHA_FWD_BATCH_PREFILL_GEN_BLOBS})
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT)
-endforeach()
-list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_BATCH_PREFILL_GEN_BLOBS})
-
-foreach(blob ${FMHA_BWD_GEN_BLOBS})
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT)
-endforeach()
-list(APPEND ck_fused_attn_SOURCES ${FMHA_BWD_GEN_BLOBS})
-
-# add generated cpp files into ck_fused_attn_sources
-set(MHA_BWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/mha_bwd.cpp")
-set(MHA_FWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/mha_fwd.cpp")
-
-file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${MHA_BWD_SRC})
-file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${MHA_BWD_SRC} ONLY_IF_DIFFERENT)
-
-file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${MHA_FWD_SRC})
-file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${MHA_FWD_SRC} ONLY_IF_DIFFERENT)
-
-list(APPEND ck_fused_attn_SOURCES ${MHA_BWD_SRC} ${MHA_FWD_SRC})
-
-foreach(CK_TARGET_ARCH IN LISTS V3_ASM_ARCHS)
-  set(ASM_MHA_FWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/asm_fmha_fwd_v3_${CK_TARGET_ARCH}.cpp")
-  set(ASM_MHA_BWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/asm_fmha_bwd_v3_${CK_TARGET_ARCH}.cpp")
-
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${ASM_MHA_BWD_SRC})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${ASM_MHA_BWD_SRC} ONLY_IF_DIFFERENT)
-  
-  file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${ASM_MHA_FWD_SRC})
-  file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${ASM_MHA_FWD_SRC} ONLY_IF_DIFFERENT)
-  list(APPEND ck_fused_attn_SOURCES ${ASM_MHA_BWD_SRC} ${ASM_MHA_FWD_SRC})
-endforeach()
-
-# remove all previously generated temporary files
-file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp)
-
 message(STATUS "Found the following fused attention files:")
 foreach(file ${ck_fused_attn_SOURCES})
   message(STATUS " ${file}")
 endforeach()
 
-add_library(ck_fused_attn STATIC ${ck_fused_attn_SOURCES})
+add_library(ck_fused_attn SHARED ${ck_fused_attn_SOURCES})
 set(CK_FUSED_ATTN_COMPILE_OPTIONS)
 list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS
-    -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -DCK_TILE_FMHA_FWD_SPLITKV_API=1-DCK_TILE_FMHA_FWD_APPENDKV_API=0
-    -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT}
-    -fgpu-flush-denormals-to-zero -ftemplate-backtrace-limit=0 -fPIC
-    -Wno-undefined-func-template -Wno-float-equal -Wno-gnu-line-marker -Wunused-variable -Wuninitialized
-    "SHELL:-mllvm -enable-post-misched=0" "SHELL:-mllvm -amdgpu-early-inline-all=true"
-    "SHELL:-mllvm -amdgpu-function-calls=false" "SHELL:-mllvm -amdgpu-coerce-illegal-types=1"
-    "SHELL:-mllvm --amdgpu-kernarg-preload-count=16")
+    -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT})
 
-foreach(CK_TARGET_ARCH IN LISTS CMAKE_HIP_ARCHITECTURES)
-  list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS --offload-arch=${CK_TARGET_ARCH})
+foreach(ARCH IN LISTS V3_ASM_ARCHS)
+  list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS --offload-arch=${ARCH})
 endforeach()
 
 set(CK_INCLUDE_DIR "${__CK_SOURCE_DIR}/include")
@@ -216,18 +94,22 @@ target_include_directories(ck_fused_attn PRIVATE ${CK_INCLUDE_DIR} ${__CK_SOURCE
 target_include_directories(ck_fused_attn PRIVATE ${AITER_INCLUDE_DIR})
 
 find_package(hip)
-list(APPEND ck_fused_attn_LINKER_LIBS hip::host hip::device roctx64)
+list(APPEND ck_fused_attn_LINKER_LIBS hip::host hip::device roctx64 ${__AITER_MHA_PATH}/libmha_fwd.so ${__AITER_MHA_PATH}/libmha_bwd.so)
 target_link_libraries(ck_fused_attn PUBLIC ${ck_fused_attn_LINKER_LIBS})
 target_compile_options(ck_fused_attn PRIVATE ${CK_FUSED_ATTN_COMPILE_OPTIONS})
+set_target_properties(ck_fused_attn PROPERTIES INSTALL_RPATH "$ORIGIN")
 
+install(FILES ${__AITER_MHA_PATH}/libmha_fwd.so ${__AITER_MHA_PATH}/libmha_bwd.so DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib)
+install(TARGETS ck_fused_attn DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib)
 # copy v3 kernels to destination
 foreach(ARCH IN LISTS V3_ASM_ARCHS)
   install(DIRECTORY
         ${__AITER_SOURCE_DIR}/hsa/${ARCH}/fmha_v3_fwd
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine/aiter/${ARCH}/
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib/aiter/${ARCH}/
         PATTERN "codegen.py" EXCLUDE)
   install(DIRECTORY
         ${__AITER_SOURCE_DIR}/hsa/${ARCH}/fmha_v3_bwd
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine/aiter/${ARCH}/
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib/aiter/${ARCH}/
         PATTERN "codegen.py" EXCLUDE)
 endforeach()
+
diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp
index 840db7b86..2b717ace0 100644
--- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp
+++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp
@@ -920,8 +920,8 @@ hipError_t ck_attn_varlen_bwd(
                          cu_seqlen_q_ptr,//cu_seqlen_q
                          cu_seqlen_kv_ptr,//cu_seqlen_kv
                          nullptr, /* seqlen_k_ptr */
-                         0, //seqlen_q, unused in group mode
-                         0, //seqlen_kv, unused in group mode
+                         max_seqlen_q, //seqlen_q, unused in group mode
+                         max_seqlen_k, //seqlen_kv, unused in group mode
                          batch,
                          max_seqlen_q,
                          max_seqlen_k,
diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp
index 2829175ab..c87a3db6c 100644
--- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp
+++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp
@@ -209,9 +209,13 @@ hipError_t ck_attn_fwd(
                          nullptr,//rand_val_ptr
                          lse_ptr,
                          o_ptr,
-                         nullptr,//cu_seqlen_q
-                         nullptr,//cu_seqlen_kv
-                         nullptr, /* seqlen_k_ptr */
+                         nullptr, //cu_seqlen_q
+                         nullptr, //cu_seqlen_kv
+                         nullptr, //seqstart_q_ptr
+                         nullptr, //seqstart_k_ptr
+                         nullptr, //seqlen_k_ptr
+                         nullptr, //seqstart_padded_q_ptr
+                         nullptr, //seqstart_padded_k_ptr
                          max_seqlen_q,
                          max_seqlen_k,
                          batch,
@@ -308,6 +312,7 @@ hipError_t ck_attn_varlen_fwd(
   ck_tile::index_t nhead_k = hg;
   ck_tile::index_t hdim_v = d_v;
   ck_tile::index_t max_seqlen_q = s_q;
+  ck_tile::index_t max_seqlen_kv = s_kv;
 
   float scale_s = scaling_factor;
   float scale_p = 1.f;
@@ -379,11 +384,15 @@ hipError_t ck_attn_varlen_fwd(
                          nullptr,//rand_val_ptr
                          lse_thd_ptr,
                          o_ptr,
-                         cu_seqlen_q_ptr,//cu_seqlen_q
-                         cu_seqlen_kv_ptr,//cu_seqlen_kv
-                         nullptr, /* seqlen_k_ptr */
-                         0, //seqlen_q, unused in group mode
-                         0, //seqlen_kv, unused in group mode
+                         nullptr, //cu_seqlen_q
+                         nullptr, //cu_seqlen_kv
+                         cu_seqlen_q_ptr, //seqstart_q_ptr
+                         cu_seqlen_kv_ptr, //seqstart_k_ptr
+                         nullptr, //seqlen_k_ptr
+                         nullptr, //seqstart_padded_q_ptr
+                         nullptr, //seqstart_padded_k_ptr
+                         max_seqlen_q, //seqlen_q, unused in group mode
+                         max_seqlen_kv, //seqlen_kv, unused in group mode
                          batch,
                          max_seqlen_q,
                          hdim_q,
diff --git a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp b/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp
index 72696fbd9..b38249f5b 100644
--- a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp
+++ b/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp
@@ -557,6 +557,7 @@ void fused_attn_ck_fwd_impl(
       nvte_log_ck_config = true;
   }
   bool nvte_ck_uses_fwd_v3 = getenv<int>("NVTE_CK_USES_FWD_V3", 0);
+
   bool is_ragged = nvte_get_qkv_format(layout)==NVTE_QKV_Format::NVTE_THD;
 
   // extract the qkv and o storage bytes to allocate buffer for padding removing

From cc5b35667ec290eed362fd0890e74e89faa9282a Mon Sep 17 00:00:00 2001
From: Ye Wang <yewang12@amd.com>
Date: Wed, 22 Oct 2025 16:33:41 -0500
Subject: [PATCH 25/26] [ROCm] update AITER to support aiter shared lib for
 multi-gpu (PRs 1196,1230) (#337)

* [ROCm] include AITER PR 1196 to support aiter shared lib for multi-gpu

* [ROCm] update aiter commit to remove pandas requirement

(cherry picked from commit 63b4ce9339ca54deb0a13bee67270854031139cc)
---
 3rdparty/aiter | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/aiter b/3rdparty/aiter
index 74e71eb8e..1b00a0e8a 160000
--- a/3rdparty/aiter
+++ b/3rdparty/aiter
@@ -1 +1 @@
-Subproject commit 74e71eb8ee8a663d5e33c0cfd8b4dad7708ae84b
+Subproject commit 1b00a0e8a54be0411490a69a5d7042abd33a56d9

From 08344fe062340c2f93e2f265ed2dc1fe0085fb6f Mon Sep 17 00:00:00 2001
From: ipanfilo <145064111+ipanfilo@users.noreply.github.com>
Date: Wed, 12 Nov 2025 17:40:18 -0500
Subject: [PATCH 26/26] Use .info/version for ROCm verison (#368)

(cherry picked from commit e9c736190c3080db9202b872d96f7171f1f93aa5)
---
 transformer_engine/common/CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
index f70c9f8bb..9a4187378 100644
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -487,8 +487,16 @@ install(TARGETS transformer_engine DESTINATION .)
 set_target_properties(transformer_engine PROPERTIES INSTALL_RPATH "$ORIGIN/lib;$ORIGIN/transformer_engine/lib")
 
 if (USE_ROCM)
+  if("$ENV{ROCM_PATH}" STREQUAL "")
+    set(ROCM_PATH "/opt/rocm")
+  else()
+    set(ROCM_PATH "$ENV{ROCM_PATH}")
+  endif()
+  file(READ "${ROCM_PATH}/.info/version" ROCM_VER)
+  string(STRIP "${ROCM_VER}" ROCM_VER)
+  string(REGEX MATCH "^[0-9]+\\.[0-9]+" ROCM_VER "${ROCM_VER}")
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/build_info.txt"
-    "ROCM_VERSION: ${hip_VERSION_MAJOR}.${hip_VERSION_MINOR}\n"
+    "ROCM_VERSION: ${ROCM_VER}\n"
     "GPU_TARGETS: ${CMAKE_HIP_ARCHITECTURES}\n"
   )
   install(FILES "${CMAKE_CURRENT_BINARY_DIR}/build_info.txt" DESTINATION "transformer_engine/")