From 02bc8525ba31e1365cf8feca00f6867567b730f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Mon, 19 May 2025 18:15:22 +0200 Subject: [PATCH 01/26] =?UTF-8?q?[Pytorch]=20NVIDIA-DL-Framework-Inspect?= =?UTF-8?q?=20support=20=E2=80=93=20part=203=20=E2=80=93=20tests=20(#1612)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tests drop Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move dir Signed-off-by: Pawel Gadzinski * tests fox Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski Signed-off-by: Przemek Tredak Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Przemek Tredak Co-authored-by: Kirthi Shankar Sivamani --- qa/L0_pytorch_debug_unittest/test.sh | 26 + qa/L1_pytorch_distributed_unittest/test.sh | 14 + tests/pytorch/debug/conftest.py | 27 + tests/pytorch/debug/run_distributed.py | 647 ++++++++++++++++ tests/pytorch/debug/test_api_features.py | 398 ++++++++++ tests/pytorch/debug/test_config.py | 151 ++++ .../debug/test_configs/disable_fp8_gemms.yaml | 8 + .../debug/test_configs/disable_fp8_layer.yaml | 7 + .../debug/test_configs/dummy_feature.yaml | 9 + .../fake_quantization_config.yaml | 14 + .../test_configs/per_tensor_scaling.yaml | 19 + .../stats_collection_test_config.yaml | 59 ++ ...ensor_manipulation_transformer_engine.yaml | 45 ++ tests/pytorch/debug/test_distributed.py | 39 + tests/pytorch/debug/test_numerics.py | 718 ++++++++++++++++++ tests/pytorch/debug/test_sanity.py | 107 +++ tests/pytorch/debug/utils.py | 22 + tests/pytorch/distributed/run_numerics.py | 12 + tests/pytorch/test_numerics.py | 26 + transformer_engine/debug/features/api.py | 6 +- .../debug/features/fake_quant.py | 2 +- .../debug/features/log_fp8_tensor_stats.py | 1 - .../debug/features/per_tensor_scaling.py | 5 +- .../debug/features/utils/stats_computation.py | 7 +- .../debug/pytorch/debug_quantization.py | 18 +- transformer_engine/pytorch/distributed.py | 6 + transformer_engine/pytorch/module/base.py | 7 +- .../pytorch/module/layernorm_linear.py | 1 + 28 files changed, 2385 insertions(+), 16 deletions(-) create mode 100644 qa/L0_pytorch_debug_unittest/test.sh create mode 100644 tests/pytorch/debug/conftest.py create mode 100644 tests/pytorch/debug/run_distributed.py create mode 100644 tests/pytorch/debug/test_api_features.py create mode 100644 tests/pytorch/debug/test_config.py create mode 100644 tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml create mode 100644 tests/pytorch/debug/test_configs/disable_fp8_layer.yaml create mode 100644 tests/pytorch/debug/test_configs/dummy_feature.yaml create mode 100644 tests/pytorch/debug/test_configs/fake_quantization_config.yaml create mode 100644 tests/pytorch/debug/test_configs/per_tensor_scaling.yaml create mode 100644 tests/pytorch/debug/test_configs/stats_collection_test_config.yaml create mode 100644 tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml create mode 100644 tests/pytorch/debug/test_distributed.py create mode 100644 tests/pytorch/debug/test_numerics.py create mode 100644 tests/pytorch/debug/test_sanity.py create mode 100644 tests/pytorch/debug/utils.py diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh new file mode 100644 index 000000000..9339777f4 --- /dev/null +++ b/qa/L0_pytorch_debug_unittest/test.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + + + +: ${TE_PATH:=/opt/transformerengine} +: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features} +: ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/} + +# Config with the dummy feature which prevents nvinspect from being disabled. +# Nvinspect will be disabled if no feature is active. +: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml} + +FAIL=0 + +pip install pytest==8.2.1 +pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1 +pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1 +pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1 +NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1 + +# standard numerics tests with initialized debug +NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1 + +exit $FAIL diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh index 4319e96c7..09ef661c4 100644 --- a/qa/L1_pytorch_distributed_unittest/test.sh +++ b/qa/L1_pytorch_distributed_unittest/test.sh @@ -20,6 +20,7 @@ FAILED_CASES="" : ${XML_LOG_DIR:=/logs} mkdir -p "$XML_LOG_DIR" + pip3 install pytest==8.2.1 || error_exit "Failed to install pytest" python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py" @@ -30,6 +31,19 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_use python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn_with_cp.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || test_fail "test_fused_attn_with_cp.py" python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py" + +# debug tests + + +# Config with the dummy feature which prevents nvinspect from being disabled. +# Nvinspect will be disabled if no feature is active. +: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml} +: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features} + +pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py" +# standard numerics tests with initialized debug +NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py" + if [ "$RET" -ne 0 ]; then echo "Error in the following test cases:$FAILED_CASES" exit 1 diff --git a/tests/pytorch/debug/conftest.py b/tests/pytorch/debug/conftest.py new file mode 100644 index 000000000..20edc6aab --- /dev/null +++ b/tests/pytorch/debug/conftest.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--feature_dirs", nargs="+", action="store", default="", help="List of feature directories" + ) + parser.addoption( + "--configs_dir", + action="store", + default="", + type=str, + help="Path to the directory with configs.", + ) + + +@pytest.fixture +def feature_dirs(request): + return request.config.getoption("--feature_dirs") + + +@pytest.fixture +def configs_dir(request): + return request.config.getoption("--configs_dir") diff --git a/tests/pytorch/debug/run_distributed.py b/tests/pytorch/debug/run_distributed.py new file mode 100644 index 000000000..640fdf9c5 --- /dev/null +++ b/tests/pytorch/debug/run_distributed.py @@ -0,0 +1,647 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import tempfile +import functools +import os +import itertools +import random +import argparse +import re + +import torch +import torch.distributed as dist +import transformer_engine +import transformer_engine_torch as tex +import nvdlfw_inspect.api as debug_api +from transformer_engine.debug import set_weight_tensor_tp_group_reduce + + +from test_numerics import ( + _emulate_linear, + _init_debug, + disable_fp8_gemms_create_config, + DISABLE_FP8_LAYER_CONFIG, + _cmp, + IN_SIZE, + OUT_SIZE, + _init_model, + SEED, + SEQ_LEN, + BATCH_SIZE, + FP8_RECIPE, + fake_quant_fp8_create_config, + _get_current_scale, + _prepare_per_tensor_scaling_config, + AMAX_HISTORY_LEN, + set_scaling_factors, + set_current_scaling_factors, +) + +WORLD_RANK, WORLD_SIZE = None, None +NCCL_WORLD = None +FEATURE_DIRS = None +all_boolean = [True, False] +TEST_NR = 0 + + +def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None): + if tp_size is None: + tp_size = WORLD_SIZE + tp_rank = WORLD_RANK + torch.manual_seed(weight_seed) + weight = torch.randn((OUT_SIZE, IN_SIZE)).cuda() + torch.manual_seed(data_seed) + in_split_size = IN_SIZE // tp_size + out_split_size = OUT_SIZE // tp_size + x = torch.randn((SEQ_LEN * BATCH_SIZE, IN_SIZE), requires_grad=True).cuda() + if parallel_mode == "row": + x = x[:, tp_rank * in_split_size : (tp_rank + 1) * in_split_size] + x.retain_grad() + + with torch.no_grad(): + if parallel_mode == "column": + weight = weight[tp_rank * out_split_size : (tp_rank + 1) * out_split_size, :] + else: + weight = weight[:, tp_rank * in_split_size : (tp_rank + 1) * in_split_size] + + return x, weight.contiguous() + + +def _init_model(weight, parallel_mode=None, tp_group=None, name="linear"): + model = transformer_engine.pytorch.Linear( + IN_SIZE, + OUT_SIZE, + name=name, + parallel_mode=parallel_mode, + tp_group=(tp_group or NCCL_WORLD if parallel_mode else None), + ) + with torch.no_grad(): + model.weight.copy_(weight) + return model + + +class AllGather(torch.autograd.Function): + @staticmethod + def forward(ctx, tensor, dim, group=None): + if group is None: + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + else: + world_size = torch.distributed.get_world_size(group=group) + rank = torch.distributed.get_rank(group=group) + dist.barrier() + + # Create a list to gather tensors from all processes + y_list = [torch.zeros_like(tensor) for _ in range(world_size)] + torch.distributed.all_gather(y_list, tensor, group=group) + + # Save the world size and rank for backward computation + ctx.world_size = world_size + ctx.rank = rank + ctx.dim = dim + + # Concatenate the gathered tensors along the feature dimension + y_full = torch.cat(y_list, dim=dim) + + return y_full + + @staticmethod + def backward(ctx, grad_output): + # Split the gradient output and return the portion corresponding to this rank + grad_input = torch.chunk(grad_output, ctx.world_size, dim=ctx.dim)[ctx.rank] + return grad_input, None, None + + +def _run_forward_backward(x, model, parallel_mode=None, group=None): + with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y = model(x) + + y.requires_grad_(True) + y.retain_grad() + if parallel_mode == "column": + y = AllGather.apply(y, -1, group) + y.requires_grad_(True) + y.retain_grad() + l = y.sum() + l.backward() + elif parallel_mode == "row": + l = y.sum() + l.backward() + debug_api.step() + return y + + +def _emulate_linear_distributed(*args, parallel_mode=None, **kwargs): + assert parallel_mode in ["column", "row"] + + def split(gradient): + split_size = OUT_SIZE // WORLD_SIZE + gradient = gradient[:, WORLD_RANK * split_size : (WORLD_RANK + 1) * split_size] + return gradient + + activation_sync = None + gradient_sync = None + if parallel_mode == "column": + activation_sync = lambda x: AllGather.apply(x, -1) + gradient_sync = split + else: + activation_sync = ( + lambda activation: dist.all_reduce(activation, op=dist.ReduceOp.SUM) or activation + ) + + output = _emulate_linear( + *args, activation_sync=activation_sync, gradient_sync=gradient_sync, **kwargs + ) + + if parallel_mode == "column": + dist.all_reduce(output["dgrad"], op=dist.ReduceOp.SUM) + + return output + + +def check_debug_log(msg): + with open(f"log/debug_logs/debug_log_globalrank-{WORLD_RANK}.log", "r") as f: + for line in f.readlines(): + if msg in line: + return True + return False + + +def run_debug_test(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank = dist.get_rank() + temp_file_name = None + temp_logdir_name = None + + if rank == 0: + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file: + temp_file_name = temp_file.name + temp_dir_obj = tempfile.TemporaryDirectory() + temp_logdir_name = temp_dir_obj.name + + # Store the TemporaryDirectory object to prevent it from being deleted + wrapper.temp_dir_obj = temp_dir_obj + + temp_file_name_list = [temp_file_name] + temp_logdir_name_list = [temp_logdir_name] + + # Broadcast the temporary file and directory names to all processes + dist.broadcast_object_list(temp_file_name_list, src=0) + dist.broadcast_object_list(temp_logdir_name_list, src=0) + + temp_file_name = temp_file_name_list[0] + temp_logdir_name = temp_logdir_name_list[0] + + dist.barrier() + + config_file = open(temp_file_name, mode="r+", buffering=1) + + try: + kwargs["config_file"] = config_file + kwargs["log_dir"] = temp_logdir_name + + if rank == 0: + global TEST_NR + print(f"Running test {TEST_NR} {func.__name__} with args = {args}.") + TEST_NR += 1 + + func(*args, **kwargs) + finally: + if rank == 0 and temp_file_name is not None: + os.unlink(temp_file_name) + + debug_api.end_debug() + + if rank == 0 and hasattr(wrapper, "temp_dir_obj"): + wrapper.temp_dir_obj.cleanup() + + return wrapper + + +CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed: + layers: + layer_types: [linear] + enabled: + True + transformer_engine: + LogTensorStats: + enabled: True + tensors: [activation, gradient, weight, output, wgrad, dgrad] + stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range] + start_step : 0 + end_step: 1 + LogFp8TensorStats: + enabled: True + tensors: [activation, gradient, weight] + stats: [underflows%] + start_step : 0 + end_step: 1 +""" + + +def _prepare_config_test_log_distributed(config_file): + if WORLD_RANK != 0: + return + config_file.write(CONFIG_LOG_TEST_DISTRIBUTED) + config_file.flush() + + +def _compute_dynamic_range(tensor): + tensor_abs = tensor.abs() + tensor_abs = tensor_abs[tensor_abs != 0] + if tensor_abs.any(): + amin = tensor_abs.min().float() + else: + amin = torch.tensor(1, device=tensor.device).to(torch.float) + amax = tensor_abs.max().float() + if not amax.all(): + amax = torch.tensor(1, device=tensor.device).to(torch.float) + dynamic_range = torch.log2(amax) - torch.log2(amin) + return dynamic_range + + +@run_debug_test +def test_log_distributed(parallel_mode, gather_weight, **kwargs): + _prepare_config_test_log_distributed(kwargs["config_file"]) + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + set_weight_tensor_tp_group_reduce(gather_weight) + if WORLD_SIZE % 2 != 0: + return # skip + TP_SIZE = WORLD_SIZE // 2 + DP_SIZE = 2 + TP_RANK = WORLD_RANK % TP_SIZE + DP_RANK = (WORLD_RANK - TP_RANK) // TP_SIZE + + debug_api.set_tensor_reduction_group(NCCL_WORLD) + + x, weight = _get_tensors( + parallel_mode, + weight_seed=TP_RANK * 1234, + data_seed=DP_RANK * 1234, + tp_size=TP_SIZE, + tp_rank=TP_RANK, + ) + + tp_group_ranks = [i for i in range(DP_RANK * TP_SIZE, (DP_RANK + 1) * TP_SIZE)] + tp_group = dist.new_group(ranks=tp_group_ranks) + + dp_group_ranks = [i for i in range(TP_RANK, WORLD_SIZE, TP_SIZE)] + dp_group = dist.new_group(ranks=dp_group_ranks) + + model = _init_model(weight, parallel_mode=parallel_mode, tp_group=tp_group) + output = _run_forward_backward(x, model, parallel_mode=parallel_mode, group=tp_group) + + gathered_activation = AllGather.apply(x.contiguous(), 0) + gathered_weight = AllGather.apply(weight.contiguous(), 0, tp_group) + gathered_gradient = AllGather.apply(output.grad.contiguous(), 0, dp_group) + if parallel_mode == "row": + gathered_gradient = AllGather.apply(gathered_gradient, 0, tp_group) + + log_file = kwargs["log_dir"] + "/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log" + + dist.barrier() + if WORLD_RANK != 0: + return # stats are gathered on node 0 + with open(log_file) as f: + content = f.read() + + def get_stat(tensor, stat): + regex = r".*_{tensor}_{stat}\s+.*iteration=(\d+)\s+.*value=([-+]?\d*\.?\d+)".format( + tensor=tensor, stat=stat + ) + for line in content.splitlines(): + match = re.search(regex, line) + if match: + value = float(match.group(2)) + return value + + rf = lambda x: round(float(x), 4) + stats = [] + tensors = { + "activation": gathered_activation, + "weight": gathered_weight if gather_weight else weight, + "gradient": gathered_gradient, + } + stats = { + "min": torch.min, + "max": torch.max, + "mean": torch.mean, + "std": torch.std, + "l1_norm": lambda x: torch.norm(x, p=1), + "l2_norm": lambda x: torch.norm(x, p=2), + "cur_amax": lambda x: x.abs().max(), + "dynamic_range": _compute_dynamic_range, + } + for stat_key in stats.keys(): + for tensor_key in tensors.keys(): + torch.testing.assert_close( + get_stat(tensor_key, stat_key), + rf(stats[stat_key](tensors[tensor_key])), + atol=0.0001, + rtol=0.0001, + ) + set_weight_tensor_tp_group_reduce(True) # reset + + +@run_debug_test +def test_log_expert_parallel(**kwargs): + """ + This test tests the scenario, when one of the node of data parallel does not invoke the debug layer. + It naturally occurs in the expert parallelism, when one expert doesn't get input on one node, + but gets it on other nodes. If there were all_gather inside forward(), this would result in deadlock. + """ + _prepare_config_test_log_distributed(kwargs["config_file"]) + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + debug_api.set_tensor_reduction_group(NCCL_WORLD) + x, weight = _get_tensors( + "row", weight_seed=WORLD_RANK * 1234, data_seed=WORLD_RANK * 1234, tp_size=1, tp_rank=0 + ) # data parallel + model = _init_model(weight, parallel_mode=None, name="linear1") + model1 = _init_model(weight, parallel_mode=None, name="linear2") + with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y1 = model(x) + y2 = model1(x) + y = y1 + y2 + y.sum().backward() + debug_api.step() + with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y = model(x) + if WORLD_RANK != 0: + y = y + model1(x) + + y.sum().backward() + + +@run_debug_test +def test_disable_fp8_gemms(fprop_fp8, dgrad_fp8, wgrad_fp8, parallel_mode, **kwargs): + disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, kwargs["config_file"]) + fp8_kwargs = { + "fprop_fp8": fprop_fp8, + "dgrad_fp8": dgrad_fp8, + "wgrad_fp8": wgrad_fp8, + } + + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + x, weight = _get_tensors(parallel_mode) + model = _init_model(weight, parallel_mode=parallel_mode) + y = _run_forward_backward(x, model, parallel_mode=parallel_mode) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + + x.grad.zero_() + ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs) + _cmp(ground_truth, output) + + +@run_debug_test +def test_disable_fp8_layer(parallel_mode, **kwargs): + if WORLD_RANK == 0: + kwargs["config_file"].write(DISABLE_FP8_LAYER_CONFIG) + kwargs["config_file"].flush() + dist.barrier() + + x, weight = _get_tensors(parallel_mode) + + ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode) + x.grad.zero_() + + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + + model = _init_model(weight, parallel_mode) + y = _run_forward_backward(x, model, parallel_mode) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + _cmp(ground_truth, output) + + +@run_debug_test +def test_per_tensor_scaling( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + parallel_mode, + **kwargs, +): + input_kwargs = { + "fprop_inp": fprop_inp, + "fprop_weight": fprop_weight, + "dgrad_weight": dgrad_weight, + "dgrad_grad": dgrad_grad, + "wgrad_input": wgrad_input, + "wgrad_grad": wgrad_grad, + } + fp8_kwargs = { + "fprop_fp8": True, + "dgrad_fp8": True, + "wgrad_fp8": True, + } + """ + Runs a test to validate per-tensor (current) scaling in FP8 computations. + The function performs warm-up iterations to populate the amax buffer of the model and compute scaling factors based on delayed scaling. + Subsequently, weights and inputs are switched to ensure their current scaling factors differ from those based on delayed scaling; + similarly, the loss is multiplied by a large factor to alter the gradient's magnitude, + creating a discrepancy between the original (delayed) and per-tensor (current) scaling factors. + Finally, a linear pass is emulated, and the results are compared.” + """ + _prepare_per_tensor_scaling_config( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + kwargs["config_file"], + ) + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + + warmup_input, warmup_weight = _get_tensors(parallel_mode=parallel_mode) + model = _init_model(warmup_weight, parallel_mode=parallel_mode) + + # Warmup run to setup amax and scaling factors. + for _ in range(AMAX_HISTORY_LEN): + _run_forward_backward(warmup_input, model, parallel_mode=parallel_mode) + + x, weight = _get_tensors( + parallel_mode=parallel_mode, weight_seed=WORLD_RANK * 2137, data_seed=WORLD_RANK * 2137 + ) + model.weight.data = weight.data + x.retain_grad() + + # delayed scaling factor + # need to be collected before forward pass with test data, + # because this forward pass changes scaling factors + set_scaling_factors(model, input_kwargs, fp8_kwargs) + + LOSS_MULTIPLIER = 100 + + with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y = model(x) + model.zero_grad() + if parallel_mode == "column": + y = AllGather.apply(y, -1) + y.retain_grad() + + ( + LOSS_MULTIPLIER * y.sum() + ).backward() # Loss multiplication to change gradient's order of magintude + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + # per tensor - current - scaling factors + # need to be collected after forward pass with test data, + # because gradient(y.grad) cannot be accessed before forward, + # but it needs to be collected. + + set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs) + ground_truth = _emulate_linear_distributed( + x, weight, parallel_mode=parallel_mode, loss_multiplier=LOSS_MULTIPLIER, **fp8_kwargs + ) + + _cmp(ground_truth, output) + + +@run_debug_test +def test_fake_quant_fp8( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + parallel_mode, + **kwargs, +): + + fp8_kwargs = { + "fprop_input_fake_quant": fprop_inp, + "fprop_weight_fake_quant": fprop_weight, + "dgrad_gradient_fake_quant": dgrad_grad, + "dgrad_weight_fake_quant": dgrad_weight, + "wgrad_gradient_fake_quant": wgrad_grad, + "wgrad_input_fake_quant": wgrad_input, + "fprop_fp8": not (fprop_inp or fprop_weight), + "dgrad_fp8": not (dgrad_weight or dgrad_grad), + "wgrad_fp8": not (wgrad_grad or wgrad_input), + } + if WORLD_RANK == 0: + fake_quant_fp8_create_config( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + kwargs["config_file"], + ) + dist.barrier() + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS) + + x, weight = _get_tensors(parallel_mode) + model = _init_model(weight, parallel_mode) + y = _run_forward_backward(x, model, parallel_mode) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + fp8_kwargs["fprop_input_scale"] = ( + _get_current_scale(x, fprop_inp) if not fp8_kwargs["fprop_fp8"] else None + ) + fp8_kwargs["fprop_weight_scale"] = ( + _get_current_scale(weight, fprop_weight) if not fp8_kwargs["fprop_fp8"] else None + ) + fp8_kwargs["dgrad_gradient_scale"] = ( + _get_current_scale(y.grad, dgrad_grad) if not fp8_kwargs["dgrad_fp8"] else None + ) + fp8_kwargs["dgrad_weight_scale"] = ( + _get_current_scale(weight, dgrad_weight) if not fp8_kwargs["dgrad_fp8"] else None + ) + fp8_kwargs["wgrad_gradient_scale"] = ( + _get_current_scale(y.grad, wgrad_grad) if not fp8_kwargs["wgrad_fp8"] else None + ) + fp8_kwargs["wgrad_input_scale"] = ( + _get_current_scale(x, wgrad_input) if not fp8_kwargs["wgrad_fp8"] else None + ) + ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs) + _cmp(ground_truth, output) + + +def _init_distributed(): + global WORLD_RANK, WORLD_SIZE, NCCL_WORLD, FP8 + + WORLD_RANK = int(os.getenv("RANK", "0")) + WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1")) + LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0")) + LOCAL_SIZE = int(os.getenv("LOCAL_WORLD_SIZE", "1")) + + assert WORLD_SIZE == LOCAL_SIZE # this test supports only 1 node + assert LOCAL_SIZE <= torch.cuda.device_count() + dist_init_kwargs = { + "backend": "nccl", + "rank": WORLD_RANK, + "world_size": WORLD_SIZE, + } + dist_init_kwargs["init_method"] = "env://" + dist_init_kwargs["device_id"] = torch.device(f"cuda:{LOCAL_RANK}") + assert dist.is_nccl_available() + torch.cuda.set_device(LOCAL_RANK) + dist.init_process_group(**dist_init_kwargs) + + NCCL_WORLD = dist.new_group(backend="nccl") + + WORLD_SIZE = dist.get_world_size() + + +def _run_test_with_combinations( + test_function, values_list, num_repeat, extra_args, sample_size=None +): + combinations = itertools.product(values_list, repeat=num_repeat) + total_combinations = itertools.product(combinations, extra_args) + + if sample_size is not None: + total_combinations = random.sample(list(total_combinations), sample_size) + + for comb, arg in total_combinations: + test_function(*comb, arg) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--feature_dirs", type=str) + args = parser.parse_args() + FEATURE_DIRS = args.feature_dirs + random.seed(SEED) + _init_distributed() + + test_log_expert_parallel() + for parallel_mode in ["column", "row"]: + for gather_weight in [True, False]: + test_log_distributed(parallel_mode, gather_weight) + + for parallel_mode in ["row", "column"]: + test_disable_fp8_layer(parallel_mode) + + # test_disable_fp8_gemms + _run_test_with_combinations( + test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"] + ) + + # test_fake_quant_fp8 + dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None] + _run_test_with_combinations( + test_fake_quant_fp8, + dtype_options, + num_repeat=6, + extra_args=["column", "row"], + sample_size=20, + ) + + _run_test_with_combinations( + test_per_tensor_scaling, + all_boolean, + num_repeat=6, + extra_args=["column"], + sample_size=20, + ) diff --git a/tests/pytorch/debug/test_api_features.py b/tests/pytorch/debug/test_api_features.py new file mode 100644 index 000000000..f9cd234ba --- /dev/null +++ b/tests/pytorch/debug/test_api_features.py @@ -0,0 +1,398 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import torch +from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer + +import nvdlfw_inspect.api as debug_api + +try: + import transformer_engine + import transformer_engine_torch as tex +except (ImportError, ModuleNotFoundError): + print("Could not find TransformerEngine package.") + exit(1) + + +def test_transformer_engine_no_config(feature_dirs): + debug_api.initialize("", feature_dirs=feature_dirs) + try: + + tensor = torch.rand(24, 2046).cuda() + + # FP8 enabled - true by the default + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="fprop", iteration=0 + ) + + # modify_tensor_enabled - False by default + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0 + ) + + # inspect_tensor_enabled - False by default + assert not debug_api.transformer_engine.inspect_tensor_enabled( + "decoder.1.attn.qkv", tensor_name="activation", iteration=0 + ) + + # inspect_tensor_postquantize - False by default + assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled( + "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0 + ) + + finally: + debug_api.end_debug() + + +def test_disable_fp8_gemm(configs_dir, feature_dirs): + try: + debug_api.initialize(configs_dir + "disable_fp8_gemms.yaml", feature_dirs=feature_dirs) + + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="fprop", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="dgrad", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="wgrad", iteration=0 + ) + + # caching + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="fprop", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="dgrad", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="wgrad", iteration=0 + ) + + finally: + debug_api.end_debug() + + +def test_disable_fp8_layer(configs_dir, feature_dirs): + try: + debug_api.initialize(configs_dir + "disable_fp8_layer.yaml", feature_dirs=feature_dirs) + + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.mlp.fc1", gemm="fprop", iteration=0 + ) + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.mlp.fc1", gemm="wgrad", iteration=0 + ) + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.mlp.fc1", gemm="dgrad", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="fprop", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="wgrad", iteration=0 + ) + assert not debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.attn.qkv", gemm="dgrad", iteration=0 + ) + + finally: + debug_api.end_debug() + + +def test_per_tensor_scaling(configs_dir, feature_dirs): + try: + + debug_api.initialize(configs_dir + "per_tensor_scaling.yaml", feature_dirs=feature_dirs) + + tensor = torch.rand(24, 2046).cuda() + + # check modify_tensor_enabled + assert debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0 + ) + assert debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0 + ) + assert debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0 + ) + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0 + ) + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0 + ) + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0 + ) + + # check modify_tensor + + default_quantizer1 = Float8Quantizer( + scale=torch.tensor([1]).cuda(), + amax=torch.tensor([0]).cuda(), + fp8_dtype=tex.DType.kFloat8E4M3, + ) + default_quantizer2 = Float8Quantizer( + scale=torch.tensor([1]).cuda(), + amax=torch.tensor([0]).cuda(), + fp8_dtype=tex.DType.kFloat8E5M2, + ) + + output1 = debug_api.transformer_engine.modify_tensor( + layer_name="decoder.1.mlp.fc1", + gemm="fprop", + tensor_name="activation", + default_quantizer=default_quantizer1, + iteration=0, + tensor=tensor, + ) + assert type(output1) == Float8Tensor + assert output1._fp8_dtype == tex.DType.kFloat8E4M3 + + output2 = debug_api.transformer_engine.modify_tensor( + "decoder.1.mlp.fc1", + gemm="dgrad", + tensor=tensor, + tensor_name="gradient", + default_quantizer=default_quantizer2, + iteration=0, + ) + assert type(output2) == Float8Tensor + assert output2._fp8_dtype == tex.DType.kFloat8E5M2 + + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", + gemm="wgrad", + tensor_name="gradient", + iteration=0, + ) + + assert not debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc4", + gemm="fprop", + tensor_name="activation", + iteration=0, + ) + finally: + debug_api.end_debug() + + +def test_fake_quant(configs_dir, feature_dirs): + try: + debug_api.initialize( + configs_dir + "fake_quantization_config.yaml", feature_dirs=feature_dirs + ) + + tensor = torch.rand(24, 2046).cuda() + + # modify_tensor_enabled + assert debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0 + ) + + assert debug_api.transformer_engine.modify_tensor_enabled( + "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0 + ) + + # modify_tensor + debug_api.transformer_engine.modify_tensor( + "decoder.1.mlp.fc1", + gemm="fprop", + tensor=tensor, + tensor_name="activation", + iteration=0, + default_quantizer=None, + ) + + debug_api.transformer_engine.modify_tensor( + "decoder.1.mlp.fc1", + gemm="dgrad", + tensor=tensor, + tensor_name="gradient", + iteration=0, + default_quantizer=None, + ) + + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.fc2", gemm="wgrad", iteration=0 + ) + # caching + assert debug_api.transformer_engine.fp8_gemm_enabled( + "decoder.1.fc2", gemm="wgrad", iteration=0 + ) + finally: + debug_api.end_debug() + + +def test_statistics_collection(configs_dir, feature_dirs): + try: + debug_api.initialize( + config_file=configs_dir + "stats_collection_test_config.yaml", + feature_dirs=feature_dirs, + default_logging_enabled=False, + ) + + tensor = torch.randn((100, 100, 5)).cuda() + tensor_fp8 = Float8Tensor( + data=tensor.to(torch.uint8).cuda(), + fp8_scale_inv=torch.full([1], 1.0).cuda(), + fp8_dtype=tex.DType.kFloat8E4M3, + shape=tensor.shape, + dtype=torch.float32, + ) + + def log(): + from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS + + return STATS_BUFFERS.log_stats() + + def assert_empty(): + stats = log() + assert len(stats) == 0 + + # TE tensor stats -- + debug_api.transformer_engine.inspect_tensor( + "decoder.1.mlp.fc1", + tensor=tensor, + tensor_name="activation", + iteration=200, + tp_group=None, + ) + stats = log() + assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max() + assert not debug_api.transformer_engine.inspect_tensor_enabled( + "decoder.1.mlp.fc1", tensor_name="activation", iteration=201 + ) + assert not debug_api.transformer_engine.inspect_tensor_enabled( + "decoder.2.mlp.fc1", tensor_name="activation", iteration=200 + ) + assert not debug_api.transformer_engine.inspect_tensor_enabled( + "decoder.1.mlp.fc1", tensor_name="gradient", iteration=200 + ) + + expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5) + expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5) + + # TE FP8 tensor stats -- + assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled( + "decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200 + ) + debug_api.transformer_engine.inspect_tensor_postquantize( + "decoder.1.mlp.fc1", + tensor=tensor_fp8, + tensor_name="gradient", + iteration=200, + rowwise=True, + tp_group=None, + ) + stats = log() + torch.testing.assert_close( + stats[("decoder.1.mlp.fc1", "gradient", "underflows%", 200)], expected_underflows + ) + + assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled( + "decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201 + ) + assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled( + "decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200 + ) + + # Second config in same yaml + tensor = torch.rand((100, 100, 5)) + debug_api.transformer_engine.inspect_tensor( + "decoder.6.mlp.fc1", + tensor=tensor, + tensor_name="activation", + iteration=200, + tp_group=None, + ) + stats = log() + stats_names = [x[3] for x in stats.keys()] + all(s in stats_names for s in ["cur_amax", "dynamic_range", "mean", "std", "l1_norm"]) + assert stats[("decoder.6.mlp.fc1", "activation", "mean", 200)] == tensor.mean() + + debug_api.transformer_engine.inspect_tensor( + "decoder.7.mlp.fc1", + tensor=tensor, + tensor_name="weight", + iteration=200, + tp_group=None, + ) + stats = log() + stats_names = [x[3] for x in stats.keys()] + all(s in stats_names for s in ["mean", "std", "l1_norm", "min", "max"]) + assert stats[("decoder.7.mlp.fc1", "weight", "max", 200)] == tensor.max() + + assert not debug_api.transformer_engine.inspect_tensor_enabled( + "decoder.7.mlp.fc1", tensor_name="weight", iteration=201 + ) + assert_empty() + + finally: + debug_api.end_debug() + + +def test_statistics_multi_run(configs_dir, feature_dirs): + try: + debug_api.initialize( + config_file=configs_dir + "stats_collection_test_config.yaml", + feature_dirs=feature_dirs, + default_logging_enabled=False, + ) + + def feed(tensor, tensor_fp8): + debug_api.transformer_engine.inspect_tensor( + "decoder.5.mlp.fc1", + tensor=tensor, + tensor_name="activation", + iteration=1, + tp_group=None, + ) + debug_api.transformer_engine.inspect_tensor_postquantize( + "decoder.5.mlp.fc1", + tensor=tensor_fp8, + tensor_name="activation", + iteration=1, + rowwise=True, + tp_group=None, + ) + + def log_stats(): + from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS + + return STATS_BUFFERS.log_stats() + + def fp8_tensor(t): + return Float8Tensor( + data=t.to(torch.uint8).cuda(), + fp8_scale_inv=torch.ones([1]).cuda(), + fp8_dtype=tex.DType.kFloat8E4M3, + shape=t.shape, + dtype=torch.float32, + ) + + shape = [1024, 1024] + tensors = [torch.randn(shape) for _ in range(2)] + tensors_fp8 = [fp8_tensor(tensors[i]) for i in range(2)] + + feed(tensors[0], tensors_fp8[0]) + feed(tensors[1], tensors_fp8[1]) + stats1 = log_stats() + + tensor2 = torch.cat((tensors[0], tensors[1])).cuda() + fp8tensor2 = fp8_tensor(tensor2) + feed(tensor2, fp8tensor2) + stats2 = log_stats() + + assert len(stats1.keys()) > 0 + for k in stats1.keys(): + torch.testing.assert_close(stats1[k], stats2[k]) + finally: + debug_api.end_debug() + + +if __name__ == "__main__": + pass diff --git a/tests/pytorch/debug/test_config.py b/tests/pytorch/debug/test_config.py new file mode 100644 index 000000000..71715a686 --- /dev/null +++ b/tests/pytorch/debug/test_config.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +import pathlib, os + +from nvdlfw_inspect.config_manager import ConfigManager + +import nvdlfw_inspect.api as debug_api + +try: + import transformer_engine + from transformer_engine.debug.features.api import TEConfigAPIMapper +except (ImportError, ModuleNotFoundError): + print("Could not find TransformerEngine debug module.") + exit(1) + + +def test_transformer_engine_config_parsing(feature_dirs): + debug_api.initialize( + config_file=pathlib.Path(__file__).resolve().parent + / "test_configs/tensor_manipulation_transformer_engine.yaml", + feature_dirs=feature_dirs, + log_dir="./log", + ) + + cfg_fc1 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc1")["transformer_engine"] + cfg_fc2 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc2")["transformer_engine"] + assert cfg_fc1 and cfg_fc2 + + gemm_parsing = True + tensor_parsing = True + + # Per tensor scaling set for dgrad, filter based on gemm + ret, _ = TEConfigAPIMapper().parse_config_and_api( + cfg_fc1["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="wgrad", + tensor_name="activation", + ) + assert not ret + + # per tensor scaling set for gradient, filter based on tensor name + ret, _ = TEConfigAPIMapper().parse_config_and_api( + cfg_fc1["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="dgrad", + tensor_name="activation", + ) + assert not ret + + ret, parsed_cfg_fc1 = TEConfigAPIMapper().parse_config_and_api( + cfg_fc1["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="dgrad", + tensor_name="gradient", + ) + assert ret + assert parsed_cfg_fc1 == {"gemm": "dgrad", "tensor": "gradient"} + + # Test tensor struct + ret, parsed_cfg_fc1_act = TEConfigAPIMapper().parse_config_and_api( + cfg_fc1["FakeQuant"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="fprop", + tensor_name="activation", + ) + ret, parsed_cfg_fc1_wei = TEConfigAPIMapper().parse_config_and_api( + cfg_fc1["FakeQuant"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="fprop", + tensor_name="weight", + ) + assert ret + assert parsed_cfg_fc1_act == { + "gemm": "fprop", + "tensor": "activation", + "quant_format": "FP8E4M3", + } + assert parsed_cfg_fc1_wei == { + "gemm": "fprop", + "tensor": "weight", + "quant_format": "FP8E4M3", + } + + # Test gemms struct + ret, parsed_cfg_fc2_grad = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["FakeQuant"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="dgrad", + tensor_name="gradient", + ) + assert ret + assert parsed_cfg_fc2_grad == {"gemm": "dgrad", "tensor": "gradient", "quant_format": "FP8E5M2"} + ret, parsed_cfg_fc2_wei = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["FakeQuant"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="dgrad", + tensor_name="weight", + ) + assert ret + assert parsed_cfg_fc2_wei == {"gemm": "dgrad", "tensor": "weight", "quant_format": "FP8E5M2"} + + # Test gemm + tensor struct + ret, parsed_cfg_fc2_fprop_act = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="fprop", + tensor_name="activation", + ) + assert ret + assert parsed_cfg_fc2_fprop_act == {"gemm": "fprop", "tensor": "activation"} + + ret, parsed_cfg_fc2_fprop_wei = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="fprop", + tensor_name="weight", + ) + assert ret + assert parsed_cfg_fc2_fprop_wei == {"gemm": "fprop", "tensor": "weight"} + + ret, parsed_cfg_fc2_wgrad_act = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="wgrad", + tensor_name="activation", + ) + assert ret + assert parsed_cfg_fc2_wgrad_act == {"gemm": "wgrad", "tensor": "activation"} + + ret, parsed_cfg_fc2_wgrad_grad = TEConfigAPIMapper().parse_config_and_api( + cfg_fc2["PerTensorScaling"], + gemm_parsing=gemm_parsing, + tensor_parsing=tensor_parsing, + gemm="wgrad", + tensor_name="gradient", + ) + assert ret + assert parsed_cfg_fc2_wgrad_grad == {"gemm": "wgrad", "tensor": "gradient"} + + ConfigManager.reset() diff --git a/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml b/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml new file mode 100644 index 000000000..b832f26d8 --- /dev/null +++ b/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml @@ -0,0 +1,8 @@ +test_disable_fp8_gemm_1: + enabled: True + layers: + layer_types: [qkv, fc2] + transformer_engine: + DisableFP8GEMM: + enabled: True + gemms: [dgrad, wgrad] \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml b/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml new file mode 100644 index 000000000..39bfc7a25 --- /dev/null +++ b/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml @@ -0,0 +1,7 @@ +test_disable_fp8_layer: + enabled: True + layers: + layer_types: [qkv] + transformer_engine: + DisableFP8Layer: + enabled: True \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/dummy_feature.yaml b/tests/pytorch/debug/test_configs/dummy_feature.yaml new file mode 100644 index 000000000..540e3ac42 --- /dev/null +++ b/tests/pytorch/debug/test_configs/dummy_feature.yaml @@ -0,0 +1,9 @@ +deummy_feature_everywhere: + enabled: True + layers: + layer_name_regex_pattern: .* + transformer_engine: + TestDummyFeature: + enabled: True + tensors: [weight, activation, gradient, output, wgrad, dgrad] + gemms: [wgrad, dgrad, fprop] \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/fake_quantization_config.yaml b/tests/pytorch/debug/test_configs/fake_quantization_config.yaml new file mode 100644 index 000000000..62feace6d --- /dev/null +++ b/tests/pytorch/debug/test_configs/fake_quantization_config.yaml @@ -0,0 +1,14 @@ +test_fake_quant_fp8: + enabled: True + layers: + layer_numbers: [1] + layer_types: [fc1, fc2] + transformer_engine: + FakeQuant: + enabled: True + gemms: [fprop, dgrad] + tensors_struct: + - tensor: activation + quant_format: FP8E4M3 + - tensor: gradient + quant_format: FP8E5M2 \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml b/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml new file mode 100644 index 000000000..c17f2f7d2 --- /dev/null +++ b/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml @@ -0,0 +1,19 @@ +test_per_tensor_scaling: + enabled: True + layers: + layer_numbers: [1] + layer_types: [fc1, fc2] + transformer_engine: + DisableFP8GEMM: + enabled: True + gemms: [wgrad] + PerTensorScaling: + enabled: True + gemms_struct: + - gemm: fprop + tensors_struct: + - tensor: activation + - tensor: weight + - gemm: dgrad + tensors_struct: + - tensor: gradient \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml b/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml new file mode 100644 index 000000000..8f01b2d62 --- /dev/null +++ b/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml @@ -0,0 +1,59 @@ +stat_collection_test_1: + enabled: True + layers: + layer_numbers: [1, 3] + LogTensorStats: + enabled: True + stats: [mean, std, l1_norm, l2_norm] + tensors: [activation] + freq: 1 + start_step: 100 + end_step: 500 + transformer_engine: + LogTensorStats: + enabled: True + stats: [cur_amax, dynamic_range] + tensors: [activation] + freq: 2 + start_step: 100 + end_step: 500 + LogFp8TensorStats: + enabled: True + stats: [underflows%] + tensors: [gradient] + freq: 5 + start_step: 100 + end_step: 500 + +stat_collection_test_2: + enabled: True + layers: + layer_numbers: [6, 7] + transformer_engine: + LogTensorStats: + enabled: True + tensors_struct: + - tensor: activation + stats: [cur_amax, dynamic_range, mean, std, l1_norm] + freq: 2 + start_step: 100 + end_step: 500 + - tensor: weight + stats: [mean, std, l1_norm, min, max] + freq: 5 + start_step: 100 + end_step: 500 + +stat_collection_test_4: + enabled: True + layers: + layer_numbers: [5] + transformer_engine: + LogTensorStats: + enabled: True + tensors: [activation] + stats: [cur_amax, dynamic_range, mean, std, l1_norm] + LogFp8TensorStats: + enabled: True + stats: [underflows%] + tensors: [activation] \ No newline at end of file diff --git a/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml b/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml new file mode 100644 index 000000000..e86486366 --- /dev/null +++ b/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml @@ -0,0 +1,45 @@ +# This config is used when FP8 training is ON + +transformer_engine_fc1_manipulation: + enabled: True + layers: + layer_name_regex_pattern: .*(fc1) # Select layers if they end in fc1 + transformer_engine: # namespace + DisableFP8GEMM: # Disable FP8 GEMM. FProp run in high precision + enabled: True + gemms: [fprop] + PerTensorScaling: # Scale DGrad gradients using per tensor current scaling and run FP8 GEMM + enabled: True + gemms: [dgrad] + tensors: [gradient] + FakeQuant: # Disable FP8 GEMM for Wgrad. Fake quantize activations to Wgrad and run high precision GEMM + enabled: True + gemms: [fprop] + tensors_struct: + - tensor: activation + quant_format: FP8E4M3 + - tensor: weight + quant_format: FP8E4M3 + +transformer_engine_fc2_manipulation: + enabled: True + layers: + layer_name_regex_pattern: .*(fc2) # Select layers if they end in fc2 + transformer_engine: # namespace + PerTensorScaling: # Scale WGrad and Fprop inputs using per tensor current scaling and run FP8 GEMM + enabled: True + gemms_struct: + - gemm: fprop + tensors_struct: + - tensor: activation + - tensor: weight + - gemm: wgrad + tensors_struct: + - tensor: activation + - tensor: gradient + FakeQuant: # Disable FP8 GEMM for DGrad. Fake quantize weights and gradients to DGrad and run high precision GEMM + enabled: True + gemms_struct: + - gemm: dgrad + tensors: [weight, gradient] + quant_format: FP8E5M2 \ No newline at end of file diff --git a/tests/pytorch/debug/test_distributed.py b/tests/pytorch/debug/test_distributed.py new file mode 100644 index 000000000..7c072a054 --- /dev/null +++ b/tests/pytorch/debug/test_distributed.py @@ -0,0 +1,39 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os +import subprocess +from pathlib import Path + +import pytest +import torch + +""" + Distributed numerics tests + + These tests test the numerical corectness of the TransformerEngine layers. + Tests are parametrized by the layer and fp8 precision. + One test consists of running multiple configurations from file run_numerics.py + Such design is due to the fact the initialization of one test is long + - 2 processes need to start and load torch and TE. Multiple configurations + are run in one test - this reduces the initialization overhead. + +""" + + +if torch.cuda.device_count() < 2: + pytest.skip("Distributed training needs at least 2 GPUs.") + +TEST_ROOT = Path(__file__).parent.resolve() +NUM_PROCS: int = min(4, torch.cuda.device_count()) +LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"] + + +def test_debug_distributed(feature_dirs): + test_path = TEST_ROOT / "run_distributed.py" + test_cmd = LAUNCH_CMD + [str(test_path), f"--feature_dirs={feature_dirs[0]}"] + + result = subprocess.run(test_cmd, env=os.environ, capture_output=True, check=False) + if result.returncode != 0: + raise AssertionError(result.stderr.decode()) diff --git a/tests/pytorch/debug/test_numerics.py b/tests/pytorch/debug/test_numerics.py new file mode 100644 index 000000000..55c3ab9b7 --- /dev/null +++ b/tests/pytorch/debug/test_numerics.py @@ -0,0 +1,718 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import functools +import itertools +import os +import random +import tempfile +from string import Template + +import pytest +import torch + +import nvdlfw_inspect.api as debug_api +import transformer_engine.debug +import transformer_engine.pytorch as tepytorch +import transformer_engine_torch as tex +from transformer_engine.common.recipe import DelayedScaling, Format +from transformer_engine.pytorch.fp8 import _default_sf_compute +from transformer_engine.pytorch.tensor.float8_tensor import ( + Float8Quantizer, + Float8CurrentScalingQuantizer, +) +from transformer_engine.pytorch.module.base import ( + _2X_ACC_DGRAD, + _2X_ACC_FPROP, + _2X_ACC_WGRAD, +) + +all_boolean = [True, False] +FP8_FORMAT = Format.HYBRID +AMAX_HISTORY_LEN = 16 +FP8_RECIPE = DelayedScaling( + fp8_format=FP8_FORMAT, amax_history_len=AMAX_HISTORY_LEN, amax_compute_algo="max" +) +SEED = 1234 +IN_SIZE = 128 +OUT_SIZE = 64 +BATCH_SIZE = 16 +SEQ_LEN = 128 +LOSS_FN = torch.nn.functional.cross_entropy + + +def _cast_to_fp8(tensor, scale, dtype): + tensor = tensor.contiguous() + if type(scale) == torch.Tensor: + amax = scale.abs().max().float() + quantizer = Float8Quantizer(scale, amax, dtype) + else: + quantizer = Float8CurrentScalingQuantizer(scale, device=tensor.device) + + return quantizer(tensor) + + +def _get_current_scale(tensor, fp8_dtype): + if fp8_dtype == tex.DType.kFloat8E4M3: + fp8_max = Format.E4M3.value.max_fwd + else: + fp8_max = Format.E5M2.value.max_fwd + + amax = tensor.abs().max().float() + one = torch.ones(1, device=tensor.device) + + return _default_sf_compute(amax, one, fp8_max, 0).detach() + + +def _fake_cast(tensor, fp8_dtype, scale): + scale = scale or _get_current_scale(tensor, fp8_dtype) + fp8_tensor = _cast_to_fp8(tensor, scale, fp8_dtype) + + return fp8_tensor.dequantize() + + +def _fp8_gemm_kernel(tensor1, scale1, dtype1, tensor2, scale2, dtype2, use_split_accumulator): + fp8_tensor1 = _cast_to_fp8(tensor1, scale1, dtype1) + fp8_tensor2 = _cast_to_fp8(tensor2, scale2, dtype2) + + out, *_ = tepytorch.cpp_extensions.general_gemm( + fp8_tensor1, + fp8_tensor2, + tepytorch.module.base.get_workspace(), + torch.float32, + use_split_accumulator=use_split_accumulator, + ) + out.requires_grad = True + return out.T + + +def _emulate_linear( + input: torch.Tensor, + weight: torch.Tensor, + fprop_fp8: bool = False, + fprop_input_fake_quant: tex.DType = None, + fprop_input_scale: torch.Tensor = None, + fprop_weight_fake_quant: tex.DType = None, + fprop_weight_scale: torch.Tensor = None, + dgrad_fp8: bool = False, + dgrad_gradient_fake_quant: tex.DType = None, + dgrad_gradient_scale: torch.Tensor = None, + dgrad_weight_fake_quant: tex.DType = None, + dgrad_weight_scale: torch.Tensor = None, + wgrad_fp8: bool = False, + wgrad_gradient_fake_quant: tex.DType = None, + wgrad_gradient_scale: torch.Tensor = None, + wgrad_input_fake_quant: tex.DType = None, + wgrad_input_scale: torch.Tensor = None, + loss_multiplier: float = 1.0, + activation_sync=None, + gradient_sync=None, +): + _scalar = lambda x: torch.Tensor([x]).cuda() if type(x) in [float, torch.Tensor] else x + if fprop_fp8: + activation = _fp8_gemm_kernel( + input, + _scalar(fprop_input_scale or 1.0), + tex.DType.kFloat8E4M3, + weight, + _scalar(fprop_weight_scale or 1.0), + tex.DType.kFloat8E4M3, + _2X_ACC_FPROP, + ) + activation = activation.clone().detach().contiguous().requires_grad_(True) + else: + fprop_input = ( + _fake_cast(input, fprop_input_fake_quant, _scalar(fprop_input_scale)) + if fprop_input_fake_quant is not None + else input + ) + fprop_weight = ( + _fake_cast(weight, fprop_weight_fake_quant, _scalar(fprop_weight_scale)) + if fprop_weight_fake_quant is not None + else weight + ) + + activation = (fprop_input @ fprop_weight.T).contiguous() + + if activation_sync: + activation = activation_sync(activation) + + activation.retain_grad() + + (loss_multiplier * activation.sum()).backward(retain_graph=True) + gradient = activation.grad.clone() + + if gradient_sync: + gradient = gradient_sync(gradient) + + if dgrad_fp8: + dgrad = _fp8_gemm_kernel( + weight.T, + _scalar(dgrad_weight_scale or 1.0), + tex.DType.kFloat8E4M3, + gradient, + _scalar(dgrad_gradient_scale or 1.0), + tex.DType.kFloat8E5M2, + _2X_ACC_DGRAD, + ).T + else: + dgrad_gradient = ( + _fake_cast(gradient, dgrad_gradient_fake_quant, _scalar(dgrad_gradient_scale)) + if dgrad_gradient_fake_quant is not None + else gradient + ) + + dgrad_weight = ( + _fake_cast(weight, dgrad_weight_fake_quant, _scalar(dgrad_weight_scale)) + if dgrad_weight_fake_quant is not None + else weight + ) + dgrad = dgrad_gradient @ dgrad_weight + + if wgrad_fp8: + wgrad = _fp8_gemm_kernel( + input.T, + _scalar(wgrad_input_scale or 1.0), + tex.DType.kFloat8E4M3, + gradient.T, + _scalar(wgrad_gradient_scale or 1.0), + tex.DType.kFloat8E5M2, + _2X_ACC_WGRAD, + ).T + else: + wgrad_gradient = ( + _fake_cast(gradient, wgrad_gradient_fake_quant, _scalar(wgrad_gradient_scale)) + if wgrad_gradient_fake_quant is not None + else gradient + ) + wgrad_input = ( + _fake_cast(input, wgrad_input_fake_quant, _scalar(wgrad_input_scale)) + if wgrad_input_fake_quant is not None + else input + ) + wgrad_input = wgrad_input.contiguous() + wgrad_gradient = wgrad_gradient.contiguous() + wgrad, *_ = tepytorch.cpp_extensions.general_gemm( + wgrad_input, + wgrad_gradient, + tepytorch.module.base.get_workspace(), + torch.float32, + layout="NT", + grad=True, + use_split_accumulator=_2X_ACC_WGRAD, + ) + + return {"activation": activation, "wgrad": wgrad, "dgrad": dgrad} + + +def _init_debug(config_name, log_dir, feature_dirs): + debug_api.initialize( + config_file=config_name, + feature_dirs=feature_dirs, + log_dir=log_dir, + default_logging_enabled=True, + ) + + +def create_config_file(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file: + with tempfile.TemporaryDirectory() as temp_dir: + try: + kwargs["config_file"] = temp_file + kwargs["log_dir"] = temp_dir + result = func(*args, **kwargs) + finally: + temp_file_name = temp_file.name + debug_api.end_debug() + os.unlink(temp_file_name) + return result + + return wrapper + + +def _cmp(ground_truth, output): + torch.testing.assert_close(ground_truth["activation"], output["activation"]) + torch.testing.assert_close(ground_truth["wgrad"], output["wgrad"]) + torch.testing.assert_close(ground_truth["dgrad"], output["dgrad"]) + + +def _init_model(weight): + model = transformer_engine.pytorch.Linear(IN_SIZE, OUT_SIZE, name="linear") + with torch.no_grad(): + model.weight.copy_(weight.contiguous()) + return model + + +def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None): + with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y = model(x, is_first_microbatch=is_first_microbatch) + (y.sum() * loss_scale).backward() + debug_api.step() + return y + + +def _get_tensors(): + torch.manual_seed(SEED) + x = torch.randn((SEQ_LEN * BATCH_SIZE, IN_SIZE), requires_grad=True).cuda() + x.retain_grad() + weight = torch.randn((OUT_SIZE, IN_SIZE)).cuda() + return x, weight + + +DISABLE_FP8_CONFIG = Template( + """disable_fp8_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + DisableFP8GEMM: + enabled: True + gemms: [$gemms] +""" +) + + +@pytest.mark.parametrize("fprop_fp8", all_boolean) +@pytest.mark.parametrize("dgrad_fp8", all_boolean) +@pytest.mark.parametrize("wgrad_fp8", all_boolean) +def test_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8): + run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8) + + +def disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, config_file): + gemms = "" + if not fprop_fp8: + gemms += "fprop," + if not dgrad_fp8: + gemms += "dgrad," + if not wgrad_fp8: + gemms += "wgrad," + if len(gemms) > 0: + gemms = gemms[:-1] # remove last ',' + config_file.write(DISABLE_FP8_CONFIG.safe_substitute(gemms=gemms)) + config_file.flush() + + +@create_config_file +def run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8, **kwargs): + disable_fp8_gemms_create_config(fprop_fp8, dgrad_fp8, wgrad_fp8, kwargs["config_file"]) + fp8_kwargs = { + "fprop_fp8": fprop_fp8, + "dgrad_fp8": dgrad_fp8, + "wgrad_fp8": wgrad_fp8, + } + + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + x, weight = _get_tensors() + model = _init_model(weight) + y = _run_forward_backward(x, model) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + + x.grad.zero_() + ground_truth = _emulate_linear(x, weight, **fp8_kwargs) + _cmp(ground_truth, output) + + +def test_disable_fp8_layer(feature_dirs): + run_disable_fp8_layer(feature_dirs) + + +DISABLE_FP8_LAYER_CONFIG = """disable_fp8_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + DisableFP8Layer: + enabled: True +""" + + +@create_config_file +def run_disable_fp8_layer(feature_dirs, **kwargs): + kwargs["config_file"].write(DISABLE_FP8_LAYER_CONFIG) + kwargs["config_file"].flush() + + x, weight = _get_tensors() + + ground_truth = _emulate_linear(x, weight) + x.grad.zero_() + + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + + model = _init_model(weight) + y = _run_forward_backward(x, model) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + _cmp(ground_truth, output) + + +random.seed(1234) + +all_combinations = list(itertools.product(all_boolean, repeat=6)) +subset_combinations = random.sample(all_combinations, 20) + + +@pytest.mark.parametrize( + "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad", + subset_combinations, +) +def test_per_tensor_scaling( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad +): + if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]): + pytest.skip("Skipping test because all parameters are False") + run_per_tensor_scaling( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad + ) + + +PER_TENSOR_SCALING_CONFIG = Template( + """per_tensor_scaling_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + PerTensorScaling: + enabled: True + gemms_struct: +$gemms +""" +) + + +def _prepare_per_tensor_scaling_config( + fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad, config_file +): + gemms = "" + title = lambda x: f" - gemm: {x}\n tensors: [" + + def add_tensor(if_add, gemm_name): + nonlocal gemms + if if_add: + gemms += gemm_name + "," + + if fprop_inp or fprop_weight: + gemms += title("fprop") + add_tensor(fprop_inp, "activation") + add_tensor(fprop_weight, "weight") + gemms = gemms[:-1] + "]\n" + if dgrad_weight or dgrad_grad: + gemms += title("dgrad") + add_tensor(dgrad_weight, "weight") + add_tensor(dgrad_grad, "gradient") + gemms = gemms[:-1] + "]\n" + if wgrad_input or wgrad_grad: + gemms += title("wgrad") + add_tensor(wgrad_input, "activation") + add_tensor(wgrad_grad, "gradient") + gemms = gemms[:-1] + "]\n" + config_file.write(PER_TENSOR_SCALING_CONFIG.safe_substitute(gemms=gemms)) + config_file.flush() + + +def set_scaling_factors(model, input_kwargs, fp8_kwargs): + # Copy fp8 scaling factors into fp8_kwargs dict if respective flag in input_kwargs is set. + if not input_kwargs["fprop_inp"]: + fp8_kwargs["fprop_input_scale"] = model.fp8_meta["scaling_fwd"].scale[0].clone() + if not input_kwargs["fprop_weight"]: + fp8_kwargs["fprop_weight_scale"] = model.fp8_meta["scaling_fwd"].scale[1].clone() + if not input_kwargs["dgrad_grad"]: + fp8_kwargs["dgrad_gradient_scale"] = model.fp8_meta["scaling_bwd"].scale[0].clone() + if not input_kwargs["dgrad_weight"]: + fp8_kwargs["dgrad_weight_scale"] = model.fp8_meta["scaling_fwd"].scale[1].clone() + if not input_kwargs["wgrad_grad"]: + fp8_kwargs["wgrad_gradient_scale"] = model.fp8_meta["scaling_bwd"].scale[0].clone() + if not input_kwargs["wgrad_input"]: + fp8_kwargs["wgrad_input_scale"] = model.fp8_meta["scaling_fwd"].scale[0].clone() + + +def set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs): + # Compute per tensor scaling factor if respective flag in input_kwargs is set. + if input_kwargs["fprop_inp"]: + fp8_kwargs["fprop_input_scale"] = tex.DType.kFloat8E4M3 + if input_kwargs["fprop_weight"]: + fp8_kwargs["fprop_weight_scale"] = tex.DType.kFloat8E4M3 + if input_kwargs["dgrad_grad"]: + fp8_kwargs["dgrad_gradient_scale"] = tex.DType.kFloat8E5M2 + if input_kwargs["dgrad_weight"]: + fp8_kwargs["dgrad_weight_scale"] = tex.DType.kFloat8E4M3 + if input_kwargs["wgrad_grad"]: + fp8_kwargs["wgrad_gradient_scale"] = tex.DType.kFloat8E5M2 + if input_kwargs["wgrad_input"]: + fp8_kwargs["wgrad_input_scale"] = tex.DType.kFloat8E4M3 + + +@create_config_file +def run_per_tensor_scaling( + feature_dirs, + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + **kwargs, +): + input_kwargs = { + "fprop_inp": fprop_inp, + "fprop_weight": fprop_weight, + "dgrad_weight": dgrad_weight, + "dgrad_grad": dgrad_grad, + "wgrad_input": wgrad_input, + "wgrad_grad": wgrad_grad, + } + fp8_kwargs = { + "fprop_fp8": True, + "dgrad_fp8": True, + "wgrad_fp8": True, + } + """ + Runs a test to validate per-tensor (current) scaling in FP8 computations. + The function performs warm-up iterations to populate the amax buffer of the model and compute scaling factors based on delayed scaling. + Subsequently, weights and inputs are switched to ensure their current scaling factors differ from those based on delayed scaling; + similarly, the loss is multiplied by a large factor to alter the gradient's magnitude, + creating a discrepancy between the original (delayed) and per-tensor (current) scaling factors. + Finally, a linear pass is emulated, and the results are compared.” + """ + _prepare_per_tensor_scaling_config( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + kwargs["config_file"], + ) + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + + warmup_input, warmup_weight = _get_tensors() + model = _init_model(warmup_weight) + + # Warmup run to setup amax and scaling factors. + for _ in range(AMAX_HISTORY_LEN): + _run_forward_backward(warmup_input, model) + + x = torch.randn_like(warmup_input, requires_grad=True).cuda() + weight = torch.randn_like(warmup_weight, requires_grad=True).cuda() + model.weight.data = weight.data + x.retain_grad() + + # delayed scaling factor + # need to be collected before forward pass with test data, + # because this forward pass changes scaling factors + set_scaling_factors(model, input_kwargs, fp8_kwargs) + + LOSS_MULTIPLIER = 100 + + with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + y = model(x, is_first_microbatch=True) + model.zero_grad() + y.retain_grad() + ( + LOSS_MULTIPLIER * y.sum() + ).backward() # Loss multiplication to change gradient's order of magintude + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + + # per tensor - current - scaling factors + # need to be collected after forward pass with test data, + # because gradient(y.grad) cannot be accessed before forward, + # but it needs to be collected. + set_current_scaling_factors(x, weight, y, input_kwargs, fp8_kwargs) + + ground_truth = _emulate_linear(x, weight, loss_multiplier=LOSS_MULTIPLIER, **fp8_kwargs) + _cmp(ground_truth, output) + + +@pytest.mark.parametrize( + "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad", + subset_combinations, +) +def test_microbatching_per_tensor_scaling( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad +): + if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]): + pytest.skip("Skipping test because all parameters are False") + + @create_config_file + def run_microbatching_test( + feature_dirs, + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + **kwargs, + ): + # Prepare the configuration file + _prepare_per_tensor_scaling_config( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + kwargs["config_file"], + ) + + # Initialize debug + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + + # Get data + x_full, weight = _get_tensors() + microbatch_size = x_full.size(0) // 2 + x_mb1 = x_full[:microbatch_size, ...].clone().detach().requires_grad_(True) + x_mb2 = x_full[microbatch_size:, ...].clone().detach().requires_grad_(True) + + def init_and_warmup(): + model = _init_model(weight) + _run_forward_backward(x_mb1, model, loss_scale=0.5) + _run_forward_backward(x_mb2, model, loss_scale=0.5) + return model + + # Run without is_first_microbatch + + model = init_and_warmup() # running next 2 iters does not change amaxes and scaling factors + y_mb1 = _run_forward_backward(x_mb1, model, loss_scale=0.5) + y_mb2 = _run_forward_backward(x_mb2, model, loss_scale=0.5) + + # Collect outputs + output1 = { + "activation": torch.cat([y_mb1.clone(), y_mb2.clone()], dim=0), + "wgrad": model.weight.grad.clone(), + "dgrad": torch.cat([x_mb1.grad.clone(), x_mb2.grad.clone()], dim=0), + } + + # Run with is_first_microbatch + model = init_and_warmup() # running next 2 iters does not change amaxes and scaling factors + y_mb1 = _run_forward_backward(x_mb1, model, loss_scale=0.5, is_first_microbatch=True) + y_mb2 = _run_forward_backward(x_mb2, model, loss_scale=0.5, is_first_microbatch=False) + + # Collect outputs + output2 = { + "activation": torch.cat([y_mb1.clone(), y_mb2.clone()], dim=0), + "wgrad": model.weight.grad.clone(), + "dgrad": torch.cat([x_mb1.grad.clone(), x_mb2.grad.clone()], dim=0), + } + + # Compare outputs + torch.testing.assert_close(output1["activation"], output2["activation"], atol=1.0, rtol=0.5) + torch.testing.assert_close(output1["dgrad"], output2["dgrad"], atol=1.0, rtol=0.5) + torch.testing.assert_close(output1["wgrad"], output2["wgrad"], atol=1.0, rtol=0.5) + + # Run the test + run_microbatching_test( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad + ) + + +all_combinations = list( + itertools.product([tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None], repeat=6) +) +subset_combinations = random.sample(all_combinations, 10) + + +@pytest.mark.parametrize( + "fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad", + subset_combinations, +) +def test_fake_quant_fp8( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad +): + run_fake_quant_fp8( + feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad + ) + + +FAKE_QUANT_CONFIG = Template( + """fake_quant_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + FakeQuant: + enabled: True + gemms_struct: +$gemms +""" +) + + +def fake_quant_fp8_create_config( + fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad, config_file +): + format_to_str = {tex.DType.kFloat8E4M3: "FP8E4M3", tex.DType.kFloat8E5M2: "FP8E5M2"} + gemms = "" + + def _add_tensor(quant_format, tensor): + nonlocal gemms + if quant_format: + gemms += " " * 8 + "- tensor: " + tensor + "\n" + gemms += " " * 8 + " quant_format: " + format_to_str[quant_format] + "\n" + + title = lambda x: f" - gemm: {x}\n tensors_struct:\n" + if fprop_inp or fprop_weight: + gemms += title("fprop") + _add_tensor(fprop_inp, "activation") + _add_tensor(fprop_weight, "weight") + gemms = gemms[:-1] + "\n" + if dgrad_weight or dgrad_grad: + gemms += title("dgrad") + _add_tensor(dgrad_weight, "weight") + _add_tensor(dgrad_grad, "gradient") + gemms = gemms[:-1] + "\n" + if wgrad_input or wgrad_grad: + gemms += title("wgrad") + _add_tensor(wgrad_input, "activation") + _add_tensor(wgrad_grad, "gradient") + gemms = gemms[:-1] + "\n" + config = FAKE_QUANT_CONFIG.safe_substitute(gemms=gemms) + config_file.write(config) + config_file.flush() + + +@create_config_file +def run_fake_quant_fp8( + feature_dirs, + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + **kwargs, +): + fp8_kwargs = { + "fprop_input_fake_quant": fprop_inp, + "fprop_weight_fake_quant": fprop_weight, + "dgrad_gradient_fake_quant": dgrad_grad, + "dgrad_weight_fake_quant": dgrad_weight, + "wgrad_gradient_fake_quant": wgrad_grad, + "wgrad_input_fake_quant": wgrad_input, + "fprop_fp8": not (fprop_inp or fprop_weight), + "dgrad_fp8": not (dgrad_weight or dgrad_grad), + "wgrad_fp8": not (wgrad_grad or wgrad_input), + } + fake_quant_fp8_create_config( + fprop_inp, + fprop_weight, + dgrad_weight, + dgrad_grad, + wgrad_input, + wgrad_grad, + kwargs["config_file"], + ) + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + + x, weight = _get_tensors() + model = _init_model(weight) + y = _run_forward_backward(x, model) + + output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()} + ground_truth = _emulate_linear(x, weight, **fp8_kwargs) + _cmp(ground_truth, output) diff --git a/tests/pytorch/debug/test_sanity.py b/tests/pytorch/debug/test_sanity.py new file mode 100644 index 000000000..6b0883b14 --- /dev/null +++ b/tests/pytorch/debug/test_sanity.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import functools +import itertools +import os +import random +import tempfile +from string import Template + +import pytest +import torch + +import nvdlfw_inspect.api as debug_api +import transformer_engine.debug +import transformer_engine.pytorch as te +import transformer_engine_torch as tex +from transformer_engine.common.recipe import DelayedScaling, Format +from transformer_engine.pytorch.constants import TE_DType +from transformer_engine.pytorch.fp8 import _default_sf_compute +from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer + +from test_numerics import create_config_file + +B, S, H, D = 64, 64, 64, 64 + +model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"] + +configs = { + "": "", + "log": """log: + layers: + layer_types: [linear] + enabled: + True + transformer_engine: + LogTensorStats: + enabled: True + tensors: [activation, gradient, weight, output, wgrad, dgrad] + stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range] + start_step : 0 + end_step: 1 + LogFp8TensorStats: + enabled: True + tensors: [activation, gradient, weight] + stats: [underflows, overflows] + start_step : 0 + end_step: 1 +""", + "fake_quant": """ +fake_quant_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + FakeQuant: + enabled: True + gemms: [fprop, dgrad, wgrad] + quant_format: FP8E5M2 +""", +} + + +def _get_model(model_key): + if model_key == "linear": + return te.Linear(D, D) + if model_key == "layernorm_linear": + return te.LayerNormLinear(D, D) + if model_key == "layernorm_mlp": + return te.LayerNormMLP(D, D, D) + if model_key == "mha_attention": + return te.MultiheadAttention(D, H) + if model_key == "transformer_layer": + return te.TransformerLayer(D, D, H) + + +def _run_forward_backward(model, fp8): + for _ in range(3): + inp = torch.randn((S, B, H)).cuda() + with te.fp8_autocast(enabled=fp8): + out = model(inp) + out.sum().backward() + debug_api.step() + + +@create_config_file +def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir): + try: + if config != "": + config_file.write(config) + config_file.flush() + config_file_name = config_file.name if config != "" else "" + debug_api.initialize(feature_dirs=feature_dirs, config_file=config_file_name) + model = _get_model(model_key) + _run_forward_backward(model, fp8) + except Exception as error: + raise error + finally: + debug_api.end_debug() + + +@pytest.mark.parametrize("model_key", model_keys) +@pytest.mark.parametrize("fp8", [False, True]) +@pytest.mark.parametrize("config_key", configs.keys()) +def test_sanity_debug(model_key, fp8, config_key, feature_dirs): + _run_test(model_key, fp8, configs[config_key], feature_dirs) diff --git a/tests/pytorch/debug/utils.py b/tests/pytorch/debug/utils.py new file mode 100644 index 000000000..f03ee56b5 --- /dev/null +++ b/tests/pytorch/debug/utils.py @@ -0,0 +1,22 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os + +LOG_FILE = os.path.join("nvdlfw_inspect_logs", "nvdlfw_inspect_globalrank-0.log") + + +def reset_debug_log(): + if os.path.isfile(LOG_FILE): + # delete all content + with open(LOG_FILE, "w") as f: + pass + + +def check_debug_log(msg): + with open(LOG_FILE, "r") as f: + for line in f.readlines(): + if msg in line: + return True + return False diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index ac72960c4..a505d0179 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -40,6 +40,18 @@ LOSS_FN = nn.MSELoss() QUANTIZATION = None +if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False): + # The numerics of all the layers should work the same, + # when debug=True. I fed them with dummy feature + # to prevent switching off debug, which can happen if + # no feature is active. + import nvdlfw_inspect.api as debug_api + + debug_api.initialize( + os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"], + feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"], + ) + # Disable TF32 torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index 3b56796cc..6d9a4412e 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -120,6 +120,20 @@ def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq mask_types = ["causal", "no_mask"] +NVTE_TEST_NVINSPECT_ENABLED = os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False) + +if NVTE_TEST_NVINSPECT_ENABLED: + # The numerics of all the layers should work the same, + # when debug=True. I fed them with dummy feature + # to prevent switching off debug, which can happen if + # no feature is active. + import nvdlfw_inspect.api as debug_api + + debug_api.initialize( + os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"], + feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"], + ) + fp8_recipes = [ recipe.MXFP8BlockScaling(), recipe.DelayedScaling(), @@ -621,6 +635,8 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_m pytest.skip(reason_for_no_fp8) if recipe.mxfp8() and not mxfp8_available: pytest.skip(reason_for_no_mxfp8) + if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) @@ -741,6 +757,8 @@ def test_gpt_full_activation_recompute( use_cast_transpose_triton = bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) ) if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton: pytest.skip("Float8 Current Scaling unsupported for full recompute.") + if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) @@ -1957,6 +1975,8 @@ def test_grouped_linear_accuracy( pytest.skip(reason_for_no_fp8) if fp8 and recipe.mxfp8() and not mxfp8_available: pytest.skip(reason_for_no_mxfp8) + if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("FP8 parameters are not supported in debug mode.") if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) @@ -2155,6 +2175,8 @@ def test_padding_grouped_linear_accuracy( pytest.skip(reason_for_no_fp8) if recipe.mxfp8() and not mxfp8_available: pytest.skip(reason_for_no_mxfp8) + if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) @@ -2276,6 +2298,8 @@ def test_gpt_cuda_graph(dtype, bs, model): if use_fa: pytest.skip(f"ROCm flash attention does not support cuda graph with {dtype}") + if NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("Cuda Graphs are not supported in debug mode.") config = model_configs[model] sigma = 0.023 @@ -2373,6 +2397,8 @@ def test_gpt_fp8_parameters(dtype, bs, model, recipe): pytest.skip(reason_for_no_fp8) if recipe.mxfp8() and not mxfp8_available: pytest.skip(reason_for_no_mxfp8) + if NVTE_TEST_NVINSPECT_ENABLED: + pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) diff --git a/transformer_engine/debug/features/api.py b/transformer_engine/debug/features/api.py index 887043c42..13ab6040d 100644 --- a/transformer_engine/debug/features/api.py +++ b/transformer_engine/debug/features/api.py @@ -12,7 +12,7 @@ import torch from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS -from transformer_engine.pytorch.tensor import all_tensor_types +from transformer_engine.pytorch.tensor import get_all_tensor_types from transformer_engine.debug.pytorch.debug_state import TEDebugState from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor @@ -424,7 +424,7 @@ def output_assertions_hook(self, api_name, ret, **kwargs): if api_name in ["inspect_tensor", "inspect_tensor_postquantize"]: assert ret is None if api_name == "modify_tensor": - assert type(ret) in all_tensor_types + assert type(ret) in get_all_tensor_types() if ( type(ret) == torch.Tensor # pylint: disable=unidiomatic-typecheck and "dtype" in kwargs @@ -438,4 +438,4 @@ def step(self): def end_debug(self): """This function is called by the nvidia-dlframework-inspect after every debug_api.end_debug()""" - TEDebugState.reset() + TEDebugState._reset() diff --git a/transformer_engine/debug/features/fake_quant.py b/transformer_engine/debug/features/fake_quant.py index bab4b4dcf..4a5b6c34a 100644 --- a/transformer_engine/debug/features/fake_quant.py +++ b/transformer_engine/debug/features/fake_quant.py @@ -49,7 +49,7 @@ def fake_quantize(tensor: torch.Tensor, fp8_format: tex.DType, out=None): fp8_dtype = tex.DType.kFloat8E5M2 amax = tensor.abs().max().float() one = torch.ones(1, device=tensor.device) - scale = _default_sf_compute(amax, one, fp8_max) + scale = _default_sf_compute(amax, one, fp8_max, 0) quantizer = Float8Quantizer(scale, amax, fp8_dtype) else: diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py index 4ca2a8ed3..e5c84a9bd 100644 --- a/transformer_engine/debug/features/log_fp8_tensor_stats.py +++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py @@ -120,7 +120,6 @@ def inspect_tensor_postquantize( if not rowwise: return # tensor was already seen rowwise in the other gemm - tensor = tensor._data options = ( config.get("start_step", None), config.get("end_step", None), diff --git a/transformer_engine/debug/features/per_tensor_scaling.py b/transformer_engine/debug/features/per_tensor_scaling.py index eabb6304a..d648b517d 100644 --- a/transformer_engine/debug/features/per_tensor_scaling.py +++ b/transformer_engine/debug/features/per_tensor_scaling.py @@ -15,6 +15,7 @@ from transformer_engine.pytorch.tensor import Quantizer from transformer_engine.pytorch.tensor.float8_tensor import ( Float8Tensor, + Float8Quantizer, Float8CurrentScalingQuantizer, ) from transformer_engine.debug.features.api import TEConfigAPIMapper @@ -39,7 +40,7 @@ def per_tensor_cast( }, "[NVTORCH INSPECT ERROR] Only 2 FP8 types: E4M3 and E5M2 are supported in TE." tensor = tensor.contiguous() - quantizer = Float8CurrentScalingQuantizer(fp8_dtype) + quantizer = Float8CurrentScalingQuantizer(fp8_dtype, device=tensor.device) if out is not None: quantizer.update_quantized(tensor, out) @@ -118,7 +119,7 @@ def modify_tensor( if key not in ["gemm", "tensor"]: raise ValueError(f'[NVTORCH INSPECT ERROR] Unexpected key in config: "{key}".') - assert isinstance(default_quantizer, Float8CurrentScalingQuantizer), ( + assert isinstance(default_quantizer, Float8Quantizer), ( f"[NVTORCH INSPECT ERROR] Feature={self.__class__.__name__}, API=process_tensor: " "Per-tensor current scaling can be used only within `DelayedScaling` recipe autocast." f" {layer_name}" diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index 84a740161..d111e4890 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -96,7 +96,10 @@ def _get(buffers, stat_name): "max": (torch.max, lambda buffers: max(_get(buffers, "max"))), "sum": (torch.sum, lambda buffers: sum(_get(buffers, "sum"))), "mean": (torch.mean, lambda buffers: sum(_get(buffers, "sum")) / sum(_get(buffers, "numel"))), - "numel": (lambda x: x.numel(), lambda buffers: sum(_get(buffers, "numel"))), + "numel": ( + lambda x: x.numel() if hasattr(x, "numel") else x.get_data_tensors()[0].numel(), + lambda buffers: sum(_get(buffers, "numel")), + ), "l1_norm": (lambda x: torch.norm(x, p=1), lambda buffers: sum(_get(buffers, "l1_norm"))), "l2_norm_square": ( lambda x: torch.sum(x**2), @@ -137,7 +140,7 @@ def _get(buffers, stat_name): - min(_get(buffers, "dynamic_range_bottom")), ), "underflows%": ( - lambda x: (x == 0).sum() / x.numel() * 100, + lambda x: (x.get_data_tensors()[0] == 0).sum() / x.get_data_tensors()[0].numel() * 100, lambda buffers: 100 * sum(_get(buffers, "underflows_num")) / sum(_get(buffers, "numel")), ), } diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py index 4a7a156a0..b725d3ab3 100644 --- a/transformer_engine/debug/pytorch/debug_quantization.py +++ b/transformer_engine/debug/pytorch/debug_quantization.py @@ -18,6 +18,7 @@ from transformer_engine.pytorch.tensor.quantized_tensor import ( QuantizedTensor, Quantizer, + QuantizedTensorBase, prepare_for_saving, restore_from_saved, ) @@ -299,8 +300,9 @@ def quantize( iteration=self.iteration, dtype=dtype, ) - if columnwise_gemm_tensor.dtype != dtype: - raise ValueError("Dtype does not match the output of the modify_tensor call") + if dtype is not None: + if columnwise_gemm_tensor.dtype != dtype: + raise ValueError("Dtype does not match the output of the modify_tensor call") if self.rowwise_tensor_plan == API_CALL_MODIFY: rowwise_gemm_tensor = debug_api.transformer_engine.modify_tensor( layer_name=self.layer_name, @@ -311,8 +313,9 @@ def quantize( iteration=self.iteration, dtype=dtype, ) - if rowwise_gemm_tensor.dtype != dtype: - raise ValueError("Dtype does not match the output of the modify_tensor call") + if dtype is not None: + if rowwise_gemm_tensor.dtype != dtype: + raise ValueError("Dtype does not match the output of the modify_tensor call") # 3. If some tensors still are not defined we use high precision tensor. if self.rowwise_tensor_plan == HIGH_PRECISION: @@ -332,6 +335,7 @@ def quantize( quantizer=self, layer_name=self.layer_name, tensor_name=self.tensor_name, + original_tensor=tensor, ) def process_gemm_output(self, tensor: torch.Tensor): @@ -456,7 +460,7 @@ def any_feature_enabled(self) -> bool: return False -class DebugQuantizedTensor: +class DebugQuantizedTensor(QuantizedTensorBase): """ Class containing quantized tensors after debug. Depending on configuration it can contain one or two different objects. These objects can be accessed by the method @@ -470,6 +474,7 @@ def __init__( quantizer, layer_name=None, tensor_name=None, + original_tensor=None, ): self.rowwise_gemm_tensor = rowwise_gemm_tensor @@ -477,6 +482,7 @@ def __init__( self.quantizer = quantizer self._layer_name = layer_name self._tensor_name = tensor_name + self._original_tensor = original_tensor def prepare_for_saving(self): """ " Prepare for saving method override""" @@ -524,5 +530,5 @@ def size(self): """Size of the tensor.""" return self.rowwise_gemm_tensor.size() - def update_usage(self, rowwise_usage: bool, columnwise_usage: bool): + def update_usage(self, rowwise_usage: bool = None, columnwise_usage: bool = None): """Update usage of the tensor.""" diff --git a/transformer_engine/pytorch/distributed.py b/transformer_engine/pytorch/distributed.py index ea601397a..1d788148d 100644 --- a/transformer_engine/pytorch/distributed.py +++ b/transformer_engine/pytorch/distributed.py @@ -1243,12 +1243,18 @@ def gather_along_first_dim( final_quantizer = ( None if not needs_quantized_gemm(inp, rowwise=True) else quantizer.parent_quantizer ) + # Temporary fix for TP communication of Float8BlockwiseQTensorBase + if isinstance(rowwise, Float8BlockwiseQTensorBase): + rowwise = inp._original_tensor rowwise_total = gather_along_first_dim(rowwise, process_group, False, final_quantizer)[0] out_obj.rowwise_gemm_tensor = rowwise_total if rowwise is not columnwise: final_quantizer_columnwise = ( None if not needs_quantized_gemm(inp, rowwise=False) else quantizer.parent_quantizer ) + # Temporary fix for TP communication of Float8BlockwiseQTensorBase + if isinstance(columnwise, Float8BlockwiseQTensorBase): + columnwise = inp._original_tensor columnwise_total, _ = gather_along_first_dim( columnwise, process_group, False, final_quantizer_columnwise ) diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index a8b110690..d999efa3c 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -1072,7 +1072,12 @@ def grad_output_preprocess( if ( isinstance( grad_output_.get_tensor(True), - (QuantizedTensor, Float8TensorBase, MXFP8TensorBase), + ( + QuantizedTensor, + Float8TensorBase, + MXFP8TensorBase, + Float8BlockwiseQTensorBase, + ), ) and ctx.use_bias ): diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index a31823641..53f399d3d 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -205,6 +205,7 @@ def forward( # or if a gather of ln_out must be in high precision. with_quantized_norm = ( fp8 + and not debug and not return_layernorm_output and not return_layernorm_output_gathered and not force_hp_blockwise_ln_out_gather From 74525d1291a12c6c10b463b059395e6de533a829 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Mon, 19 May 2025 14:25:36 -0700 Subject: [PATCH 02/26] Fix README render for uploading package to PyPI (#1798) * Fix README render on PyPI Signed-off-by: Kirthi Shankar Sivamani * Update README.rst Signed-off-by: Kirthi Shankar Sivamani * Use anonymous hyperlink for duplicate. Fix indent. Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- README.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 49e19bd7e..09f204f68 100644 --- a/README.rst +++ b/README.rst @@ -450,7 +450,7 @@ Installation ============ System Requirements -^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^ * **Hardware:** Blackwell, Hopper, Grace Hopper/Blackwell, Ada, Ampere @@ -468,10 +468,10 @@ System Requirements * **Notes:** FP8 features require Compute Capability 8.9+ (Ada/Hopper/Blackwell) Installation Methods -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ Docker (Recommended) -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ The quickest way to get started with Transformer Engine is by using Docker images on `NVIDIA GPU Cloud (NGC) Catalog `_. @@ -496,7 +496,7 @@ Where 25.04 (corresponding to April 2025 release) is the container version. * NGC PyTorch 23.08+ containers include FlashAttention-2 pip Installation -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^ **Prerequisites for pip installation:** @@ -534,7 +534,7 @@ Source Installation `See the installation guide `_ Environment Variables -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^ These environment variables can be set before installation to customize the build process: * **CUDA_PATH**: Path to CUDA installation @@ -545,7 +545,7 @@ These environment variables can be set before installation to customize the buil * **NVTE_BUILD_THREADS_PER_JOB**: Control threads per build job Compiling with FlashAttention -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Transformer Engine supports both FlashAttention-2 and FlashAttention-3 in PyTorch for improved performance. FlashAttention-3 was added in release v1.11 and is prioritized over FlashAttention-2 when both are present in the environment. You can verify which FlashAttention version is being used by setting these environment variables: @@ -557,8 +557,9 @@ You can verify which FlashAttention version is being used by setting these envir It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug `_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue. .. troubleshooting-begin-marker-do-not-remove + Troubleshooting -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^ **Common Issues and Solutions:** @@ -692,7 +693,7 @@ Papers Videos ====== -* `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 `_ +* `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 `__ * `Blackwell Numerics for AI | GTC 2025 `_ * `Building LLMs: Accelerating Pretraining of Foundational Models With FP8 Precision | GTC 2025 `_ * `From FP8 LLM Training to Inference: Language AI at Scale | GTC 2025 `_ From cea11527603aad6277a6d7fa4fa4b4de500fb433 Mon Sep 17 00:00:00 2001 From: Evgeny Tsykunov Date: Mon, 19 May 2025 23:25:57 +0200 Subject: [PATCH 03/26] Enhance recipe compatibility (#1724) * Check tensor-recipe compatibility Signed-off-by: Evgeny Tsykunov * Tensor class in recipe, checking for *Base Signed-off-by: Evgeny Tsykunov * Extend recipe __repr__ with recipe_type Signed-off-by: Evgeny Tsykunov * Warn about recipe change Signed-off-by: Evgeny Tsykunov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Enable dynamic recipe change: clear fp8 workspace Signed-off-by: Evgeny Tsykunov * TE 1.x checkpoint compatibility Signed-off-by: Evgeny Tsykunov * Disable warning for recipe wrappers Signed-off-by: Evgeny Tsykunov * Test recipe change Signed-off-by: Evgeny Tsykunov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use QuantizedTensorBase Signed-off-by: Evgeny Tsykunov * Fix circular import Signed-off-by: Evgeny Tsykunov * Revert previous circular import fix Signed-off-by: Evgeny Tsykunov * Fix pytorch imports in common Signed-off-by: Evgeny Tsykunov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Let quantizer know about the recipe Signed-off-by: Evgeny Tsykunov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix imports Signed-off-by: Evgeny Tsykunov --------- Signed-off-by: Evgeny Tsykunov Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Przemyslaw Tredak Co-authored-by: Kirthi Shankar Sivamani --- tests/pytorch/test_recipe.py | 105 +++++++++++++++++- .../common/gemm/cublaslt_gemm.cu | 3 +- transformer_engine/common/recipe/__init__.py | 9 +- .../debug/pytorch/debug_quantization.py | 6 +- transformer_engine/pytorch/module/base.py | 63 ++++++++++- .../pytorch/tensor/float8_blockwise_tensor.py | 8 +- .../pytorch/tensor/float8_tensor.py | 13 ++- .../pytorch/tensor/mxfp8_tensor.py | 8 +- .../pytorch/tensor/quantized_tensor.py | 5 + 9 files changed, 210 insertions(+), 10 deletions(-) diff --git a/tests/pytorch/test_recipe.py b/tests/pytorch/test_recipe.py index 02ff9367a..8d379be7c 100644 --- a/tests/pytorch/test_recipe.py +++ b/tests/pytorch/test_recipe.py @@ -8,22 +8,32 @@ import pytest import torch +import warnings import transformer_engine.common.recipe import transformer_engine.pytorch as te +from transformer_engine.pytorch.tensor.float8_blockwise_tensor import Float8BlockQuantizer +from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Quantizer import transformer_engine_torch as tex from transformer_engine.pytorch.fp8 import ( FP8GlobalStateManager, _amax_and_scale_update, - get_default_fp8_recipe, + fp8_model_init, ) from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer import transformer_engine.pytorch.ops as te_ops from transformer_engine.pytorch.utils import is_fp8_fnuz +from transformer_engine.pytorch import Linear +from transformer_engine.pytorch.distributed import fp8_autocast +from transformer_engine.common.recipe import DelayedScaling, Float8BlockScaling, MXFP8BlockScaling import transformer_engine_torch as tex # Check if FP8 is supported fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() +mxfp8_available, reason_for_no_mxfp8 = FP8GlobalStateManager.is_mxfp8_available() +fp8_block_scaling_available, reason_for_no_fp8_block_scaling = ( + FP8GlobalStateManager.is_fp8_block_scaling_available() +) # FP8 per tensor delayed scaling @@ -370,3 +380,96 @@ def setup_fp8_meta(): ) torch.testing.assert_close(fp8_meta[forward_key].scale, expected_scale) + + @pytest.mark.parametrize( + "model_init_recipe", + [ + pytest.param( + MXFP8BlockScaling(), + marks=pytest.mark.skipif(not mxfp8_available, reason=reason_for_no_mxfp8), + ), + pytest.param( + Float8BlockScaling(), + marks=pytest.mark.skipif( + not fp8_block_scaling_available, reason=reason_for_no_fp8_block_scaling + ), + ), + ], + ) + def test_check_for_weight_tensor_and_recipe_correspondence(self, model_init_recipe): + with fp8_model_init(enabled=True, recipe=model_init_recipe): + linear = Linear(32, 32).cuda() + + x = torch.randn(32, 32, device="cuda") + with fp8_autocast(enabled=True, fp8_recipe=DelayedScaling()): + with pytest.raises(RuntimeError) as excinfo: + _ = linear(x) + assert "Recipe mismatch for " in str(excinfo.value) + + @pytest.mark.parametrize( + "target_recipe_class, expected_quantizer_type, available_flag, reason", + [ + pytest.param( + MXFP8BlockScaling, + MXFP8Quantizer, + mxfp8_available, + reason_for_no_mxfp8, + id="DelayedScaling->MXFP8BlockScaling", + ), + pytest.param( + Float8BlockScaling, + Float8BlockQuantizer, + fp8_block_scaling_available, + reason_for_no_fp8_block_scaling, + id="DelayedScaling->Float8BlockScaling", + ), + ], + ) + def test_dynamic_recipe_update( + self, target_recipe_class, expected_quantizer_type, available_flag, reason + ): + if not available_flag: + pytest.skip(reason) + + in_features = 32 + out_features = 32 + batch_size = 32 + linear = Linear(in_features, out_features).cuda() + initial_recipe = DelayedScaling() + + # Run initial iterations with DelayedScaling + for _ in range(3): + x = torch.randn(batch_size, in_features, device="cuda") + with fp8_autocast(enabled=True, fp8_recipe=initial_recipe): + y = linear(x) + loss = y.mean() + loss.backward() + + for quantizer in linear.quantizers["scaling_fwd"]: + assert isinstance(quantizer, Float8Quantizer) + + # Change recipe + target_recipe = target_recipe_class() + + # Run subsequent iterations with the target recipe + for i in range(3): + x = torch.randn(batch_size, in_features, device="cuda") + if i == 0: + # Expect a warning on the first iteration with the new recipe + with pytest.warns(UserWarning, match="Recipe type changed"): + with fp8_autocast(enabled=True, fp8_recipe=target_recipe): + y = linear(x) + for quantizer in linear.quantizers["scaling_fwd"]: + assert isinstance(quantizer, expected_quantizer_type) + else: + # No warning expected on subsequent iterations + with warnings.catch_warnings(): + warnings.simplefilter("error") # Raise error if unexpected warning occurs + with fp8_autocast(enabled=True, fp8_recipe=target_recipe): + y = linear(x) + loss = y.mean() + loss.backward() + + # Final check + for quantizer in linear.quantizers["scaling_fwd"]: + assert isinstance(quantizer, expected_quantizer_type) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 36cbcd330..07b256972 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -94,7 +94,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla A.scaling_mode == B.scaling_mode || (A.scaling_mode == NVTE_BLOCK_SCALING_1D && B.scaling_mode == NVTE_BLOCK_SCALING_2D) || (A.scaling_mode == NVTE_BLOCK_SCALING_2D && B.scaling_mode == NVTE_BLOCK_SCALING_1D), - "Inputs A and B to GEMM need to have compatible scaling modes!"); + "Inputs A and B to GEMM need to have compatible scaling modes, but got A.scaling_mode = " + + to_string(A.scaling_mode) + ", B.scaling_mode = " + to_string(B.scaling_mode)); NVTE_CHECK(A.has_data() || A.has_columnwise_data(), "Input A does not hold any data!"); NVTE_CHECK(B.has_data() || B.has_columnwise_data(), "Input B does not hold any data!"); GemmParam ret; diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py index 9426d1621..1cf974987 100644 --- a/transformer_engine/common/recipe/__init__.py +++ b/transformer_engine/common/recipe/__init__.py @@ -196,6 +196,7 @@ def __post_init__(self) -> None: def __repr__(self) -> str: return ( + f"recipe_type={self.__class__.__name__}, " f"margin={self.margin}, " f"format={str(self.fp8_format).split('.')[1]}, " f"amax_history_len={self.amax_history_len}, " @@ -261,6 +262,7 @@ def __post_init__(self) -> None: def __repr__(self) -> str: return ( + f"recipe_type={self.__class__.__name__}, " f"format={str(self.fp8_format).split('.')[1]}, " f"fp8_quant_fwd_inp={self.fp8_quant_fwd_inp}, " f"fp8_quant_fwd_weight={self.fp8_quant_fwd_weight}, " @@ -307,7 +309,11 @@ def __post_init__(self) -> None: assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported." def __repr__(self) -> str: - return f"margin={self.margin}, format={str(self.fp8_format).split('.')[1]}," + return ( + f"recipe_type={self.__class__.__name__}, " + f"margin={self.margin}, " + f"format={str(self.fp8_format).split('.')[1]}" + ) @dataclass() @@ -391,6 +397,7 @@ def __post_init__(self) -> None: def __repr__(self) -> str: return ( + f"recipe_type={self.__class__.__name__}, " f"format={str(self.fp8_format).split('.')[1]}, " f"fp8_quant_fwd_inp={self.fp8_quant_fwd_inp}, " f"fp8_quant_fwd_weight={self.fp8_quant_fwd_weight}, " diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py index b725d3ab3..4d61757e1 100644 --- a/transformer_engine/debug/pytorch/debug_quantization.py +++ b/transformer_engine/debug/pytorch/debug_quantization.py @@ -14,7 +14,7 @@ import transformer_engine_torch as tex - +from transformer_engine.common.recipe import Recipe from transformer_engine.pytorch.tensor.quantized_tensor import ( QuantizedTensor, Quantizer, @@ -459,6 +459,10 @@ def any_feature_enabled(self) -> bool: return True return False + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + """Probably not needed for debug quantizer""" + return None + class DebugQuantizedTensor(QuantizedTensorBase): """ diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index d999efa3c..1672bc6bd 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -51,7 +51,7 @@ from ..utils import is_non_tn_fp8_gemm_supported from ..tensor._internal.float8_blockwise_tensor_base import Float8BlockwiseQTensorBase -from ...common.recipe import Recipe +from ...common.recipe import DelayedScaling, Recipe from ...debug.pytorch.debug_state import TEDebugState from ...debug.pytorch.debug_quantization import DebugQuantizer, DebugQuantizedTensor @@ -826,6 +826,14 @@ def set_extra_state(self, state: Optional[torch.Tensor]) -> None: if state is None: return + # TE 1.x checkpoint compatibility: add DelayedScaling recipe if missing + if "recipe" not in state: + # TE 1.x only supported delayed scaling, which was the default recipe + state["recipe"] = DelayedScaling() + # TE 1.x also saved scale_inv, which is not needed with Recipe object + state.pop("scale_inv_fwd", None) + state.pop("scale_inv_bwd", None) + # Load extra items self.fp8_meta.update(state["extra_fp8_variables"]) self.fp8_meta["recipe"] = state["recipe"] @@ -899,6 +907,8 @@ def _get_fp8_params(self) -> Union[List[torch.Tensor], None]: # assume FP8 execution. def init_fp8_metadata(self, num_gemms: int = 1) -> None: """Initialize fp8 related metadata and tensors during fprop.""" + _original_recipe = self.fp8_meta.get("recipe", None) + self.fp8_parameters = FP8GlobalStateManager.with_fp8_parameters() self.fp8 = FP8GlobalStateManager.is_fp8_enabled() self.fp8_calibration = FP8GlobalStateManager.is_fp8_calibration() @@ -937,6 +947,19 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None: self.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe() + _current_recipe = self.fp8_meta["recipe"] + if _original_recipe is not None and not ( + issubclass(_current_recipe.__class__, _original_recipe.__class__) + or issubclass(_original_recipe.__class__, _current_recipe.__class__) + ): + warnings.warn( + f"Recipe type changed from {_original_recipe.__class__.__name__} " + f"to {_current_recipe.__class__.__name__}. " + "This may affect model behavior." + ) + # Clear cached workspaces as they were created with the old recipe/quantizer type + self._fp8_workspaces.clear() + @contextmanager def prepare_forward( self, @@ -961,6 +984,7 @@ def prepare_forward( self.set_activation_dtype(inp) self.init_fp8_metadata(num_gemms=num_gemms) + self._check_weight_tensor_recipe_correspondence() if self.fp8 and self.sequence_parallel and self.fp8_meta["recipe"].delayed(): assert self.fp8_meta["recipe"].reduce_amax, ( @@ -1385,6 +1409,43 @@ def _validate_name(self): ) self.name = f"Layer_{TEDebugState.get_layer_count()}" + def _check_weight_tensor_recipe_correspondence(self) -> None: + """ + Verify that the weight tensor types match their corresponding recipe type. + This is invoked in the forward(). + + This establishes a 1:1 correspondence between recipe types and tensor types: + - DelayedScaling → Float8Tensor + - Float8CurrentScaling → Float8Tensor + - MXFP8BlockScaling → MXFP8Tensor + - Float8BlockScaling → Float8BlockTensor + + Example case to check: recipe is DelayedScaling (DelayedScaling is set in fp8_autocast()), + but the weight tensor is MXFP8Tensor (MXFP8BlockScaling is set in fp8_model_init()). + """ + if not self.fp8 and not self.fp8_calibration: + return + if not hasattr(self, "weight_names") or not self.weight_names: + return + + recipe = self.fp8_meta["recipe"] + weight_tensors = [getattr(self, name) for name in self.weight_names] + for i, tensor in enumerate(weight_tensors): + if isinstance(tensor, QuantizedTensorBase): + quantizer = tensor._get_quantizer() + if quantizer is None: + continue + compatible_recipe_class = quantizer._get_compatible_recipe() + if compatible_recipe_class is None: + continue + if not isinstance(recipe, compatible_recipe_class): + raise RuntimeError( + f"Recipe mismatch for '{self.weight_names[i]}': tensor supports recipe" + f" {compatible_recipe_class.__name__}, but got {recipe.__class__.__name__}." + " Please check the recipes assigned during fp8_model_init() and" + " fp8_autocast() calls." + ) + def _turn_off_unsupported_features_in_debug(self): if ( getattr(self, "ub_bulk_wgrad", False) diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py index ce4137c66..4ab04da83 100644 --- a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py +++ b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py @@ -4,13 +4,14 @@ """Tensor class with FP8 data quantized with NxN tiles""" from __future__ import annotations -from typing import Optional, Tuple, Iterable +from typing import Optional, Tuple, Iterable, Union import math import torch import transformer_engine_torch as tex - from transformer_engine_torch import DType as TE_DType + +from transformer_engine.common.recipe import Float8BlockScaling, Recipe from ._internal.float8_blockwise_tensor_base import Float8BlockwiseQTensorBase from .quantized_tensor import QuantizedTensor, Quantizer, _IdentityFunc from ..utils import devices_match, round_up_to_nearest_multiple @@ -229,6 +230,9 @@ def calibrate(self, tensor: torch.Tensor) -> None: # where state from an estimator influences distribution parameters. pass + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + return Float8BlockScaling + class Float8BlockwiseQTensor(Float8BlockwiseQTensorBase, QuantizedTensor): """Tensor class with FP8 data quantized via NxN blocks or 1xN blocks. diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py index b55ac577c..fa8e29283 100644 --- a/transformer_engine/pytorch/tensor/float8_tensor.py +++ b/transformer_engine/pytorch/tensor/float8_tensor.py @@ -6,15 +6,20 @@ """Tensor class with FP8 data""" from __future__ import annotations +<<<<<<< HEAD import os from typing import Optional, Tuple, Iterable +======= +from typing import Optional, Tuple, Iterable, Union +>>>>>>> 6f5af6ae (Enhance recipe compatibility (#1724)) import warnings from torch.utils.cpp_extension import IS_HIP_EXTENSION import torch import transformer_engine_torch as tex - from transformer_engine_torch import DType as TE_DType + +from transformer_engine.common.recipe import DelayedScaling, Float8CurrentScaling, Recipe from ..utils import canonicalize_process_group, devices_match from ._internal.float8_tensor_base import Float8TensorBase, _FromFloat8Func from .quantized_tensor import QuantizedTensor, Quantizer, _IdentityFunc @@ -177,6 +182,9 @@ def create_tensor_from_data( quantizer=self, ) + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + return DelayedScaling + class Float8CurrentScalingQuantizer(Quantizer): """Builder class for FP8 tensors with per-tensor current scaling @@ -339,6 +347,9 @@ def _canonicalized_amax_reduction_group(self) -> dist_group_type: """Get process group for amax reduction""" return canonicalize_process_group(self.amax_reduction_group) + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + return Float8CurrentScaling + class Float8Tensor(Float8TensorBase, QuantizedTensor): """Experimental tensor class with FP8 data diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py index 920b7d6b0..8f3c73eb9 100644 --- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py +++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py @@ -7,16 +7,17 @@ from collections.abc import Iterable import math import os -from typing import Optional, Tuple from torch.utils.cpp_extension import IS_HIP_EXTENSION +from typing import Optional, Tuple, Union import torch if IS_HIP_EXTENSION: from ..triton_kernels.cast import te_quantize_triton import transformer_engine_torch as tex - from transformer_engine_torch import DType as TE_DType + +from transformer_engine.common.recipe import MXFP8BlockScaling, Recipe from ..constants import MXFP8_BLOCK_SCALING_SIZE from ..utils import devices_match, round_up_to_nearest_multiple @@ -145,6 +146,9 @@ def calibrate(self, tensor: torch.Tensor) -> None: # TODO(ksivamani): No calibration needed for mxfp8? pass + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + return MXFP8BlockScaling + class MXFP8Tensor(MXFP8TensorBase, QuantizedTensor): """Experimental tensor class with FP8 data diff --git a/transformer_engine/pytorch/tensor/quantized_tensor.py b/transformer_engine/pytorch/tensor/quantized_tensor.py index e521d4279..9b0adcc22 100644 --- a/transformer_engine/pytorch/tensor/quantized_tensor.py +++ b/transformer_engine/pytorch/tensor/quantized_tensor.py @@ -17,6 +17,7 @@ from torch.utils._pytree import tree_map import transformer_engine_torch as tex +from transformer_engine.common.recipe import Recipe class QuantizedTensorBase: @@ -242,6 +243,10 @@ def copy(self) -> Quantizer: """Create shallow copy""" return copy.copy(self) + @abc.abstractmethod + def _get_compatible_recipe(self) -> Union[type[Recipe], None]: + """Returns recipe class that is compatible with this quantizer""" + class _QuantizeFunc(torch.autograd.Function): """Cast to FP8 from other dtype""" From 610c3937d0af085c296528652404d85b68416ef5 Mon Sep 17 00:00:00 2001 From: "Peter St. John" Date: Tue, 20 May 2025 10:47:47 -0600 Subject: [PATCH 04/26] Use an empty torch tensor to indicate no fp8 information in extra_state (#1799) * Use an empty torch tensor to indicate no fp8 information in extra_state Signed-off-by: Peter St. John * Add huggingface from_pretrained / save_pretrained tests Adds integration tests to ensure models containing TransformerLayer objects can be saved and loaded using the from_pretrained and save_pretrained methods. Signed-off-by: Peter St. John --------- Signed-off-by: Peter St. John Co-authored-by: Kirthi Shankar Sivamani --- qa/L0_pytorch_unittest/test.sh | 1 + setup.py | 2 +- tests/pytorch/test_hf_integration.py | 40 +++++++++++++++++++++++ transformer_engine/pytorch/module/base.py | 12 +++---- 4 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 tests/pytorch/test_hf_integration.py diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh index 79f3c8fb9..ea5236502 100644 --- a/qa/L0_pytorch_unittest/test.sh +++ b/qa/L0_pytorch_unittest/test.sh @@ -44,6 +44,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entro NVTE_FLASH_ATTN=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py" python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || test_fail "test_fused_attn.py" python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/fused_attn/test_kv_cache.py || test_fail "test_kv_cache.py" +python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py" if [ "$RET" -ne 0 ]; then echo "Error in the following test cases:$FAILED_CASES" diff --git a/setup.py b/setup.py index 41893644c..0012844a8 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: ) # Blackwell is not supported as of Triton 3.2.0, need custom internal build # install_reqs.append("triton") - test_reqs.extend(["numpy", "torchvision"]) + test_reqs.extend(["numpy", "torchvision", "transformers"]) if "jax" in frameworks: if rocm_build(): from build_tools.jax import jax_install_requires diff --git a/tests/pytorch/test_hf_integration.py b/tests/pytorch/test_hf_integration.py new file mode 100644 index 000000000..0b2468510 --- /dev/null +++ b/tests/pytorch/test_hf_integration.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import pytest +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import PreTrainedModel + +from transformer_engine.pytorch.transformer import TransformerLayer +from transformer_engine.pytorch.utils import is_bf16_compatible + + +class SimpleTEModel(PreTrainedModel): + config_class = PretrainedConfig + + def __init__(self, config: PretrainedConfig): + super().__init__(config) + self.my_layer = TransformerLayer( + hidden_size=320, + num_attention_heads=16, + ffn_hidden_size=1024, + layer_number=None, + ) + + def forward(self, hidden_states, attention_mask): + return self.my_layer(hidden_states, attention_mask) + + +def test_save_hf_model(tmp_path): + model = SimpleTEModel(PretrainedConfig()) + model.save_pretrained(tmp_path / "simple_te_model") + + +@pytest.mark.xfail(reason="This test is failing until huggingface/transformers#38155 is merged.") +def test_save_and_load_hf_model(tmp_path): + model = SimpleTEModel(PretrainedConfig()) + model.save_pretrained(tmp_path / "simple_te_model") + del model + model = SimpleTEModel.from_pretrained(tmp_path / "simple_te_model") + assert model is not None diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py index 1672bc6bd..e86ccd172 100644 --- a/transformer_engine/pytorch/module/base.py +++ b/transformer_engine/pytorch/module/base.py @@ -746,7 +746,7 @@ def reset(key): reset("scaling_fwd") reset("scaling_bwd") - def get_extra_state(self) -> Optional[torch.Tensor]: + def get_extra_state(self) -> torch.Tensor: """Save before checkpointing.""" # This implementation is working around a few issues: @@ -781,7 +781,7 @@ def to_cpu(src: torch.Tensor) -> torch.Tensor: state = None fp8_checkpoint = self.fp8_meta["fp8_checkpoint"] or self.fp8 or self.fp8_calibration if not fp8_checkpoint: - return None + return torch.empty(0, dtype=torch.uint8) # Copy tensors to CPU and store state = {} @@ -807,13 +807,13 @@ def to_cpu(src: torch.Tensor) -> torch.Tensor: state_serialized = torch.frombuffer(state_serialized, dtype=torch.uint8) return state_serialized - def set_extra_state(self, state: Optional[torch.Tensor]) -> None: + def set_extra_state(self, state: torch.Tensor) -> None: """Load previous state.""" - if state is None: - return - # Load state if isinstance(state, torch.Tensor): + # No FP8 is indicated by an empty tensor we don't need to unpickle. + if state.numel() == 0: + return # Default format: byte tensor with pickled data state = pickle.loads(state.detach().cpu().numpy().tobytes()) elif isinstance(state, io.BytesIO): From c5ea9eb7d6500067683dcc4822da389d3ce9408a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Tue, 20 May 2025 22:42:29 +0200 Subject: [PATCH 05/26] =?UTF-8?q?[Pytorch]=20NVIDIA-DL-Framework-Inspect?= =?UTF-8?q?=20support=20=E2=80=93=20part=204=20=E2=80=93=20documentation?= =?UTF-8?q?=20(#1611)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs drop Signed-off-by: Pawel Gadzinski * a Signed-off-by: Pawel Gadzinski * fix Signed-off-by: Pawel Gadzinski * Update docs/debug/1_getting_started.rst Co-authored-by: Przemyslaw Tredak Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> * Update docs/debug/1_getting_started.rst Co-authored-by: Przemyslaw Tredak Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> * fixes Signed-off-by: Pawel Gadzinski * fix imgs Signed-off-by: Pawel Gadzinski --------- Signed-off-by: Pawel Gadzinski Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> Co-authored-by: Przemyslaw Tredak --- docs/debug.rst | 14 + docs/debug/1_getting_started.rst | 241 ++++++++++++++++++ docs/debug/2_config_file_structure.rst | 241 ++++++++++++++++++ docs/debug/3_api_debug_setup.rst | 87 +++++++ docs/debug/3_api_features.rst | 14 + docs/debug/3_api_te_calls.rst | 45 ++++ docs/debug/4_distributed.rst | 91 +++++++ docs/debug/api.rst | 13 + docs/debug/img/api_calls1.svg | 1 + docs/debug/img/api_calls2.svg | 1 + docs/debug/img/fake_quant.svg | 1 + docs/debug/img/introduction.svg | 1 + docs/debug/img/names.svg | 1 + docs/debug/img/pipeline_logging.svg | 1 + docs/debug/img/reduction1.svg | 1 + docs/debug/img/reduction2.svg | 1 + docs/debug/img/reduction3.svg | 1 + docs/debug/img/scaling_factors.svg | 1 + docs/debug/img/tensorboard.png | Bin 0 -> 123093 bytes docs/index.rst | 1 + qa/L0_pytorch_lint/test.sh | 2 +- .../debug/features/per_tensor_scaling.py | 1 - 22 files changed, 758 insertions(+), 2 deletions(-) create mode 100644 docs/debug.rst create mode 100644 docs/debug/1_getting_started.rst create mode 100644 docs/debug/2_config_file_structure.rst create mode 100644 docs/debug/3_api_debug_setup.rst create mode 100644 docs/debug/3_api_features.rst create mode 100644 docs/debug/3_api_te_calls.rst create mode 100644 docs/debug/4_distributed.rst create mode 100644 docs/debug/api.rst create mode 100644 docs/debug/img/api_calls1.svg create mode 100644 docs/debug/img/api_calls2.svg create mode 100644 docs/debug/img/fake_quant.svg create mode 100644 docs/debug/img/introduction.svg create mode 100644 docs/debug/img/names.svg create mode 100644 docs/debug/img/pipeline_logging.svg create mode 100644 docs/debug/img/reduction1.svg create mode 100644 docs/debug/img/reduction2.svg create mode 100644 docs/debug/img/reduction3.svg create mode 100644 docs/debug/img/scaling_factors.svg create mode 100644 docs/debug/img/tensorboard.png diff --git a/docs/debug.rst b/docs/debug.rst new file mode 100644 index 000000000..d33568ea3 --- /dev/null +++ b/docs/debug.rst @@ -0,0 +1,14 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. +Precision debug tools +============================================== + +.. toctree:: + :caption: Precision debug tools + + debug/1_getting_started.rst + debug/2_config_file_structure.rst + debug/api + debug/4_distributed.rst \ No newline at end of file diff --git a/docs/debug/1_getting_started.rst b/docs/debug/1_getting_started.rst new file mode 100644 index 000000000..bc2b95057 --- /dev/null +++ b/docs/debug/1_getting_started.rst @@ -0,0 +1,241 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Getting started +============== + +.. note:: + + Precision debug tools with `Nvidia-DL-Framework-Inspect `_ for Transformer Engine are currently supported only for PyTorch. + +Transformer Engine provides a set of precision debug tools which allow you to easily: + +- log the statistics for each of the tensors in every matrix multiply (GEMM) operation, +- run selected GEMMs in higher precision, +- run current scaling - with one scaling factor per tensor - for particular GEMMs, +- test new precisions and integrate them with FP8 training, +- ... and many more. + +There are 4 things one needs to do to use Transformer Engine debug features: + +1. Create a configuration YAML file to configure the desired features. +2. Import, and initialize the `Nvidia-DL-Framework-Inspect `_ tool, which is installed as the dependency of the Transformer Engine. +3. One can pass ``name="..."`` when creating TE layers to easier identify layer names. If this is not provided, names will be inferred automatically. +4. Invoke ``debug_api.step()`` at the end of one forward-backward pass. + +To start debugging, one needs to create a configuration YAML file. This file lists the features to be used in particular layers. There are 2 kinds of features: + +- provided by the Transformer Engine - for example, DisableFP8GEMM or LogTensorStats - they are listed in the :doc:`debug features API <3_api_features>` section +- defined by the user. For details on how to create a custom feature - please read the :doc:`calls to Nvidia-DL-Framework-Inspect <3_api_te_calls>` section. + +.. figure:: ./img/introduction.svg + :align: center + + Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 3 TE Linear Layers. + ``config.yaml`` contains the specification of the features used for each Linear layer. Some feature classes are provided by TE, + one - ``UserProvidedPrecision`` - is a custom feature implemented by the user. Nvidia-DL-Framework-Inspect inserts features into the layers according to the config. + +Example training script +---------------------- + +Let's look at a simple example of training a Transformer layer using Transformer Engine with FP8 precision. This example demonstrates how to set up the layer, define an optimizer, and perform a few training iterations using synthetic data. + +.. code-block:: python + + # train.py + + from transformer_engine.pytorch import TransformerLayer + import torch + import torch.nn as nn + import torch.optim as optim + import transformer_engine.pytorch as te + + hidden_size = 512 + num_attention_heads = 8 + + transformer_layer = TransformerLayer( + hidden_size=hidden_size, + ffn_hidden_size=hidden_size, + num_attention_heads=num_attention_heads + ).cuda() + + dummy_input = torch.randn(10, 32, hidden_size).cuda() + criterion = nn.MSELoss() + optimizer = optim.Adam(transformer_layer.parameters(), lr=1e-4) + dummy_target = torch.randn(10, 32, hidden_size).cuda() + + for epoch in range(5): + transformer_layer.train() + optimizer.zero_grad() + with te.fp8_autocast(enabled=True): + output = transformer_layer(dummy_input) + loss = criterion(output, dummy_target) + loss.backward() + optimizer.step() + +We will demonstrate two debug features on the code above: + +1. Disabling FP8 precision for specific GEMM operations, such as the FC1 and FC2 forward propagation GEMM. +2. Logging statistics for other GEMM operations, such as gradient statistics for data gradient GEMM within the LayerNormLinear sub-layer of the TransformerLayer. + +Config file +---------- + +We need to prepare the configuration YAML file, as below + +.. code-block:: yaml + + # config.yaml + + fc1_fprop_to_fp8: + enabled: True + layers: + layer_types: [fc1, fc2] # contains fc1 or fc2 in name + transformer_engine: + DisableFP8GEMM: + enabled: True + gemms: [fprop] + + log_tensor_stats: + enabled: True + layers: + layer_types: [layernorm_linear] # contains layernorm_linear in name + transformer_engine: + LogTensorStats: + enabled: True + stats: [max, min, mean, std, l1_norm] + tensors: [activation] + freq: 1 + start_step: 2 + end_step: 5 + +Further explanation on how to create config files is in the :doc:`next part of the documentation <2_config_file_structure>`. + +Adjusting Python file +-------------------- + +.. code-block:: python + + # (...) + + import nvdlfw_inspect.api as debug_api + debug_api.initialize( + config_file="./config.yaml", + feature_dirs=["/path/to/transformer_engine/debug/features"], + log_dir="./log", + default_logging_enabled=True) + + # initialization of the TransformerLayer with the name + transformer_layer = TransformerLayer( + name="transformer_layer", + # ...) + + # (...) + for epoch in range(5): + # forward and backward pass + # ... + debug_api.step() + +In the modified code above, the following changes were made: + +1. Added an import for ``nvdlfw_inspect.api``. +2. Initialized the Nvidia-DL-Framework-Inspect by calling ``debug_api.initialize()`` with appropriate configuration, specifying the path to the config file, feature directories, and log directory. +3. Added ``debug_api.step()`` after each of the forward-backward pass. + +Inspecting the logs +------------------ + +Let's look at the files with the logs. Two files will be created: + +1. debug logs. +2. statistics logs. + +Let's look inside them! + +In the main log file, you can find detailed information about the transformer layer's GEMMs behavior. You can see that ``fc1`` and ``fc2`` fprop GEMMs are run in high precision, as intended. + +.. code-block:: text + + # log/nvdlfw_inspect_logs/nvdlfw_inspect_globalrank-0.log + + INFO - Default logging to file enabled at ./log + INFO - Reading config from ./config.yaml. + INFO - Loaded configs for dict_keys(['fc1_fprop_to_fp8', 'log_tensor_stats']). + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: activation, gemm fprop - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: activation, gemm wgrad - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: weight, gemm fprop - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: weight, gemm dgrad - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: gradient, gemm dgrad - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Tensor: gradient, gemm wgrad - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: activation, gemm fprop - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: activation, gemm wgrad - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: weight, gemm fprop - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: weight, gemm dgrad - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: gradient, gemm dgrad - FP8 quantization + INFO - transformer_layer.self_attention.proj: Tensor: gradient, gemm wgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: activation, gemm fprop - High precision + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: activation, gemm wgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: weight, gemm fprop - High precision + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: weight, gemm dgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: gradient, gemm dgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc1: Tensor: gradient, gemm wgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: activation, gemm fprop - High precision + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: activation, gemm wgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: weight, gemm fprop - High precision + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: weight, gemm dgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: gradient, gemm dgrad - FP8 quantization + INFO - transformer_layer.layernorm_mlp.fc2: Tensor: gradient, gemm wgrad - FP8 quantization + INFO - transformer_layer.self_attention.layernorm_qkv: Feature=LogTensorStats, API=look_at_tensor_before_process: activation + .... + +The second log file (``nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log``) contains statistics for tensors we requested in ``config.yaml``. + +.. code-block:: text + + # log/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log + + INFO - transformer_layer.self_attention.layernorm_qkv_activation_max iteration=000002 value=4.3188 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_min iteration=000002 value=-4.3386 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean iteration=000002 value=0.0000 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_std iteration=000002 value=0.9998 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm iteration=000002 value=130799.6953 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_max iteration=000003 value=4.3184 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_min iteration=000003 value=-4.3381 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean iteration=000003 value=0.0000 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_std iteration=000003 value=0.9997 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm iteration=000003 value=130788.1016 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_max iteration=000004 value=4.3181 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_min iteration=000004 value=-4.3377 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_mean iteration=000004 value=0.0000 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_std iteration=000004 value=0.9996 + INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm iteration=000004 value=130776.7969 + +Logging using TensorBoard +------------------------ + +Precision debug tools support logging using `TensorBoard `_. To enable it, one needs to pass the argument ``tb_writer`` to the ``debug_api.initialize()``. Let's modify ``train.py`` file. + +.. code-block:: python + + # (...) + + from torch.utils.tensorboard import SummaryWriter + tb_writer = SummaryWriter('./tensorboard_dir/run1') + + # add tb_writer to the Debug API initialization + debug_api.initialize( + config_file="./config.yaml", + feature_dirs=["/path/to/transformer_engine/debug/features"], + log_dir="./log", + tb_writer=tb_writer) + + # (...) + +Let's run training and open TensorBoard by ``tensorboard --logdir=./tensorboard_dir/run1``: + +.. figure:: ./img/tensorboard.png + :align: center + + Fig 2: TensorBoard with plotted stats. \ No newline at end of file diff --git a/docs/debug/2_config_file_structure.rst b/docs/debug/2_config_file_structure.rst new file mode 100644 index 000000000..f1069b0c8 --- /dev/null +++ b/docs/debug/2_config_file_structure.rst @@ -0,0 +1,241 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Config File Structure +==================== + +To enable debug features, create a configuration YAML file to specify the desired behavior, such as determining which GEMMs (General Matrix Multiply operations) should run in higher precision rather than FP8 and defining which statistics to log. +Below, we outline how to structure the configuration YAML file. + +General Format +------------- + +A config file can have one or more sections, each containing settings for specific layers and features: + +.. code-block:: yaml + + section_name_1: + enabled: ... + layers: + # Specify layers here... + transformer_engine: + Feature1Name: + enabled: ... + # Feature details... + Feature2Name: + enabled: ... + # Feature details... + + section_name_2: + enabled: ... + layers: + # Specify layers here... + Feature1Name: # If feature has no namespace, then it is in the default namespace. + enabled: ... + # Feature details... + + section_name_3: + enabled: ... + layers: + # Specify layers here... + transformer_engine: + Feature1Name: + enabled: ... + # Feature details... + Feature2Name: + enabled: ... + # Feature details... + +Sections may have any name and must contain: + +1. An ``enabled`` field that specifies whether the features in that section will be active. +2. A ``layers`` field specifying which layers the section applies to. Each layer can belong to only one section. +3. Additional fields describing features for those layers. + +Layer Specification +------------------ + +Debug layers can be identified by a ``name`` parameter: + +.. code-block:: python + + linear = transformer_engine.debug.pytorch.Linear(in_features, out_features, name="linear1") + +This name is used in the config file to identify the layer. To specify the ``layers`` field, you can use one of the following methods: + +1. ``layer_name_regex_pattern``: Use a regular expression to match layer names. This expression must adhere to the Python ``re`` module syntax. +2. ``layer_types``: Provide a list of strings, where a layer will be selected if any string matches part of its name. + +Examples: + +.. code-block:: yaml + + # Example 1: Using regular expression to select layers + my_section: + enabled: ... + layers: + layer_name_regex_pattern: 'self_attn.*' + transformer_engine: + (...) + + # Example 2: Using layer type to select layers + another_section: + enabled: ... + layers: + layer_types: ['fc1', 'layernorm_linear'] + transformer_engine: + (...) + +Names in Transformer Layers +-------------------------- + +There are three ways to assign a name to a layer in the Transformer Engine: + +- Initialize the layer with the ``name=...`` argument. +- Use ``debug_api.infer_and_assign_layer_names(model)``, which assigns names based on class names. +- Rely on the default names assigned during module initialization, such as ``Layer_n``, where ``n`` represents the layer number. + +The ``TransformerLayer`` in Transformer Engine is a composition of multiple sub-layers. We can modify some of these layers using precision debug tools, particularly those that contain exactly one linear layer. To see the names of all such layers, we can inspect log files. For instance, a ``TransformerLayer`` named ``transformer_layer`` might consist of: + +- ``transformer_layer.self_attn.layernorm_linear_qkv`` / ``transformer_layer.self_attn.linear_qkv`` / ``transformer_layer.self_attn.layernorm_linear_q`` / ``transformer_layer.self_attn.linear_q`` / ``transformer_layer.self_attn.linear_kv``, +- ``transformer_layer.self_attn.proj``, +- ``transformer_layer.inter_attn.*`` for ``layer_type="decoder"``, +- ``transformer_layer.layernorm_mlp.fc1``, +- ``transformer_layer.layernorm_mlp.fc2``, + +depending on the configuration. Some layers, like ``LayerNormLinear``, are fusions of two layers: ``LayerNorm`` and ``Linear``. When referring to such layers in precision debug tools, only the ``Linear`` part is affected. + +Below is an example ``TransformerLayer`` with four linear layers that can be influenced by the precision debug tools. + +.. figure:: ./img/names.svg + :align: center + :width: 80% + + Fig 1: Names of layers in an example configuration of TransformerLayer. The most nested blocks represent the most basic layers, each containing one linear layer. Layers that do not contain linear layers, such as ``DotProductAttention``, are omitted. + +**Configuration File Example** + +.. code-block:: yaml + + # Disables wgrad in all 4 GEMMs + section1: + enabled: True + layers: + layer_types: [transformer_layer] + transformer_engine: + DisableFP8GEMM: + enabled: True + gemms: [wgrad] + + # Disables all GEMMs in layernorm_mlp layer + section2: + enabled: True + layers: + layer_types: [layernorm_mlp] + transformer_engine: + DisableFP8Layer: + enabled: True + + # Logs wgrad stats in fc1 + section3: + enabled: True + layers: + layer_types: [fc1] + transformer_engine: + LogTensorStats: + enabled: True + stats: [min] + tensors: [wgrad] + freq: 1 + start_step: 0 + end_step: 50 + + +Structured Configuration for GEMMs and Tensors +--------------------------------------------- + +Sometimes a feature is parameterized by a list of tensors or by a list of GEMMs. +There are multiple ways of describing this parameterization. + +We can pass lists, as below. + +.. code-block:: yaml + + Feature: + enabled: ... + gemms: [gemm1, gemm2] + tensors: [tensor1, tensor2] + ... + +We can use struct for tensors. + +.. code-block:: yaml + + Feature: + gemms: [gemm1, gemm2] + tensors_struct: + - tensor: tensor1 + feature_param1: value + - tensor: tensor2 + feature_param1: value + gemm_feature_param1: value + +Similarly, we can use struct for GEMMs. + +.. code-block:: yaml + + Feature: + enabled: ... + tensors: [tensor1, tensor2] + gemms_struct: + - gemm: gemm1 + feature_param1: value + - gemm: gemm2 + feature_param1: value + gemm_feature_param1: value + +We can use both structs for tensors and GEMMs. The tensors_struct should be nested inside gemms_struct. + +.. code-block:: yaml + + Feature: + enabled: ... + gemms_struct: + - gemm: gemm1 + tensors: [tensor1, tensor2] + tensor_feature_param1: value + gemm_feature_param1: value + - gemm: gemm2 + tensors_struct: + - tensor: tensor1 + tensor_feature_param1: value + - tensor: tensor2 + tensor_feature_param2: value + gemm_feature_param1: value + +Enabling or Disabling Sections and Features +------------------------------------------ + +Debug features can be enabled or disabled with the ``enabled`` keyword: + +.. code-block:: yaml + + section1: + enabled: True + layers: + layer_types: [self_attention] + transformer_engine: + LogTensorStats: + enabled: False # Disables the LogTensorStats feature + stats: [max, min, mean, std, l1_norm] + + section2: + enabled: False # Disables entire section2 + transformer_engine: + LogFp8TensorStats: + enabled: True # Does not enable the LogFp8TensorStats feature, because section2 is disabled + stats: [underflows, overflows] + +By organizing your ``config.yaml`` properly, you can easily manage debugging features, ensuring a more streamlined and customizable debugging experience. diff --git a/docs/debug/3_api_debug_setup.rst b/docs/debug/3_api_debug_setup.rst new file mode 100644 index 000000000..bda8f096d --- /dev/null +++ b/docs/debug/3_api_debug_setup.rst @@ -0,0 +1,87 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Setup +===== + +Precision debug tools for the Transformer Engine use `Nvidia-DL-Framework-Inspect `_ package from NVIDIA. +Please refer to the Nvidia-DL-Framework-Inspect `documentation `_ for more details. +Below, we outline the steps for debug initialization. + +initialize() +----------- + +Must be called once on every rank in the global context to initialize Nvidia-DL-Framework-Inspect. + +**Parameters** + +- **config_file** (*str*, default=""): Path to the configuration YAML file containing features to enable and layer names. If one wants to run without the configuration file, pass ``""``. +- **feature_dirs** (*List[str] | str*): List of directories containing features to load and register. One needs to pass ``[/path/to/transformerengine/transformer_engine/debug/features]`` to use TE features. +- **logger** (*Union[BaseLogger, None]*, default=None): Logger for logging tensor statistics. Should adhere to ``BaseLogger`` from the `Nvidia-DL-Framework-Inspect `_ package. +- **log_dir** (*str*, default= "."): Directory path to hold ``debug_logs`` and ``debug_statistics_logs``. +- **tb_writer** (*TensorBoardWriter*, default=None): TensorBoard writer for logging. +- **default_logging_enabled** (*bool*, default=False): Enable default logging to the file. + +.. code-block:: python + + import nvdlfw_inspect.api as debug_api + + debug_api.initialize( + config_file="./config.yaml", + feature_dirs=["/path/to/transformer_engine/debug/features"], + log_dir="./log_dir") + +set_tensor_reduction_group() +-------------------------- + +Needed only for logging tensor stats. In multi-GPU training, activation and gradient tensors are distributed across multiple nodes. This method lets you specify the group for the reduction of stats; see the `reduction group section <./4_distributed.rst#reduction-groups>`_ for more details. + +If the tensor reduction group is not specified, then statistics are reduced across all nodes in the run. + +**Parameters** + +- **group** (torch.distributed.ProcessGroup): The process group across which tensors will be reduced to get stats. + + +.. code-block:: python + + import nvdlfw_inspect.api as debug_api + + # initialization + # (...) + + pipeline_parallel_group = initialize_pipeline_parallel_group() + + debug_api.set_tensor_reduction_group(pipeline_parallel_group) + + # training + # (...) + # activation/gradient tensor statistics are reduced along pipeline_parallel_group + +set_weight_tensor_tp_group_reduce() +--------------------------------- + +By default, weight tensor statistics are reduced within the tensor parallel group. This function allows you to disable that behavior; for more details, see `reduction group section <./4_distributed.rst#reduction-groups>`_. + +This method is not provided by the ``debug_api``, but by the ``transformer_engine.debug``. + +**Parameters** + +- **enabled** (*bool*, default=True): A boolean flag to enable or disable the reduction of weight tensor statistics within the tensor parallel group. + + +.. code-block:: python + + import nvdlfw_inspect.api as debug_api + from transformer_engine.debug import set_weight_tensor_tp_group_reduce + + # initialization + # (...) + + set_weight_tensor_tp_group_reduce(False) + + # training + # (...) + # weight tensor statistics are not reduced diff --git a/docs/debug/3_api_features.rst b/docs/debug/3_api_features.rst new file mode 100644 index 000000000..b31c437b2 --- /dev/null +++ b/docs/debug/3_api_features.rst @@ -0,0 +1,14 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Debug features +========== + +.. autoapiclass:: transformer_engine.debug.features.log_tensor_stats.LogTensorStats +.. autoapiclass:: transformer_engine.debug.features.log_fp8_tensor_stats.LogFp8TensorStats +.. autoapiclass:: transformer_engine.debug.features.disable_fp8_gemm.DisableFP8GEMM +.. autoapiclass:: transformer_engine.debug.features.disable_fp8_layer.DisableFP8Layer +.. autoapiclass:: transformer_engine.debug.features.per_tensor_scaling.PerTensorScaling +.. autoapiclass:: transformer_engine.debug.features.fake_quant.FakeQuant diff --git a/docs/debug/3_api_te_calls.rst b/docs/debug/3_api_te_calls.rst new file mode 100644 index 000000000..eb66c8ff2 --- /dev/null +++ b/docs/debug/3_api_te_calls.rst @@ -0,0 +1,45 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Calls to Nvidia-DL-Framework-Inspect +==================================== +Let's look deeper into how Nvidia-DL-Framework-Inspect with Transformer Engine work together. TransformerEngine layers have some hook calls inside each of the GEMMs. Users can define feature classes or use feature classes provided with TE. File ``config.yaml`` describes which hooks need to be used for which layers. Nvidia-DL-Framework-Inspect combines 3 things: TE training, feature classes and ``config.yaml`` and takes care of inserting hooks in the correct places. This process is illustrated in the image below. + +.. figure:: ./img/api_calls1.svg + :align: center + + Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 1 Linear Layer. For tensors mentioned in ``config.yaml``, behavior of ``modify_tensor_enabled()`` and ``modify_tensor()`` calls are substituted with definitions from the feature class. Other calls return default values - in fact they do nothing. + +In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. The order of these calls is illustrated in the image below. + +.. figure:: ./img/api_calls2.svg + :align: center + + Fig 2: The calls to Nvidia-DL-Framework-Inspect done for Transformer Engine. There are 2 types of calls: GEMM calls and routing calls. + + +There are 2 categories of API calls, each is used for different purposes: + +- GEMM calls - invoked during every GEMM, used to process or quantize tensors and collect information about them, +- Routing calls - invoked at the beginning of every forward pass - they indicate whether a feature is going to use `modify_tensor()`, etc. + +If all routing calls for the layer return `False`, then the layer is invoked in an optimized version with Transformer Engine fusions. +If any of the routing calls return `True`, layers are run without the fusions. This is necessary because otherwise some tensors cannot be accessed +if fusions happen. An important remark is that if no feature is used for the layer, then it should perform as fast as the layer without initializing `debug_api`. + + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor_enabled + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.fp8_gemm_enabled + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_enabled + +.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize_enabled diff --git a/docs/debug/4_distributed.rst b/docs/debug/4_distributed.rst new file mode 100644 index 000000000..6f69f2712 --- /dev/null +++ b/docs/debug/4_distributed.rst @@ -0,0 +1,91 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Distributed training +=================== + +Nvidia-Pytorch-Inspect with Transformer Engine supports multi-GPU training. This guide describes how to run it and how the supported features work in the distributed setting. + +To use precision debug tools in multi-GPU training, one needs to: + +1. Run ``debug_api.initialize(...)`` and provide the same configuration YAML file on every node. +2. If one wants to log stats, one may want to invoke ``debug_api.set_tensor_reduction_group`` with a proper reduction group. + +Behavior of the features +----------------------- + +In a distributed setting, **DisableFP8GEMM** and **DisableFP8Layer** function similarly to the single-GPU case, with no notable differences. + +**PerTensorScaling** and **FakeQuant** calculate FP8 scaling factors independently on each node, meaning the number of GPUs may affect results. This differs from the delayed scaling FP8 recipe behavior, in which scaling factors are synchronized. + +.. figure:: ./img/scaling_factors.svg + :align: center + + Fig 1: For **PerTensorScaling** and **FakeQuant** tensor scaling factors are computed separately for each of the tensor shards. This is not the case for delayed scaling FP8 scaling factors, which are synchronized. + +Logging-related features are more complex and will be discussed further in the next sections. + +Reduction groups +-------------- + +In setups with tensor, data, or pipeline parallelism, some tensors are distributed across multiple GPUs, requiring a reduction operation to compute statistics for these tensors. + +The weight tensor is always split among the tensor parallel group, and debug tools automatically reduce statistics within this group by default. To disable this automatic reduction, use: + +.. code-block:: python + + transformer_engine.debug.set_weight_tensor_tp_group_reduce(False) + +In cases of data parallelism, Transformer Engine modules lack the process group needed for reduction. To manually specify the group, use: + +.. code-block:: python + + debug_api.set_tensor_reduction_group(group) + +This command ensures statistics are reduced across the defined group. Activation statistics are logged after the forward pass (immediately after exiting autocast), while gradient (dgrad and wgrad) statistics are logged following the backward pass. + +Below, we illustrate configurations for a 4-node setup with tensor parallelism size 2 and data parallelism size 2, showcasing different reduction configurations. + +.. figure:: ./img/reduction1.svg + :align: center + + Fig 2: There is a single tensor reduction group composed of all nodes. As a result, each node logs the same statistics for the tensors, as they are fully reduced across all nodes. + +.. figure:: ./img/reduction2.svg + :align: center + + Fig 3: Every node is set with a tensor reduction group consisting of itself. Every node prints the same statistics for weights (which are still synchronized within TP groups), but the statistics of activations and gradients are not synchronized. + +.. figure:: ./img/reduction3.svg + :align: center + + Fig 4: Weight synchronization is disabled by ``set_weight_tensor_tp_group_reduce(False)``, so every node logs stats for its shard of the weight. + + +Microbatching +----------- + +Let's dive into how statistics collection works with microbatching. By microbatching, we mean invoking multiple ``forward()`` calls for each ``debug_api.step()``. The behavior is as follows: + +- For weight tensors, the stats remain the same for each microbatch because the weight does not change. +- For other tensors, the stats are accumulated. + +Logging to files and TensorBoard +------------------------------ + +In a single-node setup with ``default_logging_enabled=True``, all logs are saved by default to ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log``. In multi-GPU training, each node writes its reduced statistics to its unique file, named ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-i.log`` for rank i. Because these logs contain reduced statistics, the logged values are identical for all nodes within a reduction group. + +If certain nodes are given a TensorBoard writer, only those nodes will log to TensorBoard. This is useful in scenarios involving pipeline, data, and tensor parallelism, such as with two transformer layers and settings TP_SIZE = 2, DP_SIZE = 2, and PP_SIZE = 2. To log all stats to TensorBoard, you should pass a TensorBoard writer to one process in each pipeline parallel group. + +.. figure:: ./img/pipeline_logging.svg + :align: center + + Fig 5: Example with pipeline parallelism, where a ``tb_writer`` is assigned to one node within each pipeline parallel group, setting these as tensor reduction groups. + +Alternatively, setting the tensor reduction group to None will yield unreduced statistics for wgrad and dgrad tensors on each node, allowing for post-processing. For weight statistics without reduction in the TP parallel group, use: + +.. code-block:: python + + transformer_engine.debug.set_weight_tensor_tp_group_reduce(False) \ No newline at end of file diff --git a/docs/debug/api.rst b/docs/debug/api.rst new file mode 100644 index 000000000..ac593d353 --- /dev/null +++ b/docs/debug/api.rst @@ -0,0 +1,13 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. +API +============ + +.. toctree:: + :caption: Precision debug tools API + + 3_api_debug_setup.rst + 3_api_features.rst + 3_api_te_calls.rst \ No newline at end of file diff --git a/docs/debug/img/api_calls1.svg b/docs/debug/img/api_calls1.svg new file mode 100644 index 000000000..098f384b2 --- /dev/null +++ b/docs/debug/img/api_calls1.svg @@ -0,0 +1 @@ +te.LinearLinear1Nvidia-DLFramework-Inspectconfig.yamlSection1:enabled: Truelayer_names: [Linear1]UserProvidedPrecision:enabled: Truegemms_struct:-gemm: frop-tensors: [activation, output]-gemm: dgrad-tensors: [weight]FeatureclassesUserProvidedPrecisionFPROPWGRADDGRADmodify_tensor_enabledDefaultmodify_tensormodify_tensor_enabledmodify_tensor \ No newline at end of file diff --git a/docs/debug/img/api_calls2.svg b/docs/debug/img/api_calls2.svg new file mode 100644 index 000000000..5df72fc2e --- /dev/null +++ b/docs/debug/img/api_calls2.svg @@ -0,0 +1 @@ +Tensor Ainspect_tensorfp8 castmodify_tensorinspect_tensor_postquantizeGEMMinspect_tensormodify_tensorinspect_tensor_enabledinspect_tensor_postquantize_enabledfp8_gemm_enabledmodify_tensor_enabledTensor Binspect_tensorfp8 castmodify_tensorinspect_tensor_postquantizeRouting callsGEMM calls \ No newline at end of file diff --git a/docs/debug/img/fake_quant.svg b/docs/debug/img/fake_quant.svg new file mode 100644 index 000000000..3ba6973d5 --- /dev/null +++ b/docs/debug/img/fake_quant.svg @@ -0,0 +1 @@ +FP8 GEMMBF16weightBF16inputFP8inputFP8weightBF16activationBF16 GEMMBF16weightBF16inputBF16activationBF16 Inputfake quantizedto FP8FP8inputBF16 Inputfake quantizedto FP8 \ No newline at end of file diff --git a/docs/debug/img/introduction.svg b/docs/debug/img/introduction.svg new file mode 100644 index 000000000..0eae8e820 --- /dev/null +++ b/docs/debug/img/introduction.svg @@ -0,0 +1 @@ +te.LinearLinear1Nvidia-DLFramework-InspectDisableFp8LayerLogTensorStatsconfig.yamlte.LinearLinear2DisableFp8LayerLogTensorStatsSection1:enabled: Truelayer_names: [Linear1, Linear2]DisableFp8Layer:enabled: TrueSection2:enabled: Truelayer_names: [Linear2]LogTensorStats:enabled: TrueotherparamsSection3:enabled: Truelayer_names: [Linear3]UserProvidedPrecision:enabled: Truete.LinearLinear3FeatureclassesDisableFp8LayerUserProvidedPrecisionUserProvidedPrecisionProvidedby the Transformer EngineUser candefinecustomfeatureclasses \ No newline at end of file diff --git a/docs/debug/img/names.svg b/docs/debug/img/names.svg new file mode 100644 index 000000000..3990939e7 --- /dev/null +++ b/docs/debug/img/names.svg @@ -0,0 +1 @@ +Transformer Layer with name transformer_layertransformer_layer.self_attntransformer_layer.self_attn.projtransformer_layer.self_attn.layernorm_linear_qkvtransformer_layer.layernorm_mlptransformer_layer.layernorm_mlp.fc1transformer_layer.layernorm_mlp.fc21 Linear1 Linear1 Linear1 Linear \ No newline at end of file diff --git a/docs/debug/img/pipeline_logging.svg b/docs/debug/img/pipeline_logging.svg new file mode 100644 index 000000000..b87254315 --- /dev/null +++ b/docs/debug/img/pipeline_logging.svg @@ -0,0 +1 @@ +Node 1Node 2Node 3Node 4Node 5Node 6Node 7Node 8TensorBoard logstb_writertb_writertensor reduction group 1=pipeline parallel group 1tensor reduction group 2=pipeline parallel group 2 \ No newline at end of file diff --git a/docs/debug/img/reduction1.svg b/docs/debug/img/reduction1.svg new file mode 100644 index 000000000..184799d53 --- /dev/null +++ b/docs/debug/img/reduction1.svg @@ -0,0 +1 @@ +Node 1Node 2Node 3Node 4TP group 1activation/gradient tensorsweight tensorsTensor reduction groupTP group 2StatsStatsStatsStats \ No newline at end of file diff --git a/docs/debug/img/reduction2.svg b/docs/debug/img/reduction2.svg new file mode 100644 index 000000000..36f94611e --- /dev/null +++ b/docs/debug/img/reduction2.svg @@ -0,0 +1 @@ +TP group 1activation/gradient tensorsweight tensorsTensor reduction groupTP group 2StatsStatsStatsStatsTensor reduction groupTensor reduction groupTensor reduction groupNode 1Node 2Node 3Node 4 \ No newline at end of file diff --git a/docs/debug/img/reduction3.svg b/docs/debug/img/reduction3.svg new file mode 100644 index 000000000..601fb8502 --- /dev/null +++ b/docs/debug/img/reduction3.svg @@ -0,0 +1 @@ +TP group 1activation/gradient tensorsweight tensorsTensor reduction groupTP group 2StatsStatsStatsStatsNode 1Node 2Node 3Node 4 \ No newline at end of file diff --git a/docs/debug/img/scaling_factors.svg b/docs/debug/img/scaling_factors.svg new file mode 100644 index 000000000..b70b51e66 --- /dev/null +++ b/docs/debug/img/scaling_factors.svg @@ -0,0 +1 @@ +One Scaling FactorScaling Factor No. 1Scaling Factor No. 2Node 1Node 2NodeOne Scaling FactorOne Scaling FactorNode 1Node 2PerTensorScalingandFakeQuantFP8 Delayed Scaling \ No newline at end of file diff --git a/docs/debug/img/tensorboard.png b/docs/debug/img/tensorboard.png new file mode 100644 index 0000000000000000000000000000000000000000..481dbd2eb9844f82e35d9a0446ebcf8df40ca6b7 GIT binary patch literal 123093 zcmeFZXIN9))-VbvqA0?arl5jukq(L!X#qt=K#GJUbWjLAARQs0Du@WEG^wGtkVLw) z03sm0_YwhV2^~U72;A&b-m~v}-DlrF_s4ynkB5~k)|_LOF-IR`5w54B&ce*YOh-q@ zqVf2FJ{=tsoR03qlhdbYPbw=XS?K8K4eiy`^fc7e`1RaeZ0wz&baX-=Jl|W0hCI92 z60j}kcq23R^2+1BHy=X*YQuiF6cM0>}?y8L(`j8p|MFuVSItiw?br&HH- zIR9ycda>dIWqIw$(^J_SB4;4alA|xvpsC>ay7ked`}WzjY9`aNzBe&mO`SpkLec_U zI9uqvi|f+Mfh=z{Om{)YSf}-bTEg<$B;~*&;9=GC&3D4v+SNS>tx{jwEL zOSQ);$^BNlzht|?SHC{MXUW-oldI##Z!b6hzLKN6KInafsQ*|& zKG`(FkRSkaUlBa2?215-sY_zrML}>S?>FC4k=}yw_=xG-Z?V_E?$51*Z z+V=(8^&p+$->ppW^b`NSKk?*`j`x6S8XB~3pp_dG>g;ak;&EQ1U4}N)h`oWKhoSZp z1uGYr#4~G`=THeB*o!|v=#+dEXtyw^$1{E(n3J=+f{*g`fAmnG-T(1e@;d)Nx_CG$ zUpLg&<5zQWgYwHt+>*F;9l*@b&#&ZWZKI(7;L*RpX@4nSxAX9Lp&%*g?d>h$eMiE@ z%~tZZyu7^REh$MUDREj4arc+b9?yKlo!x)`=OF(c=K<8+%FX_ThrNq4{~zN%d+y@t zp?v-N9~1r0&p+1*^|AlYOwR8A+7@ktl7F;F-j=u}`9EXRK$ZS@s-S1@19dWbU=O3A zhc*Y`uC%n$Kl=YaTK+TRpCAqY19@Biw(Orl|J3xqK|$_NH#HX+ZB7rse~#u~z<+N1 z7od{lA6x$uD*lP*e>|lj8o;b1`9FsSU>;R=T%@B@rPFwD&%lR%Ws1?~^3X`@W=&g* zrfb4|S8?;VisEbL*@iRGHHyvi&1h9uZ%K*fa@FG5=JHtM!@ByI$<-Ab@shrEM4EI> z>F?fvJBrJ9Y`d$9o-qI)8RO#sd>qn4;px)rT?~}0K{;nte$tQt}WfoLQN-Do>d(rteN2W-fvIon_mA zxwuw@{Uy}D8_GV=&>$OZE%Vnq`4?7I|IeWRf52%r^SAa~cXGxxJs}#sz5WQT`|l^K zY^)kI{T87<%6bof6(Pdmu{g(QrYNfZ-`O*riTRbK%VsbYw$8bq>eghA!MW+P4%!__ zBw$Lhq7$>(*^HumiW8tZxLSXiR{+p-cDBRm=d2d1M>B$4HG0n7n42JYxZu!=SaE1k znsWfW9IhHCcoBK%@m^UrYJaold;Bt^Cb~7 z7;plPtE-wL(~8A2PLd8dpE2!AfEyEHspL{ajVhw&_mE3u!*$jpQ4C=@LL9Bw^yJB- zzo7;gBV2xVOI+7W0JXYc`t)gkwUxU#L6cOvkcJ_UCg7n2d5lxY3A5#frn9^ok-=p}W}Z zbIv>aOGgSApCHbupIVhKJOljkL*5^bDjzkd{FdqdJ1Mg_2~^QmlA3N&>|?a%wC__; z|7DotuU`^O66PK(XQRON^LGqX5E38K^9kjf{wK`Z+9xRK>ulT>ZU~ENpCj1` zcrIsLQ5jm)+>Vp-<$19TFGJdSz^|ECe0+8(-f*-lfmagq&5^`oR<0_2Lx|sTAwb1J#yF6>p=LaYT z2WBe`-g(s3Clvka=_73|M|FjK4svz9|BLwv7I)H69A>MpbY#Et(ea^7tWM(jK>ALr}}BA|D@}b78NKSTLV#rZSe> z;h<(k*`4I4t4QNBtaP!M@&xRBz1#Dnjm$Nyt5^38#2TMr0foX;!$0CriNa6w$<|0X zZ|2BtP-HqKBo21&K06uR0{JUPRB;AY4 z%U8;iuy~I+R^&`{tDJT<7)O-{O9r&R$q0{l@u^_%b98EF?&POBvMM~>)3o_2grdU5 zT3x`Jc7dxBPn*z#GbAsrB!8RLXQ{T}%C>3$ZkdVzwQ?_jf5e@qCSy16HfAehz-%>| zPY@j~_iJIOcQ>eCX7>fDd@f#Cr#m(w|66+J@SFaNt z>wTMs{Ez*8I@Z0o@{t?Ks7qPMMN?8(bk$oki8eo>!3u)I0hgfBXGr*4;F?1JknAam zCbDCT5|OeP0Dym83$IlF4B6}GkX4rT$e8iYwRG{o#J((xN{UA6ycKw0>$2&{cVpyJ zokROdN(N(W06%_$+Ub?wcOO;j=Qh51kH z-Gn}mr|H3ngW_;U6~8`3cn#Wyw`AHM zweygITAy1~0%U$T>elUd0v+3fLBO^XD%t|4lC1L}Iu4%%x3G+nOjn)-{oo0BIP-0EcC?Ck$oPO* zj8R7>$UKgQH`P_P36Q17LH-HOKJVk~^ACPrk|W{fikMZG2{n{s)vooM7R9MBP^;>_ zm(*5k!U@3%?*i+%E2dbgi+ha3^ta$|8Ggkj=3Y^X0pfk^Dl_*}_9r1cdJW1D(dBhy z#TM}m_Dgxh!38djk+a8BBevD2hWsfQYETr#DkBe>JL#LQEt%wZX2fv`4Y;M!4N~^2 zQjyI8e6B+gJ$sd%`vs0vLXR5hd48?SIn2|oR}m?wBqi}56E5sEmB!MoUol%p?9{sY zg5mJl*W?;0ix%)+VrtKDM8$qdvMhYJiOIiq0Fq?AA+nt1QFlUYpFnUpRBn<_-ENa# zo1f---(L}%o&}a3nZz3YI`*FtFLXrm4`To!Dqp2SY=}bsukQO=%p#3C6p4cUe zz3L=ngplh<)8<}@lrT{`c00m(KlIRVG&f0c36+<^SnLfs)?bDqxhX67Oe1o3s@%dz zh9?Ms&=~JAQF!W_kZHvIf~Rz({jMD5abg|w?y&!Mg(Lz|p)C$noodZ$KePp|ZQ!-= zW$$L`c?KZF!b{X_Z5LvI0;~Zd=!Feg+onefILfR|L7CVEt~bqc-6)Hw?_k>?X)XWY?vlcaTTtf8EIo=wI{hc{ zlo%wz$l(N}y4xQ-QbQQ}E&Rx?zP*w(mhWHm{vgauMQKfWr!|LO70)2aa9 zZ^B;+7LMtR&d`rsyzKO8Jm(nn9#D3j?~^z4K6x{PD5xvScNL&ZHM5ZKW4grs;^Cd1 zMZ(cR@r)JUt@0w^u~AWxH{m*quKPh}w<+50D&z7ThD=o#t1;Lnru3`gn*no`3wT?0 zR6Z$jouHM=MXj~YfvprsA^2j;lM2Rww)SMB$#?!xV{ga2)`XqoNkLHAF8J496b3b;yF7`he<^u&tQ6zi_} zUUNO}Zsy6Y$^7ZJa67;PXU(W$npK}?GHELhQQG^c0Xh_0Dx1lWSXi%vCg;lAy4$Pj$-x(Qjqdc%1WN_}R_E?~e-cRrm|_rgKefnr^@$ zaC2LdG@r?5Pnm;sPj;G9Q^sjfziP$$;kKXJC0jI6cT^%q@F#cy;wvO6R175yJ);|W8COrCkkIZ zqK)(V%=(^uPRXJ|H8CcK$WlHQTg)3=7pH6f<7Qg%TgkcVXfqDZ$NHZii%+{)L2xAd zBTya6cZNsqphO(8v3v`nvMFidKf-!)HAK5W`<{K=DUAP|&ap=~P=k+n-(=DoD%ykE zI?L&Yx4dW1h;fLmD_GE-eT#6>v>w-~YGX>r+7-#H9q-2;G?qqi8#da2HVRBB>&N*^ zTJp<>EQgGiZzLHHQ=ymQb50hP>p2{IYzl8%oQ6S+27u1vBVru6JKG8t=-v)!Z7x^O2+0$){nck&3^zjS>RdB*muJC z*G_!yf{8Sy1GEu8pCB_RSpm16hfShBhcOcn_Mb$O<*fZ%5%I+$7xD`{LUvJvq2v|i z<6j4lR9nvsQV&7|=JPK+uK@pyX^0grV9shk6-D{sQf7b7K;Uh`+YFJ%!_>n9f!78H z)*pQN)b)n5rC^Tr#|{3mota?DoZvw2!MUF|yO>G~Rkr4>5Hf?aj1Wz@>8uqG^LIGQ_3kb1)}!#Ess8f4iYI1q{UA%juHeT<{Bi4ERdLgx4G{PDUIP5l>Q7N`yUxcu z5$d>iBB&yy9Z=XlE)P`aB?H5^%k6*cVZLv&F|0cxTP7Z7Dna5-7H-<}C|AD=DC$nU4^y+#Kb_86T-b$5bgZs0Rn)HqK4vyzR?`dNczXheYdB=v1NW; zK59!&$37O&Hivhi1n>1duqW3cdQVq%D9`lq{Sg@(wPHS^jI^IVF#_^+;+qR~25j$= z+BC12>XMW02tYKdyqpb56-GM^SRH0yw4R{1bgdt@o*a&OM%H~Dd1J_ty&%=;)7`b6xryfQ;8hmR4n$L1SDMtR``pK+j_-r(`)pXf_g$5r-|1e~ z1aURlc8tJx%DOU-M}@U4tR(jU_OWrWzQEN2E7*gPR>xllTJ2Qm1#E;)V692YaHk)K z?|8vE4?!Oc7BW88#XbO&CgZ}}gPXS{tU8AGFLd?g-VSlhO>S1N6`}>9 zuSD*)>JHdGM6G@NR0u7Px-Tu-wgySzd9O1Sf3-yj=xL6piDL8HEg87|hj+SQ zMBpV0NB`=U8zBC>WrIhYl+BPKk85Wd&Fe6lXrem;nHnSOfd6Z9yY8@fA8J&L>H z%?lg}h{r51=RY{GRS7BiIv~ea6hL&Y;creb#^>!~9a{D27YY-?C?QJX4JIi57t&TE zD$Sg*9DlJNy#@+(lU@2ijpd!kOI=ztEV{v4Ap`}(}q{W5zS0y^hQiY0QV z5oK{tqwGy*l}u<(-hsRoSivM>a9?mzQ%>Pq8cvJbRJYPN&S69*!_8W)DKwh{6gtP%5tmW64>Cvo&klZQtzN}S`3~KGts=nsFnJAO zgR{MydS~0QW3KPi$Q?VGDIGL{k7YL``

zDpO7}sJMV1AV^}vb6Fu(Re(IoTDw%bIJ||SrzdsI`f3P6VTWQ$% z9J*H+S}&Cl*M(viLTFtD@_VL@zH^_*e*Y|mEWb`o7+~@w8N^0~w=1;z_7DVHQc7AQ zeh;{j1y)GdK?YBvXZD5en$EX^-W;5p{Rm>B9CMs|(JV4Ca|f-4dbvJRWVH_ltabQ7 zv|R;=Ym0AvDcje~3Zg%kwBQpdi2?Qc;4v>3D_nbRj~(W~I-6?8w*9uw#baL4;%CPb zrFDmZqsa>BrOgj$f<{FqpksfIR3nK!s#5_!+~vCyhme>ca3tT!&0B;6{C;vT0Q2Uo?eG2J0KpfiC%y{LQEG zDk&|1ZoQJN3h-AWWejrg{poDYK%G4i-sI4;gYV-!0pJ<^3w~dfa+)MONW@ba#KY8AjrLk0IMQ)p0 zMf7!U3urHDKTr))$2-7{=%60038wgV24YIMj6YXmjgh3*h)M%!#S?Me601^nO|NJ( zJ9&U`OzkQubDRzsZSmI|w$m~(*+mH;^m@I>?k!);!=$5Ns}TJ?VghEE$rqGc``tE| z_}p-vucdbnd~d2(I=?P0P1qbRp3JeA^Tm@M4a%+81PJ*}N#-%BXoAP9Gge+2-W_m3 zY8M@sjmZk`Gc!aI`xIoNK1-Emp}Q@xvGS&*NzNk`&w!eK?WmHs3`u#<_fV5N+;C{e zGNP`2*no0@-GSL*;MXa*^f|`Zlh*g}I8n0LxXeez>>Dw06{Ir?F{XXzN^ADFk|*8= zR;#1@URT@Z=W3=oG$SA-_!&Nd1fd#(y?Z7D4hX2`l!e%)6;o@%c>1%uium=x+zbKG zy!2uA`IrQS0Nt0^bSy(1nq~JCmTd==ULl~ZrVAo5C)BQR>rOS1VtiD zxe8>H)=Lh$^S%j9;aTtXeNMpveZMpYG{1MX{8PQ1L(24^RIB)y&1SGDg>GWy z=u9Xd+5p>5s%Fx0qj@EJjkP3r=sX7FFxZ8=yvBiKFnq?n{XiZ}YPR*^>KbHwhoa7e z#Cu#Te%E4Bo*Nj~!)0*kRC#6J}vN>B(!IO z+OTv&ZRmB8YsprvaywDA3_*w)+wZuJgH zznW@3K-n?fgG=Q)&H=79nE?=&nQo^2xpy>2nS>umNRc|26LntZ^SbELXn_tOWHrI* zNScu1MH%pdQQ|=kYJIJqy`DmBjv0Rah0l8=;MwG$$$2{=&~qT-$)xRcJj|;$U<-C- z{U??l*PM%ND)HyFEw-SgV4Q;d5$mGV-9q!K+ndELgw(ZH39w6vIkMCsEX^^vEc<3&%@_XiTSijij9+^|HcoU9biXG#=@grFaiwa&Mm)X0qv!RcS0C?fiUV zHpi|YYjx-7DcHF!Fx9NhEqPo&-Wao0&LjE7H9&mei)$*)=1o2QVCa=pJt5BpJUf=~ zvSI{UXS$rEwo*<7#!ls4d%qg;)%SWEQ~Q0<&9RJdOOH>ZO#wt7V)$9cX2*a-A=IPd z9+9EVh@)3hU;7vxseJIbW6*9zu)kL)BK-@w+~i|R=&+Q4fA8iJd1aQz%;={li2}GY zjO1(A>>8FLmC94wD~^WYvZORC)1hsMk!r>B@8zgkkram*7H1)niXOzMU$)9zjLY%-1{R<$@MdFEk6%#vvL7!ld)A?%8vScc9~1i*nTKH5B*b1x@X<@3Oqn9 z9!ry%HfbfYAPnSEsR%DR!iN^9bgs@qmPF*VsG+I$?|P(Cdh$N@!0wH;jjXQS5yO|S z?kMa$vv~eeo;Pcx1jbv5;n$nhIInCp7xHLW%#hd7XLUq*xgQM&FF-CP8{R}&Fad2LXyhF8wf&iM% z+<CdA7;`+DW27AOYeM0VP+cAe3=Cg*0l(EzNeh^NnXTfgRDF8= zB5^%I^mnXXOBMX%C<3#FM4Hk9l3z0Axk?NlI9Tdz7e2SACA8%xiKN?1kyw*#R~5u{ zL+aqMX==2L-RXM?`U3ii-!zX3eGhz*J5urmivF2d82Y^C4iQj;~`>x0ql38G%$xz=}SZi~MS%$mMlj_bKG-5q*IyT}~+0SLc< ze9bMq{iKd!2cfQ=m>H0zqTe9|3fdkgj)vdpYC*rULi(Nq@t9jFiH3W&cQNhz;J?eR zLOh@>%b9(`sXxml^qC*f0*D`3Jc=%SsTLZB?`ERlKdgBx%jC^K;OT6wN{{&oq*Vxp zcpZ~_TN!2kMb`DHff+@E+hw-B^2!1ckHz2Wtq7QXRNjTM;7L2ZyHE!^e!ZdIZ!^+t zERf?!eF)OsdQi%`8h#Dms&C;uT;hayVM4JWo|uV?Mmjc{qqBEgqo-mg&hJtcn}Yjk zVSZhx0RRym_hm@w;Zbx918!_QwkFzb{%9Hb@n}NEsmJyr$_CM=q-9tT?$ohm#LGMg zE}v;1e*vQ9%byi3JNZHBXPoYMeC_$D>EZqLs88UR5sK+B6uG9ab$!aRGM$E%K?a{T*MButE+gFa<@ z0)nRVXi4tWE&7f&fTa;`z76xuHZA_#Xq$hMM1>PHrhA$8*wp%2RnlVyf5 z?Y4c6b{!>=(Iu*%8G1Cj0_qLWm2TOz(g7=|qus-}SHlX-Lmc^W>dVz?;(;ayOJf*9 zJ#!i{Y9WCC}ow7F1Cn1F#pXOZC*LrjVZA7^>H@SP`WAL!JGl2NT zZsBF!YIHbzX*2QpyEM^%8^9D}Ak5(MNLpGT4@2%Qr5i5RhCqyn#)=LJYxCCPbh)Val{xk)a{c zsHNn~a}n4fy=x7WLZcst;oE1g-d(>yZ^1sp_urO7t0Acl3=BOskjKw5QUpmAG?S%B z?1h86h$H3r~^1RmBEkwH>D&#yyv_WvIAucqvcXSha7 zfEmzz>YT0^gqqz8{Mdaj82>k5WyOKPIERyd6Bpx$1WpPqr=`89sA_s8 zOxef;7?hbC^C|eR7WAKD$X?Eqr}w>HXFcHi+hWQfX@-;Vf%IEver<#PNu1LPpH9MS zzCO*2{+lh*N)&l$rD*ngpI-g#Qau=HJ`qcl1lGs%FBWab3(n3$}|M%LYKd3I4 z))Vt0cIo%O!R%@N_m2VtU3fTdUi%O7{^E{yzJV@~yP7+{{Vk>*|E}5~$HM#r5g;f9&0z4f}Q_| ztp9Q&U7YFJ?(WT>_~=h=aiv>pP8)P3ebB$&HT&G=uZg6(obD2Cio8+c{+HwatD!fQ z@wu)F)uYa3>LuTp++Pp*Z>eg+{4L@w&qw$fvRB7khtA)TimDvfn}qmyuem9!kw3A| zudAAa)z>*_UF!IE0KG8A?;EQXVewbg{^p2+ z)+8D(;-;vlO9TF23_9D~++5)v>$z6Z`swe7b{1h{rEo?xf7SB;{V$m7PjhFE*MCTsp-dBO|j$CSog*1OnlvpMk#qZ0Pwb4l0{>d$y(N9kR~E#l@}F z?M=I*usB~{^oJpuIxB3bS;rB%Oq_1ZHJ!VY?f=d1*EoOt8%=UCpLU^X-Joufbjx)R z#B82GBu)V+D^^Ch_1f(y>ad^o$YPQtes>6BAQN=@k*ufF{G zoA+AP;yBk%Z#!+^)uV$&B!lw@&@UOTSNY^)dg9gA4|TM4I!mNWbu<4IIqlZOBKPHp zN)g;OoKGQ@4jVgGA@-(57ED~~NLwQ7hvSY|d9+_{qK#&eWbNql2ys&}F865V-A}|J zRDy|XGiQ@}IKExMf95r}6J@(w(dD_XNuAzJ^th%ddJv3q8uGpBzyC~*SX5f=Ibv1` zB0igmV&DH6qdPgEs{&heH!?D6^~_Fos?r+?oKLWrrfxK`u5~M}doW$`(39(XF851e zD)de@j1k@%b%ppS>hf|+B=1P3GI1a;npfKHy@^NPLz6P4vwuizGYwU%-NJXJQZKU_ zrM;&D4)v|$1K3lkQ>t**Zzeg)c%`G;Cvbc+Ir(%Rp?2p+Z%L84n zG2qtgk8zBtifRpev^%dX_2`OoiymB0h&&9mEk(u6b$jS|N7gMV)rtL-u?%66ePNsG z8BbYAcA30?R($ZKS5n&lrp|xzQQOJ4x=Neh1{5=O+w~}`m3ZS?`#5eYjoK%Z95G|F zo}c+?`neZ$(qMciTIJ}4V9JYNV4y5;z$amE${UHOZ91=Log{wSjJ{JCnEPc2Y$ASJ zoP2+72zNv#I#(~}mh#0(`;&$sQ*IsNi$^qW*xPL7tHIVEjY~y)nweV$F`VO2oJ#}1czY6XRMc<9Td&O1FTv0X|XVx>1KvO2#>)*tpS0FPtm zO$}mSW{2dZ`L>gbX{!{Jjk8xVFgFRvDr5EiT;RPvy^vtoTi?4mhzr< z-G(49=Q-mJ7D}ZmjJUpZQQmIY12XOhdK6iD0UhhyjJ{;KHq^#FhTHOfiHMZjJ=8dyW zb8WfA{^;2hyP!@Pz@NrQUfoNV&uCL28e2F7B~-&JN_fDGvO$BIsAAjIzSFik;ZpCZ z5G@_uZNKX1QI{k+;YE7s(BXAHbX~Q=U6tDJt-AQCJK?rf%cwk0-J_p(qRE`yMytqO z(9lV1UgK(a^~J}zj;e+h06hH_K?IXE)?H(WdC(*6XlFpzt2-BUi;djk@=J;1Tdf)f z-6!147^m?&KGW*Oq-*O~*Aka_O3C|{dj>7V&Nm66R$F<6Mg zBylJK(~W}U8D&6VV=T9k^>4)15LSSl)p^v{5QavP9>o_wRP&rvWf=Wr;){bLlho;T zU=~3)4uoD5sm^dhOrHfwf!=+gtx#i^;~I>jVDz(aav=xc%AP7csP| z-12FZu+`JVQRYCAQ8e+rQkhDLp2a-w+Xbx~49(2v9g>t5lN=>XbO|a|P8|6LpTXLV z?u=q;$V_6p*P(~@<=f9+F%rF}JRxT}+I3q+lwRdfL2UX6@Iqxbd)(2q-+JBZT?Z5K z;sG@V1?&!@w^%Q5`-rmcEuwNY_!V=e%1*D^?nV%KW7556%oS4QPK_Kn*w~+oR)mrZ zg&+IP$3e#Zmk#wdo_3tOB=zm+$o^`T@Z-b7?VjYsT@a;oIX77$P8ZTqZk8;(rHgUD zh5UnZ)L0a}J#?O=DUH?b^lLe)LD&CImlb1IxIYKB# z!%9*FqqcCQiZ`8ro9^-BOopMTBmMO6CV%t<`&>b?-vt zNOHkzRc!F516f9Ij*}6xj1Pe)YWsRfz72u&I&)5G@U1qkY*T0)$cYO|p7tf8xje?5 zO1qqrf|1Of_Guf`ryFarK%L^i=q`=}^+wc`*F+Z!A7GLJ=Znq_Q|0iPa&5f=n@GwH z#Flt-HUL%a2J@eCy;EBG+$3fK-WJj;R5i3??iQ{A0qNFPN_#5Kjnf1%pfi3S z-H;=dG*Feh7KYn4KF$cc>aZQgKc5v1ND^W~kF;}v_&~;nv@&LOM!3oNR&if-Io5Br z9MW<-ZCl=d0#=#Ii10jbSJVD{7~f~7s?#`pOkEGC2w?td;k%q8dv)hG^~)bdn-|7~ zzvNq%1urt6ETpX;#-#-QY{beW#D+TO!I(u0JPj!+v~ttzk7O>Yy45!k-UFI^t#rS) z__|v$?x_kPKkEW(uCIaHQKM76u%lng(9J+R~(NxAQ3)_UO8!Cu)+ z?d^PF)vI0B(cMkIE-s=8qJ{CesmG7TRI9@ac;7MugsfISSw*>ud>A)NDzV`BlFg~3 zYNXB_oMXes=^RmXDafAT+ILiOPigr^Rmsj4jCoYJfs^{Qw=73{48{CD)JYeue(#!D zNLCP(aXwSPe|s!VQ%baw&YnZ%g2qRN)Mzm|A;ET~zJD=&DtU5JIkOGbsEz8(qZW84jMK5^bmkSg6k%UGUxA4T@&e#7|JuS>% zv@)l6LDADFjtT-=-h~VtiL#-YBJ`8VT;xPlbV0|$HXuP8K>WV9U!YVORV-qapY4Uo zwVc+Hth1IC(MrY4mjEq+i8Uf^McZaXgWkJjgdW{K&C+%|-U~Rky@X-j$Kc3>-3A?k zEGNfsRZ`LcZ}2)Z`B6xE-+UAP8T2HIc7RFsRg3BIZF$QXCsXqjnC_GhzWkX+W((Bx zcLZFLuAEnhY7`sT6R)m&ff0KA8@S*rQ?>Wc>Y_@^>#jbw z`I#@S-jh1ykEs}Ua7H3_i;J|N+2-zDGT|VhQO>#wLBN|@Jw4NrpIvhJ$r89sC}udr z@r%?AV6qhN5XXKO2n?oRU0r2xjt&F+mDiT8NErYZ?UyNVr!6CIT&wQL18O9QUq>Ky zo5hm2u9X|#q0@o>{FOO=1|62-(KUi+g&u~6UEZnGn9m`}=>?d8bPj2nrJ}#Fy+5+z z{ZO*6*v>JX476ooBB|62U0SthiKF)7b+05ibr)=Zzc!>A;2sA;-Of#(aJ$mV_72<} zXDWU<;{9^6qd(x6@9d@vGJ8-ycF%+sYF-spyc26XsBOV$o+Y;yr|jMLklTn&r%@j2 z|JtifAmMHt=CYLa8%8ik3h?u_cL`%y3zfaC4o_^G<-h+IfMnPh>?%y_(qo7A>-}Fs zL_T~P=XhlxZxaXnhr#5<=@BUl#S07NyPkHz9oW!J47`A*KCv#CgJ)ac%35Rz6%R{X zs%~U>1=&}qgfsUf_(T(4hts+w6|`%+W2mk?XN<pUf_naXX7F z?M5>Mx7_3lwX?f~(!ul(7^27k>fruasCSc}S%S240fSjP+Yrf=I_N23Hh@3JSiIFH zHCS#qVh3V8O~K&D17Nz1R*c{3$plok_HssQFULi2M0|0i|J8+vqwdz53IVwz2bWbC zaU?llP6}t>5WZvZ=pnUS{0ci&TZg6k5#>XNPEK&SnOPXOp(VX&Bbq5?$6Jk+ZFzrb zJ)NP~ZMX@jP*IH$QB589c;_L;aVx4q@4P(F%f-XI&UwwN{0Ys-#Jlqk8I6^bkL>aH zJ7l7zUaHZlh-ERI-tm^O2>G1izo-z!9Mz#r`jqXjTP{Jxt7~`k{?ZLs<*!q%WZieD zj4StD$_N|38^Y^STg*6QPBS8JUT5AVwA8ozirmqn*OHkEG)M=HsWD$5+I*3c@fE8#{=nm!8KBfMg38>U>|Q;!VsPIaY>b zwC*L}mxcZ&@;TSi{Im`s<2-`N6E;w;+xVGP<*>KqDa<*5;$BH`X&Zx$&ClW90Iw`jU4ZSqArv&=!qP z8MiogdJmVAoPUbeJZh}()$d4`s6X8DiZP+LfvL^2l`E4(fnzNTVp-|g&LxlTydMg) z^O{=f%Mx$kCFkRr3q^PhBiJ%Pb zsVl$Aw2TYh@YPet4()7wz?>}vTtfm7nwBL_nL5r&Y`Ukt8VhbLD| zO|CAupQrui*E4@{yn7;5i}}!Ti7xfdA4WMVcYgmH+Dm6}h@Ls?^JTvj-Cji!&wJwC zLW^bG2{+v?_=xxYV`TW6?x^Hnq&rkY8p>*drVJijl5QV;Dy%y74ZM)xAA68V7aV-8 zDrq=lxW;=u+sN~G`AJLg@CRoD5xR_`>(Fz;K84AjtygY#hOO98F4?AhP%!DI(5Nl>o5L9STG(KZ+D)AodUS?Hc5ZqbpdhD6xj#Y<6OndP z9Ovnyv%433VtQ`BuL)MFeu?E>Mj(cZTFlPRn{SV`UY@e+YbAtLC_9bb@}@<>1cx(z zybHEwzOPL4`@KTU>D@_*yc1_`nwP1nHa=v0>4`X-h+}5RJ6l0W`iK_RLpeh23f5)J z@Q%tqGw2+?aM=MEQ+M><;_3@n)$?Ot0<pz#~6y z>E>T%)6^uc2VrvB-wl_`6bV*G4UUY5YVM(|F$~5IiMwie7w8UZHU!th{NzzGg99_2 zD4Vbxm8I|EqT5+mxIp|1GVQ(2DoiT#$Jg;|hCfUCjtGK~4zEKF_<(L1x|&d^L#vlt z(Mvs%c+S`%?;Vi35~E zwS4#%a3jvKrhwSFc)642-KUNRP*(>&4U+v5)=ax?uQDz*$)mPqhGbIjjplib1jfUe zzLLMXTw!GlB>muE`E@E*>a27&5A>+EBOvp_SLXX79irnLDIg%19VSD=RS2`7+U_AUKt|R>%=V|G`6hZqEL4&HJZCFYmw5$Hq-%cFM;AhFKtt zDECO{$RspYCMOrctlc=t);lCUbxub{&l2)~G4|bYO=a!Yie}Vcmm&g!Iw%AIMFD97 zc9ABKgbs?44$`Fyj$#8W1O%jsgc3sUp^5^AF1@2j4OMyxe9!UTdGE}{ zv&(wcTF=@cw&<}$0p7P^8HmQhVcJpRy-BD_P&X)Ju&U*|4WE4VsAehT3BuG}m5JRz1U%o=9N!oM6?*l1fo~L(sx1jlR$rb@cVKfI3 z2Nx66BbGoVE3_qvSw6Nlc)T#V82$c%k=<*FMpxEvxC9p>soAc>OW*6(9;Tuj+&U(t zxj$Uq+hD!bLhkDKvRtW-`}8}czYNUoes*xSUhG;VIWuGJ;Kiz`hjVm=u)(wZuO`nv zu)4L}%~m1uN`Ykca3n9Nar(**#fOABT@TeA^OxjQJ-BN#nnr3odS8wYN*OG@*_2+H z72!Vk!cDwY>0n6m#yo#x(m*CR@ff~dm!t9@=&{P<_2Zk0FzX3+0qA%?F3Hb|hXn5; zn{>zi#|Ji@!fOP*RFv*acx3P4;(t?w-l#jjM64@#|71$9UQ*&^lVFwrQaM`^s6ZL zIbOM64HKqFC?yj4bWvPyIXs0rc$p8?^G8Lch`N<+;*dpMGS0%tJJm`KX^mJAYUtEh z(G`kSm}UB7RHSBdk(hGqpyLkgfG?o4?rxW%aFqdaOh_C>TdwMGeKTrSK4eYg?TtgG zKiBP^>qm74JK%7EOkM{%Rrnl<7s*O{PzM8GeBQL5@K!C=LhWLSB2y??yy-sWH{7*q zT3drBupxE3hnr3lhp{K{13OQN5h@Klmj-az4rEGp`u5Yr13SDGh#pOgDR={ySi*fw zQ~{iZc_8t1l-p~nU0#V97u3~UNf@vfdesq$l*j3hSYK;8)y;R>Lx|hnd4;^7) zd13B+XG89T29lBqu|rexvKT?yOdAzrPvE)O`f6FK^`zPz!-4b7as*Am%EZ3vYiBLQ zGxb*X4+?&;@jJo*G3#}j=w|Fo-F|6u61i4@FrVOsd7ykgin94!J)Yi3kEnO3xqb)V zssTRjMKayfKI2xSwi)+w?P%$nmeX@XZN>pvTO4PduPT@mH%|yFa|}@OdYsvP&rSNS z*+g$arDwSh0!X{;K}vfx7Zp-e_fJz4iD8;8&in0s71EWha-b(CITTS-AB5JN%} z%%n$6E@whPR;s}rS`B>n&@h&#EjLLJmAyaz6)R|E-G2x3Ub+a}*`` z#)^Rh@mfgA1N6K(gJ$}vg{8Wt>`jgf75zir*T|{&pu*{BGwhB3S01SN!LaQ{ZtqJ&mbWWK-mi}BFtH~ ze~E$uUx}0oIHM}sGYCJGsQ%}R?v{_j!pG}=ta$&6r~j`n0d3e|SvcdC{p?Tw(_aSl zZ{KrkrXqYic~NWszYWyx;WJ=1<2I=a`gd0xZrcalEC)fhgVIh(oRW6$I3=w0p0<3B zQ{|BWsw`*t?bB>i7qMSwGn%Mi48eOp`GoQ`79hudmEld%RbzyxmP zW9>hV+K+$8REa7Rb=Pej9~ANc4lUmzIgjMR?AVw$Nu50F{Qc9hWAhVD8mf+}XP0h$ z^@uh!)$O=_r~}HxzK7d6zG4IgnbVec>JOz=9Hbv~9X>=a1A+JJ&NF=*;{GWM>%U!4 z-Vs9eH#myMKdm)f=E*BwTdo3CWZj~!#_5R_w?n-`ve)_=+g9b)XpY;jONqh3aVzn> zjhRAL8RRTI|M5IInnBPF=c=35s%6QUl;!2+8%FD@)~d$l2ES&CSip@812Mz`0~8ef)Wy`1^u{V(fdD7Z=&C zhQGP%1;zX^gUg2BMh)}N4(ME>8+tM3#*gJwrlcYV$_kdlE!Hd}1s{=&!-Px^$Xe%d zbtW3NFLPy|M0^Dxr}7pkFNf3JYfKnd1-~i#Zo9Clu%VT^<&gNV=6Lue`RPx)jOnJ*(~;nNeF?BK9)~Ure{_ojEVw*D9W}> z#2btpbQ?Rl0P~PK5Tsih?%1Mj4V8c@^?qJ<^yli1dHB#UEzjZlXzPU7AUlhsw1?8G zQ)S&=>x?0AVhl|buX}d(et705;?(0}A^}2g4gk&0K)JQWvK^e_)rBDavwaBi^?46M zzBQ0^99_zx*c1!<*bVt zPHrM`h)+d!ruvli>~KT&Dwu8h#oLVTXI2RsCD=wf793T1GyL0*1K*r_Jr>?YWw$ss zWAWO!oR*k=@m}|t_GNe`o>N{Rt{!irg21Tl@(t?H-p9tmy)z;tvK8c{mt~ zcgoJHE%e>rCBAJ;;6$(m=V%;LQ3GkiWT1dX7ibX{3O$VZ&bl4@Pn7s1j`;ZgfCM>N zJB8xXTWP$_xfN&@pxxW8%Q;Lk`(k`TJ5quTTtE4|zb2gb4x!?@UtB(Dwi4Yp2Loui zu{q>+=qD@itTZIITUed<*se@}^>o!iFBwQFU;u{*+ZQqWQ+TOenI1yqd$tZ-H~k#l zfYol-DmoydRXQBwFGUkiC^CQ63kP*g4?WkwW3Ksel*_1Xqd3d)x`BB6D+-o#CO``2 z%$y6VlUpwc1M{U2e^^2>J32hB~X2Fpb}YGa2hQsKI*hU z>~38W`p&~=9jQv$y>=bIoLjM>Y+z~XF2Lr)M;ISHk1A!Y#@^0Iztha}JML%_Q(d$` z8##&`>QhUT-?`C{AWy#wRl=M3C1L8=O}bi%_HFZqe$qlmik?4aDQPLNfh-avS{dm! zA?fyr7hgOS?rd<%5Rv+&N5e!jvrhaCqa@kPUA2OD?FdY91Ywbs?7P4% zvYV-J+2FxFn?*{`p4st5;|sg~iZ}kgmWH=_o2KHY`3vz~Q57h6uf;E0I;TyYV#;lm zYdo>_q8*xvOR&DvuhMqn&--XID(zkK z_lBTJ@wZi(^I6&21_svc@i+9*&&jKq%LjEFioeC%wMdU=zCz_=pJ?^Z;xsk?Xt!U( zksM5+ydW0_I*zuLf%7*3RRW%wtP@mV*4mHX$M4e|)E3G=%bioazSc>iMrdi)nRBKdEXNzgEwfl9l)fJ}LBBK~(?O{c`8>d1lK)hss7>*d;HzvQ-K|o=fq|KK zvuIP;Z3Lof50F|(y%>5|2gx{oI+QrQ3Q(l|btQVrz^R?^*vN(CDacD!uflL7Kl@hX zsb?S*_r2AsVOCASqIMyBcoV(SP`>rW)T?#%>c_u;2C~=XPd+%vJ~a-eL2e6zIzaFD z^E%p&?LJuv$9#htbCYSF*oO(BiadhP<4<}vX6;_pEFVEmA?u)bW=odmJ8>-*nPMh# zZ>vVKmg7a=b6RCye|yV*R3>4v*JX%%0X8E*cq)#@?gz|Q{l?2knm{w>VI2i~!Xy3d6f zH@V_l{*Ide`j+LLAuB67o;rDJbWPw&;`D8?qbZ{5*yKf!;8bqiAk{`r6qdin_CeR1 zCmb_4QLX+S;f!igWyMpz17byVppCj=Qdx3Dp*1m_4%G2%dqAeUGWa})GIsb)CRXbZ zAK}_>QgkOlHaiOxWUtq{_6dPy?NJNVb!^abW}`~RhIyFq79sf+JYMdjNskvgCV+?U z1M|<-I1lv^a{~Hu#_N5ja=t&4LS3M-a06EqVm~BrBm%~MeWNJTdypSIXYMi91*JwftkM1C)*TE ze1)8{u9<%Ol6+YLA16)6A=~?bvF7`nP%^bnNB?4HcevzzkYlRZTWwo!-e71Ix9x{L zL4MjA8pa{S=q~%@`>`i)iv5;kepAqh{7g#>^*aSv^%%!H8&vlNZ<3xbqz`G4tjs zZnp?Oz>Y{V)w!?B6J@~*blf!ALkfVHP$zB$`{-yHicPB|9m_g}E1#g`^q3p(8tRBB zHxDXg>Z84v1p#(n$a}y+$}J!5B}d<=ZF5`_x4(P*%PzOEG+&Ut=x?Bvb>wKySbCL?yp$-Xi1dqlKE;$zEt40h=kWi$ zAy=xJ5Y9xCrgel_A5{(w6i71A$@2!PVw2A;gkT%lF$bTD?^HkPc*uJr5(WNt^ZMT`* zZqsv^#@d6hJUX8>*IWU8*~+a>-6w9tIpx>75%a2E7w= zX7%oY=`N)l}ZgebzmfAS;~r?d1gcZXGZ&O6Q}f+u}dWf=%frA)MHp+*e#4 zX<$Lg%I{20m-|(vFCVaPIQ3<%afQF*k<0!L{pEfZdt5;SdQ%>t|HHanz?KLH?(tsB z_B0^_N#?`#+fXc1R?4>>j&;RaH^er*ti=>lzV||B5u0yt_|H3R5LO_{hzT>&bz@_% z&p4|fLTKb8mjMVnl?)62Gw~CCA*#avOXtCFLKmGUP)nmNh;)H7co!8jF#C;T6 zzRHH%PyG8Yl>)kgGGlD^@4QMG((kulmh<1q8l7YC@s6Iy|D}@>4$Qw_n81Aqakl1x z6O`Ai5zOySM=m|P11Fim=h@oD+hc8M6Ub)uOrM_V@gx*6YavS~A{Ja|Zldp@)U7$R=9Kz~A^B8RZ7JFwWpQtAc+2Ol< z>xVQ{BtLpA4Fu2QN_$S#o+)c?X?Yl+%UD4wQn0x@Wwh**wR!I|ZtI8c{3`L&H@bx0 zJynB$vh-scL}TuNU4W|Z+}-@Kf7-|TnBxE-&U{_>>5tuj;rrl72yY6|on7k>kaGes zul0%$AoKmo0+<~oLTdk*-#6S_T96g9Pet8mD*v&gxbIc-bQz8`C)iWv|P^8|Rv zoPb$KLtTJEWyYy94yoFa(queaR=jDNiEGkd?zeVVH5_hmFxd?B20 z|7VNdWgCAz`N^GVN;(bmg+N_DT9(uOsht;juacXQ3j9aQ%f?N=Et8Sm_Nf2eVgZ7p{{5CT{S{R z(uL1&5?HiT3or;$<0*s1U_bv{8!UA)tQ%oPRAi?^PYmCRW?%*MtV~9m60%hY z8J9YhCAzn-KO@7xl^%obZsLkGR{%nT|$nQGa6$ifBIno~}%Csh)jkCt69_*k5K z3u>BpmApP|+72BlXHQUduDJBoJG>9U7Ib{eeuI09yb0^__wAQ%2gRC%inT|wm0i5@g9syY6VQ$QxqJ+b{#hc zt7~kw@L^y3=oeYSxxc#KASuC;ZfJM>`aEx&cX z{N~9x9+aoFXk+j_8|IydpCXvITS30GC(XDbze<=|tq%lV_O(Nm>F{fHw*PNM7K}1RJ2CAPG&}n_gZNfq6o=Om248H`Y53%%#xZAley3R8?qywf@*oyqE+u! z;y;~6-PiP7f?d)PDB4WaKIdK!XXYT&>N@CmXk$uR)+-f2oe+yiWBQmd{_>h|8ZX%= z2v{Yn79$;pc1tI;k-+X`k|U-*;hB0N?y(NacsBp-+xG|!til(|8m|$Br#H9#pz|E; zISd1MtG*DN0JZ7&ue`LiYZ7?~|o0aE#(3Rh839P+!W?s#i)zHLbMOTxz z16nxSXbNGFBgr7IX)!cXC+pAe${INkv$(!Cx!whI^CYy`+!lP`u{vEr6)9SLf40@O z4QkB)GObWbUL8DGZ;LJ+v!vsqGDgXfnVx;+$(~nDM|f|IY!fc_GRH%1Ln_juYL-U@=$Lf5+rFjF6MqwphK-k*XvEN zXATE3p=WE(^gm&V$_lA^6nu(7(5D2eL;+>keyb^wX%V>ajV| zDr*bQw{0qKy;QpWI^_3Ly)!b0By!%kO$_x!?Xu}vh!!fgB~V-%w9j^=2zb8l&BMvj>dYVQC79C!fDEg*55!c{+OXndhyy#q6`^cWE#8I_4bl`fD}* z=UL67#VguI1;gxv<@(~rCBBGj`30vo_SRb&Az|G7x=fDr7DsTPZDje@Obw+y!Bj%y z!BFmTIl_tnZ;a6k(bLT=Z5nLxqQVRkY&91;dTr2|ERGs3IMnZRWC3D0j<;sl+US8m z0d}GNi*Py&ZChk>vRrCseP>8m$F$KUQ1h4k4h(obTS?|EWJ-g6IEVHu8V-9jlINYq zJr!>`g1O+G6eF@vFRi*o)-e6nW9FuZ!A;?k_N3-Ste}H|pNME1*$+MTP{OaX?rgJ% zT6QeYShME=GVRhzT}aEvQC&I%SLo`P6F+8Lyrkf|J~L5NLEZ9LM&q%}t``Q=_$CFR-7JCE;UGDO!VK0$+QhxFd(xTQ&#Ja{SG53d9B&#^vS*`L-;;cS z;di2yXaj+#n(YxZqsBMso~e@VdQ!!rd}Ur{S<=ReACh?)f}2DNh=O?GRpTxK*c^U~ zPgYhqMcm#3AvRWF(kdPno&*$~b*h;fc{PMc9GJgOngI6+Oy8f_Uw0Fklt67l4?9~=4WF(%(U$M8ln-4<9IgU>T}+4 zZtQ)@HiQV}^60JnF&YC-azJ{k@j5Kej#$^KdEH&!6`XFysueC_6Ty5m;dr7>j&--* zL(}y&2BW*1>kV3mpELJNn9mv7lx9iCE*jMR-)|EL!UN#-9CMpTpsK8N=2yHB$2o7{M%NFTmSloBQp2r!)%H^HGV|Y zo=`b$f_s+CjrW^_{m9&9#bua&_xIb5XDdH>y-#JjzpFy!+rY33HElu#M5Zn^} z=z-^(J!@rq!*vIq8Oz!)EnW7P?HPKzI`L6j^+q<4G^unWInwz5k}Td1yVXK&qZ}C{ zn`8Xidk@+~npTWY@z;Lbt z&|0FhjOxdR5zUfZ?uk_lKVcPA;`1enjc$3v+-$pdCv7rB@tX^Ob*kIkC;_4Rs&2O~ zXc4N#c3IrgXo)v~w(CCxH&OE3%feu4M}Q1X&nqhXvT{L~9@0=q{Z13?c2=D!8_k%o zC@HU)sQu<&cMlHn%#=LZu{;lwc(_A})g@E)VnClYsyElxIO1L} zbn}}a^^ZVQTtD#A=jh$koChY>zL6pgg!QxhWc-4{0Li2#lmNKX;YGLag^G0Q%2Mmw zA(4(##L5}9*C+Y!kUyVTK6H`lG~Q)TQR>lB4(S1-t1W>#Fec&Q%Mk5F4<74CE}Yk4 zUu;dI(V7F;PcPVi%V6y>Jj})|?c~tjyL{by&AMT3MCV8q_X=ZhXmT>Rml|PDxEAR; znqXZ^U!4~j8DDb@dN`?cwbS9DIJjuyJ{79f2^gK$D%Bi3TJmQXqwHd;WT-T zC{k`%ZJv2EhJAxSYvpyxKA^*1Gm#rQ@*U}V+q6-~Z;FMV4{@9h6}2xL;@Q!QkJUP| z=^TcK!h1W(@S52QNjXgK&Y6OgyHzM;&&f|Y;C4&f#80ThJt?i6Z5$Q>)+r@|v)J8k zQCuj1Te<}lbQ6FfPdQR`^Xy_Xj#qNMe=+oUUc3ME+);JKGab{@Ev(->KhlpiSWQPH zT!`4RE_p4Jjy%@*>3(^d$`eJHJ?yx`?HneAk%?eC+Q&&ZuVc z=-*GTaUOz)w&xq=pigE`6|D~Z;<+8RJ^!McKXD8XiPdKO&#Slh-ql0EXc*5txMH2` z`^!6ntr7&Ho*57qu`4qpGZ4p8k^vz6&Xpsrh-WJ;SRxT>-3|8Aj_Izku z@IQEj5{^V|2h5RRFSmai)7HT2-kLiWgBm(Z3IA;8dM*y_O+nYf8bsCp?_2v{FL8sn z94<&Geu1BoUVkNko?iohmhjHAKf}0x|GN<+H_Ky!^@#Ez z@0u`)&e3MFhUtavsK0&k|CxEmfnXDQ>Ktz2#H3Gk%7SvaSpEPRM@dK81+QP`CU`8* z3e=}fI~|m;ev@U@Dc&P&`ST{s|9lQ_0MNrSzbDD_W|zR>Z3Yczk7eOlMqX)okF7+G zWoLHHL9bQY71+O>GLG&1X(!%)Mwh>aDOS9X9MuJUC;Lnf>b^o}p1n!+_@|y!;J$C| zidGw1hx~?d*rf%U(i(QYyw4O_6(saDJMTlQa){{1e20;nmk+wUYfjduw$&DUtT?%S zeS8egk+2io|NQ3RfQdaHbuL}l7#ub8Q0i33Fo4mqCStHp@??v50B7)mVp!bckJw1a zOb^@J+go5mUswW8CkK#bzEaKVGG<~tbeGeY_Ybnc*x{?<2(dJ;pSjMDap(j;VSWMq zv0%2za4JQyvwXV4I)&}i*x2%L2j>JX5zw7Vk;=#BHav`b02*1M8tQzLv1=GDXDDWRFF)d=!(n{78{} z^3MOf+W)+i|IDK&O1x$qMdu%GKS``;CfoED;lM3ZZ$f#x-DiA)HGtz(Ih>6VY7*@F z4li~blsPji8N!U7K+I62B;(?QTdAT~H3~20lSc3IKnf4ERacEcT2Mwx(K2oAdvI~C zy!ql?Rz2)J22h7-P!)87q;CzRTwEugpkS3J;zG!GB%2*H6iU&jwpf9B+BpWwN+tq2 zjm8Dy-rTgC4omkzB5u4KCi91o&=-MM66r$P(4!wahn8Ol=JN%0DvyPEvD9F_ki`)5 zPy5G@Wl0r?x_j=ksC!w}mkD55ana`!^Ra~$K0DqgQz|7?k)NL59LW@dR>JTf5bBeLxGVNmTrn*X$2$xDM5odr zK&cfdDzSMPK)-gz?Havx1icpmokm*)V&$FUxlEc;2O6;pa_`4eOlnH_YQxW2}2}@}gnWU`klp5w`zq_!~5`ex1xgfnP zCCynDVN5d(pMwYPC?5guFgQX%@?LOv=U7t z(Epf!4kvICDcdWKxRrt|CVT`M0c(fO7T|J<#F#V2k*?(_i{>8dnBTAn9PttV3Bj>W><7nh&@8}bdPr(+JRp3kK-4|}5s>Vg^_yGWO%FDSSZ zT%GAHLUn_&*MbMN*^AEAB}RA~dM?TGVqELDONtUH?fEc~U9{k8uzD}0 zXO7D(`PW+d?rSAjb^olB?Q|kGp(Ik2zd5km6$w8=gs`&~E3jj8VnWD$UpH>yGJ?+| zMBGb}=Jl4FhajJN0~)5KAPIe%-t|S9kH~rgi%Vmw{$juJn5ZyOZ~lJa7e={mOV(0F z_aUDMZ)jCTQEp(`(6Ezq*an<|&+?n&{nz{pYL-M=wf z=02F)Zb|)${mYfe;Joi9AT1&Eb)zV=M$*a1&iCQoG5-gJ z|IgtTb$Tnm3h^^vIFXp}Irj_Y>Z_x|Wts{qHKm*!+cocK@!fy=RvLf!1MkCl-o&Yaf~TxgUO;3 zp-dn{y*=e(17wMDvNz2RrR|uU0V7%}oLR32^Rc{|+Toaidja=}+kU+eEK6~Dip}q=bJ{u1mEE@`jyO(P@;#;;dJB_9A>}=6*1V~=u?IB8 z&o!+{q5~ar1{%2Z@}xZST)(e_IeK>Qn9wSBr!BUg?FNoiY68o62r)p|O3464yFh!43#a4WmU>Dyb4&_8491nqM3eMgn< zXh(!Toddp#h8;Y!x}RuY1d?oZ-8YtZJd1+ExlK0rf7_wA;01|li)MRuS_(2<6W*kz z+Y+Of0UsZAuskenreaKxq@T6y%56ZM^xefX;S(lxc2Dcn%(o%?(S*2sOD*DaTsftd zvxh~K0f%K7et??9v-KH(E#lAw@CSm$Z$d&wMAI4fvjS>kkBW|flJ_|VEJ~VJh@}_&&xd4z82^Yl3G+H(oA1i( zNWVb`ohg0y%tz9(}?d?p2f62cSzhD2W`z!;k1{>q8knuWVF|Fc3(p`>GTD-zoRHjAzL#>0Qu zvN`M%J6Ln1E5k&UY*3gVCg?o!Io8%+(fw7r0kK`$?%S2%=)@%}*1)-;Z7LtcRxOv7 zOxJprW`?MHKcU}a9wokP_H+aGNB?%R&jiX0onJIv&Y2A+P7@YM)s$uR?92!;NL0B) zRSB1JQHdVwswADH#||~fU25g%`E4iHaPn2{;@`bd{9Oo~nHdB4;^b}a z8>RJ!Kq^+uYH5KYO@uVPZp-SavnU?9;dO~>|ZG z3WUp8Fv+nX^&|-8y3*_Nit9b>5_i z;U#V8`u6G7IMVb18;$mOM;5aOH%P=3POlUDx6d_Doo&Lzt#pP@F<%rs0Vluo;A`V2 z*&41-1=RuvA22+B@1J~<%WwJ9V0hlEOsp>*3{;*Xlk@poCd{doPsbu(`$Tn9opX=^ z@&$7I;xcY4)7#RFO}{~}U;0V`EXpuZUDkKla{V}!@UiFTcLFIk1A#S02oaYMU{L6) z-JS1v-s6Js8z>TlsS9%JO9A>|czY2x<8A<@Cq3uf1GLjvT15wRo9*eitk4Q2UpwT4 znSqTk?UjnuCt9!wPe<|t;25Z5_0Tw$>w&e#Gd@_LUki3lE%bWj9PvN%z9YxR##_Dw z;@Sj}{x|7aFh`{R2rvfe6~oQ|3ye0<(c{B00oh2KB!{WHj#s_H7Rgg{FRnaw3J|yH zv9u|eeh9@+4MBj+<0ThDi{o9mvY@me(6b*nmR$=;l`$5r$t`a$GE2B~jJBTvtM!`- z2(%iCcm-|cA6;jMzeK$H_>?^?{lc7#K$%9J%FJMmz8o-AN`D;63`mdS6t^@D2?^QO zKH7!ZmE;5t8{dN2c8RC%Yccw-COit}fd%ZW4mm}E5paUAQ|b^MdLC-*A1!+V#&{DM zQTFWFc6xpEdDCJbkSeSio%s!3l|d(#-oyL|M_Cj7d|MUkFxlMb zU3nSIzYCnlm!2m^%!P?Mv7RwmRM4ioo_y_I&qdNs3n%+mUKn4UCE{5VTUAkB(=5i_6?IfyG5I z?^A6B`BT`caGh`UU$s=2hfZb#`A8ZvRM3(=1+Uj6qMKE%7~J(;g&IcKy03C;(*gCK z%L-*ksJQ37JvtnO8$AUsB)jqT<yqio7>LyE4*@Toz@sEfOVpy+L(|>>y<<;iqItdWi`^eM1B!P|{`81@7Y0`> zj5t;77eO|vXG~~yOs)gO$mM37w(e?+@LV=yzoK3;3c7{_Dkv~7kvB+G7s}6)l)<98 z`+l^e1#>OB`tqvWu(y{PxKCdU7SIYT^MZ0fFS~EkE}6c3(1hLeamJn5R^3#l2=s|& zGQN%_PzvOB{o6x81hXG_=wpe#U|zjx%T9-o%4omqYz_=Skane1po*zc5$Wt$Po4HK z+7qH*vLOc*{YNo;WbIdYnLFpS$T%UBx6h1gHN36L%E^A`5lf81Y#|3iZjvC8eGiTSk{5ir8ID>ogfHC4Fe{xm6RD_t0CB>v~CN9Nq zhzq5C$2?EG_Z4za_0?v8`E~Bp6xAMpZ9y!T9&B-ESUoNhtfbh}E|q?R7fs!6fD&MQZ+$%L2>$Mev};w4 z12kgx&5vk&=$W4B>4_H;zQ-NvFxbRzihP27Nx0W;EEdnu*7Spd_2UvK%p~a7!D%if z8@J>(e<3_(Fy2RQ(oW=oWLC{(EkX+RtM)>2s!&TVA!$f`qs6*5cH12}L751OT##w& zH=CJW;wOsb^wf}&T*M>GAHIOcl&EAyb#ilxVK0Fqm|;*l4YHmDOJ;z~+0~465|IE^ zOK4Hb3nBNh=?G(nnu5KDZ-AlP_eWSeSY%lU?|Eamg3VLINNUw>Bt_)Yf+nZxRRigv zpM?hSdjPz;)+;;k1h!o7bIknvunsn_DOSq z;uS%++~0Kz+}`mf465MNdLp;r`e&~xvNy|qmPw*9&E0p&_nf2f&OpJkZ}1VTyi)Fc zriu8dg7kp7`wyoEx2%6{e>fT0n_he5=^e0>n0o4=a#@?O<(a*+vKm)S3G(F4S?NYI z7U-8qCc)A;TgSmkNu0R*;eNcy1CMM?niNqd(eD`1azFD=$`<)iX1LIT;r3xGa(}|! zeQIX9HTHB98fF9O`v*@Awjplkt}lZr*h?us?&d&mn~J+H-tlU!rw*+N03I7L zixvkyQnPQlcHqJlh#Q96LxFkUO{p$LAXmnIPzmoTTkMe~+hdBf3xE~7WFrUS%82wojJS9(GJHnh)l=_MgRn)x24lKkxAoOYlUzz|`#=1F`k z5xNar0n!$NDW`pz?>=a1mbzfv3yZ`O+_A1pDP9aRwWyop)iTl$rEH%B0tv0gK5^}| zxLEP&6x#B#viI~<-P*F&B*vM%z1mc$*zMikTFkMeKEt~~j@NsRpd5);lU&ROqQ1AF zgAPI?95q~ldpz{1d3ZS`n2B;AgBBbBM=zmqlxwN|sFXkKUhUAAF`+cOeRRp4G1of7 zWk7YZS};b{gi!J_T9dT<=NqQc%NX76$jLKxrF#Seza;aR^2g}+PJ$--`^|tZo_^U1 z+8?c}Mft_9*1;l%6Gby?y_@pfIsjHE6!W|o8anlp*|vd+pbx^dwii>bC)6pgm>>e& zUSu`)Wol>`en5|ym5}}s-6wGOoun>}j*EbBHui_d#@m4Y9<1>Bx~D!#phwkAj; ziGD_RU^c{(?!68o>L-1tL7C$?6JZ`klL~+9qEB$lnp1AgEZvV%FvH6YQ5Bbq$ap4%L32- zdSR#>dpTd?qGN#OlO;H8oqGVFJI@>=Z=QO6()g{vBo;U7!Udg-oH-NGgUcQ>H4^8& zr&leR?YL?6&3ZHdiD_UA`b@my@3;!youTx*&?(bVA0VjNqOEMNaqKgg`$ZRIbHRS3 zhr5%+vwhLyqnp2@7Y~F%XqcJHUXyj5eY8^8-fcpT>fN<|;O@_PSZ^-xd%<&oIl4u!0jXV+8)H*w3y585cd(}x6Kq*-F19c#~yxYeu+MI#6H(cjQo0;AKuEuNq?LU(qnxr5Hzb_?F=K8)c~){hvT)nJqix9L1%~d^4|*?p_x6YI zpCEt#y6pXMqGGH)g8~;NS7hkX9Fc=b5a%#pq@YiIxI!}sJv_FTDLB5~k57=rXe`VfK%I>AR2 zTP}5q6g#3769TPoX~+4|;k>Nst%EdPJpOF;antP`<6(y{sv?RRysP*GTcXj23p2834ipiPX2=fuY7`{WN zE$xGpig&u^&av_X3sYapx@A&B-NeH07v>jfZdi7 zo1wIS9%%bc=b?@I?lZsgt6X5+N?1H9_w?_6@IDam4t>9`w>RQd+#1lnB~S`W~2dm zpxv6RPr3M%ZK41$Z5VQs4&vYFgp%9_5&NKqZUY(Ba=bGq2e~tb0j28=u&t*fh4E=TolHba%M=d-%Rhx<|MEMD* z!iSiO;AGB3E|Rd${qkEF_`#TG4>~qop9eChf&!#PD2i#L8>~XB)x~LXI5?3a-U{5F zPyh)d4lhTDnW?O}=KG6rd5$*%<9Sd>h}o^h!K@oQkTS&_1yz+blKKV@T`dey+c_Mb zGM9PyAS7{3%=mLlk3m>LCeq>Tfh&>Nzr6!4ouV_2T z#vGytg7fNDe?{%_9Vy`|;_I*0M&3){ZneBYC^kDlZP|aI&MYqcHfZ~4h`GZt%e)zl za$|%;uULRuc3RLvhlL0p-O<|vKtEV$%xmyxPP;$npdxZ};ioh9YzQ~Wx0y}Ez!FQw zWyvmdojM%>{U%zmibP8wh@u^rgyJ(#{fg>-DFTpxY2n@;WP0%(tq1o-6^)h-^^mwj zEyQ}410Qu8X4MXg@vod=pU=ZLa_u^H35i=^d3?_&Kh$dhxV>~BsA>=cJg6h_NN+sY zjQS5tZVCVEZ2prG{gc1@1S>{J(xMi*Lh^Tg`PkPGYhsHSWgy0+TLto{@U<@0uTNg( z;oS7s<@3LGC~qI;&El4AyN@s4;QH~()Vi3i1szWd0@btA4CKCBtQgHyC?456C5L|% zAok?4(L`5Xva891uOkr#Kk~!W^?8pRInwM~x%i7k&C3IP`_QHjzqq;ZzkeDwF5s6( z!*A&QVut@0*9q;}o84zN-uyYN`Rk8APh=9m5WRZqbnm}9|q~0Ko zBw@2H&S(7w5L1-ViP!Z@0I;l}bu$c`pw1V~X#e-COZgSO!5D)OyMTlR zr`WJs9;iydp#TO=EN8Pq%v zpoXm1O}+cIIuXft1!PG*L#auarh7*5AwSF}$Zc-OOw%d8QT7`GvsaQl9Nx5C?8Hka zSk3{hZ3LzN%Nk&>^y`4&Wf=`X{VTg3#mh8MIMxxyY$tzxTo(oah@HtXAVHlK3_NfM z-`iz^x|{;c7W*ppIZnZ#CXNasU+|731|l>gPR6;h_C{{1C&LxFBTf2JXC&m@u;c%N%{;fsu$PzeB?7fE| zgA5@ptw4`-nQz_z6oeEzOY-%YhX7sMQC_V46K!cRoIF38oW3wEZ@d~V<@g?-_EYrp z=QeO=P+xB5WKm}100mT3Vkw!uURK}K+tu#ZZ^bkr)45=sv9nz?jFyzpU zB0V$=4JzGTLwwiLz2D#S>=)keKf&W*)~t1}`@XL8Izv`E6gF}O-_eWOCA*Xl=K?m? zR%mY%^rP$3?YHQgV~AY~(9BGG0QZ}AjfPGJ*NRhZ*6q@KT1v@V1877!@6p6Xgo~>P z9UA3h&@w(A@61d*36#5M*<1}c?q7(!5k{M!7A?s#?HQ}H2Q5ZUuHE7f;2mWmd_zQU zp3O6N@V(;vTEffe#FDIZTVZQTe};x4kc^%=xI<_0U$0;)s!2)=^8#?H^tedJ-z@fr zRER#p1ev8?Xh^_@2pN77A!ckL#$~RBYK=fRNCaXU3_{FI^99@yUoNJJR%+eREaQ$Q zJwix8gLU>pe~HcvLwdEu`7MAFKi2!uq%A4t8c^aHCKMl1rI+9T|9|nQxx8DPQ>c_YBUTKDvJh zUy)4MECRcpT_HOVmjB~M90@=*k^hE!znqu{&ioDHr*}l)(6<-Z&fge6tZ@NIpd2}g z&S`x|j1}8b(#_|)rdS@Lrx7v|Tlm{Vr?uZPv;56{q_>Pb)r(CK5TRc$C+4MPa%Ob5 zqzjpL+W7hCulQVyP?g5lGr5~#+nt`QSV~{U892+5klF&;5M3HZ?o_CQ@}0aRFFtVx zj$@wuCq$`*YymB=)=q&nS}4~tB928nt(qvyE17hp>rNG3*lIXYT8%>da7Stf;18ir8Ud%iY5}bEit1k{(*OI>TU> zQL7}~^*Tj)@<#0jQb>s0oduhDgj4*Cm~gn>QF83Vp7j6g`2KSay*^C`l)9Tg>6^+& zbZnF{jy*UWH;|Uvdy>0n$L=31N=wh1>q-`NvcgrPm~raxuEPDy`j$U19QU zNBvN8<2jpsZwJ;athd<;T^8s+h}$GfJMeVA);oRjhzd`Fc8Yt{=3`2G`VLJPG@d!Q z#M#ad8wRkF>y2SU;>}le>4W(I(}VZl{*Pn&_kk3JqdAupZK3LxqKc2DJ#o1KNi|T< zdHV57*gSB*OrSvK#2i5*==ZiW$A9*&BR(+HJ>x199g0XqGH@`sPtg{C zq4#3{{AKTIa>X`od_RdVZjSEQ@tr;p#Z2w}BNWt?r;*|N7fyb#t%0LCPuQ@jY;Aek z3`pE{f(it!Yr54(LIw3bk!Ls27ZG}OUdEvplH}%7IybB!MD0egJz7y`qrV#Dt z?3zivbl}vLc7S?x!C~XA*gE2cxLUIZMA5KzkU47eqzDjLkfIYv`+2BYuqk2|+k{k@ zRPm{tZ<=P!edq7gv0kEMtPz+>B z7+NA@W0+8>-nrurDCDy&hgI{Mx;pC$!~&GJzY8j68;4}M=7DbsZLid|M?N!NCQ;K& zbbO^*PtNF1{LHQH(zmQ7P}3WHUimBk&Q~c3SIOB!TaI*H(P_@rd|WEy*&zSwf-(7q z*Hj^FSXs|!f650={XHe*eJu`6e?t-HG;r!}gFJLt;^B{&@SEZ>zitF1B>P+Rzv;qt z2M4J}jXLU$E2%#Hd9-FTpJ@d@55ajLjBgR{mdyMO$Chi4UE`Otma*H>zCcJRVSSO ziV$!5bb%z7U8LhPp)aUd`{4U0*tx&l_Rdq+BxC;l)YElmQ! zeQK)~QEUK4dg|dNGth=~uNNM?y5LO;5qi_($*fDy0w<-t-0`4lDTgyY3T{g3Hz*Nu^OCICYtC! z#qvolnsl4Ne0?oqz?PS~LphJd{dQj8_440}%YW?~-rUd~uuDhGBTS7L8Eu$Pe-yQ` zG6ed;v-wqbts!Iz4%L!CfPFIX)#1Nukg=%9s}!eNM3;iNN31-eR8uPckuo<}!!)vT zxF^?~0e{CnSh4S$apdmq^`b%Eqk1lFCu`F$sXcAV342g)_Fk~7Ann$x@~Um`BC;?2 zS0@);Py+PfqnvRCR^v(tD}NFi~76HNt1c}!>BG9zMR1Gv@Sqt>P;6PmFcE+@I=E&Ev?TEy`AUPq$o$M6Mn^c{ueBv> zh$D@Ju8vNQ$!@CsPkYcnnQDV`EguGgH+!t%m6wONQ8EJNSvA2(6{qb{%SjV10PFL>z`9k>b5$l-rxVZHr+;T zW+TXaXeFG#n&$9U@TH&%)Q)@H8#!m+y4(7a_y1hhk#J`LdkqhLzAj>D1yO1JM}TcI zCaa!|v0QO#WJDrn2IAKAAl4t}p2mm`coe)=l%>f`U( zivKg0HvzTGLKoHZn~XQc!j?_c3I1f=pWah0s6CaF>t7!K!nkhm$(}MVB7%M;iJ0?l zxB2RJ_X~c98fAiX@6qB4^l9WkM&;35z-ODD6*TMB>?$ZV4p#mtig{S4DBRaa&ZK2F z=q_RC^3Bs?+%>Ymv_m99zm(}zb=7{k<#GM>9X?o(AaI96-l{vFx#Byl~(fOYWG@yMm9GomWF#}dXKqlS^kpVj{R_y zKjgsku!;EAKyUB54@|qhA;=C8jy?um!(63x@n?vaZ^Ugz=-Iba9s5LH#8pr-U6(qX zRdtOqNBctnsp28yN-=$pW9g`w`V!9u&q_B|Pnt1$V(=pWjt|6@hX0jWDd93XY%6G# zOAid&+L3M&PX55~a1$8v;(^qXs8w79XK)Rt;@ps2OiVb6|7ZpjxkJp$OOu+Hj14i= zD@qBEv+Qy^7*jIloB9YwBK(O;SYp7X-Y)WC4Sknin<5p{`ey1g^EN$PPgiC|E!OQy zA8kM4MxLG>GHaYBRg4_Em*?ynM;)jX?|sTC{K85$^gB!g(B>BcoW`S*V>_>hYv4sN{zTh@NC3KT~KGWk5g1B?yNQ; z2#_X4x`O}uX04DCynN(UxN*SJxPh>~Tqbj$EWSTzI+y6)7H&Lwy2luUO;NDim-RFC zLg^2t(JwlOHkajntRwWoW8U0`2BTTV^T&34K7T8hh1U)Ki~0UkM3(L}!bGSt_exKf zrfs9G=-K4r6v_ivswNsNxaD1ptAujiQ!;;YjfEl%kHC7A4*6E&df8w<+EVfzays-O zH1%Qf<}}z=k2jvxk7z_6e>3X_A!3^x_Svx6e-y+5Ks&U@31A*wL=ncpk7KoFrQ17M zabt{Tq+$=ijWxpZl+K9A3y#hUp#|;bsiv7Ck<-;-b4?z{_+5D9@tCrN5J%SVrw`?5bkF0z z(CG`m32Og3N##4?)d^e@N0Sqo(9;1&u#Qn3>04^a4edXt2@4^MA(ZvlZgSy*T@*)luPj@N*#Q&1-8l_!|!SfG}b#=Pu{PZ*2j+Wo|BwfroL^-JfE|K%%m-|FZ zdq21>k(W~M&Bs2ODs#6yEZ?vIegNqRb*eT4>7@b5Huht`UuSU)h?ACVz6Zr8_=9Y} zOh^|RwY_^_eW||jcK!_rR`p`3L=6AYsW#xTwgkm+SJZX4?tPgWBts`=a>KJxwY~24 zw(~1?*L$_IIuc^^Fq`kLu8Yy~G+?3v@_$kCRgUW}+wrUrDgv?ft&Aru-gd0BJZ7mu z!07)1_x}0O#s3iO%L3VFWcU0Gc$PJ&i4rM}jY?UH37~cXeq4_82#}`f@=oba@M;xx zCml>$U=(68wYJUzDqOk0ED54ha?-1Rko;TH$#af=iNf&@TH^(zs zz%V3FFf6W;qQJLJDceUi$xc+mROQ#k$-+uwQd@gmykZ}@zSg6`DTfZ-4Yk}k#xP+* z`t)(*Qg$;T(qMjjMf#3XW7bjN3CpeV+?ysdsOv@RhW$c&c4aqf{hTO`wLGK(kl}G7 zDAaD;T*5yV7SV@R@y0mA38nlW?#GQ`x|)utLBXg6gZLvDMEauTS#1Dr;pOiENXy2E z(n)x8bAy!coHAtJ>L^6fq{NC3RAmczzEtF-jxRV|Pg7`%9S>FPJG;cz3MGmlAHL+- z7S|%q*0Fe_wvc?%v$BfZ)JlHhSI3U?8t)@68MG@@W~M<$_~n>SN4?43cgMkoHyqO( zC;N89*{h0fq$x%^E&LK)yyOx!BkFse;J=17bYr_si{smE20 z<=X*rDPnB0@Eu4w-an3qyJwD`k?#m~%X@?mi<&+^eU~}0aJ4Pl7GlBH%Q3=6s{QZC zG_$5fhlW-Z`59>}8CmnoG+}-uhO8M{d4;GV@Jgdp>A~u7vFEHr1QBvRE zQ(%i%OdKK{9}^~wC)T#Vd-o1)A2csjyH=`cV$fTVdlDijmjGMve-5NwgF>||<52;J z&jH45)*2N69(EVy^12_~8y)Jrd__H+N-uzGn8HycdC|2<-Q2aPdY31$z_`D}kO>l+ zuTx?r99mr^>Yk>cLOn%Yw5G#%M7q4uzi+VV=Cxe&u%3MsTeis^Kg*_WuioKFx-2N2 z%f`mu02Sy%RvVC0D`lH%*gWQeFp-j^gDh153>pEDo(s035+&t7tP7LTsbAc2M&J%##LyKiE3)0J3;{)5dykz z2*H9C3QGA;?(#j4FNr29NRgmux6m!^$~I#WGlx2K3DER*UpDCpgaiuw*)tjMUs6yc z5W9gXMb@XQ>7j2nrCGDkHcU)V%r4Avi^_w z?K<@Su`3T(57mJy96zLaDvS2nRC`JulBY`nuwe<7QWts3t$DPJn97#p znHqc&dxMV*2wfR1^PtyAwSi7S2@Y}Kc&fA{9)*TL*r=rv@nUN(5SWG&Ft%$NaM;Y^ z&Bc@@fP6b(FnCnV8hXYh5X1K9>LDafWMs1Z?xnT%>5F8#3khX-*J2|~!75gnM=-_O zwhmYtxCaDs_HeVoDZ482xFwOA_(!FY`f5wR&T_Bx_th4W&J>?&-_C*a_YnqN1BJs| zvP>B(+YwRCT^|)UgK*dkJ~*W=7-Rz$D88}KoCHW9BBbp^A_$VF!BW!vx;R$;(hTqt zwE_Ast{Er*eyK4M#E~0)@kogxtIfEQ*{O#q9IxAfGa?Sr7(K(#OLi1u^2xtZIGnuu z)#(ZhSr7tMqH;>aB1n%xf33@<^!)nw)L0p#+*1IOBZE}x-vEdyum$b&SPntF#@UIz ze4Zn^k42WHTv%X&+LmGX(54T($-kjVGaq-6hkVPnzG=2;#g}N;Y{%ENY=ciy^|0xu z^xVz9gg3XbbLnf|lf(U^zq=#w>KGMG@0K#YqnUpC?bRI{25WUz6KS#OrGVz^K6@!R z71H~2y+W@_G|(`wi`Yd{FWHd{wA4lq$*&I0{;CMAo;RxN;#9OI>{){~0o2TVse%Xq zP-BCnL@}FvI)Tz{mqm=*`(+K)I(ps#gQyImw)vH-bM**ewH4qBKAacPWI=qhEQLsW zFbDJlbdy8VwrD+C#O$(0&G4jR~J*NiuXIV8uu)6%rSzP}xcsWC3hq<3Sg#}16_WED;1^%wra95XHoMvW z+j(`ngELAVmIuxE^6NjyU6^UPFZbKx%{0^cf##p|AqbzlSu|9A**>_C!v5QLDWW;p zNF~!Ceqda09l(=eKrQ8edhRz0W49y#cMzhxK91!;8D`Ikr_NV_5@)^C(Th&gBq3C5uFi{j zHUOSiW=fJd>N4}G4NwaEwLf^YzY!*c?%Z729W@IYMcuB70>*dWiw$jXp6yIdBW7+& z;5WQd$FvHE3u9GQwVY;5TMlMr&R)x!P5s4_NO+X%DXaJv?Z*A_pvr`+90QHw;izp{ za^7-qW12@zV`fw|P;d})_ZJ%Is-vzK!`vr^+co_yO!xFcz%yGbAm)Ib#RpEhFEY9T z4ob0x0(kF-jC_$URRRT7DrAc@u~BOV0i#>t8)2(gd$LUDoE zbzTH9q2n^3+i21XizSPd)3+{|?V$sorMq3W>-D|r{xnFVM)N7AnF`-pQ@HAYCvZMb z>D0_h5z#kuQO8Ty))kexK4cHUUM}V22{y~ra{5Y+zm`#wrdd+-M1-h2rgmiIjSXp6 zz_t5?;m6s0NhK$>|I!vXux&k6vTtBB=(#+cnJxmXQQFDm&2}7M*eCIs=(J3nkN8jm z)+rj+87$n92<$uRn;$(5um5N@HnAJ82D60;@cBGQ+4Ce^(qe;?q(=6n2M-~*++)dk zHX$Nn1swf}zYLyH2nZA&uGdtxro?6&FL4U5KgznRF|8-sPdHC8?|GY$;|t_gy}FkEH57@d@*jLoPm=<^q3wkgMNjjaJY?Y0T)Z z16O8Hp;zs*j&HOjscY;Ttd!}77PYg$c8iNNHHjG84Os)2`<<3&)JfQl|SqX>nhk;>C#0 zp2o)%Pj~vT?O3tipaEwe1F-|Yv7Eh3@{CiEbATu_VkdeLlvI$r2yN zd1hjWb$8w!eH5~#+C{bhC|$2?%OK*Ke8=n52Q+teEgDJr(8lbmy*)4%1JY4 z=8h8rO=hL85JeKz3LA!LEU&i8q}*o*NGe6Gb&!UKl+2aX-D(puX|Mf}G>F>iYRT{= z;JaeoDtH_5UP7_zQh;LWF;H$~f@jZ((M=a~Ax4Wq*C$zi)%#62wdJSV3u~F5K@_p{ z;yyC{m79!E+)9rxR1@ng_zoy#5T42uwSWdM`k{bX$nF>Gpr1PI=uI%MqqXv-Vj}d@pfbceF() z_c~9VXMJ_n+mu{Uu$Ec4X5X4AG*jkZRAWW5&64c$t=L?oO@-{|?$9`rf+-85%up6< zmlo=7?l`!;rvk{?zK3al6V7yjerAi$dtYGdu*EdclUi=PdImq)Wlh8ObTx_ba+DvX zGT*#wk;sRQT%GK-Uprnj4q@70Y{JI>0Ib?J6hp6WJ)8IM&&;yTdlxasx|sA~CgA!Me(0-3g^_Ug%Yy?_Ey>~8Ta`=I4Q zlX{%4c!Ew@LCbT-!Tg1^LJ`lj1ez^O=v00iW0>WaNt^$79`=W4(C zU1~r`19+U)4H0M z7SE@gz{9IUsNTw{ylT{vRT7Y`9`%<6&E{MZ<=r0bAzicP|NZs;{>pQRK|bay^WQh% z0uE)xhtGZ_$agk7p~KqjTHoN?+Dp&pNsJk-2|@G=NquFm67{wF))+GmE^W^|2F0f{ zj9>+x`sLpG^H-%5P=i6oFA3M+f-O3QxoVYo1?Q{PnBrSa{NXP)SGr0#4pL&b{)+n6 zu~XhAC|M~SYQAa=G5@Jl@42IR%ntO4(H2tI_imXQYeqi&XAVUM2~ATl{!@9i1EGm z%ULB1*|x{NWpZD}giuy^a<|V;?3Cf){}>ddF)Ksne|uO*zd(JL(xVOe&>BDK{_xqn z82){4B5P8Iqmtb9KBHj9b-M)0iXIK`&mG$`eBn7=$RuZwnhbdM==Vxw0W_yd= zI~g!I>i{*Pab1`w!qowt=x1PcW-DijRE$HKpB}Vdsfd}S72-Vx-QC$wOn6L6VH=>n zcO1znCALI1$1C>RL7~V3 zj**9*furCH`Y`Ma-uqoV=-E3FZW1yxvKmuB)P7e%PiE^b@VIR6cvp4`GR;LiGStDq zG#sKNmVn0vY?o%V_X-dds3{!~_RVj{*A3&oq8=oO&b9EgkC6IIwZL@zgRA-t7^NYd zO~!%4OD4mVShMVjicQnf`aVDt7`UHK2~&=`V0V1>Y$(a$2Ug;_2!)r`mKQixh|%5GK#-6DPEGAV-t`&(03b}xJ1yOf zv6%fofv+b_yykF^&t>wM{RhQ(gDfk2$C@xDi$kItx`Al_fqP|CozY=5&2_o-&`!rD z2OxIpQl{sE5aQ%H^(X*21ATlBF}q6%@S$_~7gTAe+j%t$+7adykil}GGBQ=IU~lRL z4xKo7n9_FyU>_)=*a|RhHOG27B?wt&Tr2mwEGze|+1;g*a3r|WM{qx9`NPZn7o3ay z=uZyA(ID5*N;c%deqp5(jZ7B63Tv4?Z$=kF(nkhNmeqj@gwC2?nf>+6#v>oMiWRpS z8<5?dXsbP+rOj`EX&b0Jy69G8yR0YsfHJ{WH(b!@gNd41RvXtOI^|jEtLCYvG-&I$ zlz01dN)Kwyrn?u;)x{mW@LXcYtBOv8okgg~c0a-{y2s`M)<{Flo9C;mCPrgglB(D|cVrd>Ld`{C1GY@XH$7E3U*_XEA`m`}# zSCUw-p0nzM(Q8iaa`b^Sx8))@8P_gSZ_D;|u{>P4!8bc9Wd3J}>HUBln*S^?+yLHy zewH(El2w5DZT|junB}oN&ShL%qB5U2;+*25rn^;+f%$+RB8SbbzH@gf#R81cHglB_ zcI0CC1gqx|pWM<>^U`y(H+kRQp{-To5#$=yH4&Mswh!Vf+X{J3mZ#k##<)&AsWksq zQYq0?gJ$6hpNH~?l`jxJQExFT!OMNv$#nZ(*ySnN&Tq=RD^yRs9o12TUPr8!7S!P%>n_Esf;?eOJ;jmXW4ba^QXP`CsaLQe*dzp{q< zgf)(`j%CdAK#_2rE{QRn>%dO6SCdo}ouQDS6?Q*DhCoHVprAT<${1v*89>JiPPcz` z;N;j)Cqg6WftbW42;))#G#Cedb(dhD3Hp+VOra}nUU0s`arZ-^K9tWV9{mxOWT&Uv z>rfw3pKau>npt~8<5vIcUj*x^)c#_YR3Js<8viJHHhd(3@4$AVL8M-2x()kXUN^^| zN;(Gx?EVQriLzi2jZB+ws^+U9s{Navr-><%>PpsTFl4(qzVQ-?3DMeLC<1uc6jZ_u z`ixS+sF!bsS|43fpO0Hs3-+50z3(xtLhf?*e)Yb!qLtT_T37-05&PO>Y1aBz|CC+- zYv=G@B4-HX(~4MPDy2kT+V33T>nu`nf#;D+ZEZnEaHG_vyEIGZ9S&P_W66Uv&C?yA zv6K?Gcj;4Gq`|&-Nsb$&z-rlkb>))?q+ggcm->lf=ZUgyR-V*%LS+l#5g@arJQk#kB@(S;nUu;MZ}I zKb6Fpd#r@2xt`41=hzc*85Wt4Ylfq+`JC ze)12?o9+?$v-h`{MR-Cp0yG$E5^&jHXd8VY<$I@!2}UMGV$&-NpqEEJ)h(L&EQWB z8lR6cH2i`jLY+E!3p0xD*vWeTAW3SpRh9T)%NA_)(vq_oQY)&$t8Hre&&i1@Y@l7A^d2@MHxAbvi+LODH z_3saCd-5CHgKtOsMY`|zmv>}H<(3W+7BAQw{~&JO zVDb1zem%q0?K-GifxFnk&yL#)s>97?#Ze0NcL9j&6$#|Bmy2QF9L-1Nc1dumMYG;N zN8Go*7O$$ju82E*_SPGka2c(*g838-wneaK+Xn|rbd!O#e_YFK>Yq7D=Aj1dl;q(= z*&1;cP7eENH;JT+HgBxH2A|n8U+r2H>a=t{#qzf{<6r9=`Efo>zs<3_Gm%hNtk_9Ip$E z_)R_Ci`j6^eD4(UMy^IVujyrZrTf;b#pNb6)Wf1u9`=gXWMimE)+G!7Jh6Yj zldsF1|2!J!6bk5k`9MJ91BFgCxWKSoyH@moXlrYm4h$w$n1b$SxrPQt{e?h!gJvZ) zHBRXu-hh@Cg;AJ0t!5(S17a96*p-%+W)4(>AhY@Nt`tAR>J@&)p6NKITiX8OT%TMo z$4Qf(8ShF#`tQ9fZm;PVGELWBz1CyesfBc5J;M4;WG@T#P2`lk@s~EVALrY}A!I@F z{{0{@I?b;(lnD`%L|!ZiLx0PyS)ejh7mlg3hcbE~tIO06WN9eijEz5=1SwDqq~ZwZ z)gGO<0I8Sl3H`HQDTMlu6YoXmq&^aO`iowsfjHLiaDr#NMgTi+adv<3==NJZDt_+^ zlag%6lOKsF1rrCI@^{B*FC}Y-S-8mtdLBZtj!@{;g>gWvivJKHvnxNZ=Kuba!cBLXBg6Y?UH&WuCE`!Rm) zhg8Xvo9gxm+@?Ob<~ZfSp?kg0TI2hsiH2%Jc0AeCFOpC54obyaA5d^|Abz0+ca*;` zLB`4l?vyvA!XEJM$y}fHa`JcEb9{+KFLMhRIdSyfXA-%qmO>M z1G@9Xu)-4u;!ZoGD5%e!J9qCv)AXQK=DB%5$*-KtP?(3tug0kucr;RoRZm^xcp`>* zhUdZsZ}3D8?Tcv*k0WJrXOO&1y6%dFKl#YnP`O$V9Ub)k8piBf?bp#h+-O(q=l0`8 zOr%`<+XpE7n5voe<`({0<^Ni=AJ5P=2s;RC+rdmsN-5-vnu3CYVR5!NSXXl(%ugYs z0e^erl30wCQPl$td{gG84%RI0I`t%kYgPu^L-BVCN73&WU-aEc2q@^ZQ-OUzA|AJ@J3h#eeHJ7&W zr(FIGF^7duWn1gv16QtHcNAY1i|o0>vlula&GO>6-M30jwmoR#1CR&X z>#5)tHBL+k0#ktN5SlswcHD=R&wtQ+Gb%whCb0<&W@7A-_)WKBz2-5AcEe+X(?z7@ zrX=Jw2KR13RqeZDBnZn5fauX0pa{S_-`lOeyJ~CMX^S10J|Y|7cvaB^jy{GUtLV^# zU0}p3`7c}eW#~93ZB)p~kVz#pnmg~>AXr=eJT%jq$qV|=Gca(#<*E%MxpXzaHs=); z6&OSATo zT0TYLD|=7sHXHU_Ki4+@cF(m?edV}}c|1Lv?fCsB{ZaMv^F$^rhDt2*t$-xA!>q@r zgMV-K{|VK(+&i+@L*Xo8s7bp7=o06V{6RF_)N~s&)zv$+I5|*3;f{YPeL#5EdiWDF zsv$5SfDOp299yf<&=Z(__8d6DUx67@C;A~V3G1Naode}h4T$+=l+t^(+^Rc;F5t+N zO~i^2Cp4UhsLd3bKBx7fZ5LqntOzj!cHM4@0AcM(Bel8(u+tQ=Xma8Pf6fYBpc0&q znz*KabqfaF0iYGT4~QlQ+zmlJJ6fbmG`Ikzxy(Epv+~v&NfqzV)0M7Ub8B}NbAJhKQ>w{0d3rQUY%iR`U zqw3P~Tch`#1YFudt0>#l+$^hXM4Ma=`#O6`jTA;BJufMui9c5~%1 zrc%8T_ESyyj8s{)r=tQ6WK5Vt{19O}9P+_JsCQ2-1C$fwu+VrHz~Hys%mqnbq9?&< z_QJs{GtR1KRh)=M&h6O3{<#CT^ugq#gj>{Mb3fMnLV0{0F7KMayuWl(ra@T{JK3t` zQLvDz@+;g5yKO8TY<{R+l;o#oV4c6;52OV*lp8LDBUV*@}E9hB<5a#ya0IziTT=PcBZx z+%amNP&eyD;sb^{lGH^k@{;=!gY(2RiFsbYA527p-*>DNtqwc8c(5!(2(uxI1fHmV zpcxI#6c9fC0VyqnqMZXZ+W1N`%=&u$Gl_hI1|Eam-RSuk`V9eTSdp?Gwn<@8Wa?Kz zNV>xON*J8S=P8*suHuL}j$Xv)r`d10qx|tYE>u{-u!N(eaB9|ll4lxe7m6=W=H2u6 z8&yxZ-e<-XOj3DLQ>4Opti5&zPeoVRD3?v)7Lp-2W<9*)d1CaLi(YbGTSX6E-|Ga3 z08Uk?XBiuAmKMhzj+$aJ6deND=WPIWa_9ODHP(_jXk-Q7XRhOu58;kgJbU)+JUs!+ z-C9UAuVbEQ^KIx95>)1~qCPs|dy;Qp?771Hj$Sy<(A< zH{On@0$LPNP7(;1;h#g~`<&ybHAX^S>VhsvRBkFDPG}PCCI&Dey^&3%2jjxC zp8^<#Mx<{>`Jx^esk)mL;mf{mV$Tq&i!LaQUeTHuY52Bp?b-wdcI*7o-2Lz;ErX!WfVOoWGYJ>XpM^F`p|D3U!9#-m!wyXP&}HVCH>|s z92#wG_3MuFXeE#-(Q?{Mb$6U-O0yG`qwO~un$g;vWAghxF{WH2h*8y7E>Z3q?<@s_ zk{~2ZROd*3omC-H+!WPGY&kTt@H|V}FbW=hQ))#NEeRIaxes8p);V>F1^)t3|fo2Z|>pxYjE_RG`CvuOF-q9~?*cIKAd8ZQ8rxbZA z_cuhd4Vfa|52X;EJooE1cc_d&(*mrm3=I&g8Yy*M7 zC0_@0g1N}QKHmwp!e+ksqDLwM{OSDjL};qF5Yo|u6fp^&ATg&+2T{urZ_#93MZgW! zN72@QS}M9r67nxm z_jFdSLFR%00N%C`uu*GQ=(O^UWdPK=iI2qsY1%!?ZQWJwVW-P>0w zaL}H=7+@4XMq2Uj5xb!u!N*P)pZD1KZm)}3(=^9^>-Fp1;UpmwA>(3audwbqFqR#X z6dE#yYb5SUx`rLk%)6tjprHCSBbg^GC`f-vxI`Y)s*4>`Og#Qf0D4L^swR7!>8`;d zeS3zXAN1M5eElscacde$<7q4~4p2@ltq`u9fJN44o z)v@Mm&2flz*UD3hp>lTn*s8Bt%I{eIr_VwqQU3gSn>mOlpH;21gC%KN-XkxIyC1{% zfo-W(1*jj7p=3b=IeW{Ggb&Ha^V3VcxbVdsxy+>MH4T zX8wbT7KJo@*?{uMVh%&uP!Fp$7Oo!AvuEAvzdvH&=EPoU-}X5nMc#!6^L3?EW+@TLs+FVW)4O4g`fPjo-^T zw*5VO;Ri03KNf_7|FNJup^Cyg@C-XS35|pfudn2)4tp_-;~IyOW40XSna$QDB)puv zw!_h2;pRa4*KyPP{;SCSBs=m!!l{T`HT@P)@VJ0a7O=9iBK5hdrp6DXkCmW#M-CYQ zkglb1*Muqm8yx{-SOH8McU5(@FX&_MK^XsQ~2sbmUsZp}}@x4ruz7NY0aY4%lrCl4)&_et=i8FVM&};OpxpH||Q+ zyni!Fj+~>eFJG;56{_l(@X!q0Ly^VY%DjO!K+l<`B}Z!xdIaFLaLv34_k+;e$vxn1 zAL67-^@WZO2D4*w?*vER+!0eu(5UL$_C@#0Fr1AE#C7Rr}@d zqj>M`a)~;0@7_J{=yP#brhde-1%so?&rYEfwFFyWAwN)2F~|}=n}a1S4Har?DFXMy zZ69mCC&I&T0@FX@`INmavaD?!BE2h58v2X)U&$PlEe#7a22c z_8;cWziT2F`6I9QwL}5N1X;pSd1lc8=?6F+s6Zlpjp6m%xAP)h zz;Z->wB?Im)lxV(C6G8;iz0!kS2ZCEC|i{uyRvOWG8qoDJ70vBce*|8=3#pJ9vjle z20(>Z$bUMrpy+z|KXCsPu<=ktpL=CJ<>RZf#g`U!Y{H`_U;5`u5y?caIO&$^+YeHo zyqyr!5A%<{wC__~+sw_2jAWbB3i@ASer@0UyzoVpkQY&i)m%HD=XeomYWMDOc~eYW zTvnHPH>@Ri`}XZQ@X)S=R7>uja8@EjhY`dO17-d96yO;kFupXVR*X2YtV5bjvgLa! ze~n|`lHDK*%GY|blk&>z&!uMMlDhZV;<>52JSKY8E~N(6kya7}2^x{?TE#=d!$DYH zp~~mR5@~S~obRoT{XH!<@8U4$JuI)a5wrJqZdUWSTEEzj`0xKGaPVi(?mx0uMRK@* zKwIJW7H`E+SzCLAkZL7N$Y|x(S_b{7z%hU`404A7pZB18mU#=0eQ`J|Bs}rDx(PUB z`yi}m;r1ehVYQSf=qoz$M1SFo@WN>1+ukm*A>SwGEs~pd&oql34Ra12&pC}gtBESH zVd}Id^?RdR*Um37O^g2O~L3v^Hgy;PLQRDz4J91A_lH=e~ zp8F5#&AhC|OmNR2&Ayq=Jm^};pxD0j#aVClj(Fco@el02euK@)R-_|oVheQk<9rX+fP%F#+lBIaL)u|=!RsR6egw_rAN2A+1eCgYW8z4i=`aXwzVKDT{U^cUvYF{l9rYk zXzXI^`SwJzZs_wuqoxtJ3)lCe!BU0*rSKv7PS*S?1t3+ra?kXc*s^_u38zW7odBF=;-(J_Rh5x&ACFd#lKh zu%}=54HCzWV=sNa>lJwFs@?O6lu?x!zmDI9y1Ww^VUFi%wHT&Bq+Yd4gd=kkyY}hT z2bHvBd(=SEaM|Yi$uupeCRL)1m&&l`?Ct5D0F~36baP}{v|%SfA!80Q#cCMk$SNgM z84AzDl4;P^88$}^gB9R;&}t?x#KY)`3o+)6*$7!xN24d65rAo~3t#x4%4ECK>L3$+LHFOC?5;dL#c{7Jh)O`9`a!UUN=e6P zIN_bPHq-S8%wFL4)F=ED=VDKYXq_OAt;MH5k(h3YT7XDKoomNKCUKFw36|h&7Fn~q z$SAsFP*PGZs{g4vOvKEWZARwbHa|$wapBm3 z?R$%ro2&!C!Fb+M+X%d)l=JpN)m0^ps4yzMLX(iH#28Bri6nL9V$Db@J7J9A2jZL+ zL0vFN_-04pl{t7*Zwp#!NH9k)#E#lBx3@G^gTNtdwzc2RL>=H!!gGYkGbatpo2P=s z$ze5XX`DH-$P}`i*H!on8tX%;EQI3|WsA{KktYe3ZL>y+M`v1BjiOX)(95IEcZF(@ zR8ZQpQCg`_x1NFh6~E1AVk*?L=?}7 z^uequNGrbdn8w5QR?Bcb9_f=ZvOF2MY-AP?nv(OAy7Ae#{$1jW-b_r8huOHDSghKT`sqn)K2oKwLsC_iHm!Jq zz1HSF)xgA9bo8^5z#o^2f4?1cQsk#uW)zJH*N`uVxVoRWpF<=_Tk*-H0z>BLL?c?Q zLM69y5TFS47#w>{xYWvOk$Q=6sx>Nu>WQ$0k~A$;;R1PWD^`74&Gv*3Z4=$F!E}Zt zl$!M@2f-$AYq7)Niy_Z2?#VkZFFqDmstzzh*D5;s&P?b-;&sCf;}hV`>g(;Tjh0yH z-%Y<)4r+Xu;|p5y^x=*bhfv}i*MgDC53>m0ZlReRR)VzBj^ajA?SY zC9lq!tRy?=HO|}|Jn+v}@n59|MF6TYjwu4-WCV07>D&^6HJArlP+|7v2G}xTK^Zs8 zwa;wt-9+0H=Ua?*3GP>6luO9aAnRN^YM>WJrSIqGCp(m&mx8$I;Up7vA7Z+gg^hGJ zpQ+>Bh5N^=9g<8WT&`%j6|5a>M$%JqT(icSqb2DZGd+8V)2$PlJoFLZT(NFRtHnbr zr#g(fc6|!zx?66PD0Z%*V()#VU!iT%dmnzGts^M3u>-$(cVbzSy2(d?dz%IT3dorl zRHgokQS1Oj))+|fV;EzodW4=RxEc9$3)RTtZ4p;W{Iz5%)$vXtKV?}yuzU036{5qD zErA)d?ZmR)O}7F;y;}a@v{I$yj?h_5@F;Tb^N|^-8Y_d3Z79c=jS7j)#)&4r(>R#F zTarFJ@q8A3K4eiG9K)b?7LM$g+g$T3*Axw94)JeYGJG8oMG%p-VmSNCEM_XjLPn#z(G%}J8E<|uJym0W? zmFKA=UiL`*%WXHSkMI>$@!?j4AJBn^e36N~*7HP4O==Z7ZXr_9#*%s?%Q7Ms4vk7q zrbMS1b|?@YBuSTKgH$Q@-8_VXum|eeS|$bK5p<%W$YuAQH-TdZVq<~oD<~MFnAk_O zY%%YEB8F3|*al!&J(IpSx*4G&W~S+B!bh(dxAO}oA>lnw?3i_m!PC*`%Fwez$A2$7 z+OytmGSRF=j!6gkb6xw^FIiWM?=i%?hw~}f(Qt%szHaAY3}MaOw-zgMu<4NC4i7ip z%Q?W$?+G!^El*SZVw5t`u{)*NIYn>XUCkaQv^PyE-vR4>ofV?M4>aPQscleL()NMp z-YAe_H^dT#Th+=niwADd?^Av2dc~)(tLD_B#Lu^X{v6-9ocZ968}?GR$r1at&yT0O z&nY!4$;D1zJrnMja_O(>Qc{i^Rp!jB%ILP~*Dl(04P`eqri2HGGgcM*ze6jqV*8It z)pc%f?{mIkN|#D-*y~pszTL!hsp7igUR%|o(gdv~M;~UtaUk8E@d`ckyDR(m2Ih=P zqWH?$ubvoe0f76T|CLuKgClhk_xf2w)e@m9VX}2oLW1KqE|i5HGm?8I=2i4U%&W4^ z37#!4H}M_*sv9VYO!(ErkZ6)K_q8|ISAQ2*Fmz#k`SPVA`M}^{74UtykeEhpts=BB zX*qz>a&iIO7o^&7st-pyVc-}H-byE~jNm>>-DH&~|Eq3GEas#pjWXUr`ps@(U$q`$ z-dxJ#@a3&w%!G|5o&8d=k=0f99XgX4(5}$->VK<*_RGcl@CT?|5d81F%m>!d(a{Gz zYLlSM_XQ~ky@;*avmflhVnZd64G^?IfBz%&Cn7}5B1HAUY7!Y9iH_s&SCSjw!|bzy z!a>|}L_0afl3bi_CL5LKH;574Ivx;t*4L(_&n9Jl19w_qy5$A??FUWYH8%p<^OC0{ zOsuB7Xttl;u+9MEZTHsxqhzo?=orcO?c3+;@2_wz2;yg}p{uLTUX+DS1tb@RSEOx# zwA@sLwYTup)Y1@dN*halBtcHg88yBVD?Ywz)Xc4=kL~t0yt*dcGWoE~s84o+AR!HF z)MI^JqjGD;1I3x0*ymo>Gv~Ad@u(=D^+e zzYy&;4yx>~o{}YQ#++_oC9g6(Ls_Sf2HdzSTGy>R_PUr!hZ0Ua_I1H}d^;xa-l)HL zn0dX;Tj6MPcdT1?q}=wmqC_8V1@?~QU3gJYIy5K527-U60AJ=e)8z;QLlvZhpGLyg zEWy4J*C!5Uj=8XYuY?`Iae zm(h*Sg0{H|_$;>sErdzgMY5!)0k5qWGYy;1b3Ai3he~0hGn1?T$xP42n4K#G)Nf=TZ0%R6EK9%_p2!?-oX)x*V6MoD)%uASb4QEpZ(cD z*T!`1?CRpm>Xb-9G)zRK(6(t$XqXmWAFOhBcz07|sLAc249m6|3wOF$*=1K`Ncz;@ zKNzWLzOr+yAHRjT^}G|GJ|pQX)sh_wK&M9;fAzPH=Z5vV;L(nu{d0xBQ-G4MIPOjH)wU6#NkW)kN5hfLvN99tO zl{(!ubXK944VQoG>-P3dwWa`Ui@G)31-@2c-nK04pKOe?K(Q9*aU{cG8#kG2GQV|B4|- zqa@?!*=u{QWnPhR{4A&W#J-w3V|Ns%%|!+2O`cm5RrHUhT#s#!uGa%r9^Y5`i>-eq zIHKhDn$*H~yZVvW^*tVq#^t+w5;V&`Uc4U7dbdCAbt`&CQXZq_*)nMqYnSJ4J zUwdZ6Z_ofGa{8`IvN9Jnf$N}v*%6i}>-7nD6-tD&WmF%(6|LNM2oS>z^E#e=_y00w zO?=@wv^E_C9Xpd?t6TRwSE)8y$6ek@^@?aDt4>zwlxl7jh4adL8=u5(dn{5R^7NG> z8kdj!x{bX@y*^RD`lNEG+W}D94-Hj_CvL2!-t85+mK;^>SM2yWaHDnKN~)%IsYn6c zwx=2CIM}-8^*yqKm-mksOyvwb->}JW66gY+q-5 zyU}SoOHXJEBD}kk!4JCZ+LGV@7}NgG9eTY4zt$6n1D=X+ol_6aQ4Nu{B3_dB$HAO} zk(XDM&L;qb_-V)Pqm?J2P57#65Psef##PJ7+{$VsA=F$CV9M zIaRLL@wemdz6fVLjLzFE-fuz?kWl*zb@4yZX@e%QyY$Plz|6^Wm&0re7w#7a~8KL!{mx3Rb1CXW3^VNa;!0``j7J)U&V@Ee;>Fp z$NP#H71v&AQuuXST`sRXM_}R16(0Tkm45<`;6uFMC!x-ftw8PDzowZ(E8|__CQn_{ z#}fBulWn%%_#2;s(UwB&Ut8(VPg5buT*c#xm%q{}hW%vP8YKd&*A5In?kA1(%(Y*a zyc=ob6{ z=s_N)pC-*^%l`QBss{LMf-1DU??HM_;JH!PWq#FYM@NT-vRV}rgD3buNUQDGA-3a| zl(-7a`atyp&anMSP3+-&AHDS4?Y3yix3m8|x|L5hoM&b~j4hrBuiwt_P-mWP?5=%S zW+C`!?0G0+c>BLkVF!uVAwctAoa#Ydmc2q=z)F03XBvh+jKad`DG;n?8{r2-Y2U zatSf|YLAFJjB(&5-&Bpa$xKY1h?`H+x`se@}K4U`%giDSW)HG+aiyMo%$_y zKPodTOOAIM3=)}`n67d2Uj> z_HOGf@sF5a>GYrh=V?or9S4C7NIq}9*&q2dnw%n%A*270grrXCY+t~4&@q|2#2hUhipSIqTQQR`e8BgLGr3vMK5-3Wi)?@!oNn22_H z#xrRCY5`4nu!r-{)BV5hf?<~Q?p_0_K1t)xusy{j_ns8A`|@6kGW+wVPY)*8+1OsQ zL_LtuabGqZ>nkqOD0Qq2WVHb%iL^c5>`p!qUTnatR~ihMuB}c2qUc6xDkbI81|Ygq z<>oMpJwTqgxBEUrHxz|Tmo8m;DY-YyMgQmO`;Kgvv`;K7GUg7x>JYw8xWD?|7OzA{ z-_b~zK4^ZOob$he+=5s|SKhoO-MBIkj@!P0@sg(#3%|08wR*WM;|-<5NtPzS#v zY(3z4Z$`BdpwY&{EA|$nFd&zLtm>^0B(H?;%_niG2^q)`ZbX+xEhnt9)IM;Z-I#hJq0&OR$DK+r*&mCgf3aJmyAudj#fX^5l9K4yC#hhd` zY1+D|AWT$oqD9=WoYFdrrwHS!j>S3KDaDcoXY@aNRESpMBLU-^4dPvr_Wyaa)3}Fp zP*z6aDV7#nv7qV>L@a2V^CAikNMYfQzYgMuk_=s}pIRzdb)$MZjt9J<2+vSv6gtQM zdHDTpfI(9O^1z)-!{XXg9NMs5yj{b|Na_-T+UKdvP^17bgAa~UIoGW35v%_m-hTuWcjmGv&JOm zn^yXv;7=YRgQ6NzVb*u?Yos-777m>{&;B#ntKZt>{!gW*w$SL{%=qn>8rp1omvGPg z{yG)BcI;zh_RF@z5nsHZWxa5oV&(MLgH3nxIkvesxL;B~C@AhY9sZB}OyaGwnnyh1 z{iG~QE{ReGjl8dVCc;v`^Kbw8M~E20MfZH!KeuFsB17$Rf8MbC@{hpz%o@X~qs_$M zQ5L9Pkz6_?V8*-!SfdIifuVdRaDM^{hT!diKCS_9R}fC_q5b+ZixJo&`lF?FZTJc{ zPHLGt!Rd}4fzJMKT9%}YBXtRjCmuBwt!OTpVu1fXi&C54_F;PJ!fLJfgobZD<|WB8 z8xl11olfVmK@$L!ex>1ntakUX0!5uYuKPElZ0#2-zdIhMw2A-lf&Wrh)BctXkvA)) z6ubQ*A|h549-ZE%?FP;KEZ~KTdfN&ikKW7oNK=dkfatDTDr#$+=fc2iB%{Co1ibEV z!~)9uZBwV*O`8Gz{V?J{6GV^~*8+o0{AY3jJm{%4dhFgmcO*QuM`1fQ3y+y9JpcKh zWcnXF_t$QI3`xuE?QNszpSyAEE+kYyN8pJFzwdA0gMYgabs%av#2iCx{IVP(`$W&2 z5eKsoE|v)EhY_x`HHS}~I+dt*Ul(ql;XJ7wM-4dYkX5N4Bi()rZCyBNNEx&b;s951 z_{bN%Q&%4-BJ_aeO8O0k6?aewzYIMuaNY5PJ{oDvYecMB-g*!I+14~G2r%W>m~GYFYOEFw6VIoF~NrU@)y zbNUkIh<=6|!>;T4;2bvm5r8M*`UVF=T4#Rm>w}z7u(lG~2&LkgJCtYfAG4+qAwE{) zaq^`pK#dseIC%W>CsJ2%TXO4!38Yg$B!*5P$c%9F5WMAEOIt&npO3 zKsKDW{v3n$&kgf!>+b(brGWrB<7uU%69ssgu-6S?&8xt%s|Dk|j|(z#azT*7R3tC2 z0`}qyfJ3+tTI@a~xh}d4x%H754#;v4vKZpX_GaG%k&^$=uiM&o&u{Ioy-Hr5dt8kh zqd0&Cm&;n0d^{g9X?uR`QiWM6Y=Y1`X*w~6;aQoPjOWfthVhsgKcI$mdmp<0@qPU7JUMT~$Q&4@ecb@g$7B7an}-??j(BS}>qGJXS)#Oursn8W z<1%n@Un9g;8=K74sV7dGtqP(wF4M(oBK;j5>ILA-RhQ?qZHU=Ha^vXU2W5DktLxRp z*wjL4;0*}~0PqHYwe>K0j*gxdj8jbmivBG4+SLNi!pa?Zyc|HUES?{4mjODG2DXI% z4-G1>!o46*@V2b|1z+Di+=u9^PML!c8Udy|-h#fV`yO@GvT%&~M{K(Q1TUjtJASxp zNoO76sWoJ*KsN(46x(yW14STcKeuTNVAez~7W}q52SV}fa&7`l>1AM2HVBj$zDhxm z@EHQ&E=|3^Amn^CFJ~P-hF!V}c-m!PDAqfdIHOfm0<$f;aowE!zr?GY_W{7^y1d{R zqrkUEN50aA+PzqYfd>xTs95{}=6^OIO8R6s5n#82Krx}BbhvKQ%5)w;K#U-OT&NyQ zlR>=_cXeGt2a?HJkF!U;pKg1(ihI0b$d|@N9sI#bn$(0y?@Oew0Uw<6nhVPa1!bp} z)kvb*(iZ*Xfot17nHPYz@nhztGf^(K~4(j+9HLyla5)GX$i`t0mx@ZyS%= zWVU*#gm!OwPF!XKM3=)yl=H~j!cIkmg^fRcWkUgh0>FNBOHD0#*)vp4uRn)gT84+y57Be7rc18Sjzjx z4LY}mfCF4!pe>~ixUH}!2Np0yfOJXia)o4{*9^Z>C;E2k4X{`tCcXij#)Y@OH0c?# zZ!e0c0ZIA$H^|k;a(WL9|zri7^^XUKWSHibRUqR2#D60WxA9u51&AY#!O1PBB66XvvhKl zmN+{*dvzgWTk6FQl6H!=l&_e7w*kr|x3-+{Li*O-Q&E#>lH>t2QvA}<-Cm&H z!0nP16C(yVQ@3`e^+!+35L8+dpKN?3_y1h_AOE~(jcgJbco`q@xSogzJMrtV^^_^g zyzsD?bH9!-DR(6TE86Y_$f1RL#N|rr|DAk1lz`+z@#^l~zjj2z8`F>*%L;ECYok{zTg%VrZo7|y#Hh`lxWZLocY|V$2 zZ2#cW$l&w4x@Xmr6~xB=+~fbagn=g{Yi9F@ocOnTUGT1_DoiZaq_`Kb7pt!4KPeFX>(REBk47ef~`;2czIZ_&UkO&Pz61&sTIGwNQJMl`~f(Bun`h zu6=*EZl`bgo1+)G0!sL}*^ciQVMkIWkzO zJh|9$z3#z3R{g&+?ydh@OC}*wxd8)+J{wZVjNr00Yc?ero4#DkxBFU-GPMKV8m+MuFizQq+jmE0g@!@&H7fZ;NZ8ncT zJzS@Iqg_~MVcQ6dTrBrXWGYF^rE>xLZ^XAa5I-53_Sh`CM}=?aURnYbvnRXHVoedf z45Ynsk`1y#$h%bRHKiuzs@8xPZ5RrSC-q=Ml2Q;cIG@+*IX%@|RF?;ln~98!Y^Zw+ zQzW5r$4VzjTY&wBQ3_`=k$ax5Sao%KRqw4O98mD`By)u!3>OkGp_4SbY-$bkKsE4d zRwzD9LGntZ+yV!UAoC|XQwXO zQEiu9SUncN)(+z(dX>IUgAbe*u=&6`iMZ7XfO$gwDr^`n&^?iTISnizZt-47k90W; z1`pe6@5<@oe$WZ+uelA#C$4y@q5($$pKGVq}q^EsL zI)E{sN(fyPRGrKNR0Hz^jJN@%7=lh6MqPOgPALq~3XU4(M|NDj0XLz5ZmE4BZnqSA zXaD%;Brk(v)#~buG0|Bk+t@+1*m_{B1%3r zO{ulABg3YsQ=5gM)Lb37=v)# z)T-Yd4+(3=$X&Uf!<}Um1F}r5feyGkQM*}rb`VTiBJExynE%xJSv(>~XwYf;dGZ=9 z+F-#f8YGu0A}T2t+Ql%txmn~R&I26FNM1Q!mUTKa{=VtJQsqI>!1w-w zU{*MBrQrKBcE&SjV%MDAl-t*MPAnk>P&pv25Y{7jFbE?eztCl(*vzFPa^ zoY_yZlX3}37E6}CQyjU_CfkO%vm4Z#ipk@B>+9;Sh|5#TnwH6jyUgSj-nG3fZFelOFKiMs6pAHZGL302=FZf4ztg;)u|1yY74b%vZqBC+ zqmzW<91}5(I+-F(xgTvY<50T~g0Ju;jdGgmsquvG*aqdS`#gm9cJ7Oii_?eiM-vxO z(CP}gCx-UU<=jOnO*p2V@-Eb@l#mnuBXzDv7XrJMPP|4PuUUMi5iQ|LD>$X$WDV=% z9msz4ri~!_k^?Ww+$d%ZapMxUm|!dB_xKY$4cj8nkx)MK7*RUXY*4=<$%9p?yGZED z?c#1#+f@?Qs}&vd!3Z~ASeVLVm}5mrVyZFvOst7tK&0ky_4u--C^HD9Ixl+kwM9Qq+BrV7i31Wbkf|zxfRBz?vBX$kj)j|fUc7_ zo9QZUXBL}wkL2pp1r-$LR?Rv=GgVtKrIB4o3u}T5$!O=|^X_LFSyR=fQ#D0b*!n7q zuxqxsbz{cbmEy-A+Ky(arg==&qJN(+Bx*3k?Cb|fLi2t(v>$4hck1OTx*0S~Y4)j_ zzD!M(%et3m(aj7ZE~ppid;5$yD3Zk`+}bA)?~1(7gn3e7-Cb3Jk3o(huZ72X0$%Zy z?Eakh@FHB)yn@=Z|Ja>cDYH90hC}1;F?dAL^y|IF%bzthHFu3^p6&`s0m#i~*{p10 z=*qh>x|*<9?U4B6U4;$lsjS8r_SX6mMk9)bPu3 zzm4DwR2Psv%=R!?9O~mKFD1(BrixI=IrnrAvlS;F&!Y3Z3QY*ZpWmyDdbQB2YFSSa zF#2)OaITrRpH{nKoim?igkff4szLmd4ip(r6h`a*EDlENfSB{~wB5-(Use!8643!@ zi>ZcK!pJy3_b7%b25lH=MmM5mRm@cQIW;%}cfkOipxUoyPBW6`uX)-9jQwoD_CCy} zu5h1{VvnO zN2~$9!7DxhBVN?F621Tpid@hgXeSNlp-|aXS{QFtm4Pbk=!^|bDfFEL{cR)Z6NDVP zr8WkANI2zKzd_5Uc5Qi1&RS(xizbOi_~|suh%l!$*fuq2$6mvR;-&9?a`C3X8)JWg zEnIF4F=JXNk`lZNcL$3#d8$jfQt&W2b)7~R zU;_05g$)+yFr#eK0SFRPa*Jj{komAyuQf>?4E>4+{p|J$$MtWM{OA!iq9&XP${NDs zf@zqk_p8~uJodV@_#m1TrXsiIHrZ9YryDxVxoy~Qmwmw=JVO%|iDqz5POg*Wyh8ME zQ_8JM2SnNUp2pkzneJZxh$Wvw?Jra1BU>ypv>!QXQo0+(-p39}FYt&wYMvZACwqJZ z!3~zB8KtY5)1;+5=n0?13{PVR%&Jz^+O1HlY&vS;7LpNH*HoYWv6ldTr_9__Vg2{d|FsJLwU}1$WSsy1Fa7_*n%?~= zoDtjT2e8WT!=-0%IHG-FP;Y8D^^1Io1NH>91VX`|RO;V9_Psn_G5eN2m&|=s4=oy> zNbtHl`J?HRNzQmB+m|<^FZ%rqCOC^yO=JYx3IaOz-O3q%{MW$v_iZ!}3R1Z;KFsNI z9a>F`m2{gdriC|rj1E;Jcj?(2?Dcd~M%7tQKPaS*cJt||c_HjiW~vXBCA37jVS=P6 zIZy&2PHgA;bo6i9gMU06_23~dD(;=S0k_+Y&J`DY@mfFoSo&j0ZCB2i@xK+%PM2r8VNpm9*&}mm(+gTs<25r@zkgPtPs8<0sqe zc_+GRaD3g(DZ2Jkd8$Kxtg05X1l7s#01H$p;X=!U?xt?edNK-f@(GZc%mA)Z7ighP z^tnvufOJamFlPsV*#;9mN%{s9{2ks{{yypP6gy6FUQ}_EVEf4C)Y5QHM^$q5n33&> zy1~~YM$MkwXl28+ioTn6ojD_Y1_Z4s)q6n?>)+F4rz~a@Ckr(waY*4+cO_^(w46T=o$oBpSu~dMaYQKwpDZqJh?=_!Yg?Sk| zdYvHPvcqm$F^yllnK#LkzM%l*GB|IX&+|%4o3PiLHh#k~Tl*%qqVFS$y(glW?z2Ei zCf}=%!K_a>u}<#zQ+?HEbH*7plbdYme4W4=J$_)Z`38Ls(2M(ya%$gWS51Dxu?P|q z-5`U}t6v;{CVUV0NG>lgJHQdq38W)4=%)m5G@c@*O1w(4Qk85qJemNO8LFl3ZjSwr zi#Kv-pICqRmx#o$lYU`b_5Gu92VkicXnYIDXQpVZ`yO4kzT2Edn><0(n?88Ok27Q- z6w~i+`ie+^yV$2AzZ5vU_(ImLGTTx-oO55ut@%OFm>5$cN~Kka;(_I026CD%&Vb3! z0g09)OP|mEJmy8t8N-0zJU`LSHG2KUJKMnD2`g_crxzm$9V_>uuU+3+I_r}*nL{~U zmF9IR7iL%HGNxFPCn%3E6u!8iGhhjC&!yaYJIA|q8&3}Q(OqV4bGYUjxY-G5+z%^SYJ}1mWP@7mwE#>R1#?*%b?D{iOKs0Yu11#STG)S z`R>y^vuMZ8ZEVFHRIC$Qj+@eHFX&$}Zt43Ryxw1EqINUAOvhz%Xn3f7tpqq+xgcT9 zRjzau<~l(d496B$&P1|j?Wz8)#V7pPF)|z885CYW)@t5e2Oq_B(B*x%M(GdBV6~h= z)bibD4u>!%8Y%DX{kr`{R_@WxPh~ZVIP~1F)&PkWzamx~bOxjuVrD|%FO@$XrrLh` zgX99|V?AddeS{|CHR=3FC5>W-&8U5OZ}211b%dT?^D(Pjy$-P~t^_9S2GN39|Jxgu z72RvJr_)W}=;rZk#hds8_puEdTW>gx)MDvE2gPAat~V_2!BE+^@JF&h=V5PvPcQO&CR9?eGEXY^9p9qeJ4AiW|5wg(tn@N zuMKm=LHWYtUl+3N7jFGIgKG@NB>nc)QC7?P(YYU^jgC^BZTT3KBE&5yYUL|ejzov! zsa4;9{|&#cT5N&LhIt0vZ+cLR=HKQVf8FDwBZ-GCF9rVvzuW%lAbax*&rXtlZJ~4X zdw#{~-~Z}fRxzMNJgfTsZ~R%tXdhvisn5A0;qia}>PwP#2~M?fzZBilK2Sh20a#Y% zO-7~hZewp)7BOZL(6LACg0bkLTvUoC&l{9RZp6Ol0NAACzIO)>DL>-c8wHUJfDv?C z7EKzR6100F9PQ@_4HH9XkH^6=e1*0s@#U6RGHPjjml|2gqhbLkbQ3EnTcoEu z?U>_hcRp0$mRB)I89%hK=Lek#Tk(mfEK&J2E5d+h0D%A?!6JIzh!V*Hq${0(e$sCY z8Ek+8-vW4^*LW|Pww(t)w9c!MD^7O-%j^KM69$>mak{+s;?viHwJbBh7CFDC=#y^U z(6T28E(wMU*v1Pz0<=iC=;pFQYsfAcY|v+ywCt4%hi#w2c?+*(vJ&9Aa!l&$xgcx( zlx(S2V}eQ5;=PTL0j6felT?jOjkRhYkJbPwzqU5 zvre1o{wvxC@4k{Q^w=td^WT4mW#-6=%zXcx6oCMB0I)^}^PXd-uWpn&9T^1vwN?Mz z;y1mLloP_-Gka?nl!;}{!A9-Z?_~8L^eAb@vg!E4KWSWspjpw12s0HCQj4@1R-hwH z>!KB!nh$9RlIUDUzZj~^*fAoMr=SC)d4Rnb74+V6-tI+J93!KiH*J&dU+6O9^2qyY zm8JSkYxqX(ALbzB`vBSS2>^6fU>KCy>)78bFS>4rfVFxqliw)<(L6*d9NI)k+skDN zhA}xn{%iz@{6PXV^TTP|xxmX+R15ItlaQ?Bf=YE4s%Zs(xEDfmugW)sCMjX(>{MG7UtTp%a zDEq87TM^&{xQ-%Nl9d#>jY>tR-W`Gt$H3Eqj0+(P55WL9+ z0B08{0s2i`={!OgrfwTqD^ZP&wa$rd$Ep=sV8VDhPPCR=2iwbHk0yyOf$7|DW=SeU! zx5LM2i>!>EYfb+O|2?s*ZvtE?m8*n4sX@HZjIEs*6Zu7EI$VaU zxD8|)2^ckfD%Pn{&2hl;+bXsbfN0XQdbyJg^9=Eclhb|M#FJu=2i2B)KjGc)Z3E`*)3HDNWI1aU}3Hi28aC~_)j2FT2}5*Shj+lC(&6BzGnz-4Ot8 z3ozX624`=*JPP_uWMWBX3UC@#b38)X1Sz>1G7s{)Y;y=A=$COb!^7cLxgela&XM-= z*0-o(X<=fUGMfKbx8g$~ayGDxc{337PJwlQ%S;`D^Lueh2&-?KJ0IOD16H>VTX)(epqf&j51AJp>B>TN(jbi zz3ZAerkx;OBqu4&o~Yb}oiv^5hq2b>{Gt!wII!+W8^J22!jSM%WoL`qVy`ie87><- zot^MNRqm~Wa^_oqH95i>L3h)pTalp?;6t>#8zY7A#i<*t5lWlXjJkHs*QT86ZzIg0 zZgBrFQ@Hx`5k+Nc;W*nW+;R^=@E~}tr@8^;i*o?UT6#nF*hRU%jWq2h7bG%~`g|#x z$n>LlXD>oC$o@z<(V?CPpYLY45>R1{4j0)g=Xjt(TRACp53$iD$&~o&#_3BwiDO}X zZv+F#uh2AE=CrFifa3x!dA)aclYo+_U6eumV!DKe+gJr({i{{by~r=?77ZCy)jite zP87jAsSXwQnXTA767{I*`!ROG?!{==z2}q%CQI;7T*#itOnf%V^CPV zsfXgHJV0d>`xzC}8Y8}oR`%m;ExW^)GmrUC_p14H0}W>Sp|+1Sc}$x_w*?<<1?vZ; zsuU}72GmZqFhm;PTJIIx8{!R>RfN<*D6K&Iw196#tVO|4uw@>(CQU84IaVX7u>`ug zrx@T#j^AVeSx8YFJTA^w1hQpC3@U^@c}c6|)4bj0%A-zXFUg}&Lp`ZzA?5VcJ91~# z3ss98EYxa}+%QSs70(a`Yu@zg`!ph+0@@uIJ)0a0lm`sV+lo^!DwHRQ8Vd$oRmIjmCGp;Z^FU8$*5XN8xJ6*Wkb!$%dI}=i->gb$3 z#wb{XCbN|%R@wyGL#R~Zl9ywl*zd!yoP8l-=XPIwMDiRX46y#$W2}8F=GuX6eCFCZ zV(0#3B=dA%Xc<15%n}JU(#%cidXlvMY>KzPV6}KCqcJZ>AObt0+%Zzh4DZu#(uA3k zFQ|}Vp2_r6XRthHZCJbUr>G34l{+2`ulooD9| zYOY4}gxtkVIXyv7zmnT$p8PJ)U@{vE>Qy2}oxX)NMtLT|^HzCrg{n_K<||Gh8l)^J zOuZ$?x8tr~$5eaiQo0%!TL&dCvHM_W@?4Cm4E=(s01;JreU%{F$kd%1wP=x5 z&ui<5a+?T0IJW-dbzi*T9=+70i+IV8Y8VkS{vl+NiW=J;()zOhr`I&VB72DM$X)0c zR_v*lqU~`SsbL!Uw+*EjvsRu{StTdGrJdtQP8C&z52 zx0Yl1(Q3w7Hh1!3W8M?hv)If1c|gJ{Tnz9?HoINF*(c+;=FHU#b34l=XA7C2&h087 zuh}!uB;kn+yKXlpSDr?Ga^6XSS3;LW*Mm79$!XYLd1B_d_hur>Z5G3%kWB00uRqF&1@Z7CIw{;X)$8*XEI3C`seqgW`|3#Puf?Notx$jFd^5SHk}*XsPCBRwYxYD3!GU$N^2Ga|7>^0m}l~oc|zb> zp-#0jRWw~FASvvCT_zojT zh7-xgQI(N<@GxdbWI&2hpePDQ^_z>H7vx1NJC-L+69<+Ct{i)^I`qs1J1&pXGkvSk z-zJK>vKs#_%>H!g+N=y83wD?a)!NiTzQ7%wd~lZdaYef&Ms`G8@}e21sz?1je^sK) zNc`0tJG=@rUb5o{uHdI`*$5ZsDh!MHAY&2vdR^ZHb=*zCH9e(fPM`xzTUy%HFSYZoLuPTDsJ=iVMdNytxfe9Zrs(f zFun)hDtzM4JVx=1?;3e+UU~s9=P2h>aQazmKa_yw+>XhZPNE3z;xP14&u6mgGOa+J zn6DSZ6JnJd-isHxKb)Wk2JnwBC0Z*U5*ay(*(bI&F@uqqg+UVU1q-?1P z&P_QkpgF|YYc9C?f&Fu_V#K0O4|9vie%m!?D{ytHtCvezb|<>mf94ZuJ-wgeA4L*$ z92lwh9vSr2VE*=5W`@zrYTrz4D`tg3*Gu*ty&Dz1IaojCA>?e_tu|`Ld#6zGOR-0P zgvjJPXPe|EHGCJ3KTNzmrxb5W7vJ(&+`YhV^%ibVX-R3jO^L$Kb;X>QI?6+u>AkAT1B-(X?&d8w%nyLFeEQ>@KgBZj{BkrXIQMWjegF}P z>C%4cNB5!{p+=zT%zjvb6i$>v7&gs*>WUsfJ#c!qtV4E)ryM%zFr`r%c`=VJSKapX zQa@i9pL$Rdhg1Nw(fxc$;c0Epu`FW&o z<0NDwlqdO{K;Zmi2Qy6!>jXj#h-Ki<6?-TIm=|rSGpU#q9`me*ACQO+4r}<+pH|yh z?4Gwr(#2xr8GC9vfS1BZgj4zf(k1ls(-98!f>S-Z9jhBxM;+fcc>9J!ryyW5qxv*^ z&Ky-P7ZbXH-Y>OO3a|2(5XUgGv#1f@RSo{Q~$%Mb|Cd>9HO+{_5kN1Xdf5>MKF%YJt$vc_7 z`KK1ZO?IK3sChZCyqdHIsq=Vx)Vsv7@ZDjH3iMyGf)gRS?)N1hvDvjXy?{dP!Ne{d zpEAws?l^L9SgCVgzzh(XXba_z?l>8Jd~ z>7T6zTZ&>Fq}z}Ug<^;opdiL(`D;~XjZ2ymZv?p@`d?U2@&j*@o}J=IW4JCk?EbTg z)-vk0k4lNjrr@5H9XbnX#S3BBwl@iV^!5at{Tcamy2iA+$#|M3f%A3o*1DpF2j;8D z$Tn9h9zG0{>&;dmqaBew3WLY#_I)7<3;KU)wXV_P*SZ zIP`2u>}KT+f^g_Ww~{lul<;u6Xn^-fZ{vq(y$L&=_QfT=$o4#`xeVsLz)#qeU>YRqyUscYHf4CudppB}3#*f#*``j+hO#j*`*YlT%# z5#wRxw!%wpm~5 z=^;?;cRxL9u)5DIwq!I+W}%41eKl$AM~K{ldiP7?TkiCb2YHB0{9GL>K=OxwB!DrD zBy8Mw4)nj?8~olruiH|nH6U#Gn{r$bCjX(FxF_4&hO2Y3wDO~pw<+n9zA>V1CBz$2 zYbUPA#CnRfxdJUqJ7!EY;y{+Hb?MJVn(r+>82fQkb;uYqsYJXVQ~M=S3{r z5^q=3n?=c?356m&Yo?j;)@VS~q5N=%Lsz}~73rC-N~N{?^yQ088b+P_+Kxpm#x zrsQ(3!@D8-zSogO7xCWx@xCV&+ULnA+(K{4;T4#5HE-sii;vk2|dc!>mt&5Y3bF`mO>x5og#>-L+3PIDd>&>D)zK> z+0G1W3HoQIr$v1#$z4%~AD$hfyp(kCO1mAlnyx~fTnF{#vB9N@6<$ zQnkoLYb3sh=Ve5@(4KJ1`Xkgx3|tXt*Al{l+f=@|0C zhX+)*-)kG@4|kw~ayRI$q1E|#qe$(wh~>C_C7t=Wd%D{{Hx$x%lkqIe%Vw-wcHm~j z`o1Kh$~LUIOM76(sy&qEYpQxPNiWJ{jsdJ_H9FpLFF zA={t@jJmMgII6(#OvFvgZ7JnoAC2{66LMRmjYc=_^!Sv+`O6cShxRBXq`8Fy(xq8v zW}HxJx|hvck@Wo}q2$^C?+s0@m=$1-Y2G(4i9JRVCZjed5pu1SY7!|CYwxN}?Aqp? zKyKl@*npNq*|W^T;jGq|FT$|A9cvd~hwV{r&o&LyQk9z{VYfyp$0R9=$7X0^40}je zH<~XtL$k$?Bi5aX!jLZC?BMX1C2GIiJi; zs#ViV8@|&<6yPJoRwC?MOp;RsNqsj{od_ZjBPe&Un4} zF?-bc*&Zm|%TtF#fn9Jrwo(@er{QzAZS8Pry8STY)M*E4Tz%*a-51aBbylUwdaiY` zN|aob=E+JlG#UlJwa_U!fj4O`kkI&4WH zd%JZFEGkV3xqIpAlu$w|)SPCpma*gA7wZC#reIv!AA<;XH?$yo1AK#Vb8)@g9!Z$z zZdED6-%+hUO-s;|q8;<zamBaLm_Al_z;q0NSPKn{@k?{fYvl2qua5P~X^M+GpAYE$dB^W8d<4WMDp^CX-F| zDZfP|nw0dxsG(|q&U4>S&@If9Yi||gCF<&>(JKixJ3QD{P7k29dXN0PTkF*V;A`-L6Bw;D+kgs@aH;y_58C^-2K98Op`2s3~8jMo~mz_r0rRoffCm8j;pz73gcXzKPs(iy3a+BWAJZcnb zKnTtop%JU&XjW_pJ?ZzAdZ{IoC7p=V3K?rawWrYgY^+pNLg}qNlh*X_h!w^oOW?5i zg1p~AFlcSrPA~02#iQ4H!Aa-afT+Q7zvBch=iS@;Cx7$e{n|QAN09S#IpZ&%`Tw0! z{I$hiB15y}<}8F_$dDn1(Q7%9R-W}Q->#@Zw%3)N5IT|%CO#}=P_tUuX1=i_|IbeN z$4@pAVooGpR`L<#f(Wr}jdj3XKSAZo zHk-LdfJ+d3|6s?T22k6T7J+;_hxWDCY9)+u!GK1wnUs3|`q@|N1K8>nGGRd6KXXVt z`IkDzHft03t_+%u@&j7WqRf#yJD=f+aiQ55zmWnDfvJ!A#5PT?1x-NkIf)%!vln9etQwOzs`DObmW4>B9Cok@YphgMjCGo;knY!k%N@XY&)vw-67V>LL>5&Gq!zV{Bo-#NN zdnzWAoN7%KcctaU~o%0 z@=57mw(=hjUaSG4|8Ng$tmOawtNie?Sj%7djm_ELK0g>CCJ$41Uh{P5E|-7%8qU{< z6krmH>}sjmpi`gGWNSzqfIyuA8!D(Bu`d+K0^J;@$ql2%C<{RDDimpFlBt6KBSMc$ z(a4Jba#m2sL*yaX?#5O)btVB4WGf2R6E4;nhkNOPk_46lAZe*?jA}fhQ zLqkC|8`(#E5`M<&tu+9?%^cM4uRW|?8d4HgRljQ7a8h~TSU-$GTuxjul3i;(dQ}0W zM>7$q78DdC&%q`7UNnHeU)f)FJ9~RzI`1cpZaaZ;MW7F^P4RE_2o+;S0zXWBKufVz zn;~^>dRJe+`)Yl0JV9a-N@qTF+T|J1>C(*-7Yjh%Zw;wB5u144J7qguV9|Qb$~_*& z3p#Lw%gDv%N#HUqYx?$-&!KQ`cRl|0NdFOM#5l=S>U`wo0QwH2*EH=>=~$7(+H?TN zsAhqEXQT0|c#diN1pVexH8bHe(mHQdaP<>afD%6|5{yDDFOhkZicPNAW(^ZJ ztB%S8>R#R{$DRz8+yrSOvM-25Lw8h#7TJKB_I)2f4YX4y_rm}?RU!|{%6*GYAl}b% zRfp^i42d%VBv%B7k_(*O-SR?{gYIjeAI6hErQx22-L+w8xr^lI2>Z-o$KS(ykp4RR zOJbQ?yEn%CDZdB2X%)0`LTLp!kHjlU^}P@V=dSi=PkhD&hpEhP4zsu$!Pnr zmMWFoE;tQi4VeCE$63I-YhPT33B4wyA;_>fI6B%2X(VOnS9ih4{YP$!J<-|qO*cs0 z8o(aI2#KID0Hg@bsA`F1nbfUbg$B&0h9MXN$#tV4Tb)d=JkWrZj}){FADhT|=_~6F z#El71QOQL7W>MVOE*DS<-UouwW{~q$34OYB| z$7Ct&u_x0nc5@AizA8 zPyb+GCkTl5nj?S>q<>4`?_4eOCTk4wfb#9PaRG_<6b5cLjeWAn@3`FwpL%pJUysrS z=$pELSlP6AV%w@8DRT@nouT|;3P0yW%#rnRO zFIi#`L3QT#Ih*lCq)n$)==el|ct(=YX`&1vku|XGVi|dpzv#vT+A_AcDCKmF7^Gwp zvb~F2vuuDZ^sk?MFyIyMTbaL9UzJQa(_!}& zG|v-Uq(>yh6WFZ1<-WOsoT0Y@O<|h7oz7YS{aGy>`7N3*f_fXZ^@^BkfOmXXHK0O< zB}|=IC*%wh&17~avNhSG$LjVA2AC$f(v-0?s2b*)QyZ}dqFQ;0VI@=ySNf@!FiINu z#0WcW)1=%uuIu;sCqVJF3qe{u)QP0$IX<@19uiFyFhc+u@REO}<24_}@bTQhSYiB8 z_Y(2S7S&cVdb;TrMU%OyN-C?UV_1w1ncKKug#hKW79d&kbbU%oSPfL~#9u_wgp&KY zl~SL@Hg@H%oa{-!sG^QzJY{KO$o){w;fZ4%zdSHWN@IaDCVSH&=;q6SG8sMN$$ z+3un=!jnyoqiQjwPkrw$5Xb`LcK$#1-aH=4zil5cQ6W*GB1;R3P_l#!k|=8!`&QZ4 zvSv3GDuuFTC$h|7?7OigB70-(OU2j;V;P3;`%3rydhX|*KF{m*`}_B&Xy&@!>vn$e^l|D}=2lyID&b*CcGsQe zSAcJq8{SB^X!5!?RmtSpqYqW?Y%f|7^i;4Y8=CqeOyFZ}SBDSjo9?dO$cqB1nn`BeC>=o^`OVcH zwaOMvst%jjL%RD>lg(0q$#i_-T$jh)qU68>+Dw_T?a{mrOiwfAu7Ar;qr z=o}RG=TI~ym-FU^2NVPEwH$6Ws<$k@gJM(>u`9YOmk*Vm^&{xidGw{jvKC`CaY5b4 zw8+q0ek#m;WzEy+GfKQBY^n85fS)PUBNB(KkSr6-LD|Bc&t?onMTI8>#%aSVp6-AP z7FL$kM{-$wb_>r}BJ_`WG~%v6Sn=5exP9hXUr;$&znLL(Hl_YLJP3IyOaq%v|5!vV ziw}ig;;{ZbuNwe zmXi5cwgi-|dWvlUw=JPwSL7L_EWN-@Q}923O#XkeW^l|E}|R@Qj#4Jc^(`~!M{j|+ z?G-`3&iwRF|B#lSl#*3^gc5cvv=rE>jDQDm-Qgv?_or z7jKxH^H9^2ReG)%b0)b{Z*qr4Be@3fR}Z>REN}q0P$2Bvm2VR6#G8nARF#lDP&Y0H z0mqjFfZ&@PR&PY#w40aiDi)OU5g!AhtS*slSw}PpJdx-#1a`^T;e(BVI`RD^Ilj3Usja(W-xNIiFB}^7l9o-o8j(+dr_z_P1f;Estao2j zv)*j_Koru;6?42=?Zmv!)?o~NeIokSAo6fWmJJ4^#7yJr*AM2h!_r!4GZC%!2Y(Zn#~{m5$ksXZO*)cPv>JBU`S(IW#tBq(0!TS$~NKvo2@O z(>wFFcL;XFHZ1;Njq(^& zc6=eJzQ0ljX@TgDu2MxWe_J(w4-)=Cl>l{PVvaR8yT{MBy6&xU9@#&J$ch#uQ_z_4 zFY!A@(KtdrDMf>j03L1UDk3wG9R~CQ-wCwR#u=Le7(IkeuzGs^GdsgY6AG3P$x}l= zA1sQ5RtB=G32%E^6U1Ok@J`g(VM0K{hG3Lgftm^(Lwu;wtj+TmW9q~hJ2eElUUndf z0@#EIGH%#ZJOv5dZn;dzXixKDnqT;iq@*8Gq(ZM)H6s&;l^o$VrRAL^(m2@&6VY-x zOz))$SCoc-W|El-JSO%)R1_-(04`eJgaE-!Cv_P)OIs_Vp*_gGFmWq}HO{TP44V7I z0IbXkr1scA<%;V=*APBd+GwS2>n*G5jbFDaC!!37s>ZkmuJiFQcAi;{Wlhj_qxfju zzLh%H&gw<3tswI~1to@QrOF9%OqR)~5_Vq#b)uo|uYg9ICt}_G$$l0nCguS}(aOzy z(r2h<%nl?e*V0?G+uUcYE^Ovaskjh6L#a8{&*KU|k z-=YYmaE8Y~H%>3`o*4uz1fw$Omh%GMPQ#PH098*+=o18xOfLI^K->=En<18g7{SDm zZ_-lew`a{J4}3S>JwB}F5+c024l&G%D3Kc4E3IF&U@`IM6AFQ{wLTKI1F)k7{kKg3 zjdqlr%X>D6KH_RW2R^v{AzJgx>sP279EGI!QR=v2fnd-cIRKgn^nQA0 zE`G^tcY3H2MN@er0V-(A*@G{GvAElBS{mGT?H>`ttB@ zX!WI~UxBD1Avr4`c{c!Eu11RG0KAg~p6K(SrNXr)8j`#OVB&j&HK-Z~gg;wUfztNL znM<8_vBIGh6zD7S5P%3o)0?R7AUoktGE*#lD#rs^4Xv_m2BFf}Z-`=FPfe~tfP8G9 zu{{vW$rZv$DK3ZlTCUGvTZJza)qe-g5{nB8!L#o9H%&!IzP+<7BYG*(_4fkIev<}_ z0M*wG=qgnBBmsBx0K4IvcXsZ$g$Q?l3Og&OEL-RZ)q+WYuK)PHV>S9=L)25sC|fKq6iJ*Jw005VB; z8f{m8W1y3gKd~AR2&pdm>_5L|_EZw0y5GkGmX!m5VDccOY&|2FbjwH0mCbeqXiDWx z`&`a|*ysn3oKm+!%dTkQqDD)JkijsH@f?6d5DV@IV%QYvo|OXPkc6m8%vGySPJG&^ zg6HCtM32Zv!5dbY)@+6P6%So#f03^(Fv)PDm@r6|d&PY~>+}7I2x4tPb|5_+`y*Ld z1np_Y!)D!DTs;6vD1B9b8vY>C zA*{<`v8s`B_Jy$DB>Eh}u%mWIUJG{AWesgO&m!CglR=o}r+g=Bx zj)$(`LAdN(zaC_8rB2#!v(-d-(MzP-l2ECz=oNsm@eG}oN}j!p^}>c4j%k1AAREuL zl*?!zF!4E+b(ZABwZz|nUoQI*!q+jJv3Qs>>^uccjc`N`FB_aIvqd3*+lD1dM;2Kh z0f_8<0kObtpvX%_jvG~=ijB7cFsB%N8CjsIA{$kY>ULnE(YwD0cx&@d*#o38S!y&b z!bh?lStzvqyq+S^e*AMeHy!3RMQOtQKxc?f9V$#HoWUCKW`pSjA-!M{7W5@g1#+um zqqXpgcsXVXOVEw%kZDHMZqq2P@P(N9$N;Y^Tk1%|3uI`fctt_S#*f6KN0sKHX^jsR zd~@4or~++lIjn2?ayMp&m`b^3OJBPh@#d}Jkb(Qw;p16$J|@T6!u{GtdCRnmpDanN zm~<|;POU**L~;-KzG1_@*%;UtN22s~%&Tv<;*7ocnFlJV-fgGS=)n2-=HNYBA{eTz z#2Ja+lT5-lobq;w?-UBoHfb6e00N@(aA!2Xx8@Z#ZxbTEVd%4ow7x>*B*uD>#Ycu8 zd(~pum@qg=J`6DgiP9K~J`D&L&P}>~JJiHtp~*qCxeoJD&!IPkU;4PYk1(DMV}uis zFitIz2ws2T(Ue%23$F7W@(K1(Jj+uptf7s*&fvF0XqE+yhU*||AE{pZrm0p=vbruC zaM06-FoRu#GW-+@5qtu#2IJj~!6?`%xNF-V=sWk7a>Hw6Y$0IL8Rfg%L7!f(;iG@!b)c>Jt`=N+Ww>MjDN~G8^?-D69SLO%2i;&5Mh81YDXF*8R~nWOZE;o(%ii^VoWjCgxg< zF{hrtE{p2Rk`#4gY!mUOT;Aptp-B5jn=b3c9D0;K$8%EOstThEZyLds0jDq~@VPcs z(N0Y=Z=Z|G*-g-VpX89=H=70cWeq{oHKN)>wo1FZm#kNCp*3$tjgb#un?#XMaU(3D z20#`et-3U1j&O8bpx(D3J7S)`bH8<_Y(N~cx7!w(+`Sz{l+kK9V*}Fj%p7_LSChJ@ zHy`>~A(5xRolOvC%!18fAjEbGr7nZW=7p~!(qTumT(e=Bz1tC>Gs0OmD3CJ#%KfL1 zK2h#F{^61$BRV}#pg>efU&*9yn|X_F{&WooRmn^SGDTSU{*~8?u2-!WBt544R_|9n zwcbT!)h|Oh81~0{v`t(UBwRsBC0vx_8%?~cM`g;A*e6CBsQ?AF;q7AN6n0Tngp>%k zS_}2tZV+Ut#4h%#V1?tW11qFBS=47~C5eL{Qr;4_?wR!76G8iOL|gQXp7HsmxYe1n z1Ea(n^0)bL4QN+5r;Wb@a%q~}Q^A@xGnUqg=Sex*?{jOgpyAYcOZ`dKran9;9tGM- z)Mc_vg}WHOOxrcEN2AlQRrSQ1dIjkzFHkz@g+PhktuN>d!9Z(YeNv4g>D%azi^}*i zR2^EHosPmn(>AY$*C45_e=KsrX)h^p-Y3FiThk;ycMs7Il8dutI0q@<1=#&QwOM)E z9E>&_bpTx}OSuez@rk%Y&BVNW4T3~BP~oan*OU`MF5i;lo_S^vkU;EOWBf)$3I&uR z*?S-QEamtEh%AtORVA{XMMIxS)CS4eqaK00vgRygGdNxHv9k<_oGv^wU<<#=Af1+6 z!a5Y1hzpuP9_jIQ{W?7WV(#$}`GYfYXbmN8?C~L!3i}lapiT(eA8-Zv>B!F1de&`t z=Z$nv-^s5r4(-NNSv##7M!8;n`!qnVrC7GZ?g``xOdSZ;0epTtjU=Q5aAC0icC`**a1M(P;}@jRU)fn zRhPZ0kLHTp%8X-#BQcJ_doD@M4b^pdH6ZO4|FC#BLpuxh=6KSsSLS@gq_{>VswOKo zTHB<)Tandi0kmJyMx7u?o=nuNA*3e5eR`KBAjT&u9lqP$A1HP3&8hBkXG_Zuw(2-M zpnC^r=f1tQ3P@5hIf8spuPA>ifr*IJAMS8BG@B^n(HhScw3<>f^YjDSU)7J57=7P^ z6LPL|T6Qfe1bnM8=&brJvf8_Q+%PQ$u&8}IfCfmaT*C*r}9RL&JVpwsl7`qb3 z3*=30j#ChrzM#Ny8Hp6T*Aqk}CTrI%>RBdCOpZdCn_=TC6Jw4jzWB~V85j`gx#iPq z5e&UeS-i8yc|{h}U~|f)`~*$xC*%cIAlN?`bd!Gc=^$1;6e-k_X3YktMXBA_f8=*R z$`Wu9G7Tw+MyQtRvCzQh*r6&GE*X=GuDnK?B}Vq?)@ztc{@shFKKmQR;L0-|!!yAm zG+uDJY{A+Hi=HdpH1SyyaJx1d-tIRx-e8a(S=! zpFFr-qG8k|Q3G{0kVfTj8lV8BiE+H^X>U&4S!xPf5XD|iH;BfOtrcHX1DF&t1Z^(;;rfon%D1e)NDA%cd<8U0i4(S5rOx#yZ|UMBta8~Wjq7*cBIpf0MX0$P6i<5 z$k-Y)K|GC}EdJ(QPfQl`l|~L}$UuzW3iA_eNnU4{LV0@_pW9|?Q1RQ;%_Eo7?}T#; zJUE99zOw_z;)%7OfLJU92QAyoLsjnG`K@(X&lD29d{u(?AQ+0V=hjW$2ife4-)?5WbbavY|xib>O;=BbKuXtPx ziIovw&o&}2nRJh;lYBxQi8arV50nPJIVRMMMb~eOg6P(~-neOJpl4Sa_-8n5f%Ue{ zErd}q|6rl$%r_Tb*6@bQ1Y?c@;@K-AbBQD~!f5FB%A?x$np7!|In4y4we z7zIh?cN4czCWmwyui{SGLu@~N)tVeRm6Ow4z5Q7XrGYkvx0LQQi#M#_C!hlOPh4Df zdNd09buTY_E>V-_50`-UB1(?*Dvlww0;&@U=Y8(T$QJL29^?aH#$o4>r>z;4Nnaj+ z9MoPS0OkF2+Vd>lBYp_J4a3I^88hzZ1~+`3SBG4yUi-d&!aKg@7GfD#3>yYij-Zg= zaVH0aOC6C>3li;VyIkcZ+n{OvbymoEENFGOimmv4xBoX`#Dk&PW~>sdfU_U`M zAbyyc0aAHK9L%O>Uf5 zDj;Z`omS9PEKwGKp}Y9C^s>u!8nlk%zhB*eO)owaC{;#3gnQZQzkFAu#mz#vWkLz6 zyMse#hCD;p7DuJE%pL9!wFYL>K_D=4a*! zH)>1oew1`N`uETM7nGmL34w?|Z#kVJiyZxrwV}QVD8787;`kS2&)Wa_08bAyY-op_ z_;Mfm@AoNTq*YBXz(;uy%*@Ex2f8>dS@8czE}c+I*58~JlKL&Te=%1EpkCP^G}4Az z>l9k1#iqje!3TJ5!@=Vec&dy+$G{_vH>&6 z4Xo$^sC^z}I()OadE^b3_sSd>sQ`*5Ky^zLB8NusAb=lv7ics(3F%L;0nQPi@s=)s z^C7~r^nc96zt%{Bpv+ht$ytUTya_^xsSLPh*a29g7{coH zi>&U0miLD-3rPw8B-b_#=!a2>Tn12x>||^T2u`2^D*PL~GlAq2SfxCRRNh}Xi{DZ` z^bac}Ia6k&z`XUGe9|PuyWEGcqlH+9K9Ph%<~pKc_DIX7vC4r&qvy+>g-xKb@}_nq|RkpSf}A z|9{ZG%;>)c|NrBNa=KGOx?9rVcu7eDyPyxW1w=-wA_XJ!+v3Fwh1g5l-d|1CU|$FP zRlQ50anH!wcO+k@z`iOxKD)P3)%7TCdb6z|X95?<#FT&a{UNdk%kMuoQ}U-7wfj;C zxxD>t+3x)!`8ilMT$E-|{P53z$!P%ovj3gsUmT?O$iTmk7#V6naaTIrJoEPl{B!!q zlz_9lkgLP@A2#ORQz4GifNjnYhW)?)(heL57D|bv|CUwaFSNyrQSb;b&~)%k6`EcU>j zNPSaaHNpJy)`NqP2Oz}A%x^luqgr%HGyN0$mL#MIJ%Ps$f(%x{`r z@CH&SqE0`X`yK$!WhvHyy48c=wxpdu|J74GUwMnf@3<4n9CM}qoZ8W%Dyu15ezz|# zbGxYjwx$sUOkm;-9M^tppE5A%LKVVEdqDZr2nFK#vA#f>VDX*;+XRPxBrp7OKM^99 zqK?7A(9(BTWL7=>D@oXO4S00k)?oC{RKd z8PHXfSc8@T^CIA`l{=K-A*~;vFFFtUDqUOMfQXSW-Vq4a7?z=2wM4RA z84q{7zuEx&rl))R0Znfn5chk}1q^^_V;;N}|7v_-bRzJq+4N=fe0)M$UA-Elyrt1I zewRyFytc6=#=;#)-|Z59-|eGY#jS7S$`1ASZ-XP&3RL%++uIAvblBYUx}}1RF7+7g zf~-IHRgJPcu_j@@fVLskvLV5T+@Zi|0-75)+_HbW9;qwur^+YZ>$y=)1Fr$r4PqG% zA5>r<`F8`1=_X^1C7@$5%Giqm$df=GWcp(S9O`zsGyyBfz5Y2h7=z)8avvZt-CT(=t>eu@i_mx_8)w>N?XN>ijJJo zUWKH-Kwp_CUFn(la2?r4AJXrqkvf}Mc&{IS$w=*XA}89p13TF5dI?SPjlyMB_7k5@ zd1DTw-q$N4MhD7xoOq$VHV{Z)&eBfEE3Hb9B>sMcf1Un+?lI8o#p_*oyCVb~VTcyK z&Qt~ZN`N*)=SKQjTX@PK5L6N?f{M_NV0SuQkLA&kwF4D=+WSxwkXF9euq#dvlpltK zq79v18cc#6lMm^fl?;pOPq5g7ptlmm?M6?ryE=>OAjqzy+9*x~mN);Bjd%shR5@jl zqejm-bn5EmSlhSaChw&@f+i9O_|D#msTWf==FGaqk9}tdn<9LL6y(KDWo}at&e`7h zmll8ra1#hND>_K~`}&J`>5tUnBgTIOI)JVS`$sR#sH%YAV#*t)Q+!2l9FZgNM#l%Wf?X7$cZTTTS-=jy(>(R zhNGZbURYnlaXf*uO}ZN7?-x9?9>ByFy$YWeuMyrcm9#F*^G;sk;&?qdOjnp=Hruqwlfj{rRgvGKlaPIH z5V^W3jT>%$&%?jmWv~F-0*;&f1h>#Yo@`EDd6UoyPD6iCr~IZ zpzcUpi0q1C>Ai)Q8MYQ{l_XxqjW>sPZF=L|zx-wmBY%1 zU6GIVVmckiN#tq@&G(?xK`BK{r!Ymw-FMAugkWvOxYp^C7-gbgRQ2v{8Y~g z5w8}8j%K~)J>#59o2`$@4=NUIB_wu}&v>0wLE4O0rDofEco4n%3XFd2sueq)U$th$ zzu0_j=(gAqRx8onpyLbv*kc#v5p=U{+UTJ&*+6Fjd0kO*t!E&%22omT%;fdoQ^Ol7 zGjA1X);3_lO7nGRqm9jdWIRKEB=JzWXOQae}mWWwG#&6(zpUM8K_pQwcC%bvE6?GQtIC7ZdJ79wJ z61#MbRe{WiW|n~foN0Z?$?W%fh}4r|!o<>x6r9zb04mas@g1d}7=a2@Zwa`O^XYwzFy zzxFM3+Z1WH7_`BStioDvL%r;xZmqDrz5Kss|6++uBZ)Gl6+Gqc;j8TfZ_EY(|;%EE8|xS8_D4=N7AN z-u^nj%{O!|Z<-vo+69YrQ!-b9T2gg>JpD3m_KJJ7Ogwl zcVX`w-&;EW#=asIK;*X^ZyUl0y;8Ce?N9mLpFF7LHx)k#bCw=nbNET(n!nqn2ldT* zd6+#W6U%fwUdUX6lG=0JapaC-&?rQSi6qgCZS0*&(K4@iO@g%zK2NUr;LU@pHXlqB zj9azdPBu;GAAoliEKm8#*`yK{NhZ8Pyt)jg=WT~T)82(AQzbJ@w7}ZF&k1f% zsqnVmMp+E*PIPp-exYAM(!raUx3smB9X1RYwnpiuGk^NB1E^-cU28>~r>D5? z>S}Bh?@G1Ix|U4Lmz4H*H!`1gQ>Zt8avbrRO4RO(LG^iLSG|Ju6uhq8bjXWmC;VphaDUVX^Y*-2qz|YYe%)W~~ zqGR!x%rJs=qIdz^yov$%KQBH-Q7n25bvh?+C=aIJ7+2PU`w&^{;X{;s*_`&4IYt=y z0rki`PTv%}{Kl)3pg36U=!gpv|K zggXFb!yr>Ch5|=vdw@U}pfN<=HK?M|o2n4n_rt{`61bd?4pA2n?Rsy$mlc zd>eH7u~dOv&Z#Wd!&2_@_vOEtNboF|Ppsh6pYTFYh>Y~w;oTyi20a@B0`m3I(rT`I*@i)Uf`XY{oNUzt4HS-x|q#$-Lh~LgpF>grA+Ogw#q%D@nUWAZ^eCbsC!L zT!sxOk(bgzX}hQn92o#Pr4`f}>pInW_E|A+-v^OYi{jbX9K({w0;BvaTsLZ~e2$96 z?WW3ezs_q8ch?<>T#vCgUiIcOjNv)9L_y;v;{C)|rO*aTDjXu0*@4sJKd`76+RM6m zweS@PA)aV~Dx#2}YA^7$KGzD*(rA2_vkP*LK|nB_^FMG@@ze#|hcgWz)lYC?m3Py< zmy~%jp;ZU5STL2>9w(3TM&WsGbXWaozqZB2LiD}bXs0W@O4^#Ij{b3n@*R!(a9l*- zNpfqR;o?h>>HK?Vz2Zu483wq+0H}}IfnV5SIoa4cs5K1!~y$h)W~8L|FMS|B>ntgy|>W~wS7Z%cRkM)8Gl*IraufPKfIy?8F$3dEEKn^r8wgcL@f%F~7D7mH0OA4*SK#xF%&S*t=#DGCUT3+saNX)r;*%3cKHPHr zYFO&eZQ=6BMTYJCoz73ni~*-U99CqI&FiO(9l=Sg`>5SB*U=o-($UdLKoT(>J*!#H zBLwHz=S_aQqhj9_@Dt?qJyT73xX3`0akPzX6Mkhe?s_M zGe0lfV0X%LGw>k4&ny$cZ~F1#kEYT1jLBn3G*UB#ivR`e0hk~$+pos0>hX`;2`=4t zd=renuC$041mfSDTs$11l_|AKdvLf2qG!7TM_L!n)J1kW4p`NqM202V!}It>)7`C7 zDPq_)?-MWX>ijL{{)>Phy8886$1NADYqkdv{x&u? zRoTO`&klNYG=;G7u`(15-Wueb1BQ1@UG2WtL3`i11^fO2`4Xc{EiJ7~48|BZz^53D ztbhOh;0?Q!#P`G;9b@BMEvi7G*^rxaq7zp8i%88OH|397TF3JzU(h$8BL)rm)*~%l zUsG>OIQwx&u7_s6o1zz88sV8$UA=W;D5=e<~KwV>^x*lD6==@ejv^#YKLZO2X!{2M+Y-}aideO7Mn z+S4jg#rU|mS^L2`sVjbcc4I&dADNc+X^^7gM@*F{!kAoG;n&BuVBoBF| z7Pt0!eb!3(suVu&RrKAEC?(F5aCZF%Z+VTOOVP}Y4huxD#TVa+_euE+zHd3>-snn_ zU7^(soDHQXp9ud>SYD`^0wr2~fQ^C^CN({sTd=33pg_ab)wKfn!4^IF^#J|D5r}S8 z02*`kkulIWAVYgnV0B?|PCqVR(r?Ebh*x;j4L}@XFNkbB9mwj?37Q_NI>NbCUR5wH z#fv*viT7ojN-hAsfNG*1DTr>PS6fMiyLm*Hhbh{%b==S+7;qn*aOkK+55;ya9e0DI zAU)nrT@J&Id%lSUT0!Pyq5DqT{q0NhB4^p4K+orV7@g%tNm0?0{ezvo?b_WI0}>%8 z5H<%2%+0%=g^HTmJu68F9(L=u*mhe$IalWnxaZ=26}G7hT)rGJ>?YA~4%VAbCDy)8 zHe9t@jrU5r+*8>V2VB`SKh=U5!bCIPapbb)OmsyK`=Xo|vwTUs^>CLJ=(>>rdhC)T zcWY*)B-18uTZi?FR$U;D{%8*#09ny`Fnhw{%5U`dzf(QfvBL@Dtx@es?;rinSHsK@ zPoGk_9$6b0Po=V|q*)svc0H|L!we^i>zl0UHFd+cLR<@GDjxVpL<<%R-yw)7m=IrP zeA)cGUhTPXVqMW-RLYap#w=9E%iu~M--^SG>YTTiCdaXVO==H@SCFFK6k$2)jP{D(%w($C2783*+|qQA7|fT;2evi zGz;9zKR9zJTZnfy12k-%my}0OaW5LKsX2Y#r7!n#Zzg@DCTcL`QH=XL+8KyH`wKWD zb&_*z(D|DFULYT~NkKYz5NFk~`X>K+;a5Ejw`3n_Flj=-Px!O8R;%uQF;fbRYu!YC zg7asL1v9zN4D$rmgIRuuzW$kvUFwJkrGQ2aFO+0X*IYGrT^9*&aQtZU2fGxR{VCgNeQsk&UlK%8JO6HHIv`fzIi?6mR zoy>W8wH0@{R($cec+%!8YkcgAw@PPeR-I1k1ED*|tt8GY@n4lzU?h7)F-%D%wML_f9#xHZD|hwB$fE?%0cmmYHurTE2)VBugck&uHCA2z@M>-rg@cC_4J4y zW9y%^@;}o;ko8h}lQxatGwmHU!1fcrF|guoKP?-D&RGCkTdP-X!NeqTYe>T@_#Hn* z%8FNiySTx~^;vR3r((*!{iBTEuLCQr8!NQMdt7Maraw!J|w2o zI1&{*&Cf8<9^>!mS)*VZ!FOB%66Z<&LPW z(394EhZeJzQrF?@E0RTcJ{>|P2`E`agopdoWUK)SUsiT@wt{w6ZhVbjw9|n8H;H6< z1Znal<@bw{?h9J7va)CRy++OE`tqw6pR<#VXca$z>ACRW8Y474GDK=dW@ekI&Xi*( zO81WeCmKE`#92uWNw>RqlZ}W|2folLJ_`+hb=OPu4Qf%{biA0?$1z9sl;pj3o!r;u zKAa4SFWtYMm`i;HE}Q9M2d~f0h|SIBFOebAXB{G=qM}yy-G}%-l2`JoKG}@XiW!OP zI`I2Zn5lnGOl0}(?i@2IFXw(A`KIyUubT#v91?xbh=)O~F(Noy{0{C%x5-ZD3e0NU z5%HNgw%ym>Kf3s`pIk_676D%LwP(WRJ%|bcW6nIqIT1^DGnO?iaDbhDQaP3v|6%dT zJVofJy-;B3S{XgdhOWOqI@IK-Cb{IVSqp5w7@~#V$-R~BbJnzj|ET$9@29sDwv#`- ze>sUK!AxxIqqbV*HM|T=zDB^p`AtVtN%H#)UiHiljgO^mp7_tLw|9skRvE#mo4EK! z@6S0dfmq(rFh+dqzJSGk5}~*3<#)R6dv_ry?wRapnPh*Z-hKc2dZuy|-!DpDq>$e&r4(BB}-|aj78>9NKV1ZAjHah9R zpL6|%Nv*}kK(?wFC)+FORvnMc0$Gy@re>pRjfIws?@1^4!}&VDF&tULpS-(CRn1!Y z@y@_%6wzLn-`=ldl?kssUL8l3Lv|n(`Su^xQ$pby&Gvr1l;YpETx>w>jlDd&yu)o3 z$1!>#R9dz@yWGj|*ZJ~RjVh{UXSteo|lHt#qF~Sgyp}JY1vH6w`l5o21`< z$Fey_N^rwDvWn^Iak5LqnoB0oQNLP%d#@VjPftY?oYwtn?33r?DD-@-_>UDQ3&Uvw z)01rn>&14{n(b*5#L{N<$6p=!3t<%WBjZ$Fl5XD}Kvb`@?o!n#LiS4zyBr5W?}@`4 zzhCU#`MgKtH_I1z28)e7r$4d)QMFHFpeEl9u;zw!kp8;%sEBza>16AsykQj!OiBQj z!?9N%-riX05&&jlR`wq3^0gM5y*NegM;Sul69%Rw>u(CLo5XO<6V(?C{KB?2&WV=} zaagJe&pPj~q;fVuMc5Wp&<6itZ#PRg}aR9h)DKOhd zes84Hs`gmii(iBgbbu8-a?J2#}3P@Fh;C7>q9pR`3C+F*F=1 zG}UzX36h*R=eFO=4K>_eUZP*1hj})aUiIswWz=SWU$xOfFQNOy`dv#S20zQ9IZ4DV zdwi9x*SfQ?G%clfCC~RuWJXeE4Xo+55%-Q3Fp!7{YGAI=)H(@F*sMyO^9bq6F(0f5 zPEJlAMoQTLRg3RH4(RfK;HC@AFWFf-lsxQf?UBwrgoGfetdRuyN)r0V1{mt;rH@n_~uRubA1r3?U9PU=GofJce|lSJ^8H*i~ef*4W% z2$*!X03U<+o$j^(bi3ChL*1KwpS~7PW;nx3HU*smL&Kd{NE)*DhkBGU0@qHmL%rpGsve7~Sy{43VZ2Cm zRFibmd>oVRsv1Zr;-_`qyDX#(lx`Z)CAvhpv`D+fY_jh97mw$OWTk6o1oH10z!4hUV)DBjR(DkjX0Xc$IZmAIffLT zj3pQ1afS_3m~B|BZ&?z+>XW%k2|7elKyMv%W&jv!x!?&OA0J#tCj(URUj0X4|+|usuXVnDvITIQJB5*)DUrp!CC%<1p>^@@SXLTJvto zoBKzTn9rQ_!%uVe7Y8v-zupHg)+n(RY!cHZH130&ma4XDBclyY+vDNQR*Agt78@4xt==^l(BJ;oX@?y`-$t%27 zNywVa#t9xOtUjQmlD8yx!aMDPT5`U?sk!1s(GXK~N!5v?YZ4~ABgg7Se)2zmOE$M@ zJ_1GPzu5Jt_O87h#wjt*>*&}6+={0Z`_3fLVz?}qH#L;P4Ov%<`#(keJS=<$RwI@; zA8$+Fk~nA^?7S9Wp|pVJwrqb*c3;RGqy<;!SafUm-uEk(%RG*`(;$kkIyi75cET1n z@wEcpUUtXH-7&iI6Q$3kF90AKHRxVh@d6u>RCc%hWkzfHT++L1Cb!83oSY3%R=PoJ z;juT|X@<$)A4M&m4WAesiRzrb?(7h*DSm4qP;6pn(1&_HuKmMdVUgIeYLB2zs!7z9 zy8*e5<*mK~>%(FbM{e}e_PQH|^ZaGvPJpqJ>iQt?Cqh^?OwPQ;dr545=(*)4a+g!Q z(D9m&q|@$d)lhWe+~&5`u}Xib8XR+6`S7#vJNE;IbP@^reP8Ve=s8(it!?jUuq z>={4(=Q}Rk=8TxIZGQx}ReC@?LU5|T)vs(~FvJwv0wSs@>d_Mr6j08CUw7CJ9`f6* z8BvbsbBOq?c6T*_^cP*&S`+G{FxKi03IFnofGUCRj(eY|g#z8@RVer}?k%dH}wM z^%@h?pO8CQ2LiIiKD=*n|(qEKeNJ7rUDI|?e%V~6%Fw7__}1vI z`G~9Pl|BOv-*2K?WWl6;hz)?Hea7GB`&Xu;z$vB!vKs@Nqm(r~#Exzn@k0Ea$syDA zH=`{+JNdUl!+b~tGf8}@N%+=Rxx}~8{gTAAxs?ZvWY_J5d;b28$MaABmSN{4QR*8Q z5Pg-+DEu2Y*xTq02Pw2pTz~Kk!wW&H3Ce>^Tp#6@9`>&F}(t%TRPu z*VxJEi}=Z{k97?dtSvenNEGUaf%`d~dnV^6VzbxIo$zP#_KudS3?tVZo|W|0 zhQ7t9LdM2YX@Q-j8&K*jFk29__5H+T)lZ-g%`I8z5b1erZr+K3{ojDae~g9`fT@d7e z+Zu~{A#`i((%d#u4u3i1Mh`PA&9mxw{(VuNy3tSm9T>l5IV+n zReo)^Ys#8s%O>Q za)UA3^5Q+{=I2a>yRDYvYSwT`r& zbZ4<@Bs%q@-{>3HIuS+=4`1|F-_3Fk`^ak+AL8|#d;D^;{|oT$KdQO`0x~k9}C58%jF*H zY~KV>w1|P(`uV=Wx&ZP^O5-d*te9qA|7*_QQm_?z+lS?c21_~iim%+r$}-NSBLjIX(ei!BGEW^Q;gGc+J|>z z)fIA19MWgzNOM$9?OQDR|FIMP4HAwW09yj~gj#w3pErChgAwN`$@U97M{T+3c>sR& zZ5y3;>_E};Xl7tU|Cyz*3M1~`YE@U`Lci1ti_Q+cX`}NHiE1YW)t}7Yhq+a zz2HvmT#v$D%*44#0qU0#vlZhA0GDV0V=M0M@|8wlq=N)f0Ltd(<|C<~{%!>r-QzaE zjjjNs1;=*0i-J|+M6EZMC&q06HLn7QL)IsiE54O6#!wSua}w+W(Ig2AD-G`N$}kg& zJcWgo&`WtP2q)gq$lD3;Bl3O36usi8OT=QWV-7?HD@|aCRYUOIah0CUDC*br*FU>! zTHGW%-e!dWh_>_H%7#2H6yMeY6YyfMb%$Z6R{rtRTH4xyXi1YBz(%QHzX@w69hJ9< z=QLMalC-p8aaq~*|5x6X|3kg@Z+jGNDwQm!BTklt8VzAO>Zq(mIVDThWX9OHnZf9! zb1KWBEZNHv*)q&v%+OIJYshY>vG3VN*6;nXoL73r^ZWr{KlI9L7Wd~~-uHXCuIr5z zs{B4h^x7arN1hWIhR%SOlV0oGO;faaA0}?)(qE~Nv?M;T(1z|gaYGbXYPE_pIv;qk z@_)LL-f>{^JYzZ}#-~)3!lO{p)owLgl$R#L!Rq-?c3DF_Da-pMpw(Uxl9r$z6iX@a zS@EWT4#Y|TCenLGoIc@DQBiTlaMXRQBNdQ8)q&JvVBvPLcDPogd{U+Auk;BOPBwVK z8|f}}5zPlX#bvtf42P{#ck=g*H)fKL2Pg0(iu&IAxW>_G>oKh3gG zbu?Ng(5=!3BA(IDo~gozK@WugND{r6MYB@_0s;WjB{D57?RnE@>+R44XMUsP_Hm_t zb||wANv2oDO7ut2wd)u%1;vFDmtM?UMj%8ZuA!3YCl(4ihHy^Hw{xJw9j6s+%hY)L zB5YXYvC5AoRx8QzwVd0xZ!Z!ZwoQoGH8CDXJ5SJ79gT#vg_=GZ!BmN(c9N|ce1CV@N1c%D`?4wd1KsO#`O#(D7f2% zWFJ!&C^VdR2+)wAX52={p0~&$%N)wBkvfn)ZU(rT+_s^iyO8`);fl9&;N2ASdj&9zq5(2{R&8ZUjp92BkYLmI zL>5J-F9OYd1{)dRfG5Ym`K}h60%?;S&EKX|)G`3bMuTszEq<&7&tB>}dY_N2eKb*j z&%qo!5ml_mTde%N`X^Xw#szK9nTi?WxnjQ~oWd0SrDaUPeB8uwcxbJzQr;3KxG|0p z4S=6{S2@Plnf~&n9u7yjtf{F^DjJ7%3m%r3F0nSg8X~55 z+i!H6n-pnTFNAC4WYwTeCEOve!$u?oxEk=M<(KK=t}lb};p0Uo+&}qv>~PMEmP!(B ziNdEzGbKgiRriB@kZNxd#35(F-vPABtY$zbD-8k!T96P@RbFg0X&HTrIxr4YoGivG ztv}O1ckE~KRu4(W1eJD0gOSy=&x^l>K@D8MlU}Gz#T zbxRDOwM|Y=7MYC#g3x$g0Z>hLDdkhV1IT-{ejv1Rq4TgC;K|UAAg^b4UiAXJtfU-9lVA(%=Y51kBk56cp20jLh`c=w=Np-Ynd)2{Z^+76ei-1Um;WZj5j_W;*b zo3W#~Q}JG%)OvvYgXs{vRg#*K;mS{w5?bA1(>LQm&;b-^$6eLD$4cy5k?h2TjF{0| zsN{@>duGr04z|am`ppUMnwKvJwiz0ZDl;|!GK~PGr9{*DFayfTIm5hNrD-DRIO7-F zqDs(zw>n`avJ-#T;H_^IWyB8mc;{XNg|doBXQQ`t>U=hUIL3GlrQ$x}@=Y0E$-S8- z$IDxkNQ+vM!vfx64<7+enuif7#T;42JTVGj-e_7n=yCE;CZ?e`Ku#Aj-RzJ7JnBbr z`%8sq10|#KNHL6Bco)48Kc^9&f%Sc22NSoIZ+B=uqP6_?VU_HzD%d#_eh0Va^bqo* zKt-IoNWcJf8f}toSn%yl6>Z-(%)*^W7iNSoUql&H^Jp=w{AbN=d4GhpS#HJh(8)9T z@k6+)n);IhipX52oi?+DQrf}Oy0T(EJ9&gZ;)bskNcp|7>-W1+KfuEyj2~m_^&sN2 zpzu2Hx(ew82LuATDpf5I(4qg;7MrpH)7YE1lp3(t7ai*J@0x~G=cE2jEd(@z?+9KWf5dY>U{k%NFq0UGJU6|R>*c|y* z)L@kOJDh!LNAco0$>ZS)c4HFfoY|tA2-5s+(ChHk0S~t?X zW9S|q$}P*nkGh4^bx|42ye;EH#Cno}+k@@Ccxkise4CS5+&*rOtk9)?01=DyBq*kZ z_uA#)aE6EP_JRmfFnB8RkQ3Fx*5RQLJCz!Bil2E25(K0mBlG;c(8l;vd{ru&V+Y%5 z&k8eQ0v;>$YC2KRHYjYDrfAMjj8i9b-k7-8K5#124w+Iz^VU?m{;Yt-owg9)f<6Bl zD(7E&=(Rq5-MXx3I~#(K{khc|#6oK@MUk^xe+F)z9)XMOfjr(ycZhEG=p8DT=G|?Y zGE5F+kQdYoG_{h^4<9Y0tb{BqSka`Nsya?hwCGyzqtz6Qgko-Oks@p339k9uE$cDQ z>LC`>@2*QwZ_9VIcpD6d!vTEpR zm=;tA++8sv-;|UjXS9d!nvyV0Glri7%qDc?Wc5|5QU|O`ToF5Bs+47 zzG74w6ZNc1t>JMiX@SMlhlRQvxwQMj_s>F*s~t)7-`ucY9t|%pVu6^;8v*Crdzp@R z&e&HbW@*_!LIg4wh|8Qvx1{QU(YgKTGONo1A$<4Rr9TB4DU*na_MarQo+$Lg&{+3( zvOSre@Kk3<5ydw&c1(}fSaSsItC{1h{_oOvncgeIMYWb*5onEWL{TE?nh25j(HRAz8 zrP}KoaC2gy{vybn9HvMoZ%FsZakq>{J5MDgd!>r!^$RIC$c9g~?x;#MF3=ngNn3z% zk5U)YJJd2KfEUapf+dS2N+kSmQ>zr8$oFpz` zex%{jYfJX%>pzKb*1a`k6Y?jl9mPql*9rqFWREx6{h9VnL7$Z8ziuSW6`&@`#LvuuM7FGNhKpMk4%gR0G^VjDSE_;cCuaxR>eNSxfbrr{-OlJXhBYXY6g2HDc2|a1o>S(DS&ERH4sOXh(052dT(x{QXxU5fM+J zJ1b(9AQc%j{h9f}u$2T0qHTJlVPiprxp~hP3wHh!%(H*?+4UMj>k9Z(#|zRf0I!WCV4UY9x<9^oxsmwUw8s!$+!Lk?^>h`v zjf5TWFV6jKsenVl^J^2Excr)rFE&KtqM|?3d+giA!0lIdv#m`l+dZMN!;FmroKZ(B zq>MU#Y%l*r(hpwp`mcyG?jEbE%fEB4+LN0+(zo*UGKc;9v4ctBcdH`gKG)8E0dXMk zWK#@`jWY%tQicICV~3jQYxDas`;LRl03ak`2Y~dSYc~Re=$9x?nL#PxzcZWj{=~2y^j|S zCOT*@h$r`tbmi&!qaANZ%e>s;3bj27EriLw2FQb!zqAP`lp`4ubQ{OQawP83`Ev$R zQl-ta&nXmRAQPP-@P`mE?806HKgqJ31x%?d8X%h`ew*rl(7sO~O5r9(uPM{`)nEJ) zLqLOh{G3!;GNputkT;ddl}i0hQu6Q1Qdh}|KmGi~LLSGyYmv~=B@@aeEY%$R&_)`LAn*FDI@LD3k zj9k>ft6*k5TA`N3*9Kl?XP-1b4mu^*-4`$!;Iyo)taPnD-8=vUk#{v$$k_z@x3!G} zc|&fs_5$Y=(5>*$XjicsHf!pB%ssw2w~-glLaH|L2yZ-3ny_juqr{{F85BJU;j;}p zy0uH8yfnI47^5I>KO@m!Gbh|IwK-OY$HUc7B z1ShkBW)Ih{T>0C?<1;59MT-wf{uXdJ35}%-ezqAenhFF=zdt5yqF5I}C&;0K{#bqR zGq(&RdD7lJ60ieQYA?`ya0X2IgE-fsY}T%_vfpHkh*|)-BqZV(GstGOloaAbsD2GE zXT! zPdW8mY=s4#o&3|icuItrp`xBoH@vf06np0{tG30PKl?_?>_{8(7a((!m8=^#hI)MjMFAS^6T~|^IoM1bNX{`(#`oRB z{U+UWA(|(?`cV>;V7t$vX=g##;16W0T9!aJ`*DDO%X1m5*CR|Y=I6bDijE4D%Ro45 zYX|)H_~0JMXBufOSNG?=RHjEgTuNmZjMlZRK&2Vd5`whD18+(Og+_(Py&t2G*$sp~ zGBC)yz-OWJQA9$kp&v#`ua_BaJcW!-J3%feU+ViLY+$b*(5AVNT5YYEa;~+|^%ZFp zBMF+aj+?W-%)^J%-%o1Wu%X7S!yk>@*WuN29`*EJAS|wqSHZ$G% zxsr;?E~YPBZV_XQ|2?$p(uHI7-E{sYRF+a<-vg)d0+VA#T}|Ot;f5c!rS5oMJhfTTD27Dngf zu^pU8rpV~ovt77rSMyqF1$CD52M#0&n|Fx83Cpd~oeTW%a1*=w?%Ky_odjp!M?Whx z(}y2lXS&fuyEOnGYld!gTL--EyXU>X0(O*`0W%Y9Yip%?MA;6X(V4R;@wP`Ir*qs( z6_szb_`c?Ab35UNu?A?Q*I6~LBmJ3e8m|NkZ4{o2Hhq9JbviG%F@%)eEPKOhW~Cph zsx-`Zt5!t2zG9mqoD%zHqu@TnKD|7pJNU)o{t2|Og8bfwO{)SpiS zndCkF?kZEQ7W~NGH3C-a4DANw>UB#20+t_0{>R+vXUSMXvXhLT-g#qU)Er9zV5vI* zETwwWc+4srV4|yKdAT%D$yvIWM<@8{ahHtj98_SB-~oP=ud)$gOy|ck1-Qr~*zxp@ zw6@X?-h)SqENUlFY2%BhmQ)Eb&hr!n>$Upn#_`?oPmX{XmX3I_;Tv1VBd)w|%#zQ~ zX19K1Lx}OXDFE^SOY;$DnFcXGMA?8!1?x7t!&Z!v2~lZQG}eq?3YE4ZI_or(-WM@V zhyHm9+!9(^8`c${zK0t%ddpDebYoI~j(RKQLefU=AWpZ6S;C=O@`ygzYGOe7yUq+khL)(G+4=b1p399(^oqa0(BR0op*={ZtQ@P z@FaYEyg;5gy>s_K9#}jR{(9E+=dw)nfZoxriFme=9Q5DOBHa~z zw!;4DEu+!#cQ8e+TA>=LlSWDc^O(%4o&Ryaul~)wkc;@!W0wtF+99dnC_AE?s(60U zJxo%Q^~#CU(Va#^U2u5?rXlL?i!XgI876GZp&=Wnc>yrG`&&A|o_=a$XWRZ? zs%1!s9;c&+za^O8EBsw}ezTJ!L zmQzzR-49S!jL@$Mz{hpdv9fA)RkhRC4+Us&13fdMBmkDaSs=1@O-?$&1Nqz2^iP(z zh#$bnEbQ11=vK$SEqv-2p^uEV@T6`O|2MV`w5x)xz$PY)+$TUn$SnBSRrb^S;Ggyc zAWNFL@Sww)@alEna_rbKjjH2^EK-~vgF+Oa@Y$9f-pNkF6~bhYfCtLKMFKs#vn{EI zYq=OGf3)PXcPooBC$ojayYPd;i3`Sp`Kj3Skuv~U5H7OLL0nyQ=KqLlL$TrDBa620 zpY6v3yLyUN(bxa1zSOM-SOooZU+GJVa@JeG4_3eLxIjyhC&7QGkfc0Te_1l(hvUfP z*WGboMdv#1FeO)RdVk_AaF#!qvhp*(NWY(sFZ=*<7blp(d=A#!pw;)|p8(6E{Y}bj zv%%W*cQ>{@md*3)=@;unM{Y6w5?5mtpx@vH6dGJXclAM_fp87%MfsHAoYy0TE3>5H z!N&?Ub-A_;Io6#&O@Ky329ONrhYufqCCSe&KxlRBx51?{6=KA3~`<4mr_|p^$QTKo4-{B$26@O3}wd^Ci8NBS>5qkd$U_9 zpxpwV`mGwBmYdsH>C{QE31Y(|#AOJTc`e;>#*D^PeM&dL9&3iXDh$3^URs#>AeH6O z+7U>%tgkP;^=PR8oIg;`@h>h+ui79q{ji=BAUiU7Z*5EZ)^Z}V#|4V#E_j5&SdgT` z>WYfR6~SVl$X7Z{jHG5{WJvz<3wKv#&m+*1eGmjy*NGgMQF1P;X;cMujq+hNIc$Yu zy4%IHe1Sy4ZH4br#zOfA^vYs6i_AKG83d zZr-PREVXo(AWUnxDWd_1wakFNuy>{hJ{AIP>ky@FT+GulolqsFUC@HMI+zNY_>(`*tZ8)aneX`#0E+ebvgRP&zAT zb5WxQn)H*kzRIgAqG?BT-EABCfZ$)4O3K_ewK+Tl9-x3v+yp`|CO`!<_l9~Ae`k)B z=J^KT@_%Y>*65(Gk}&`C%$x`NR&emmY7Lu}1AE-8Rw2)*qfM(Bd4}9f{iCDf7J#al z!dPw^81zuWfj4dkQY}=h#$V5#bpDf1hn2ww7M8|+Orh|paW{3eAELn&FIE=6%*{2x zBM`GicQM+`^PTuiVCLLVrTlC68TMy~(Y$6a)?%Ptu5dO*7W;X}hv|$MpXnDOgXFHI zOo5)fH^oplG*8dPsbh=k!M6pR`>H zsH{2w)7ML^T>?easHDz1Vf^FW+3oWogOPi( ziY^0sj{fKaRBYHfmadRz_lP!0v5EZ6pExmQESt-WL5vVphO&y~80)7@NBNyU7L13^ z^+ZyR7}HUFH>meJf$!4#HTK4_|79O70JlQ|ZV>1#T= z_yIOMH{zf~)~SI}}F-Q#@D zX`;OCibZPZiSr3nG=}|{s$Nl+YP_$MkWmxPvZd{hc`X637@ayJ>{Zy2C!OH#Atk$btU{XRyT(yp)w(um@z? zNfw=2`Bdj?_h|DktBpRy6r<~csK1BsRGRZ`xib$zJYjZ|qngm<%b$z4*xiaA{-)L+ z{@(k_)+;p;gQ+RwAy2v#xKLK7^yR?RcF&p z$T(+Rf@l8<@tk|K;d5-w?C8*p(`xqMJAdKdKNBCs@qS|T=idugk-HaV@wgdQ+*ImK zmAuS?a()sDY1F=rsZ+Q54R}ki4YiYD7OVH}+;!3J%d=9n`u5qcftp){AYk=J4uJIN zi1$i5)HG@$0D&f&>%2W0C{y_{OIEfS%=3Vwdsl?POXcLJFF}C}gVfmKh6Ue6;{g#> zj0nU7#y{rgj!6=L;F9oF>)Qu9}_3M`3z{>F)V3z~@{*i$u6yLMs z54LiO5WETDiY`x3+1gxqgQGh9fK+m1b-l1MCs#;(UU!WNiW+2`!XXOP+gNLhC}#I2 zp7w4sqWp1t%?*N3+&h{DFSbvg(!(_Fv`C+*jg$STYhpX$ zN0oyO5}$#DtLlFl(I&Kh35m@{I4;G;`ud!kw-)ceJNCz2j3480D#X0sBH~*+GwYvO z0lRf(oP67Q>&aYyMg$4i6msup=Bf?ff4tiiJo++Dv1;oPZVY_@%2AWfo@JWo+w@qA zfGFQwJnkRK)SumCz&gP*jfOpV@co!Jy@&1`liN&0LoOTmXAdk1aA88}kXBAVYA1e7g7)nEJ_^R Date: Wed, 21 May 2025 05:59:03 +0800 Subject: [PATCH 06/26] [PyTorch] Add docstring for CP load balancing (#1802) add docstring for CP Signed-off-by: Charlene Yang --- .../dot_product_attention/context_parallel.py | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py index b52d1003f..2b8d332f4 100644 --- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py +++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py @@ -3487,7 +3487,64 @@ def attn_forward_func_with_cp( use_flash_attn_3=False, ) -> torch.Tensor: """ - Attention implementation with context parallelism. + Attention implementation with context parallelism (CP). CP partitions tensors along the sequence + dimension, and by reducing the memory and computational pressure on each GPU, it enables long-context + LLMs in a distributed fashion. Transformer Engine's PyTorch CP implementation currently utilizes + the DualChunkSwap strategy to ensure load balancing across CP ranks. It is applied to all `attn_mask_type`s + and all `qkv_format`s, and it requires sequence lengths to be, or are padded to be, divisible by + (cp_size * 2). It also requires tokens to be re-ordered before entering this function. + + For qkv_format = {'bshd', 'sbhd'}, the token re-ordering is illustrated as below, for an example + use case of s = 12, attn_mask_type = 'causal', and cp_size = 2. seq_pos indicates each token's position + in their corresponding sequence. + + GPU0 | GPU1 GPU0 | GPU1 + seq_pos | 0 1 2 3 4 5 | 6 7 8 9 10 11 seq_pos | 0 1 2 9 10 11 | 3 4 5 6 7 8 + ---------------------------|----------------- ---------------------------|------------------ + 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0, + G 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0 G 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0, + P 2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0 P 2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0, + U 3 | 1, 1, 1, 1, 0, 0,| 0, 0, 0, 0, 0, 0 U 9 | 1, 1, 1, 1, 0, 0,| 1, 1, 1, 1, 1, 1, + 0 4 | 1, 1, 1, 1, 1, 0,| 0, 0, 0, 0, 0, 0 -> 0 10 | 1, 1, 1, 1, 1, 0,| 1, 1, 1, 1, 1, 1, + 5 | 1, 1, 1, 1, 1, 1,| 0, 0, 0, 0, 0, 0 11 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 1, + ---------------------------|----------------- ---------------------------|------------------ + 6 | 1, 1, 1, 1, 1, 1,| 1, 0, 0, 0, 0, 0 3 | 1, 1, 1, 0, 0, 0,| 1, 0, 0, 0, 0, 0, + G 7 | 1, 1, 1, 1, 1, 1,| 1, 1, 0, 0, 0, 0 G 4 | 1, 1, 1, 0, 0, 0,| 1, 1, 0, 0, 0, 0, + P 8 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 0, 0, 0, P 5 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 0, 0, 0, + U 9 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 0, 0, U 6 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 0, 0, + 1 10 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 0, 1 7 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 1, 0, + 11 | 1, 1, 1, 1, 1, 1,| 1, 1, 1, 1, 1, 1, 8 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 1, 1, + + For qkv_format = 'thd', multiple sequences may be packed into the batch, and they may be of different + lengths. DualChunkSwap divides each sequence into (cp_size * 2) chunks and distributes 2 chunks of + every sequence onto a CP rank. The token matrix transformation is shown as follows, for an example of + batch_size = 2, seq_ids = [0, 1], seq_lens = [8, 4], t = 12, attn_mask_type = 'padding_causal', and + cp_size = 2. + + GPU0 | GPU1 GPU0 | GPU1 + seq_id | 0 0 0 0 0 0 | 0 0 1 1 1 1 seq_id | 0 0 0 0 1 1 | 0 0 0 0 1 1 + seq_pos | 0 1 2 3 4 5 | 6 7 0 1 2 3 seq_pos | 0 1 6 7 0 3 | 2 3 4 5 1 2 + ---------------------------|----------------- ---------------------------|------------------ + 0 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0 0 0 | 1, 0, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0, + G 0 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0 G 0 1 | 1, 1, 0, 0, 0, 0,| 0, 0, 0, 0, 0, 0, + P 0 2 | 1, 1, 1, 0, 0, 0,| 0, 0, 0, 0, 0, 0 P 0 6 | 1, 1, 1, 0, 0, 0,| 1, 1, 1, 1, 0, 0, + U 0 3 | 1, 1, 1, 1, 0, 0,| 0, 0, 0, 0, 0, 0 U 0 7 | 1, 1, 1, 1, 0, 0,| 1, 1, 1, 1, 0, 0, + 0 0 4 | 1, 1, 1, 1, 1, 0,| 0, 0, 0, 0, 0, 0 -> 0 1 0 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 0, 0, + 0 5 | 1, 1, 1, 1, 1, 1,| 0, 0, 0, 0, 0, 0 1 3 | 0, 0, 0, 0, 2, 2,| 0, 0, 0, 0, 2, 2, + ---------------------------|----------------- ---------------------------|------------------ + 0 6 | 1, 1, 1, 1, 1, 1,| 1, 0, 0, 0, 0, 0 0 2 | 1, 1, 0, 0, 0, 0,| 1, 0, 0, 0, 0, 0, + G 0 7 | 1, 1, 1, 1, 1, 1,| 1, 1, 0, 0, 0, 0 G 0 3 | 1, 1, 0, 0, 0, 0,| 1, 1, 0, 0, 0, 0, + P 1 0 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 0, 0, 0 P 0 4 | 1, 1, 0, 0, 0, 0,| 1, 1, 1, 0, 0, 0, + U 1 1 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 0, 0 U 0 5 | 1, 1, 0, 0, 0, 0,| 1, 1, 1, 1, 0, 0, + 1 1 2 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 2, 0 1 1 1 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 2, 0, + 1 3 | 0, 0, 0, 0, 0, 0,| 0, 0, 2, 2, 2, 2 1 2 | 0, 0, 0, 0, 2, 0,| 0, 0, 0, 0, 2, 2, + + When all transformer layers in a model share the same CP configuration, i.e. cp_group, cp_global_ranks, + cp_comm_type and cp_stream, token re-ordering can take place in the dataloader, i.e. only once for + all the layers. An example of the re-ordering code is `get_batch_on_this_cp_rank + `_ + in Megatron-LM. + """ if cp_comm_type == "a2a+p2p": From 90458e773e7a05683040eebda482d7e429e4f8eb Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 20 May 2025 17:26:11 -0700 Subject: [PATCH 07/26] Add missing docs for C API (#1803) * Add missing docs for C API Signed-off-by: Kirthi Shankar Sivamani * Grammar, typos, copy-paste errors Signed-off-by: Kirthi Shankar Sivamani * remove contiguous word Signed-off-by: Kirthi Shankar Sivamani * Better wording Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- docs/api/c/cast_transpose_noop.rst | 9 + docs/api/c/comm_gemm_overlap.rst | 9 + docs/api/c/cudnn.rst | 9 + docs/api/c/index.rst | 4 + docs/api/c/multi_tensor.rst | 9 + .../transformer_engine/cast_transpose_noop.h | 16 +- .../include/transformer_engine/fused_attn.h | 143 ++++++++++++ .../include/transformer_engine/multi_tensor.h | 204 ++++++++++++++++++ 8 files changed, 394 insertions(+), 9 deletions(-) create mode 100644 docs/api/c/cast_transpose_noop.rst create mode 100644 docs/api/c/comm_gemm_overlap.rst create mode 100644 docs/api/c/cudnn.rst create mode 100644 docs/api/c/multi_tensor.rst diff --git a/docs/api/c/cast_transpose_noop.rst b/docs/api/c/cast_transpose_noop.rst new file mode 100644 index 000000000..ae80c5d2d --- /dev/null +++ b/docs/api/c/cast_transpose_noop.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +cast_transpose_noop.h +===================== + +.. doxygenfile:: cast_transpose_noop.h diff --git a/docs/api/c/comm_gemm_overlap.rst b/docs/api/c/comm_gemm_overlap.rst new file mode 100644 index 000000000..090551f60 --- /dev/null +++ b/docs/api/c/comm_gemm_overlap.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +comm_gemm_overlap.h +=================== + +.. doxygenfile:: comm_gemm_overlap.h diff --git a/docs/api/c/cudnn.rst b/docs/api/c/cudnn.rst new file mode 100644 index 000000000..5d93c4d6e --- /dev/null +++ b/docs/api/c/cudnn.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +cudnn.h +======= + +.. doxygenfile:: cudnn.h diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst index 7bc864dcc..27ba553d6 100644 --- a/docs/api/c/index.rst +++ b/docs/api/c/index.rst @@ -14,10 +14,14 @@ directly from C/C++, without Python. transformer_engine.h activation.h + cast_transpose_noop.h cast.h + comm_gemm_overlap.h + cudnn.h fused_attn.h fused_rope.h gemm.h + multi_tensor.h normalization.h padding.h permutation.h diff --git a/docs/api/c/multi_tensor.rst b/docs/api/c/multi_tensor.rst new file mode 100644 index 000000000..8ba2d274c --- /dev/null +++ b/docs/api/c/multi_tensor.rst @@ -0,0 +1,9 @@ +.. + Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +multi_tensor.h +============== + +.. doxygenfile:: multi_tensor.h diff --git a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h index 678ffe919..649b5ced5 100644 --- a/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h +++ b/transformer_engine/common/include/transformer_engine/cast_transpose_noop.h @@ -17,23 +17,21 @@ extern "C" { #endif -/*! \brief Transposes the input, providing the option to immediately exit the kernel - * based on the value of the 'noop' tensor. +/*! \brief Transposes the input. * - * \param[in] input Input tensor. - * \param[in] noop Noop tensor. + * \param[in] input Input tensor to be cast. + * \param[in] noop If this single element tensor has non-zero value, kernel will exit immediately. * \param[in,out] output Output tensor. * \param[in] stream CUDA stream used for the operation. */ void nvte_transpose_with_noop(const NVTETensor input, const NVTETensor noop, NVTETensor output, cudaStream_t stream); -/*! \brief Casts and transposes the input, providing the option to immediately exit the kernel - * based on the value of the 'noop' tensor. +/*! \brief Casts and transposes the input. * - * \param[in] input Input tensor. - * \param[in] noop Noop tensor. - * \param[in,out] output Output tensor. + * \param[in] input Input tensor to be cast. + * \param[in] noop If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] output Output quantized tensor. * \param[in] stream CUDA stream used for the operation. */ void nvte_cast_transpose_with_noop(const NVTETensor input, const NVTETensor noop, NVTETensor output, diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index 3400eaaeb..f63ee636d 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -634,6 +634,8 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso #ifndef __HIP_PLATFORM_AMD__ /*! \brief Update the RNG state with the seed and calculated offset. + * + * \warning This API is **experimental** and subject to change. * * \param[in] rng_state_dst RNG state to store seed and offset. * \param[in] seed Seed for RNG state. @@ -666,6 +668,8 @@ void nvte_populate_rng_state_async(void *rng_state_dst, const void *const seed, #endif /*! \brief Get KV format for a given QKV layout. + * + * \warning This API is **experimental** and subject to change. * * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. * \param[in] workspace Workspace tensor. @@ -675,48 +679,187 @@ void nvte_populate_rng_state_async(void *rng_state_dst, const void *const seed, uint32_t nvte_get_runtime_num_segments(NVTETensor cu_seqlen, NVTETensor workspace, size_t len, cudaStream_t stream); +/*! \brief Set the seed and offset for RNG state. + * + * \warning This API is **experimental** and subject to change. + * + * \param[out] rng_state_ptr A size 2 array storing the RNG's seed and offset respectively. + * \param[in] captured Whether a CUDA graph is being captured. + * \param[in] seed_ptr Seed pointer. + * \param[in] seed_val Seed value. + * \param[in] offset_ptr Offset pointer. + * \param[in] offset_val Offset value. + * \param[in] offset_intragraph Intragraph offset in RNG states. For use with CUDA Graphs. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_extract_seed_and_offset(int64_t *rng_state_ptr, int captured, int64_t *seed_ptr, uint64_t seed_val, int64_t *offset_ptr, uint64_t offset_val, uint32_t offset_intragraph, cudaStream_t stream); +/*! \brief Copy keys and values into the KV cache. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] new_k Key tensor. + * \param[in] new_v Value tensor. + * \param[out] k_cache Key cache. + * \param[out] v_cache Value cache. + * \param[in] page_table Page table for K cache, [batch_size, max_pages_per_seq]. + * \param[in] cu_new_lens Cumulative sequence lengths. + * \param[in] cu_cached_lens Cached cumulative sequence lengths. + * \param[in] qkv_format QKV format, e.g. sbhd. + * \param[in] b Batch size. + * \param[in] max_ctx_len Maximum context length. + * \param[in] max_seq_len Maximum sequence length. + * \param[in] max_pages_per_seq Maximum number of pages per sequence. + * \param[in] is_non_paged Whether the cache is paged or not. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_copy_to_kv_cache(NVTETensor new_k, NVTETensor new_v, NVTETensor k_cache, NVTETensor v_cache, NVTETensor page_table, NVTETensor cu_new_lens, NVTETensor cu_cached_lens, NVTE_QKV_Format qkv_format, int b, int max_ctx_len, int max_seq_len, int max_pages_per_seq, int is_non_paged, cudaStream_t stream); +/*! \brief Extract the first half (half_idx=0) or second half (half_idx=1) of a THD tensor. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] tensor Input tensor. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[out] half Output tensor. + * \param[in] half_idx Whether to read first or second half of input tensor. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_read_half_tensor(const NVTETensor &tensor, const NVTETensor &cu_seqlens, NVTETensor half, int half_idx, cudaStream_t stream); +/*! \brief Correct the second half of the softmax LSE (LogSumExp) for context parallelism. + * + * \warning This API is **experimental** and subject to change. + * + * \param[out] lse Output tensor. + * \param[in] lse_per_step Input tensor. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[in] lse_packed Whether or not lse_per_step is packed. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_second_half_lse_correction(NVTETensor lse, const NVTETensor &lse_per_step, const NVTETensor &cu_seqlens, int lse_packed, cudaStream_t stream); +/*! \brief Read the second half of the softmax LSE (LogSumExp) for context parallelism. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] lse Input tensor. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[out] half_lse Output tensor. + * \param[in] lse_packed Whether or the softmax LSE is in packed format. + * \param[in] second_half_lse_seqlen Sequence length. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_read_second_half_lse(const NVTETensor &lse, const NVTETensor &cu_seqlens, NVTETensor half_lse, int lse_packed, int second_half_lse_seqlen, cudaStream_t stream); +/*! \brief Correct the THD format output of context parallelism in forward pass. + * + * \warning This API is **experimental** and subject to change. + * + * \param[out] out Output tensor. + * \param[in] out_per_step THD format output of context parallelism in forward pass. + * \param[in] lse Softmax LSE. + * \param[in] lse_per_step Softmax LSE per step. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[in] only_second_half Whether or not to correct only second half. + * \param[in] lse_packed Whether or the softmax LSE is in packed format. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_out_correction(NVTETensor out, const NVTETensor &out_per_step, const NVTETensor &lse, const NVTETensor &lse_per_step, const NVTETensor &cu_seqlens, int only_second_half, int lse_packed, cudaStream_t stream); +/*! \brief Correct the THD format output of context parallelism in forward pass. + * + * \warning This API is **experimental** and subject to change. + * + * \param[out] grad Output tensor. + * \param[in] grad_per_step THD format gradient of context parallelism. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[in] first_half One of ("add", "copy", "none") correction op for first half. + * \param[in] second_half One of ("add", "copy", "none") correction op for second half. + Must be different from first_half. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_grad_correction(NVTETensor grad, const NVTETensor &grad_per_step, const NVTETensor &cu_seqlens, const char *first_half, const char *second_half, cudaStream_t stream); +/*! \brief Generate partitioned indices for inputs in THD format. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[out] output Output tensor. + * \param[in] total_tokens Total number of tokens. + * \param[in] world_size Total number of devices for context parallelism. + * \param[in] rank Device ID for current device. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_cp_thd_get_partitioned_indices(const NVTETensor &cu_seqlens, NVTETensor output, int total_tokens, int world_size, int rank, cudaStream_t stream); +/*! \brief Convert tensor from THD to BSHD format. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] tensor Input tensor. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[out] new_tensor Output tensor. + * \param[in] b Batch size. + * \param[in] max_seq_len Maximum sequence length. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_convert_thd_to_bshd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor, int b, int max_seq_len, cudaStream_t stream); +/*! \brief Convert tensor from BSHD to THD format. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] tensor Input tensor. + * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. + * \param[out] new_tensor Output tensor. + * \param[in] b Batch size. + * \param[in] max_seq_len Maximum sequence length. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_convert_bshd_to_thd(NVTETensor tensor, NVTETensor cu_seqlens, NVTETensor new_tensor, int t, cudaStream_t stream); +/*! \brief Prepare QKV tensor for Flash Attention forward kernel. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] qkvi Input tensor. + * \param[out] qkv Output tensor. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_prepare_flash_attn_fwd(NVTETensor qkvi, NVTETensor qkv, cudaStream_t stream); +/*! \brief Prepare QKV tensor for Flash Attention backward kernel. + * + * \warning This API is **experimental** and subject to change. + * + * \param[in] q Input query tensor. + * \param[in] k Input key tensor. + * \param[in] v Input value tensor. + * \param[out] qkv Output tensor. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_prepare_flash_attn_bwd(NVTETensor q, NVTETensor k, NVTETensor v, NVTETensor qkv, cudaStream_t stream); diff --git a/transformer_engine/common/include/transformer_engine/multi_tensor.h b/transformer_engine/common/include/transformer_engine/multi_tensor.h index e78b31d77..c21fd2627 100644 --- a/transformer_engine/common/include/transformer_engine/multi_tensor.h +++ b/transformer_engine/common/include/transformer_engine/multi_tensor.h @@ -17,6 +17,25 @@ extern "C" { #endif +/*! \brief Computes L2 norm for a list of tensors. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] output Scratch space. Required size grows with number of inputs. + * \param[in] output_per_tensor Fixed size auxilliary scratch space. + * \param[out] ret L2 norm of all inputs. + * \param[out] ret_per_tensor L2 norm for each tensor. + * \param[in] per_tensor Whether to calculate per tensor or cumulative norm. + * \param[in] max_chunks_per_tensor Maximum number of chunks in any input tensor. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_l2norm_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, NVTETensor output, NVTETensor output_per_tensor, NVTETensor ret, @@ -24,6 +43,28 @@ void nvte_multi_tensor_l2norm_cuda(int chunk_size, NVTETensor noop_flag, NVTETen int max_chunks_per_tensor, const int device_id, cudaStream_t stream); +/*! \brief Computes L2 norm for a list of tensors after unscaling. + * + * Unscaling is only done for computing the L2 norm. The tensors themselves are not updated. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] output Scratch space. Required size grows with number of inputs. + * \param[in] output_per_tensor Fixed size auxilliary scratch space. + * \param[out] ret L2 norm of all inputs. + * \param[out] ret_per_tensor L2 norm for each tensor. + * \param[in] inv_scale Scalar for the unscaling operation. + * \param[in] per_tensor Whether to calculate per tensor or cumulative norm. + * \param[in] max_chunks_per_tensor Maximum number of chunks in any input tensor. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_unscale_l2norm_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, NVTETensor output, @@ -32,6 +73,27 @@ void nvte_multi_tensor_unscale_l2norm_cuda(int chunk_size, NVTETensor noop_flag, int per_tensor, int max_chunks_per_tensor, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for Adam optimizer. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] lr Learning rate. + * \param[in] beta1 Coefficient for first moment of gradient. + * \param[in] beta2 Coefficient for second moment of gradient. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] step Iteration counter. + * \param[in] mode Whether to use AdamW (L2 penalty applied to params). + * \param[in] bias_correction Whether to apply correction factor for moment estimates. + * \param[in] weight_decay L2 penalty for weight decay. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_adam_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, const float lr, const float beta1, const float beta2, @@ -39,12 +101,57 @@ void nvte_multi_tensor_adam_cuda(int chunk_size, NVTETensor noop_flag, NVTETenso const int bias_correction, const float weight_decay, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for Adam optimizer + * where the master parameters only store the remainder bits. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] lr Learning rate. + * \param[in] beta1 Coefficient for first moment of gradient. + * \param[in] beta2 Coefficient for second moment of gradient. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] step Iteration counter. + * \param[in] mode Whether to use AdamW (L2 penalty applied to params). + * \param[in] bias_correction Whether to apply correction factor for moment estimates. + * \param[in] weight_decay L2 penalty for weight decay. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_adam_param_remainder_cuda( int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, const float lr, const float beta1, const float beta2, const float epsilon, const int step, const int mode, const int bias_correction, const float weight_decay, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for Adam optimizer + * when model parameters are in Float8 precision. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] lr Learning rate. + * \param[in] beta1 Coefficient for first moment of gradient. + * \param[in] beta2 Coefficient for second moment of gradient. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] step Iteration counter. + * \param[in] mode Whether to use AdamW (L2 penalty applied to params). + * \param[in] bias_correction Whether to apply correction factor for moment estimates. + * \param[in] weight_decay L2 penalty for weight decay. + * \param[in] fp8_dtype FP8 data type for model parameters. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_adam_fp8_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, const float lr, @@ -53,28 +160,125 @@ void nvte_multi_tensor_adam_fp8_cuda(int chunk_size, NVTETensor noop_flag, const float weight_decay, const NVTEDType fp8_dtype, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for Adam optimizer + * with CUDA graph support and LR scheduling. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] lr Learning rate. + * \param[in] beta1 Coefficient for first moment of gradient. + * \param[in] beta2 Coefficient for second moment of gradient. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] step Iteration counter. + * \param[in] mode Whether to use AdamW (L2 penalty applied to params). + * \param[in] bias_correction Whether to apply correction factor for moment estimates. + * \param[in] weight_decay L2 penalty for weight decay. + * \param[in] inv_scale Scalar for the unscaling operation. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_adam_capturable_cuda( int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, NVTETensor lr, const float beta1, const float beta2, const float epsilon, NVTETensor step, const int mode, const int bias_correction, const float weight_decay, NVTETensor inv_scale, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for Adam optimizer + * with CUDA graph support, LR scheduling, and FP32 master weights. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] lr Learning rate. + * \param[in] beta1 Coefficient for first moment of gradient. + * \param[in] beta2 Coefficient for second moment of gradient. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] step Iteration counter. + * \param[in] mode Whether to use AdamW (L2 penalty applied to params). + * \param[in] bias_correction Whether to apply correction factor for moment estimates. + * \param[in] weight_decay L2 penalty for weight decay. + * \param[in] inv_scale Scalar for the unscaling operation. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_adam_capturable_master_cuda( int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, NVTETensor lr, const float beta1, const float beta2, const float epsilon, NVTETensor step, const int mode, const int bias_correction, const float weight_decay, NVTETensor inv_scale, const int device_id, cudaStream_t stream); +/*! \brief Compute and apply gradient update to parameters for SGD optimizer. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] wd Weight decay (L2 penalty). + * \param[in] momentum Momentum factor. + * \param[in] dampening Dampening factor. + * \param[in] lr Learning rate. + * \param[in] nesterov Whether or not to enable nesterov momentum. + * \param[in] first_run Whether momentum buffers have been initialized. + * \param[in] wd_after_momentum Whether to applied weight decay after momentum update. + * \param[in] scale Scalar for the scaling operation. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_sgd_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, float wd, float momentum, float dampening, float lr, int nesterov, int first_run, int wd_after_momentum, float scale, const int device_id, cudaStream_t stream); +/*! \brief Check overflow and scale a list of tensors. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] scale Scalar for the scaling operation. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_scale_cuda(int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, float scale, const int device_id, cudaStream_t stream); +/*! \brief Check overflow and scale a list of tensors. + * + * \warning This API is **experimental** and subject to change. + * \warning Argument device_id is deprecated and will be removed in a future release. + * + * \param[in] chunk_size Number of tensor elements processed by a CUDA block. + * \param[in] noop_flag If this single element tensor has non-zero value, kernel will exit immediately. + * \param[in,out] tensor_lists 2D array of input tensors. + * \param[in] num_tensor_lists Size (dim0) of tensor_lists. + * \param[in] num_tensors_per_list Size (dim1) of tensor_lists. + * \param[in] max_fp8 Maximum representible value in underlying FP8 format. + * \param[in] force_pow_2_scales Ensure scaling factors are a power of 2. + * \param[in] epsilon Term added to the denominator for numerical stability. + * \param[in] device_id [DEPRECATED] CUDA device ID for this operation. + * \param[in] stream CUDA stream used for this operation. + */ void nvte_multi_tensor_compute_scale_and_scale_inv_cuda( int chunk_size, NVTETensor noop_flag, NVTETensor **tensor_lists, const size_t num_tensor_lists, const size_t num_tensors_per_list, float max_fp8, int force_pow_2_scales, float epsilon, From 3a5ca57fd68854f4f6145ef0278a1e56a1f63b0e Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 22 May 2025 12:20:14 -0700 Subject: [PATCH 08/26] Remove `comm_gemm_overlap` doc (#1815) Remove comm_gemm_overlap docs Signed-off-by: Kirthi Shankar Sivamani --- docs/api/c/comm_gemm_overlap.rst | 9 --------- docs/api/c/index.rst | 1 - 2 files changed, 10 deletions(-) delete mode 100644 docs/api/c/comm_gemm_overlap.rst diff --git a/docs/api/c/comm_gemm_overlap.rst b/docs/api/c/comm_gemm_overlap.rst deleted file mode 100644 index 090551f60..000000000 --- a/docs/api/c/comm_gemm_overlap.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. - Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - - See LICENSE for license information. - -comm_gemm_overlap.h -=================== - -.. doxygenfile:: comm_gemm_overlap.h diff --git a/docs/api/c/index.rst b/docs/api/c/index.rst index 27ba553d6..0499f52f0 100644 --- a/docs/api/c/index.rst +++ b/docs/api/c/index.rst @@ -16,7 +16,6 @@ directly from C/C++, without Python. activation.h cast_transpose_noop.h cast.h - comm_gemm_overlap.h cudnn.h fused_attn.h fused_rope.h From 9b80ea92914ccbf13e86d09d0fd2eaf37ab00549 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Thu, 22 May 2025 15:26:54 -0700 Subject: [PATCH 09/26] Add docs for missing FP8 recipes. (#1816) Document all recipes Signed-off-by: Kirthi Shankar Sivamani --- docs/api/common.rst | 4 ++ transformer_engine/common/recipe/__init__.py | 58 +++----------------- 2 files changed, 11 insertions(+), 51 deletions(-) diff --git a/docs/api/common.rst b/docs/api/common.rst index 95d4b50f3..541118985 100644 --- a/docs/api/common.rst +++ b/docs/api/common.rst @@ -11,3 +11,7 @@ Common API .. autoapiclass:: transformer_engine.common.recipe.DelayedScaling(margin=0, fp8_format=Format.HYBRID, amax_history_len=1024, amax_compute_algo="max", scaling_factor_compute_algo=None) .. autoapiclass:: transformer_engine.common.recipe.MXFP8BlockScaling(fp8_format=Format.E4M3) + +.. autoapiclass:: transformer_engine.common.recipe.Float8CurrentScaling(fp8_format=Format.HYBRID) + +.. autoapiclass:: transformer_engine.common.recipe.Float8BlockScaling(fp8_format=Format.E4M3) diff --git a/transformer_engine/common/recipe/__init__.py b/transformer_engine/common/recipe/__init__.py index 1cf974987..466c2e605 100644 --- a/transformer_engine/common/recipe/__init__.py +++ b/transformer_engine/common/recipe/__init__.py @@ -209,42 +209,12 @@ def __repr__(self) -> str: class Float8CurrentScaling(Recipe): """ Use the per-tensor current scaling factor strategy. + Parameters ---------- fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.HYBRID Controls the FP8 data format used during forward and backward pass. - fp8_quant_fwd_inp: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0} - used for quantization of input tensor x - fp8_quant_fwd_weight: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0} - used for quantization of weight tensor w - fp8_quant_bwd_grad: QParams, default QParams{power_2_scale=False, amax_epsilon=0.0} - used for quantization of gradient tensor dY - fp8_gemm_fprop: MMParams, default MMParams.use_split_accumulator=False - used for calculating output y in forward pass - fp8_gemm_dgrad: MMParams, default MMParams.use_split_accumulator=True - use for calculating dgrad in backward pass - fp8_gemm_wgrad: MMParams, default MMParams.use_split_accumulator=True - use for calculating dgrad in backward pass - fp8_dpa: bool, default = `False` - Whether to enable FP8 dot product attention (DPA). When the model is placed in an - `fp8_autocast(enabled=True)` region and `fp8_dpa` is set to `True`, DPA casts the - inputs from higher precision to FP8, performs attention in FP8, and casts tensors - back to higher precision as outputs. FP8 DPA currently is only supported in the - `FusedAttention` backend. - fp8_mha: bool, default = `False` - Whether to enable FP8 multi-head attention (MHA). When `True`, it removes the casting - operations mentioned above at the DPA boundaries. Currently only standard MHA modules - i.e. `LayerNormLinear/Linear + DPA + Linear`, are supported for this feature. When - `fp8_mha = False, fp8_dpa = True`, a typical MHA module works as - `LayerNormLinear (BF16 output) -> (cast to FP8 ) FP8 DPA (cast to BF16) -> Linear`. - When `fp8_mha = True, fp8_dpa = True`, it becomes - `LayerNormLinear (FP8 output) -> FP8 DPA -> Linear`. - - Notes - ----- - * `fp8_dpa` and `fp8_mha` are Beta features, and their API and functionality are - subject to change in future Transformer Engine releases. """ fp8_format: Format = Format.HYBRID @@ -259,6 +229,9 @@ class Float8CurrentScaling(Recipe): def __post_init__(self) -> None: assert self.fp8_format != Format.E5M2, "Pure E5M2 training is not supported." + assert ( + not self.fp8_dpa and not self.fp8_mha + ), "FP8 attention is not supported for Float8CurrentScaling." def __repr__(self) -> str: return ( @@ -335,32 +308,12 @@ class Float8BlockScaling(Recipe): NOTE: To relax the default constraint that scales be powers of 2, set env variable NVTE_FP8_BLOCK_SCALING_FP32_SCALES=1 to override it for the recipe defaults. - export NVTE_FP8_BLOCK_SCALING_FP32_SCALES=1 - Or initialize the Recipe with non-default QParams in code for increased control. Parameters ---------- fp8_format : {Format.E4M3, Format.HYBRID}, default = Format.E4M3 Controls the FP8 data format used during forward and backward pass. - fp8_quant_fwd_inp: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0} - used for quantization of input tensor x - fp8_quant_fwd_weight: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0} - used for quantization of weight tensor w - fp8_quant_bwd_grad: QParams, default QParams{power_2_scale=True, amax_epsilon=0.0} - used for quantization of gradient tensor dY - x_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional) - qblock scaling for x. - w_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional) - qblock scaling for w. - grad_block_scaling_dim: Choice to use 1x128 (1 dimensional) or 128x128 (2 dimensional) - qblock scaling for grad. - fp8_gemm_fprop: MMParams, default MMParams.use_split_accumulator=False - used for calculating output y in forward pass - fp8_gemm_dgrad: MMParams, default MMParams.use_split_accumulator=True - use for calculating dgrad in backward pass - fp8_gemm_wgrad: MMParams, default MMParams.use_split_accumulator=True - use for calculating dgrad in backward pass """ use_f32_scales: bool = os.getenv("NVTE_FP8_BLOCK_SCALING_FP32_SCALES", "0") == "1" @@ -394,6 +347,9 @@ def __post_init__(self) -> None: assert self.fp8_gemm_fprop.use_split_accumulator, "Split accumulator required for fprop." assert self.fp8_gemm_dgrad.use_split_accumulator, "Split accumulator required for dgrad." assert self.fp8_gemm_wgrad.use_split_accumulator, "Split accumulator required for wgrad." + assert ( + not self.fp8_dpa and not self.fp8_mha + ), "FP8 attention is not supported for Float8BlockScaling." def __repr__(self) -> str: return ( From 7558c445aa891428d96a5b4c0a2e6ce57cd289f2 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Fri, 23 May 2025 12:55:08 -0700 Subject: [PATCH 10/26] Fix the failing test cases in the CI (#1806) * Modify the test cases Signed-off-by: Przemek Tredak * Make the tests reproducible on different machines Signed-off-by: Przemek Tredak * Fixed the cache of the gamma_in_weight_dtype setting Signed-off-by: Przemek Tredak * Reinstate the tests Signed-off-by: Przemek Tredak * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * More verbose code and comments Signed-off-by: Przemek Tredak --------- Signed-off-by: Przemek Tredak Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../operator/test_cast_mxfp8_gated_swiglu.cu | 2 +- tests/cpp/test_common.cu | 23 +++++++++------ tests/pytorch/distributed/run_numerics.py | 2 +- .../common/normalization/common.cpp | 29 ++++++++++--------- .../common/normalization/common.h | 7 +++-- .../common/normalization/layernorm/ln_api.cpp | 22 +++++++++----- .../normalization/rmsnorm/rmsnorm_api.cpp | 19 ++++++++---- 7 files changed, 65 insertions(+), 39 deletions(-) diff --git a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu index f93c8c9e0..0e43c2c9d 100644 --- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu +++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu @@ -382,7 +382,7 @@ std::vector> matrix_sizes = { {256, 256}, {993, 512}, {768, 1024}, - {65536, 128}, + {65504, 128}, {16384, 1632}, }; diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 32eb1d63a..e11b32689 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -731,13 +731,19 @@ std::pair getTolerances(const DType type) { template void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { -#ifdef __HIP_PLATFORM_AMD__ - // TODO: Introduce a parallel RNG library (Random123, PCG, rocRAND) - std::uniform_real_distribution<> dis(-2.0, 1.0); - for (int i = 0; i < size; i++) { - data[i] = static_cast(dis(*gen)); + // Check how many RNG calls are required to generate one uniform random value + int rng_calls_per_val = 0; + { + std::mt19937 gen1 = *gen, gen2 = *gen; + std::uniform_real_distribution<> dis(-2.0, 1.0); + const float _ = dis(gen1); + while (gen2 != gen1) { + auto _ = gen2(); + ++rng_calls_per_val; + } } -#else + + // Generate uniform random values in parallel #pragma omp parallel proc_bind(spread) { std::mt19937 gen_local = *gen; @@ -746,15 +752,14 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { const int chunk_size = (size + threads_num - 1) / threads_num; const int idx_min = chunk_size * thread_ID; const int idx_max = std::min(chunk_size * (thread_ID + 1), static_cast(size)); - gen_local.discard(idx_min); + gen_local.discard(idx_min * rng_calls_per_val); std::uniform_real_distribution<> dis(-2.0, 1.0); for (int i = idx_min; i < idx_max; ++i) { data[i] = static_cast(dis(gen_local)); } } -#endif - gen->discard(size); + gen->discard(size * rng_calls_per_val); } void fillUniform(Tensor *t) { diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index a505d0179..b7af78832 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -207,7 +207,7 @@ def _get_tolerances(dtype): if dtype == torch.bfloat16: return {"rtol": 1.6e-2, "atol": 1e-5} if dtype == torch.float32: - return {"rtol": 1.3e-6, "atol": 4e-5} + return {"rtol": 1e-4, "atol": 1e-4} raise ValueError(f"Unsupported dtype ({dtype})") diff --git a/transformer_engine/common/normalization/common.cpp b/transformer_engine/common/normalization/common.cpp index 3be7d5004..029c89e2a 100644 --- a/transformer_engine/common/normalization/common.cpp +++ b/transformer_engine/common/normalization/common.cpp @@ -41,9 +41,6 @@ Compute always in FP32 namespace transformer_engine { namespace normalization { -#ifndef __HIP_PLATFORM_AMD__ -bool& use_zero_centered_gamma_in_weight_dtype(); - cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) { return training ? cudnn_frontend::NormFwdPhase_t::TRAINING : cudnn_frontend::NormFwdPhase_t::INFERENCE; @@ -53,13 +50,17 @@ cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) { TupleKeyType get_key(NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, DType ctype, uint64_t batch_size, uint64_t hidden_size, bool zero_centered_gamma, - bool is_tuned, NVTEScalingMode mode, bool training) { - // TODO: Add scaling_mode to general_key is needed - uint64_t general_key = static_cast(itype) | (static_cast(otype) << 3) | - (static_cast(ctype) << 6) | (static_cast(wtype) << 9) | - (uint32_t(NormType) << 12) | (uint32_t(NormStage)) << 14 | - (uint32_t(NormBackend) << 16) | (uint32_t(zero_centered_gamma) << 18) | - (uint32_t(mode) << 19) | (uint32_t(training) << 22); + bool is_tuned, NVTEScalingMode mode, bool training, + bool gamma_in_weight_dtype) { + static_assert(NVTE_INVALID_SCALING < 1024, + "This function assumes at most 10 bits used in the scaling mode."); + static_assert(kNVTENumTypes < 32, "This function assumes at most 5 bits used in the NVTEDType"); + uint64_t general_key = static_cast(itype) | (static_cast(otype) << 5) | + (static_cast(ctype) << 10) | + (static_cast(wtype) << 15) | (uint64_t(NormType) << 20) | + (uint64_t(NormStage)) << 22 | (uint64_t(NormBackend) << 24) | + (uint64_t(zero_centered_gamma) << 26) | (uint64_t(mode) << 27) | + (uint64_t(training) << 37) | (uint64_t(gamma_in_weight_dtype) << 38); return std::make_tuple(general_key, batch_size, hidden_size, is_tuned); } @@ -502,11 +503,12 @@ NormalizationPlanBase* NormalizationPlanRegistry::getNormalizationPlan( NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, const size_t batch_size, const size_t hidden_size, const size_t sm_count, const bool zero_centered_gamma, const bool is_aligned, - const NVTEScalingMode mode, const bool training) { + const NVTEScalingMode mode, const bool training, const bool gamma_in_weight_dtype) { const DType ctype = DType::kFloat32; bool is_tuned = is_aligned && (batch_size % 4 == 0); - auto key = get_key(NormBackend, NormType, NormStage, wtype, itype, otype, ctype, batch_size, - hidden_size, zero_centered_gamma, is_tuned, mode, training); + auto key = + get_key(NormBackend, NormType, NormStage, wtype, itype, otype, ctype, batch_size, hidden_size, + zero_centered_gamma, is_tuned, mode, training, gamma_in_weight_dtype); auto it = normalizationPlanMap.find(key); if (it != normalizationPlanMap.end()) { @@ -578,6 +580,7 @@ void nvte_enable_cudnn_norm_bwd(bool enable) { transformer_engine::normalization::_cudnn_norm_bwd_flag() = enable; } +// Only for testing, not thread-safe void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable) { NVTE_API_CALL(nvte_enable_zero_centered_gamma_in_weight_dtype); transformer_engine::normalization::_zero_centered_gamma_in_weight_dtype() = enable; diff --git a/transformer_engine/common/normalization/common.h b/transformer_engine/common/normalization/common.h index 241a3b77b..d1fe6868e 100644 --- a/transformer_engine/common/normalization/common.h +++ b/transformer_engine/common/normalization/common.h @@ -196,7 +196,7 @@ TupleKeyType get_key(NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, DType ctype, uint64_t batch_size, uint64_t hidden_size, bool zero_centered_gamma, bool is_tuned, NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING, - bool training = true); + bool training = true, bool gamma_in_weight_dtype = false); template class TeNormalizationRegistry { @@ -350,7 +350,8 @@ class NormalizationPlanRegistry { NVTE_Norm_Backend NormBackend, NVTE_Norm_Type NormType, NVTE_Norm_Stage NormStage, DType wtype, DType itype, DType otype, const size_t batch_size, const size_t hidden_size, const size_t sm_count, const bool zero_centered_gamma, const bool is_aligned, - const NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING, const bool training = true); + const NVTEScalingMode mode = NVTE_DELAYED_TENSOR_SCALING, const bool training = true, + const bool gamma_in_weight_dtype = false); private: NormalizationPlanRegistry() {} @@ -471,6 +472,8 @@ void rocm_norm_mxfp8_quantize(LaunchParams &launch_params) } #endif +bool& use_zero_centered_gamma_in_weight_dtype(); + } // namespace normalization } // namespace transformer_engine diff --git a/transformer_engine/common/normalization/layernorm/ln_api.cpp b/transformer_engine/common/normalization/layernorm/ln_api.cpp index f660ca5b7..e3cdfaf45 100644 --- a/transformer_engine/common/normalization/layernorm/ln_api.cpp +++ b/transformer_engine/common/normalization/layernorm/ln_api.cpp @@ -17,6 +17,7 @@ #include "../../common.h" #include "../common.h" +#include "transformer_engine/transformer_engine.h" namespace transformer_engine { @@ -67,12 +68,15 @@ void layernorm_fwd(const Tensor& x, // BxSxhidden_size #ifndef __HIP_PLATFORM_AMD__ bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp_scaling(z->scaling_mode); + bool gamma_in_weight_dtype = false; if (cudnn_backend) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; - } else -#endif + gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype(); + } else +#else { + norm_backend = NVTE_Norm_Backend::Te; is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, beta.data.dptr, mu->data.dptr, rsigma->data.dptr); @@ -88,7 +92,8 @@ void layernorm_fwd(const Tensor& x, // BxSxhidden_size z->data.dtype, // otype x.data.shape[0], // batch_size x.data.shape[1], // hidden_size - multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training); + multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training, + gamma_in_weight_dtype); if (workspace->data.shape.empty()) { workspace->data.shape = plan->getWorkspaceShape(); @@ -155,12 +160,14 @@ void layernorm_bwd(const Tensor& dz, const Tensor& x, const Tensor& mu, const Te NVTE_Norm_Backend norm_backend; bool is_aligned = true; -#ifndef __HIP_PLATFORM_AMD__ + bool gamma_in_weight_dtype = false; + #ifndef __HIP_PLATFORM_AMD__ if (use_cudnn_norm_bwd()) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; - } else -#endif + gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype(); + } else +#endif { norm_backend = NVTE_Norm_Backend::Te; is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, mu.data.dptr, rsigma.data.dptr, @@ -173,7 +180,8 @@ void layernorm_bwd(const Tensor& dz, const Tensor& x, const Tensor& mu, const Te gamma.data.dtype, // otype x.data.shape[0], // batch_size x.data.shape[1], // hidden_size - multiprocessorCount, zero_centered_gamma, is_aligned); + multiprocessorCount, zero_centered_gamma, is_aligned, NVTE_DELAYED_TENSOR_SCALING, true, + gamma_in_weight_dtype); if (workspace->data.shape.empty()) { workspace->data.shape = plan->getWorkspaceShape(); diff --git a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp index eabed2bd5..c783e1550 100644 --- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp +++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp @@ -15,6 +15,7 @@ #include "../../common.h" #include "../common.h" #include "transformer_engine/normalization.h" +#include "transformer_engine/transformer_engine.h" #include "transformer_engine/transpose.h" namespace transformer_engine { @@ -57,12 +58,14 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens bool training = is_delayed_tensor_scaling(z->scaling_mode) || (z->columnwise_data).dptr != nullptr; -#ifndef __HIP_PLATFORM_AMD__ + bool gamma_in_weight_dtype = false; + #ifndef if (cudnn_backend) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; + gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype(); } else -#endif +#endif { norm_backend = NVTE_Norm_Backend::Te; is_aligned = is_ptr_aligned(z->data.dptr, x.data.dptr, gamma.data.dptr, rsigma->data.dptr); @@ -75,7 +78,8 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens z->data.dtype, // otype x.data.shape[0], // batch_size x.data.shape[1], // hidden_size - multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training); + multiprocessorCount, zero_centered_gamma, is_aligned, z->scaling_mode, training, + gamma_in_weight_dtype); if (workspace->data.shape.empty()) { workspace->data.shape = plan->getWorkspaceShape(); @@ -133,12 +137,14 @@ void rmsnorm_bwd(const Tensor &dz, const Tensor &x, const Tensor &rsigma, const NVTE_Norm_Backend norm_backend; bool is_aligned = true; + bool gamma_in_weight_dtype = false; #ifndef __HIP_PLATFORM_AMD__ if (use_cudnn_norm_bwd()) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; - } else -#endif + gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype(); + } else +#endif { norm_backend = NVTE_Norm_Backend::Te; is_aligned = is_ptr_aligned(x.data.dptr, gamma.data.dptr, rsigma.data.dptr, dx->data.dptr, @@ -151,7 +157,8 @@ void rmsnorm_bwd(const Tensor &dz, const Tensor &x, const Tensor &rsigma, const gamma.data.dtype, // otype x.data.shape[0], // batch_size x.data.shape[1], // hidden_size - multiprocessorCount, zero_centered_gamma, is_aligned); + multiprocessorCount, zero_centered_gamma, is_aligned, NVTE_DELAYED_TENSOR_SCALING, true, + gamma_in_weight_dtype); if (workspace->data.shape.empty()) { workspace->data.shape = plan->getWorkspaceShape(); From d82f67b32b5068a3d5b9038d6ce101059ec1b220 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 27 May 2025 21:30:09 -0700 Subject: [PATCH 11/26] Fix multi-framework runtime lib loading (#1825) * Fix single FW build with multi FW available Signed-off-by: Kirthi Shankar Sivamani * Some fixes Signed-off-by: Kirthi Shankar Sivamani * Fixes Signed-off-by: Kirthi Shankar Sivamani * sug Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Kirthi Shankar Sivamani --- transformer_engine/common/__init__.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 871723a0e..49395fa23 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -113,9 +113,10 @@ def _get_shared_object_file(library: str) -> Path: # Case 1: Typical user workflow: Both locations are the same, return any result. if te_install_dir == site_packages_dir: - assert ( - so_path_in_install_dir is not None - ), f"Could not find shared object file for Transformer Engine {library} lib." + if so_path_in_install_dir is None: + raise FileNotFoundError( + f"Could not find shared object file for Transformer Engine {library} lib." + ) return so_path_in_install_dir # Case 2: ERR! Both locations are different but returned a valid result. @@ -123,13 +124,12 @@ def _get_shared_object_file(library: str) -> Path: # editable builds. In case developers are executing inside a TE directory via # an inplace build, and then move to a regular build, the local shared object # file will be incorrectly picked up without the following logic. - if so_path_in_install_dir is not None and so_path_in_default_dir is not None: - raise RuntimeError( - f"Found multiple shared object files: {so_path_in_install_dir} and" - f" {so_path_in_default_dir}. Remove local shared objects installed" - f" here {so_path_in_install_dir} or change the working directory to" - "execute from outside TE." - ) + assert so_path_in_install_dir is None or so_path_in_default_dir is None, ( + f"Found multiple shared object files: {so_path_in_install_dir} and" + f" {so_path_in_default_dir}. Remove local shared objects installed" + f" here {so_path_in_install_dir} or change the working directory to" + "execute from outside TE." + ) # Case 3: Typical dev workflow: Editable install if so_path_in_install_dir is not None: @@ -139,7 +139,9 @@ def _get_shared_object_file(library: str) -> Path: if so_path_in_default_dir is not None: return so_path_in_default_dir - raise RuntimeError(f"Could not find shared object file for Transformer Engine {library} lib.") + raise FileNotFoundError( + f"Could not find shared object file for Transformer Engine {library} lib." + ) @functools.lru_cache(maxsize=None) @@ -207,6 +209,7 @@ def load_framework_extension(framework: str): @functools.lru_cache(maxsize=None) def _get_sys_extension(): system = platform.system() + if system == "Linux": extension = "so" elif system == "Darwin": From b1d2539a8ee6603b107aa444e0d2ed7844e26368 Mon Sep 17 00:00:00 2001 From: alextmagro Date: Mon, 6 Oct 2025 12:23:39 -0500 Subject: [PATCH 12/26] Release v2.4_rocm --- transformer_engine/common/normalization/common.cpp | 1 + transformer_engine/common/normalization/layernorm/ln_api.cpp | 4 +++- .../common/normalization/rmsnorm/rmsnorm_api.cpp | 2 +- transformer_engine/pytorch/tensor/float8_tensor.py | 4 ---- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/transformer_engine/common/normalization/common.cpp b/transformer_engine/common/normalization/common.cpp index 029c89e2a..442e11216 100644 --- a/transformer_engine/common/normalization/common.cpp +++ b/transformer_engine/common/normalization/common.cpp @@ -41,6 +41,7 @@ Compute always in FP32 namespace transformer_engine { namespace normalization { +#ifndef __HIP_PLATFORM_AMD__ cudnn_frontend::NormFwdPhase_t get_cudnn_forward_phase(const bool training) { return training ? cudnn_frontend::NormFwdPhase_t::TRAINING : cudnn_frontend::NormFwdPhase_t::INFERENCE; diff --git a/transformer_engine/common/normalization/layernorm/ln_api.cpp b/transformer_engine/common/normalization/layernorm/ln_api.cpp index e3cdfaf45..9b689ec88 100644 --- a/transformer_engine/common/normalization/layernorm/ln_api.cpp +++ b/transformer_engine/common/normalization/layernorm/ln_api.cpp @@ -67,14 +67,16 @@ void layernorm_fwd(const Tensor& x, // BxSxhidden_size bool is_aligned = true; #ifndef __HIP_PLATFORM_AMD__ bool cudnn_backend = use_cudnn_norm_fwd() || is_mxfp_scaling(z->scaling_mode); +#endif bool gamma_in_weight_dtype = false; +#ifndef __HIP_PLATFORM_AMD__ if (cudnn_backend) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; gamma_in_weight_dtype = use_zero_centered_gamma_in_weight_dtype(); } else -#else +#endif { norm_backend = NVTE_Norm_Backend::Te; diff --git a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp index c783e1550..4eb5f7496 100644 --- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp +++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_api.cpp @@ -59,7 +59,7 @@ void rmsnorm_fwd(const Tensor &x, const Tensor &gamma, const float epsilon, Tens is_delayed_tensor_scaling(z->scaling_mode) || (z->columnwise_data).dptr != nullptr; bool gamma_in_weight_dtype = false; - #ifndef +#ifndef __HIP_PLATFORM_AMD__ if (cudnn_backend) { // TODO: add check for GPU ARCH norm_backend = NVTE_Norm_Backend::Cudnn; diff --git a/transformer_engine/pytorch/tensor/float8_tensor.py b/transformer_engine/pytorch/tensor/float8_tensor.py index fa8e29283..f43a6dd28 100644 --- a/transformer_engine/pytorch/tensor/float8_tensor.py +++ b/transformer_engine/pytorch/tensor/float8_tensor.py @@ -6,12 +6,8 @@ """Tensor class with FP8 data""" from __future__ import annotations -<<<<<<< HEAD import os -from typing import Optional, Tuple, Iterable -======= from typing import Optional, Tuple, Iterable, Union ->>>>>>> 6f5af6ae (Enhance recipe compatibility (#1724)) import warnings from torch.utils.cpp_extension import IS_HIP_EXTENSION From 0e1c8fe6d4e30791ed157527f5b5ef7e437115e7 Mon Sep 17 00:00:00 2001 From: alextmagro Date: Tue, 7 Oct 2025 16:07:30 -0500 Subject: [PATCH 13/26] readd HIP data generation --- tests/cpp/test_common.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index e11b32689..9f4c9c3cb 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -731,6 +731,14 @@ std::pair getTolerances(const DType type) { template void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { +#ifdef __HIP_PLATFORM_AMD__ + // TODO: Introduce a parallel RNG library (Random123, PCG, rocRAND) + std::uniform_real_distribution<> dis(-2.0, 1.0); + for (int i = 0; i < size; i++) { + data[i] = static_cast(dis(*gen)); + } + gen->discard(size) +#else // Check how many RNG calls are required to generate one uniform random value int rng_calls_per_val = 0; { @@ -760,6 +768,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { } } gen->discard(size * rng_calls_per_val); +#endif } void fillUniform(Tensor *t) { From 758ed7e3159ba29a25fca12157291cd2633a8428 Mon Sep 17 00:00:00 2001 From: alextmagro Date: Wed, 8 Oct 2025 09:50:33 -0500 Subject: [PATCH 14/26] Missing ; in test_common --- tests/cpp/test_common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 9f4c9c3cb..ccc8ae681 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -737,7 +737,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { for (int i = 0; i < size; i++) { data[i] = static_cast(dis(*gen)); } - gen->discard(size) + gen->discard(size); #else // Check how many RNG calls are required to generate one uniform random value int rng_calls_per_val = 0; From d1b8dba9514c3adcc74a8e839df7fb15e46bfda9 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Fri, 31 Oct 2025 16:19:28 -0500 Subject: [PATCH 15/26] [CI] Removed Jax jit workaround, replaced with XLA_FLAGS=--xla_gpu_enable_nccl_comm_splitting=false (#346) --- ci/jax.sh | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/ci/jax.sh b/ci/jax.sh index 80c61ce9b..cc080916c 100755 --- a/ci/jax.sh +++ b/ci/jax.sh @@ -66,33 +66,23 @@ run_test_config() { run_test_config_mgpu() { echo ==== Run mGPU with Fused attention backend: $_fus_attn ==== - _JAX_DISABLE_JIT_FLAG=${JAX_DISABLE_JIT:-0} _ver=$(pip show jaxlib | grep Version) case "$_ver" in *0.4.35*) - # Workaround for distributed tests hang with JIT enabled - JAX_DISABLE_JIT=1 run 3 test_distributed_fused_attn.py -k 'not (test_context_parallel_allgather_attn[BALANCED or test_context_parallel_ring_attn)' - _JAX_DISABLE_JIT_FLAG=1 - - # Run tests that fail with JIT disabled - #run_lbl "allgather_balanced" 3 test_distributed_fused_attn.py -k 'test_context_parallel_allgather_attn[BALANCED' - + # Workaround for distributed tests hang with xla_flag + XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_ring_attn' + # Test ring attention with xla_flag --xla_experimental_ignore_channel_id only - # TODO: remove this flag after jax/xla update - XLA_FLAGS="--xla_experimental_ignore_channel_id" run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn - ;; - *0.6.*) - # Workaround for distributed tests hang with JIT enabled - JAX_DISABLE_JIT=1 run 3 test_distributed_fused_attn.py -k 'not test_context_parallel_allgather_attn[BALANCED' - _JAX_DISABLE_JIT_FLAG=1 + XLA_FLAGS="--xla_experimental_ignore_channel_id" run_lbl "parallel_ring" 3 test_distributed_fused_attn.py -k test_context_parallel_ring_attn ;; *) - run 3 test_distributed_fused_attn.py + # Workaround for distributed tests hang with xla_flag + XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run 3 test_distributed_fused_attn.py ;; esac run_default_fa 3 test_distributed_layernorm.py - JAX_DISABLE_JIT=$_JAX_DISABLE_JIT_FLAG run_default_fa 3 test_distributed_layernorm_mlp.py + XLA_FLAGS="--xla_gpu_enable_nccl_comm_splitting=false" run_default_fa 3 test_distributed_layernorm_mlp.py run_default_fa 3 test_distributed_softmax.py run_default_fa 3 test_sanity_import.py From fa8615df0ab6ae89f36d1577ce5bdffda8253024 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Fri, 10 Oct 2025 16:37:32 -0400 Subject: [PATCH 16/26] CI hotfix: IFU test update (#329) --- ci/pytorch.sh | 2 +- tests/pytorch/test_cpu_offloading.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/pytorch.sh b/ci/pytorch.sh index e4f8380f5..207949ee5 100755 --- a/ci/pytorch.sh +++ b/ci/pytorch.sh @@ -58,7 +58,7 @@ run_test_config(){ run_default_fa 1 test_deferred_init.py run_default_fa 1 test_float8tensor.py run_default_fa 1 test_float8_current_scaling_exact.py - run_default_fa 1 test_cpu_offloading.py + test $_fus_attn = auto -o $_fus_attn = ck -o $_fus_attn = aotriton && NVTE_FLASH_ATTN=0 run 1 test_cpu_offloading.py run_default_fa 1 test_fused_rope.py run_default_fa 1 test_fusible_ops.py run_default_fa 3 test_gemm_autotune.py diff --git a/tests/pytorch/test_cpu_offloading.py b/tests/pytorch/test_cpu_offloading.py index ab4b7634b..816df12f6 100644 --- a/tests/pytorch/test_cpu_offloading.py +++ b/tests/pytorch/test_cpu_offloading.py @@ -29,7 +29,7 @@ # Flash attention saves some internal tensor for the backward pass # that cannot be offloaded to CPU. -assert os.getenv("NVTE_FLASH_ATTN") == "0" +assert os.getenv("NVTE_FLASH_ATTN", "1") == "0" # Offloading is supported for attention only for fused and flash attention backends, # so the use of bfloat16 is required. From 08bf8fc836eda32b7064dad2ec2ed7fe89b28b58 Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Sat, 18 Oct 2025 23:20:02 -0400 Subject: [PATCH 17/26] Fix and add MXFP8 GEMM test failures (#326) * Fix MXFP8 GEMM test * Fix uninitialized var in GEMM code * Add Dequantize+GEMM test to check MXFP8 scaling tensor layout --- ci/core.sh | 4 +- tests/cpp/CMakeLists.txt | 3 +- tests/cpp/operator/test_cublaslt_gemm.cu | 375 ++++++++++++-------- tests/cpp/test_common.cu | 3 + tests/pytorch/test_gemm_autotune.py | 2 +- transformer_engine/common/gemm/rocm_gemm.cu | 2 +- 6 files changed, 239 insertions(+), 150 deletions(-) diff --git a/ci/core.sh b/ci/core.sh index 0953d7bde..35b4000e9 100755 --- a/ci/core.sh +++ b/ci/core.sh @@ -31,14 +31,14 @@ fi check_test_filter "nongemm" if [ $? -eq 0 ]; then echo ===== Run non GEMM tests ===== - ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "OperatorTest/GEMMTestSuite" + ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite" test $? -eq 0 || test_run_error "non-GEMM" fi check_test_filter "gemm" if [ $? -eq 0 ]; then echo ===== Run GEMM tests ===== - ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "OperatorTest/GEMMTestSuite" + ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite" test $? -eq 0 || test_run_error "GEMM" fi diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index da8a37ba8..4ab5fd237 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -64,8 +64,7 @@ else() project(transformer_engine_tests LANGUAGES HIP CXX) # Ask hcc to generate device code during compilation so we can use # host linker to link. - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fno-gpu-rdc -Wno-defaulted-function-deleted") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_FLAGS}") + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -fno-gpu-rdc -Wno-defaulted-function-deleted -Wno-unused-result") endif() add_subdirectory(../../3rdparty/googletest ${PROJECT_BINARY_DIR}/googletest) diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu index 7d0597ef7..b731cc701 100644 --- a/tests/cpp/operator/test_cublaslt_gemm.cu +++ b/tests/cpp/operator/test_cublaslt_gemm.cu @@ -3,17 +3,15 @@ * * License for AMD contributions = MIT. See LICENSE for more information ************************************************************************/ +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "../test_common.h" using namespace transformer_engine; @@ -30,29 +28,17 @@ std::vector> test_case_sizes = { {29, 29, 17389}, //primes }; +std::vector> test_case_sizes_mxfp8 = { + {2304, 768, 4096}, +}; + // A, B, Bias, Gelu, D // Bias type choose as bf16 in use_fp8, D_type otherwise // Gelu type the same as Bias_Type -// {DType::kFloat32, DType::kFloat32, DType::kFloat32, DType::kFloat32, DType::kFloat32}, -// {DType::kFloat16, DType::kFloat16, DType::kFloat16, DType::kFloat16, DType::kFloat16}, -// {DType::kBFloat16, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16}, -// {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat32}, -// {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat16}, -// {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16}, -// {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3}, -// {DType::kFloat8E4M3, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2}, -// {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat32}, -// {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat16}, -// {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16}, -// {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3}, -// {DType::kFloat8E4M3, DType::kFloat8E5M2, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2}, -// {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat32}, -// {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat16}, -// {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kBFloat16}, -// {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E4M3}, -// {DType::kFloat8E5M2, DType::kFloat8E4M3, DType::kBFloat16, DType::kBFloat16, DType::kFloat8E5M2}, -} // namespace +using fp32=float; +using fp8=fp8e4m3; +using bf8=fp8e5m2; using Layout = std::pair;// {transa, transb} static const Layout kNN{false,false}; @@ -61,10 +47,9 @@ static const Layout kNT{false,true }; static const std::vector kLayouts = { kNN, kTN, kNT }; -// , -class GEMMTestSuite - : public ::testing::TestWithParam< - std::tuple, bool, bool, Layout, NVTEScalingMode>> {}; +using TShape = std::vector; +} // namespace + float ref_gelu(float x){ float cdf = 0.5f * (1.0f + tanhf((0.7978845608028654f * (x + 0.044715f * x * x * x)))); @@ -81,12 +66,14 @@ void compute_ref( const float d_scale, size_t m, size_t k, size_t n, D_Type* ref_d_data, - float* ref_d_amax, + float* ref_d_amax_ptr, Gelu_Type* ref_gelu_data, bool transa, bool transb){ - *ref_d_amax = 0; + float ref_d_amax = 0; + + #pragma omp parallel for schedule(static) collapse(2) reduction(max: ref_d_amax) proc_bind(spread) for(size_t ii = 0; ii < m; ii++){ for(size_t jj = 0; jj < n; jj++){ float val = 0; @@ -106,41 +93,45 @@ void compute_ref( // update ref_d_amax if in fp8 DType dtype = TypeInfo::dtype; if(isFp8Type(dtype)){ - *ref_d_amax = std::max(*ref_d_amax, std::fabs(val)); + ref_d_amax = std::max(ref_d_amax, std::fabs(val)); } } } + if (ref_d_amax_ptr) + { + *ref_d_amax_ptr = ref_d_amax; + } } template void compute_mxfp8_ref( const A_Type* a_data, const B_Type* b_data, - const NVTEShape& a_scale_inv_shape, const fp8e8m0* a_scale_inv_data, - const NVTEShape& b_scale_inv_shape, const fp8e8m0* b_scale_inv_data, const Bias_Type* bias_data, //bias is of dim m const float d_scale, size_t m, size_t k, size_t n, D_Type* ref_d_data, - float* ref_d_amax, + float* ref_d_amax_ptr, Gelu_Type* ref_gelu_data, bool transa, bool transb){ - *ref_d_amax = 0; + float ref_d_amax = 0; + + #pragma omp parallel for schedule(static) collapse(2) reduction(max: ref_d_amax) proc_bind(spread) for(size_t ii = 0; ii < m; ii++){ for(size_t jj = 0; jj < n; jj++){ float val = 0; for(size_t kk = 0; kk < k; kk++){ - float a_val = a_data[ii*k + kk]; - float b_val = b_data[kk + jj*k]; - float a_scale_inv_val = - (float)std::pow(2, a_scale_inv_data[ii * a_scale_inv_shape.data[1] + kk / 32] - 127); - float b_scale_inv_val = - (float)std::pow(2, b_scale_inv_data[kk / 32 + jj * b_scale_inv_shape.data[1]] - 127); - val += a_scale_inv_val * a_val * b_scale_inv_val * b_val; + size_t a_idx = transa ? (ii*k + kk) : (kk*m + ii); + size_t b_idx = transb ? (kk*n + jj) : (jj*k + kk); + float a_scale_inv_val = (float)std::pow(2, + a_scale_inv_data[transa ? a_idx/32 : (kk/32 * m + ii)] - 127); + float b_scale_inv_val = (float)std::pow(2, + b_scale_inv_data[transb ? (kk/32 * n + jj) : b_idx/32] - 127); + val += a_scale_inv_val * (float)a_data[a_idx] * b_scale_inv_val * (float)b_data[b_idx]; } if(bias_data){ val += (float)bias_data[ii]; @@ -153,10 +144,14 @@ void compute_mxfp8_ref( // update ref_d_amax if in fp8 DType dtype = TypeInfo::dtype; if(isFp8Type(dtype)){ - *ref_d_amax = std::max(*ref_d_amax, std::fabs(val)); + ref_d_amax = std::max(ref_d_amax, std::fabs(val)); } } } + if (ref_d_amax_ptr) + { + *ref_d_amax_ptr = ref_d_amax; + } } template @@ -172,6 +167,36 @@ void cpu_rowwise_to_columnwise( } } +std::pair getTestTolerances(const DType type, bool use_fp8, bool use_mxfp8) { + auto [atol, rtol] = getTolerances(type); + + //relax for certain prime number gemm + if (type == DType::kFloat32) { + atol = 1e-5; + } + // relax for certain FP8 gemm with hipblaslt + if (use_mxfp8) { + atol = 5e-4; + /*During hipifying std::max is converted to ::max + to w/a HIP bug with using std:: in device functions. + W/o explicitlit , compiler uses non-templated int method variant from HIP headers + TODO: remove when switch to new hipify version after fixing HIP bug */ + rtol = std::max(rtol, 1e-3); + } + else if (use_fp8) { + atol = 1e-3; + //TODO: remove (see comment above) + rtol = std::max(rtol, 5e-3); + } + else if (type == DType::kBFloat16) { + //relax for certain prime number TN gemm + rtol = 5e-2; + } + else if (type == DType::kFloat32) { + rtol = 1e-5; + } + return {atol, rtol}; +} struct TestParams { size_t m; @@ -258,8 +283,13 @@ void performTest(const TestParams& params) { if (params.use_gelu && dtype == DType::kBFloat16) { GTEST_SKIP() << "BF16 GEMM with GELU is not supported in current config"; } - if (has_fp8 && params.use_bias && dtype == DType::kFloat32) { - GTEST_SKIP() << "FP8 GEMM with bias and FP32 output is not supported in current config"; + if constexpr ((std::is_same::value || std::is_same::value) && + std::is_same::value) + { + //GEMM with bias and fp32 output is not supported with bf8 A/B + if (params.use_bias) { + GTEST_SKIP() << "FP8 GEMM with bias is not supported in current config"; + } } } if (prop.major == 9 && prop.minor == 4) //gfx942 specific hipblasLt limitations @@ -273,49 +303,39 @@ void performTest(const TestParams& params) { } #endif - // pytorch tensor storage is row-major while cublas/hipblaslt is column-major - Tensor A; - if (params.transa){ - A = Tensor("A", std::vector{ params.m, params.k }, atype, true, false, params.scaling_mode); - }else { - // hipblaslt path need fp8-gemm with TN layout - A = Tensor("A", std::vector{ params.k, params.m }, atype, true, isFp8Type(atype), params.scaling_mode); - } - Tensor B; - if (params.transb){ - //hipblaslt path need fp8-gemm with TN layout - B = Tensor("B", std::vector{ params.k, params.n }, btype, true, isFp8Type(btype), params.scaling_mode); - }else { - B = Tensor("B", std::vector{ params.n, params.k }, btype, true, false, params.scaling_mode); - } - Tensor D("D", std::vector{ params.n, params.m }, dtype); + // FP8 GEMM path needs columnwise data for A/B tensor with non TN layout + const bool a_colwise = !params.transa && isFp8Type(atype); + const bool b_colwise = params.transb && isFp8Type(btype); + Tensor A("A", params.transa ? TShape{ params.m, params.k } : TShape{ params.k, params.m }, + atype, (!a_colwise || !use_mxfp8), a_colwise, params.scaling_mode); + Tensor B("B", params.transb ? TShape{ params.k, params.n } : TShape{ params.n, params.k }, + btype, (!b_colwise || !use_mxfp8), b_colwise, params.scaling_mode); + + Tensor D("D", TShape{ params.n, params.m }, dtype); Tensor bias; if(params.use_bias){ - bias = Tensor("bias", std::vector{params.m}, bias_type); + bias = Tensor("bias", TShape{params.m}, bias_type); } Tensor pre_gelu_out; if(params.use_gelu){ - pre_gelu_out = Tensor("pre_gelu_out", std::vector{ params.n, params.m }, gelu_type); + pre_gelu_out = Tensor("pre_gelu_out", TShape{ params.n, params.m }, gelu_type); } //initialize the data and scale inv of A, B + //fillUniform does not initialize columnwise data if rowwise data exist fillUniform(&A); - if (isFp8Type(atype) && !params.transa && !use_mxfp8) { + if (a_colwise && !use_mxfp8) { // A must be of shape k, m - cpu_rowwise_to_columnwise( - params.k, params.m, - A.rowwise_cpu_dptr(), - A.columnwise_cpu_dptr()); + cpu_rowwise_to_columnwise(params.k, params.m, + A.rowwise_cpu_dptr(), A.columnwise_cpu_dptr()); // sync the columnwise data on GPU as well A.from_cpu(); } fillUniform(&B); - if (isFp8Type(btype) && params.transb && !use_mxfp8) { - // B must be of shape k, m - cpu_rowwise_to_columnwise( - params.k, params.n, - B.rowwise_cpu_dptr(), - B.columnwise_cpu_dptr()); + if (b_colwise && !use_mxfp8) { + // B must be of shape k, n + cpu_rowwise_to_columnwise(params.k, params.n, + B.rowwise_cpu_dptr(), B.columnwise_cpu_dptr()); // sync the columnwise data on GPU as well B.from_cpu(); } @@ -335,7 +355,7 @@ void performTest(const TestParams& params) { workspace_size = 67108864; } #endif - Tensor Workspace("Workspace", std::vector{ workspace_size }, DType::kByte); + Tensor Workspace("Workspace", TShape{ workspace_size }, DType::kByte); //perform the gemm in GPU nvte_cublas_gemm(A.data(), @@ -370,28 +390,23 @@ void performTest(const TestParams& params) { const A_Type *a_data; const B_Type *b_data; const fp8e8m0 *a_scale_inv_data, *b_scale_inv_data; - NVTEShape a_scale_inv_shape, b_scale_inv_shape; if (params.transa) { a_data = A.rowwise_cpu_dptr(); a_scale_inv_data = A.rowwise_cpu_scale_inv_ptr(); - a_scale_inv_shape = A.rowwise_scale_inv_shape(); } else { a_data = A.columnwise_cpu_dptr(); a_scale_inv_data = A.columnwise_cpu_scale_inv_ptr(); - a_scale_inv_shape = A.columnwise_scale_inv_shape(); } if (params.transb) { b_data = B.columnwise_cpu_dptr(); b_scale_inv_data = B.columnwise_cpu_scale_inv_ptr(); - b_scale_inv_shape = B.columnwise_scale_inv_shape(); } else { b_data = B.rowwise_cpu_dptr(); b_scale_inv_data = B.rowwise_cpu_scale_inv_ptr(); - b_scale_inv_shape = B.rowwise_scale_inv_shape(); } compute_mxfp8_ref( - a_data, b_data, a_scale_inv_shape, a_scale_inv_data, b_scale_inv_shape, b_scale_inv_data, + a_data, b_data, a_scale_inv_data, b_scale_inv_data, params.use_bias ? bias.rowwise_cpu_dptr() : nullptr, D.scale(), params.m, params.k, params.n, ref_D.get(), &ref_amax_d, params.use_gelu ? ref_pre_gelu_out.get() : nullptr, @@ -416,49 +431,91 @@ void performTest(const TestParams& params) { compareResults("D_amax", D.amax(), ref_amax_d, atol_amax, rtol_amax); } - auto [atol, rtol] = getTolerances(dtype); - //relax for certain prime number gemm - if (dtype == DType::kFloat32) { - atol = 1e-5; - } -#ifdef __HIP_PLATFORM_AMD__ - // relax for certain FP8 gemm with hipblaslt - if (use_mxfp8) { - atol = 5e-4; - /*During hipifying std::max is converted to ::max - to w/a HIP bug with using std:: in device functions. - W/o explicitlit , compiler uses non-templated int method variant from HIP headers - TODO: remove when switch to new hipify version after fixing HIP bug */ - rtol = std::max(rtol, 1e-3); - } - else if (has_fp8) { - atol = 1e-3; - //TODO: remove (see comment above) - rtol = std::max(rtol, 5e-3); - } - else if (dtype == DType::kBFloat16) { - //relax for certain prime number TN gemm - rtol = 5e-2; - } - else if (dtype == DType::kFloat32) { - rtol = 1e-5; - } -#endif + auto [atol, rtol] = getTestTolerances(dtype, has_fp8, use_mxfp8); compareResults("D", D, ref_D.get(), true, atol, rtol); if(params.use_gelu){ - auto [atol, rtol] = getTolerances(gelu_type); - //relax for certain prime number gemm - if (dtype == DType::kFloat32) { - atol = 1e-5; - } + auto [atol, rtol] = getTestTolerances(gelu_type, false, false); compareResults("gelu", pre_gelu_out, ref_pre_gelu_out.get(), true, atol, rtol); } } -using fp32=float; -using fp8=fp8e4m3; -using bf8=fp8e5m2; +#ifdef __HIP_PLATFORM_AMD__ +template +void performDqTest(const TestParams ¶ms) { + DType atype = TypeInfo::dtype; + DType btype = TypeInfo::dtype; + DType dtype = TypeInfo::dtype; + + GTEST_ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input datatype is expected"; + GTEST_ASSERT_FALSE(isFp8Type(dtype)) << "Non FP8/BF8 output datatype is expected"; + + if (params.m % 32 != 0 || params.n % 32 != 0 || params.k % 32 != 0) { + GTEST_SKIP() << "MXFP8 requires M, N, K to be multiples of 32"; + } + + cudaDeviceProp prop; + (void)cudaGetDeviceProperties(&prop, 0); + + bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5); + if (!mxfp8_supported) { + GTEST_SKIP() << "MXFP8 is not supported in current config"; + } + + DType ref_type = dtype; + TShape a_shape = params.transa ? TShape{params.m, params.k} : TShape{params.k, params.m}; + TShape b_shape = params.transb ? TShape{params.k, params.n} : TShape{params.n, params.k}; + + Tensor A_src("A", a_shape, ref_type); + Tensor B_src("B", b_shape, ref_type); + //initialize A, B + fillUniform(&A_src); + fillUniform(&B_src); + + // FP8 GEMM path needs columnwise data for A/B tensor with non TN layout + Tensor A_fp8("A_fp8", a_shape, atype, params.transa, !params.transa, + NVTEScalingMode::NVTE_MXFP8_1D_SCALING); + Tensor B_fp8("B_fp8", b_shape, btype, !params.transb, params.transb, + NVTEScalingMode::NVTE_MXFP8_1D_SCALING); + nvte_quantize(A_src.data(), A_fp8.data(), 0); + nvte_quantize(B_src.data(), B_fp8.data(), 0); + + Tensor A_ref("A_ref", a_shape, ref_type); + Tensor B_ref("B_ref", b_shape, ref_type); + nvte_dequantize(A_fp8.data(), A_ref.data(), 0); + nvte_dequantize(B_fp8.data(), B_ref.data(), 0); + + Tensor bias; + Tensor pre_gelu_out; + + size_t workspace_size = 67108864; + Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte); + + //perform FP8 gemm and copy the output results from GPU memory to CPU memory + Tensor D("D", TShape{params.n, params.m}, dtype); + nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(), bias.data(), pre_gelu_out.data(), + params.transa, params.transb, false, Workspace.data(), false, false, + prop.multiProcessorCount, 0); + D.to_cpu(); + + + //perform non-FP8 gemm and copy the output results from GPU memory to CPU memory + Tensor D_ref("D", TShape{params.n, params.m}, dtype); + nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(), bias.data(), pre_gelu_out.data(), + params.transa, params.transb, false, Workspace.data(), false, false, + prop.multiProcessorCount, 0); + D_ref.to_cpu(); + + // check if error message happens in running + (void)cudaDeviceSynchronize(); + auto err = cudaGetLastError(); + ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); + + //compare results + auto [atol, rtol] = getTestTolerances(dtype, true, true); + compareResults("D", D, D_ref.rowwise_cpu_dptr(), true, atol, rtol); +} +#endif // __HIP_PLATFORM_AMD__ #define MAKE_TEST_PARAMS(P_) \ TestParams P_ = {.m = std::get<0>(std::get<0>(GetParam())), \ @@ -472,10 +529,13 @@ using bf8=fp8e5m2; ? NVTEScalingMode::NVTE_MXFP8_1D_SCALING \ : NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING} +// , use_bias, use_gelu, Layout, fp8_scalinig +class GEMMTestSuite + : public ::testing::TestWithParam< + std::tuple, bool, bool, Layout, NVTEScalingMode>> {}; + #define MAKE_GEMM_TEST(NAME_, A_, B_, BIAS_, GELU_, D_) \ TEST_P(GEMMTestSuite, NAME_) { \ - using namespace transformer_engine; \ - using namespace test; \ MAKE_TEST_PARAMS(test_params); \ using A_Type = A_; \ using B_Type = B_; \ @@ -523,24 +583,51 @@ MAKE_GEMM_TEST(Testbf8xfp8xbf16xbf16xbf8, bf8, fp8, bf16, bf16, bf8); MAKE_GEMM_TEST(Testfp8xfp8xfp16xfp16xfp8, fp8, fp8, fp16, fp16, fp8); -INSTANTIATE_TEST_SUITE_P( - OperatorTest, - GEMMTestSuite, - ::testing::Combine( - ::testing::ValuesIn(test_case_sizes), - ::testing::Values(false, true), //use bias - ::testing::Values(false, true), //use_gelu - ::testing::ValuesIn(kLayouts), //transa,transb - ::testing::Values(false, true)), //use mxfp8 - [](const testing::TestParamInfo& info) { - auto TN = [](bool v){ return v ? "T" : "N"; }; - const auto layout = std::get<3>(info.param); - std::string name = std::to_string(std::get<0>(std::get<0>(info.param))) + "X" + - std::to_string(std::get<1>(std::get<0>(info.param))) + "X" + - std::to_string(std::get<2>(std::get<0>(info.param))) + "X" + - std::to_string(std::get<1>(info.param)) + "X" + - std::to_string(std::get<2>(info.param)) + "X" + - TN(layout.first) + TN(layout.second) + "X" + - (std::get<4>(info.param) ? "M" : "S"); - return name; - }); +static inline auto TN(const Layout& layout) { + static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}}; + return std::string(map[layout.first][layout.second]); +} + +static inline auto MKN(const std::tuple& shape) { + return std::to_string(std::get<0>(shape)) + "x" + std::to_string(std::get<1>(shape)) + "x" + + std::to_string(std::get<2>(shape)); +} + +INSTANTIATE_TEST_SUITE_P(OperatorTest, GEMMTestSuite, + ::testing::Combine(::testing::ValuesIn(test_case_sizes), + ::testing::Values(false, true), //use bias + ::testing::Values(false, true), //use_gelu + ::testing::ValuesIn(kLayouts), //transa,transb + ::testing::Values(false, true)), //use mxfp8 + [](const testing::TestParamInfo& info) { + return MKN(std::get<0>(info.param)) + "x" + + std::to_string(std::get<1>(info.param)) + "x" + + std::to_string(std::get<2>(info.param)) + "x" + + TN(std::get<3>(info.param)) + "x" + + (std::get<4>(info.param) ? "M" : "S"); + }); + +#ifdef __HIP_PLATFORM_AMD__ +class DqGEMMTestSuite: public GEMMTestSuite {}; + +#define MAKE_DQ_GEMM_TEST(NAME_, A_, B_, D_) \ + TEST_P(DqGEMMTestSuite, NAME_) { \ + MAKE_TEST_PARAMS(test_params); \ + using A_Type = A_; \ + using B_Type = B_; \ + using D_Type = D_; \ + performDqTest(test_params); \ + } + +MAKE_DQ_GEMM_TEST(Testfp8xfp8xfp16, fp8, fp8, fp16) + +INSTANTIATE_TEST_SUITE_P(OperatorTest, DqGEMMTestSuite, + ::testing::Combine(::testing::ValuesIn(test_case_sizes_mxfp8), + ::testing::Values(false), // bias - unused + ::testing::Values(false), // gelu - unused + ::testing::ValuesIn(kLayouts), //transa,transb + ::testing::Values(true)), //use mxfp8 + [](const testing::TestParamInfo& info) { + return MKN(std::get<0>(info.param)) + "x" + TN(std::get<3>(info.param)); + }); +#endif // __HIP_PLATFORM_AMD__ diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index ccc8ae681..d37900a1f 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -454,6 +454,9 @@ void Tensor::set_scale_inv(float scale_inv) { columnwise_cpu_scale_inv_ptr()[0] = scale_inv; } else { std::uniform_int_distribution dis(0, 127); + if (rowwise_) { + from_cpu(); //Need it because scale_inv_ptr getting does to_cpu() + } auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr(); for (size_t i = 0; i < num_scales; i++) { scale_inv_ptr[i] = dis(gen_); diff --git a/tests/pytorch/test_gemm_autotune.py b/tests/pytorch/test_gemm_autotune.py index 562581364..1b54e8464 100644 --- a/tests/pytorch/test_gemm_autotune.py +++ b/tests/pytorch/test_gemm_autotune.py @@ -34,7 +34,7 @@ def analyse_storage(fname): next(reader) head = reader.fieldnames assert ("m" in head and "algo_id" in head and "ws_min" in head and "ws_max" in head - and "aidx" in head), "Invalid CSV format" + ), "Invalid CSV format" return head def read_storage(fname): diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu index dcba674e4..574e8ab7e 100644 --- a/transformer_engine/common/gemm/rocm_gemm.cu +++ b/transformer_engine/common/gemm/rocm_gemm.cu @@ -1089,7 +1089,7 @@ void hipblaslt_gemm(const Tensor *inputA, // Note: gelu fusion is available for certain config from rocm 7.0 // amax(D) either (next op is high precision). #if HIPBLASLT_VERSION_MAJOR > 0 || HIPBLASLT_VERSION_MINOR >= 15 - hipblasLtMatmulMatrixScale_t scaling_mode; + hipblasLtMatmulMatrixScale_t scaling_mode = (hipblasLtMatmulMatrixScale_t)0; #else constexpr int scaling_mode = 0; #endif From c6a2c65c2f6a99f1c61f9b7541a1fad933e01549 Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Thu, 23 Oct 2025 12:05:35 -0400 Subject: [PATCH 18/26] Fix FFI import. Add distributed tests hang workaround (#347) --- build_tools/jax.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/build_tools/jax.py b/build_tools/jax.py index ae8e696c8..4e587b965 100644 --- a/build_tools/jax.py +++ b/build_tools/jax.py @@ -21,7 +21,12 @@ def xla_path() -> str: Throws FileNotFoundError if XLA source is not found.""" try: - from jax.extend import ffi + import jax + from packaging import version + if version.parse(jax.__version__) >= version.parse("0.5.0"): + from jax import ffi + else: + from jax.extend import ffi except ImportError: if os.getenv("XLA_HOME"): xla_home = Path(os.getenv("XLA_HOME")) From 499d2d86eda09def6b977ef92fddd820bf29a6b1 Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:30:36 -0400 Subject: [PATCH 19/26] Make TE ROCm wheels building image directly from manylinix image (#340) * Build ROCm wheels directly from manylinix image * Fix build on top of the latest Manylinix image * Fix build after switching to AITER --- .../wheel_utils/Dockerfile.rocm.manylinux.x86 | 34 ++++++++++++------- build_tools/wheel_utils/build_wheels.sh | 20 +++++++---- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86 b/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86 index 2b78544df..cf5dbb3bc 100644 --- a/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86 +++ b/build_tools/wheel_utils/Dockerfile.rocm.manylinux.x86 @@ -2,22 +2,32 @@ # # See LICENSE for license information. -# This Dockerfile is used to build TransformerEngine wheels for ROCm on x86_64 architecture. -# It is based on the manylinux_2_28_x86_64 based image with ROCm installed. -ARG BASE_IMAGE=quay.io/pypa/manylinux_2_28_x86_64:non_existent_rocm_tag +# This Dockerfile is used to build TransformerEngine wheels for ROCm on x86_64 architecture +# on top of the manylinux_2_28_x86_64 base image. + +# Build args: +# BASE_IMAGE - Base manylinux image to use. Default: quay.io/pypa/manylinux_2_28_x86_64 +# ROCM_REPO_URL - ROCm repository URL. Default: https://repo.radeon.com/rocm/rhel8/latest/main/ +# GPU_TARGETS - Semicolon separated list of target GPU architectures. Default: "gfx942;gfx950" +# TARGET_BRANCH - Target branch for TransformerEngine. Default: none (use git default) +# GPU_TARGETS and TARGET_BRANCH can be overriden when start a container with NVTE_ROCM_ARCH and TARGET_BRANCH environment variables. + +# Set base image +ARG BASE_IMAGE=quay.io/pypa/manylinux_2_28_x86_64 FROM $BASE_IMAGE -# Setup the build_system repo -RUN echo -e "[build_system]\nname=ROCm\nbaseurl=https://repo.almalinux.org/build_system/8/x86_64/\nenabled=1\ngpgcheck=0" >/etc/yum.repos.d/build_system.repo +ARG ROCM_REPO_URL=https://repo.radeon.com/rocm/rhel8/latest/main/ -# Add and enable repos -RUN dnf update -y || true -RUN dnf install -y epel-release elrepo-release -RUN dnf config-manager --set-enabled build_system powertools extras epel elrepo +# Set up ROCm repo +RUN echo -e "[rocm]\nname=ROCm\nbaseurl=${ROCM_REPO_URL}\nenabled=1\ngpgcheck=0" > /etc/yum.repos.d/rocm.repo + +# Setup packages +RUN dnf install -y --disablerepo=epel rocm-dev hipblaslt hipblaslt-devel hipcub hipcub-devel +RUN dnf group install -y "Development Tools" && dnf install -y git cmake llvm-toolset gcc-toolset-12 + +#Uncomment the next line for ROCm 6.4 cmake workaround: remove newer incomnpatible cmake preinstalled on base image +#RUN rm /usr/local/bin/cmake || true -# Setup dev packages -RUN dnf group install -y "Development Tools" && \ - dnf install -y git cmake llvm-toolset hipblaslt hipblaslt-devel gcc-toolset-12 RUN dnf clean all RUN rm -rf /var/cache/dnf/* diff --git a/build_tools/wheel_utils/build_wheels.sh b/build_tools/wheel_utils/build_wheels.sh index 5320b8a39..5d37ae1d9 100644 --- a/build_tools/wheel_utils/build_wheels.sh +++ b/build_tools/wheel_utils/build_wheels.sh @@ -30,11 +30,17 @@ fi ROCM_BUILD=`${PYBINDIR}python -c "import build_tools.utils as u; print(int(u.rocm_build()))"` +if [ "$LOCAL_TREE_BUILD" != "1" ]; then + if [ "$ROCM_BUILD" = "1" ]; then + git pull + fi + git checkout $TARGET_BRANCH + git submodule update --init --recursive +fi + if [ "$ROCM_BUILD" = "1" ]; then - git pull + ${PYBINDIR}pip install setuptools wheel fi -git checkout $TARGET_BRANCH -git submodule update --init --recursive if $BUILD_METAPACKAGE ; then cd /TransformerEngine @@ -50,10 +56,10 @@ if $BUILD_COMMON ; then WHL_BASE="transformer_engine-${VERSION}" if [ "$ROCM_BUILD" = "1" ]; then TE_CUDA_VERS="rocm" - ${PYBINDIR}pip install ninja dataclasses - if [ -n "$PYBINDIR" ]; then - PATH="$PYBINDIR:$PATH" #hipify expects python in PATH - fi + #dataclasses, psutil are needed for AITER + ${PYBINDIR}pip install ninja dataclasses psutil + #hipify expects python in PATH, also ninja may be installed to python bindir + test -n "$PYBINDIR" && PATH="$PYBINDIR:$PATH" || true else TE_CUDA_VERS="cu12" PYBINDIR=/opt/python/cp38-cp38/bin/ From 235b9b6525eb57ab6b1b8f34d1c36b05c0746de5 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar <125210283+VeeraRajasekhar@users.noreply.github.com> Date: Fri, 31 Oct 2025 10:42:03 -0500 Subject: [PATCH 20/26] [CI] Hotfix test_gemm_autotune update (#353) --- transformer_engine/common/gemm/rocm_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu index 574e8ab7e..9de4cfad7 100644 --- a/transformer_engine/common/gemm/rocm_gemm.cu +++ b/transformer_engine/common/gemm/rocm_gemm.cu @@ -750,8 +750,8 @@ protected: std::getline(is, type_b, csv_sep); std::getline(is, type_d, csv_sep); std::getline(is, bias_type, csv_sep); - is >> cfg.lda >> c >> cfg.ldb >> c >> cfg.ldd >> c >> cfg.scaling_mode >> c; std::getline(is, aux_type, csv_sep); + is >> cfg.lda >> c >> cfg.ldb >> c >> cfg.ldd >> c >> cfg.scaling_mode >> c; std::getline(is, epi, csv_sep); std::getline(is, comp, csv_sep); std::getline(is, scale, csv_sep); From bcae45934fd5f2133801d5c3094e5001d7a41131 Mon Sep 17 00:00:00 2001 From: alextmagro Date: Fri, 31 Oct 2025 10:43:35 -0500 Subject: [PATCH 21/26] MXFP8 test scale off by 1 fix (#338) * MXFP8 test scale off by 1 fix --- tests/cpp/operator/test_cast_mxfp8.cu | 47 ++++++++++++- .../operator/test_cast_mxfp8_gated_swiglu.cu | 44 +++++++++++- tests/cpp/test_common.cu | 68 +++++++++++++++++++ tests/cpp/test_common.h | 9 +++ 4 files changed, 163 insertions(+), 5 deletions(-) diff --git a/tests/cpp/operator/test_cast_mxfp8.cu b/tests/cpp/operator/test_cast_mxfp8.cu index 33a9b8629..6855c9487 100644 --- a/tests/cpp/operator/test_cast_mxfp8.cu +++ b/tests/cpp/operator/test_cast_mxfp8.cu @@ -76,12 +76,12 @@ void scale_block(const ProcessingMethod processing_method, continue; } amax = std::max(amax, std::abs(elt)); -#else +#else // #ifdef __HIP_PLATFORM_AMD__ if (std::isinf(elt) || std::isnan(elt)) { continue; } amax = fmaxf(amax, fabsf(elt)); -#endif +#endif // #ifdef __HIP_PLATFORM_AMD__ } } @@ -312,6 +312,23 @@ void performTest_x1(const ProcessingMethod processing_method, block_size_cols, scales_stride); + +#ifdef __HIP_PLATFORM_AMD__ + if (processing_method != ProcessingMethod::CAST_ONLY) { + std::vector> mismatch_idx; + compare_e8m0_scaling_factors("scales", output_c, ref_output_scales.get(), + unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, rowwise, mismatch_idx); + + if (mismatch_idx.size()) { + adjust_ref(mismatch_idx, ref_output_c.get(), unpadded_blocks_Y, unpadded_blocks_X, rows, cols, otype); + } + + auto [atol, rtol] = getTolerances(otype); + compareResults("output_c", output_c, ref_output_c.get(), rowwise, atol, rtol); + } + else +#endif // #ifdef __HIP_PLATFORM_AMD__ + { auto [atol, rtol] = getTolerances(otype); compareResults("output_c", output_c, ref_output_c.get(), rowwise, atol, rtol); @@ -321,6 +338,7 @@ void performTest_x1(const ProcessingMethod processing_method, compare_e8m0_scaling_factors("scales", gpu_scales_ptr, ref_output_scales.get(), unpadded_blocks_Y, unpadded_blocks_X, scales_stride); + } if (processing_method == ProcessingMethod::CAST_DBIAS || processing_method == ProcessingMethod::CAST_DBIAS_DACT) { auto [atol_dbias, rtol_dbias] = getTolerances(itype); @@ -454,7 +472,29 @@ void performTest_x2(const ProcessingMethod processing_method, block_size_cols, scales_stride_rowwise, scales_stride_colwise); +#ifdef __HIP_PLATFORM_AMD__ + if (processing_method != ProcessingMethod::CAST_ONLY) { + std::vector> mismatch_idx_r; + compare_e8m0_scaling_factors("scales_rowwise", output, ref_scales_rowwise.get(), + unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, scales_stride_rowwise, 0.01, true, mismatch_idx_r); + + if (mismatch_idx_r.size()) { + adjust_ref(mismatch_idx_r, ref_output_c_rowwise.get(), unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, rows, cols, otype); + } + std::vector> mismatch_idx_c; + compare_e8m0_scaling_factors("scales_colwise", output, ref_scales_colwise.get(), + unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, scales_stride_colwise, 0.01, false, mismatch_idx_c); + + if (mismatch_idx_c.size()) { + adjust_ref(mismatch_idx_c, ref_output_c_colwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, rows, cols, otype); + } + auto [atol, rtol] = getTolerances(otype); + compareResults("output_c_rowwise", output, ref_output_c_rowwise.get(), true, atol, rtol); + compareResults("output_c_colwise", output, ref_output_c_colwise.get(), false, atol, rtol); + } else +#endif // #ifdef __HIP_PLATFORM_AMD__ + { auto [atol, rtol] = getTolerances(otype); compareResults("output_c_rowwise", output, ref_output_c_rowwise.get(), true, atol, rtol); compareResults("output_c_colwise", output, ref_output_c_colwise.get(), false, atol, rtol); @@ -464,6 +504,7 @@ void performTest_x2(const ProcessingMethod processing_method, compare_e8m0_scaling_factors("scales_colwise", output.columnwise_cpu_scale_inv_ptr(), ref_scales_colwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, scales_stride_colwise); + } if (processing_method == ProcessingMethod::CAST_DBIAS || processing_method == ProcessingMethod::CAST_DBIAS_DACT) { auto [atol_dbias, rtol_dbias] = getTolerances(itype); @@ -563,7 +604,7 @@ TEST_P(FusedCastMXFP8TestSuite, TestFusedCastMXFP8) { if (getDeviceComputeCapability() < blackwellComputeCapability) { GTEST_SKIP(); } -#endif +#endif // #ifdef __HIP_PLATFORM_AMD__ using namespace transformer_engine; using namespace test; diff --git a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu index 0e43c2c9d..96663e752 100644 --- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu +++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu @@ -262,9 +262,24 @@ void performTest_x1(const size_t rows, block_size_rows, block_size_cols, scales_stride); +#ifdef __HIP_PLATFORM_AMD__ + std::vector> mismatch_idx; + if (rowwise) { + compare_e8m0_scaling_factors("rowwise scales", output, ref_output_scales.get(), + unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, true, mismatch_idx); + } else { + compare_e8m0_scaling_factors("colwise scales", output, ref_output_scales.get(), + unpadded_blocks_Y, unpadded_blocks_X, scales_stride, 0.01, false, mismatch_idx); + } + if (mismatch_idx.size()) { + adjust_ref(mismatch_idx, ref_output.get(), unpadded_blocks_Y, unpadded_blocks_X, rows, cols, otype); + } auto [atol, rtol] = getTolerances(otype); compareResults("output", output, ref_output.get(), rowwise, atol, rtol); +#else // #ifdef __HIP_PLATFORM_AMD__ + auto [atol, rtol] = getTolerances(otype); + compareResults("output", output, ref_output.get(), rowwise, atol, rtol); const uint8_t * const gpu_scales_ptr = rowwise ? output.rowwise_cpu_scale_inv_ptr() @@ -276,6 +291,7 @@ void performTest_x1(const size_t rows, compare_e8m0_scaling_factors("colwise scales", gpu_scales_ptr, ref_output_scales.get(), unpadded_blocks_Y, unpadded_blocks_X, scales_stride); } +#endif // #ifdef __HIP_PLATFORM_AMD__ } /** @@ -361,17 +377,41 @@ void performTest_x2(const size_t rows, block_size_cols, scales_stride_rowwise, scales_stride_colwise); +#ifdef __HIP_PLATFORM_AMD__ + std::vector> mismatch_idx_r; + compare_e8m0_scaling_factors("scales_rowwise", output, + ref_scales_rowwise.get(), unpadded_blocks_Y_rowwise, + unpadded_blocks_X_rowwise, scales_stride_rowwise, 0.01, true, mismatch_idx_r); + + if (mismatch_idx_r.size()) { + adjust_ref(mismatch_idx_r, ref_output_colwise.get(), unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, rows, cols, otype); + } + + std::vector> mismatch_idx_c; + compare_e8m0_scaling_factors("scales_colwise", output, + ref_scales_colwise.get(), unpadded_blocks_Y_colwise, + unpadded_blocks_X_colwise, scales_stride_colwise, 0.01, false, mismatch_idx_c); + + if (mismatch_idx_c.size()) { + adjust_ref(mismatch_idx_c, ref_output_rowwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, rows, cols, otype); + } auto [atol, rtol] = getTolerances(otype); auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("output_c_rowwise", output, ref_output_rowwise.get(), true, atol, rtol); compareResults("output_c_colwise", output, ref_output_colwise.get(), false, atol, rtol); +#else // #ifdef __HIP_PLATFORM_AMD__ + auto [atol, rtol] = getTolerances(otype); + auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); + compareResults("output_c_rowwise", output, ref_output_rowwise.get(), true, atol, rtol); + compareResults("output_c_colwise", output, ref_output_colwise.get(), false, atol, rtol); compare_e8m0_scaling_factors("scales_rowwise", output.rowwise_cpu_scale_inv_ptr(), ref_scales_rowwise.get(), unpadded_blocks_Y_rowwise, unpadded_blocks_X_rowwise, scales_stride_rowwise); compare_e8m0_scaling_factors("scales_colwise", output.columnwise_cpu_scale_inv_ptr(), ref_scales_colwise.get(), unpadded_blocks_Y_colwise, unpadded_blocks_X_colwise, scales_stride_colwise); +#endif // #ifdef __HIP_PLATFORM_AMD__ } std::vector> matrix_sizes = { @@ -418,12 +458,12 @@ class CastMXFP8_GatedActTestSuite : public ::testing::TestWithParam TEST_P(CastMXFP8_GatedActTestSuite, TestCastMXFP8Swiglu) { #ifdef __HIP_PLATFORM_AMD__ omp_set_num_threads(std::min(128, omp_get_max_threads())); // Using threads = # of vcpus causes occasional errors. -#else +#else // #ifdef __HIP_PLATFORM_AMD__ // Skip tests for pre-Blackwell architectures if (getDeviceComputeCapability() < blackwellComputeCapability) { GTEST_SKIP(); } -#endif +#endif // #ifdef __HIP_PLATFORM_AMD__ using namespace transformer_engine; diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index d37900a1f..d3dd6e95f 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -714,6 +714,74 @@ void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, } } +#ifdef __HIP_PLATFORM_AMD__ +void compare_e8m0_scaling_factors(const std::string &name, Tensor &output, const uint8_t *ref, + const size_t row_blocks, const size_t col_blocks, const size_t stride, + double tol, bool rowwise, std::vector> &mismatch_idx) { + const uint8_t *const test = rowwise ? output.rowwise_cpu_scale_inv_ptr() + : output.columnwise_cpu_scale_inv_ptr(); + + const float scale_tol = std::max(1.f, row_blocks * col_blocks * tol); + + for (int i = 0; i < row_blocks; i++) { + for (int j = 0; j < col_blocks; j++) { + const int idx = i * stride + j; + if (test[idx] != ref[idx]) { + int t_scale = static_cast(test[idx]); + int r_scale = static_cast(ref[idx]); + if (std::abs(t_scale - r_scale) == 1) { + mismatch_idx.emplace_back(i, j, r_scale-t_scale); + } else { + GTEST_FAIL() << "Error in " << name << std::endl + << "Mismatch: " << t_scale << " vs " + << r_scale << " at index " << idx; + } + } + } + } + const size_t scale_mismatches = mismatch_idx.size(); + + ASSERT_FALSE(scale_mismatches > scale_tol) + << "Error in " << name << std::endl << std::setprecision(4) + << "Total scale mismatches: " << scale_mismatches << " (" << 100.*(double)scale_mismatches/(double)(row_blocks*col_blocks) + << "%) Exceeds tolerance of " << scale_tol << " (" << 100.*tol << "%) mismatches"; + + if (scale_mismatches) { + std::cout << "\x1b[33mWARNING:\x1b[0m " << scale_mismatches + << " scale mismatches were found. This does not imply an accuracy issue." << std::endl; + } +} + +void adjust_ref(std::vector> mismatch_idx, void *ref, const size_t row_blocks, + const size_t col_blocks, const size_t rows, const size_t cols, DType otype) { + TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( otype, T, + T *ref_data = reinterpret_cast(ref); + double scale_val; + const size_t col_blocks_size = cols / col_blocks; + const size_t row_blocks_size = rows / row_blocks; + for (const auto &[i, j, scale_diff] : mismatch_idx) { + if (scale_diff == 1) { + scale_val = 2.; + } else if (scale_diff == -1) { + scale_val = .5; + } else { // Shouldn't ever reach this + GTEST_FAIL() << "Error in adjust_ref, |scale_diff| > 1"; + } + size_t ii_min = i * row_blocks_size; + const size_t ii_max = std::min(ii_min + row_blocks_size, rows); + for (; ii_min < ii_max; ii_min++) { + size_t jj_min = j * col_blocks_size; + const size_t jj_max = std::min(jj_min + col_blocks_size, cols); + for (; jj_min < jj_max; jj_min++) { + const size_t data_idx = ii_min * cols + jj_min; + ref_data[data_idx] = static_cast(static_cast(ref_data[data_idx]) * scale_val); + } + } + } + ); // NOLINT(*) +} +#endif // #ifdef __HIP_PLATFORM_AMD__ + std::pair getTolerances(const DType type) { switch(type) { case DType::kFloat32: diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 7ac2b75a6..6b9514d38 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -19,6 +19,7 @@ #else #include #include "amd_detail/hip_float8.h" +#include #endif #include @@ -461,6 +462,14 @@ void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const size_t row_blocks, const size_t col_blocks, const size_t stride); void compare_e8m0_scaling_factors(const std::string &name, const uint8_t *test, const uint8_t *ref, const size_t N); +#ifdef USE_ROCM +void compare_e8m0_scaling_factors(const std::string &name, Tensor &output, const uint8_t *ref, + const size_t row_blocks, const size_t col_blocks, const size_t stride, + double tol, bool rowwise, std::vector> &mismatch_idx); + +void adjust_ref(std::vector> mismatch_idx, void *ref, const size_t row_blocks, + const size_t col_blocks, const size_t rows, const size_t cols, DType otype); +#endif std::array get_scale_tensor_dims(const size_t rows, const size_t cols, const size_t block_size_rows, const size_t block_size_cols); From 34b1a3495da1f0f57a16707ec5bbd1018ee773d7 Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Fri, 7 Nov 2025 12:46:55 -0500 Subject: [PATCH 22/26] CI: allow numpy 2.0 (#366) (cherry picked from commit 6b8a47d16a6b5b7ba162238ef05ca0214621ef3d) --- ci/pytorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pytorch.sh b/ci/pytorch.sh index 207949ee5..93b9ded7f 100755 --- a/ci/pytorch.sh +++ b/ci/pytorch.sh @@ -12,7 +12,7 @@ TEST_DIR=${TE_PATH}tests/pytorch #: ${TEST_WORKERS:=4} install_prerequisites() { - pip install 'numpy>=1.22.4,<2.0' pandas + pip install 'numpy>=1.22.4' pandas rc=$? if [ $rc -ne 0 ]; then script_error "Failed to install test prerequisites" From 736ab30857d9e26510fcffa77a05a7360b7831c9 Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Fri, 7 Nov 2025 19:48:05 -0500 Subject: [PATCH 23/26] Relax tolerance to pass 29x29x17389NT GEMM on MI350 (#365) (cherry picked from commit 9a987f8d391a8b3dbad21d279899b53cbcbe55b7) --- tests/cpp/operator/test_cublaslt_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu index b731cc701..1ef3f7ee3 100644 --- a/tests/cpp/operator/test_cublaslt_gemm.cu +++ b/tests/cpp/operator/test_cublaslt_gemm.cu @@ -186,7 +186,7 @@ std::pair getTestTolerances(const DType type, bool use_fp8, bool else if (use_fp8) { atol = 1e-3; //TODO: remove (see comment above) - rtol = std::max(rtol, 5e-3); + rtol = std::max(rtol, 1e-2); } else if (type == DType::kBFloat16) { //relax for certain prime number TN gemm From baed0d1e146ddfc3246e104868d2a9cfd596597f Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Sun, 12 Oct 2025 10:17:59 -0400 Subject: [PATCH 24/26] Bring back aiter solib with aiter update (#327) * AITER solib with commit fc3c0420 * [ROCm] api call fix and disable v3 fwd with swa (#331) * [ROCm] update aiter commit with gfx950 fix and swa fwd fix --------- Co-authored-by: Ye Wang (cherry picked from commit b08a1ed9273ccf641d58fe0a7093e1e1dcf6c2b2) --- .gitignore | 2 - 3rdparty/aiter | 2 +- setup.py | 30 +-- transformer_engine/common/CMakeLists.txt | 13 +- .../common/ck_fused_attn/CMakeLists.txt | 176 +++--------------- .../ck_fused_attn/src/ck_fused_attn_bwd.cpp | 4 +- .../ck_fused_attn/src/ck_fused_attn_fwd.cpp | 25 ++- .../common/fused_attn_rocm/fused_attn_ck.cpp | 1 + 8 files changed, 55 insertions(+), 198 deletions(-) diff --git a/.gitignore b/.gitignore index 44de0a19e..874eed018 100644 --- a/.gitignore +++ b/.gitignore @@ -52,7 +52,5 @@ compile_commands.json **/profiler_outputs/ **/times.csv tensor_dumps/ -aiter/ transformer_engine/build_info.txt transformer_engine/common/util/hip_nvml.* -transformer_engine/aiter/ diff --git a/3rdparty/aiter b/3rdparty/aiter index a2ca1b460..74e71eb8e 160000 --- a/3rdparty/aiter +++ b/3rdparty/aiter @@ -1 +1 @@ -Subproject commit a2ca1b460f097a309ee5a128c7454b1c419dc331 +Subproject commit 74e71eb8ee8a663d5e33c0cfd8b4dad7708ae84b diff --git a/setup.py b/setup.py index 0012844a8..b7b234ba3 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ from setuptools.command.build_ext import build_ext as BuildExtension -from setuptools.command.develop import develop as _develop os.environ["NVTE_PROJECT_BUILDING"] = "1" @@ -48,26 +47,6 @@ if not rocm_build(): archs = cuda_archs() -# A custom develop command only used for ROCm builds -class develop(_develop): - def run(self): - super().run() - if ( - int(os.getenv("NVTE_FUSED_ATTN_CK", "1")) and - int(os.getenv("NVTE_FUSED_ATTN", "1")) - ): - # Ensure that the AITER ASM kernels are properly available at runtime - # by creating a symlink to them. This is only necessary for editable - # mode since our C++ code assumes the AITER ASM kernel paths relative - # to trasnformer_engine.so, which is different in editable installs. - project_dir = Path(__file__).parent - asm_src_dir = project_dir / 'transformer_engine' / 'aiter' - # Must be synced with - # TransformerEngine/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp - asm_target_dir = project_dir / 'aiter' - if asm_src_dir.is_dir() and not asm_target_dir.is_dir(): - asm_target_dir.symlink_to(asm_src_dir) - class TimedBdist(bdist_wheel): """Helper class to measure build time""" @@ -89,7 +68,7 @@ def setup_common_extension() -> CMakeExtension: cmake_flags.append(f"-DCK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT={os.getenv('NVTE_CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT', 3)}") if os.getenv("NVTE_CK_FUSED_ATTN_PATH"): ck_path = Path(os.getenv("NVTE_CK_FUSED_ATTN_PATH")) - cmake_flags.append(f"-DCK_FUSED_ATTN_PATH={ck_path}") + cmake_flags.append(f"-DAITER_MHA_PATH={ck_path}") if int(os.getenv("NVTE_FUSED_ATTN_AOTRITON", "1"))==0 or int(os.getenv("NVTE_FUSED_ATTN", "1"))==0: cmake_flags.append("-DUSE_FUSED_ATTN_AOTRITON=OFF") if int(os.getenv("NVTE_FUSED_ATTN_CK", "1"))==0 or int(os.getenv("NVTE_FUSED_ATTN", "1"))==0: @@ -192,7 +171,6 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: with open("README.rst", encoding="utf-8") as f: long_description = f.read() - cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist} # Settings for building top level empty package for dependency management. if bool(int(os.getenv("NVTE_BUILD_METAPACKAGE", "0"))): assert bool( @@ -200,6 +178,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: ), "NVTE_RELEASE_BUILD env must be set for metapackage build." te_cuda_vers = "rocm" if rocm_build() else "cu12" ext_modules = [] + cmdclass = {} package_data = {} include_package_data = False setup_requires = [] @@ -211,8 +190,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: else: setup_requires, install_requires, test_requires = setup_requirements() ext_modules = [setup_common_extension()] - if rocm_build(): - cmdclass["develop"] = develop + cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist} package_data = {"": ["VERSION.txt"]} include_package_data = True extras_require = {"test": test_requires} @@ -255,7 +233,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]: long_description=long_description, long_description_content_type="text/x-rst", ext_modules=ext_modules, - cmdclass=cmdclass, + cmdclass={"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}, python_requires=">=3.8, <3.13", classifiers=[ "Programming Language :: Python :: 3.8", diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index a9e2e056e..f70c9f8bb 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -351,18 +351,7 @@ else() endif() if(USE_FUSED_ATTN_CK) - if(NOT DEFINED CK_FUSED_ATTN_PATH) - set(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT ${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT} CACHE STRING "ck float to bf16 conversion rounding") - add_subdirectory(ck_fused_attn ${CMAKE_CURRENT_BINARY_DIR}/ck_fused_attn) - else() - # Use CK built during initial TE building/installation - # When only need rebuild TE library itself - unset(CK_FUSED_ATTN_LIB CACHE) - find_library(CK_FUSED_ATTN_LIB NAMES ck_fused_attn PATHS ${CK_FUSED_ATTN_PATH}/lib REQUIRED NO_DEFAULT_PATH) - add_library( ck_fused_attn STATIC IMPORTED ) - set_target_properties( ck_fused_attn PROPERTIES IMPORTED_LOCATION ${CK_FUSED_ATTN_LIB} ) - target_include_directories(ck_fused_attn INTERFACE ${CK_FUSED_ATTN_PATH}/include) - endif() + add_subdirectory(ck_fused_attn ${CMAKE_CURRENT_BINARY_DIR}/ck_fused_attn) endif() find_package(hip) diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt index 2a2afa328..c44a930e6 100644 --- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt +++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt @@ -1,20 +1,15 @@ # Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -#TODO: compile to a shared library -cmake_minimum_required(VERSION 3.28) -set(CMAKE_CXX_STANDARD 20) -#TODO: remove after figuring out how to install clang-scan-deps -set(CMAKE_CXX_SCAN_FOR_MODULES OFF) +cmake_minimum_required(VERSION 3.21) +set(CMAKE_CXX_STANDARD 17) project(ck_fused_attn LANGUAGES HIP CXX) -# remove files that should be regenerated -file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp ${CMAKE_CURRENT_BINARY_DIR}/gen_src/blob_list.txt) -# create gen_src and gen_src/tmp directories if needed -file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp) +set(AITER_MHA_INSTALL_PREFIX "transformer_engine" CACHE STRING "aiter mha shared lib install prefix in TE") set(__AITER_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../3rdparty/aiter") +set(__AITER_TEST_DIR "${__AITER_SOURCE_DIR}/op_tests/cpp/mha") set(__CK_SOURCE_DIR "${__AITER_SOURCE_DIR}/3rdparty/composable_kernel") # so far, there are only gfx942 and gfx950 v3 kernels @@ -37,82 +32,22 @@ message(STATUS "AITER V3_ASM_ARCHS: ${V3_ASM_ARCHS}") list(JOIN V3_ASM_ARCHS ";" V3_ASM_ARCHS_STR) set(ENV{GPU_ARCHS} "${V3_ASM_ARCHS_STR}") -# generate v2 (CK) kernels -# fwd kernels list -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api fwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_blob_list.txt --receipt 600 -) -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api fwd_splitkv --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_splitkv_blob_list.txt --receipt 600 -) -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api batch_prefill --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_batch_prefill_blob_list.txt --receipt 600 -) - -# bwd kernels list -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/gen_src/bwd_blob_list.txt --receipt 600 -) - -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS) -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_splitkv_blob_list.txt FMHA_FWD_SPLITKV_GEN_BLOBS) -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/fwd_batch_prefill_blob_list.txt FMHA_FWD_BATCH_PREFILL_GEN_BLOBS) -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gen_src/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS) - -# generate the actual fwd kernel cpp files -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api fwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600 -) - -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api fwd_splitkv --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600 -) - -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api batch_prefill --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600 -) - -# generate the aiter fwd interface cpp file -execute_process( - COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/cpp_itfs/mha_fwd_generate.py - --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 5 -) - -# generate the actual bwd kernel cpp files -execute_process( - COMMAND python3 ${__CK_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py - --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp --receipt 600 -) - -# generate the aiter bwd interface cpp file -execute_process( - COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/py_itfs_cu/fmha_bwd_pre_post_kernel_generate.py - --filter *@*_ndeterministic@*_nbias*_dropout*_ndeterministic* --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp -) - -execute_process( - COMMAND python3 ${__AITER_SOURCE_DIR}/csrc/cpp_itfs/mha_bwd_generate.py - --receipt 3 --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp -) - -# generate fwd/bwd v3 kernels for each requested rocm arch -foreach(CK_TARGET_ARCH IN LISTS V3_ASM_ARCHS) - execute_process( - COMMAND python3 ${__AITER_SOURCE_DIR}/hsa/${CK_TARGET_ARCH}/fmha_v3_fwd/codegen.py - --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp - ) +if(NOT DEFINED AITER_MHA_PATH) + # delete the existing aiter/jit/build dir for a clean build + file(REMOVE_RECURSE "${__AITER_SOURCE_DIR}/aiter/jit/build") + # compile the libmha_fwd.so and libmha_bwd.so + set(ENV{AITER_LOG_MORE} 1) + # fp32 to bf16 cvt env still required for MI300X + set(ENV{CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT} ${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT}) execute_process( - COMMAND python3 ${__AITER_SOURCE_DIR}/hsa/${CK_TARGET_ARCH}/fmha_v3_bwd/codegen.py - --output_dir ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp + COMMAND python3 ${__AITER_TEST_DIR}/compile.py ) -endforeach() + # libmha_fwd.so and libmha_bwd.so will be under 3rdparty/aiter/op_tests/cpp/mha + set(__AITER_MHA_PATH ${__AITER_TEST_DIR}) +else() + # use pre-built libmha_fwd.so libmha_bwd.so + set(__AITER_MHA_PATH ${AITER_MHA_PATH}) +endif() set(ck_fused_attn_SOURCES) list(APPEND ck_fused_attn_SOURCES @@ -120,75 +55,18 @@ list(APPEND ck_fused_attn_SOURCES src/ck_fused_attn_bwd.cpp src/ck_fused_attn_utils.cpp) -foreach(blob ${FMHA_FWD_GEN_BLOBS}) - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT) -endforeach() -list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_GEN_BLOBS}) - -foreach(blob ${FMHA_FWD_SPLITKV_GEN_BLOBS}) - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT) -endforeach() -list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_SPLITKV_GEN_BLOBS}) - -foreach(blob ${FMHA_FWD_BATCH_PREFILL_GEN_BLOBS}) - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT) -endforeach() -list(APPEND ck_fused_attn_SOURCES ${FMHA_FWD_BATCH_PREFILL_GEN_BLOBS}) - -foreach(blob ${FMHA_BWD_GEN_BLOBS}) - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${blob}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${blob} ONLY_IF_DIFFERENT) -endforeach() -list(APPEND ck_fused_attn_SOURCES ${FMHA_BWD_GEN_BLOBS}) - -# add generated cpp files into ck_fused_attn_sources -set(MHA_BWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/mha_bwd.cpp") -set(MHA_FWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/mha_fwd.cpp") - -file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${MHA_BWD_SRC}) -file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${MHA_BWD_SRC} ONLY_IF_DIFFERENT) - -file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${MHA_FWD_SRC}) -file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${MHA_FWD_SRC} ONLY_IF_DIFFERENT) - -list(APPEND ck_fused_attn_SOURCES ${MHA_BWD_SRC} ${MHA_FWD_SRC}) - -foreach(CK_TARGET_ARCH IN LISTS V3_ASM_ARCHS) - set(ASM_MHA_FWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/asm_fmha_fwd_v3_${CK_TARGET_ARCH}.cpp") - set(ASM_MHA_BWD_SRC "${CMAKE_CURRENT_BINARY_DIR}/gen_src/asm_fmha_bwd_v3_${CK_TARGET_ARCH}.cpp") - - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${ASM_MHA_BWD_SRC}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${ASM_MHA_BWD_SRC} ONLY_IF_DIFFERENT) - - file(RELATIVE_PATH blob_path ${CMAKE_CURRENT_BINARY_DIR}/gen_src ${ASM_MHA_FWD_SRC}) - file(COPY_FILE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp/${blob_path} ${ASM_MHA_FWD_SRC} ONLY_IF_DIFFERENT) - list(APPEND ck_fused_attn_SOURCES ${ASM_MHA_BWD_SRC} ${ASM_MHA_FWD_SRC}) -endforeach() - -# remove all previously generated temporary files -file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/gen_src/tmp) - message(STATUS "Found the following fused attention files:") foreach(file ${ck_fused_attn_SOURCES}) message(STATUS " ${file}") endforeach() -add_library(ck_fused_attn STATIC ${ck_fused_attn_SOURCES}) +add_library(ck_fused_attn SHARED ${ck_fused_attn_SOURCES}) set(CK_FUSED_ATTN_COMPILE_OPTIONS) list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS - -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -DCK_TILE_FMHA_FWD_SPLITKV_API=1-DCK_TILE_FMHA_FWD_APPENDKV_API=0 - -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT} - -fgpu-flush-denormals-to-zero -ftemplate-backtrace-limit=0 -fPIC - -Wno-undefined-func-template -Wno-float-equal -Wno-gnu-line-marker -Wunused-variable -Wuninitialized - "SHELL:-mllvm -enable-post-misched=0" "SHELL:-mllvm -amdgpu-early-inline-all=true" - "SHELL:-mllvm -amdgpu-function-calls=false" "SHELL:-mllvm -amdgpu-coerce-illegal-types=1" - "SHELL:-mllvm --amdgpu-kernarg-preload-count=16") + -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT}) -foreach(CK_TARGET_ARCH IN LISTS CMAKE_HIP_ARCHITECTURES) - list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS --offload-arch=${CK_TARGET_ARCH}) +foreach(ARCH IN LISTS V3_ASM_ARCHS) + list(APPEND CK_FUSED_ATTN_COMPILE_OPTIONS --offload-arch=${ARCH}) endforeach() set(CK_INCLUDE_DIR "${__CK_SOURCE_DIR}/include") @@ -216,18 +94,22 @@ target_include_directories(ck_fused_attn PRIVATE ${CK_INCLUDE_DIR} ${__CK_SOURCE target_include_directories(ck_fused_attn PRIVATE ${AITER_INCLUDE_DIR}) find_package(hip) -list(APPEND ck_fused_attn_LINKER_LIBS hip::host hip::device roctx64) +list(APPEND ck_fused_attn_LINKER_LIBS hip::host hip::device roctx64 ${__AITER_MHA_PATH}/libmha_fwd.so ${__AITER_MHA_PATH}/libmha_bwd.so) target_link_libraries(ck_fused_attn PUBLIC ${ck_fused_attn_LINKER_LIBS}) target_compile_options(ck_fused_attn PRIVATE ${CK_FUSED_ATTN_COMPILE_OPTIONS}) +set_target_properties(ck_fused_attn PROPERTIES INSTALL_RPATH "$ORIGIN") +install(FILES ${__AITER_MHA_PATH}/libmha_fwd.so ${__AITER_MHA_PATH}/libmha_bwd.so DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib) +install(TARGETS ck_fused_attn DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib) # copy v3 kernels to destination foreach(ARCH IN LISTS V3_ASM_ARCHS) install(DIRECTORY ${__AITER_SOURCE_DIR}/hsa/${ARCH}/fmha_v3_fwd - DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine/aiter/${ARCH}/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib/aiter/${ARCH}/ PATTERN "codegen.py" EXCLUDE) install(DIRECTORY ${__AITER_SOURCE_DIR}/hsa/${ARCH}/fmha_v3_bwd - DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine/aiter/${ARCH}/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/${AITER_MHA_INSTALL_PREFIX}/lib/aiter/${ARCH}/ PATTERN "codegen.py" EXCLUDE) endforeach() + diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp index 840db7b86..2b717ace0 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp @@ -920,8 +920,8 @@ hipError_t ck_attn_varlen_bwd( cu_seqlen_q_ptr,//cu_seqlen_q cu_seqlen_kv_ptr,//cu_seqlen_kv nullptr, /* seqlen_k_ptr */ - 0, //seqlen_q, unused in group mode - 0, //seqlen_kv, unused in group mode + max_seqlen_q, //seqlen_q, unused in group mode + max_seqlen_k, //seqlen_kv, unused in group mode batch, max_seqlen_q, max_seqlen_k, diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp index 2829175ab..c87a3db6c 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp @@ -209,9 +209,13 @@ hipError_t ck_attn_fwd( nullptr,//rand_val_ptr lse_ptr, o_ptr, - nullptr,//cu_seqlen_q - nullptr,//cu_seqlen_kv - nullptr, /* seqlen_k_ptr */ + nullptr, //cu_seqlen_q + nullptr, //cu_seqlen_kv + nullptr, //seqstart_q_ptr + nullptr, //seqstart_k_ptr + nullptr, //seqlen_k_ptr + nullptr, //seqstart_padded_q_ptr + nullptr, //seqstart_padded_k_ptr max_seqlen_q, max_seqlen_k, batch, @@ -308,6 +312,7 @@ hipError_t ck_attn_varlen_fwd( ck_tile::index_t nhead_k = hg; ck_tile::index_t hdim_v = d_v; ck_tile::index_t max_seqlen_q = s_q; + ck_tile::index_t max_seqlen_kv = s_kv; float scale_s = scaling_factor; float scale_p = 1.f; @@ -379,11 +384,15 @@ hipError_t ck_attn_varlen_fwd( nullptr,//rand_val_ptr lse_thd_ptr, o_ptr, - cu_seqlen_q_ptr,//cu_seqlen_q - cu_seqlen_kv_ptr,//cu_seqlen_kv - nullptr, /* seqlen_k_ptr */ - 0, //seqlen_q, unused in group mode - 0, //seqlen_kv, unused in group mode + nullptr, //cu_seqlen_q + nullptr, //cu_seqlen_kv + cu_seqlen_q_ptr, //seqstart_q_ptr + cu_seqlen_kv_ptr, //seqstart_k_ptr + nullptr, //seqlen_k_ptr + nullptr, //seqstart_padded_q_ptr + nullptr, //seqstart_padded_k_ptr + max_seqlen_q, //seqlen_q, unused in group mode + max_seqlen_kv, //seqlen_kv, unused in group mode batch, max_seqlen_q, hdim_q, diff --git a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp b/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp index 72696fbd9..b38249f5b 100644 --- a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp +++ b/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp @@ -557,6 +557,7 @@ void fused_attn_ck_fwd_impl( nvte_log_ck_config = true; } bool nvte_ck_uses_fwd_v3 = getenv("NVTE_CK_USES_FWD_V3", 0); + bool is_ragged = nvte_get_qkv_format(layout)==NVTE_QKV_Format::NVTE_THD; // extract the qkv and o storage bytes to allocate buffer for padding removing From cc5b35667ec290eed362fd0890e74e89faa9282a Mon Sep 17 00:00:00 2001 From: Ye Wang Date: Wed, 22 Oct 2025 16:33:41 -0500 Subject: [PATCH 25/26] [ROCm] update AITER to support aiter shared lib for multi-gpu (PRs 1196,1230) (#337) * [ROCm] include AITER PR 1196 to support aiter shared lib for multi-gpu * [ROCm] update aiter commit to remove pandas requirement (cherry picked from commit 63b4ce9339ca54deb0a13bee67270854031139cc) --- 3rdparty/aiter | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/aiter b/3rdparty/aiter index 74e71eb8e..1b00a0e8a 160000 --- a/3rdparty/aiter +++ b/3rdparty/aiter @@ -1 +1 @@ -Subproject commit 74e71eb8ee8a663d5e33c0cfd8b4dad7708ae84b +Subproject commit 1b00a0e8a54be0411490a69a5d7042abd33a56d9 From 08344fe062340c2f93e2f265ed2dc1fe0085fb6f Mon Sep 17 00:00:00 2001 From: ipanfilo <145064111+ipanfilo@users.noreply.github.com> Date: Wed, 12 Nov 2025 17:40:18 -0500 Subject: [PATCH 26/26] Use .info/version for ROCm verison (#368) (cherry picked from commit e9c736190c3080db9202b872d96f7171f1f93aa5) --- transformer_engine/common/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index f70c9f8bb..9a4187378 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -487,8 +487,16 @@ install(TARGETS transformer_engine DESTINATION .) set_target_properties(transformer_engine PROPERTIES INSTALL_RPATH "$ORIGIN/lib;$ORIGIN/transformer_engine/lib") if (USE_ROCM) + if("$ENV{ROCM_PATH}" STREQUAL "") + set(ROCM_PATH "/opt/rocm") + else() + set(ROCM_PATH "$ENV{ROCM_PATH}") + endif() + file(READ "${ROCM_PATH}/.info/version" ROCM_VER) + string(STRIP "${ROCM_VER}" ROCM_VER) + string(REGEX MATCH "^[0-9]+\\.[0-9]+" ROCM_VER "${ROCM_VER}") file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/build_info.txt" - "ROCM_VERSION: ${hip_VERSION_MAJOR}.${hip_VERSION_MINOR}\n" + "ROCM_VERSION: ${ROCM_VER}\n" "GPU_TARGETS: ${CMAKE_HIP_ARCHITECTURES}\n" ) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/build_info.txt" DESTINATION "transformer_engine/")