[None][chore] Enable auto deploy accuracy test in CI (NVIDIA#7179)

ajrasane · suyoggupta · web-flow · commit 068056677fd4 · 2025-08-24T08:42:30.000-07:00
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
Co-authored-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -476,9 +476,6 @@ def update_input_ids_with_new_tokens(
         idx = self.previous_batch_indices_cuda[: len(previous_batch_indices)]
         idx.copy_(host_idx, non_blocking=True)
 
-        # sort them so that masked_scatter_ lines up correctly
-        idx, _ = idx.sort()
-
         # gather the exact values you want to write
         src = new_tokens[0, idx, 0]
 
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -69,6 +69,8 @@ l0_b200:
   - unittest/_torch/modeling -k "modeling_deepseek"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
   - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
+  # ------------- AutoDeploy tests ---------------
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -89,3 +89,5 @@ l0_dgx_b200:
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
   - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
+  # ------------- AutoDeploy tests ---------------
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -61,6 +61,8 @@ l0_dgx_h100:
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
   - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
   - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism
+  # ------------- AutoDeploy tests ---------------
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -34,6 +34,8 @@ l0_dgx_h200:
   - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout]
   - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout]
   - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
+  # ------------- AutoDeploy tests ---------------
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -102,6 +102,8 @@ l0_h100:
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
   - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
+  # ------------- AutoDeploy tests ---------------
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
 - condition:
     ranges:
       system_gpu_count: