diff --git a/examples/configs/grpo_math_1B_sglang.yaml b/examples/configs/grpo_math_1B_sglang.yaml
new file mode 100644
index 0000000000..17b30f3ef5
--- /dev/null
+++ b/examples/configs/grpo_math_1B_sglang.yaml
@@ -0,0 +1,25 @@
+defaults: grpo_math_1B.yaml
+
+grpo:
+  val_batch_size: 128
+
+policy:
+  generation:
+    backend: "sglang"
+    sglang_cfg:
+      # SGLang specific configuration
+      model_path: ${policy.model_name} 
+      gpus_per_server: 1 
+      dtype: ${policy.precision}
+      context_length: 512  # Maximum context length
+      allow_auto_truncate: true
+      enable_memory_saver: false
+      dp_size: 1
+      pp_size: 1
+      ep_size: 1
+      max_running_requests: null
+      mem_fraction_static: 0.7
+      skip_server_warmup: true
+
+logger:
+  wandb_enabled: true
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml
new file mode 100644
index 0000000000..8428b1cd96
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml
@@ -0,0 +1,48 @@
+defaults: ../../grpo_math_1B.yaml
+
+grpo:
+  max_num_steps: 450
+
+checkpointing:
+  checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang
+
+policy:
+  model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    backend: "sglang"
+    max_new_tokens: 512
+    sglang_cfg:
+      model_path: ${policy.model_name}
+      gpus_per_server: 8
+      dtype: ${policy.precision}
+      context_length: 512
+      allow_auto_truncate: true
+      enable_memory_saver: false
+      dp_size: 1
+      pp_size: 1
+      ep_size: 1
+      max_running_requests: null
+      mem_fraction_static: 0.7
+      skip_server_warmup: true
+
+data:
+  max_input_seq_length: 512
+
+logger:
+  log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang
+
+cluster:
+  gpus_per_node: 8
+
diff --git a/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml b/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml
new file mode 100644
index 0000000000..30c6f5f76c
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml
@@ -0,0 +1,49 @@
+defaults: ../../grpo_math_1B.yaml
+
+grpo:
+  max_num_steps: 500
+  val_batch_size: 128
+
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-0.6b-1n8g-sglang
+
+policy:
+  model_name: Qwen/Qwen3-0.6B
+  tokenizer:
+    name: Qwen/Qwen3-0.6B
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    backend: "sglang"
+    max_new_tokens: 512
+    sglang_cfg:
+      model_path: ${policy.model_name}
+      gpus_per_server: 8
+      dtype: ${policy.precision}
+      context_length: 512
+      allow_auto_truncate: true
+      enable_memory_saver: false
+      dp_size: 1
+      pp_size: 1
+      ep_size: 1
+      max_running_requests: null
+      mem_fraction_static: 0.7
+      skip_server_warmup: true
+
+data:
+  max_input_seq_length: 512
+
+logger:
+  log_dir: logs/grpo-qwen3-0.6b-1n8g-sglang
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-0.6b-1n8g-sglang
+
+cluster:
+  gpus_per_node: 8
+
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 8ab62d00fb..3651521d9f 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -61,6 +61,7 @@
     run_multi_turn_rollout,
 )
 from nemo_rl.models.generation.interfaces import GenerationInterface
+from nemo_rl.models.generation.sglang import SGLangConfig, SGLangGeneration
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.interfaces import ColocatablePolicyInterface
@@ -482,9 +483,77 @@ def init_vllm():
         pg.finish_generation()
         return pg, time.perf_counter() - t0
 
-    # Handle backend-specific setup
+    def init_sglang():
+        """Initialize SGLang generation workers."""
+        t0 = time.perf_counter()
+        pg = SGLangGeneration(cluster=inference_cluster, config=generation_config)
+        pg.finish_generation()
+        return pg, time.perf_counter() - t0
+
+    def initialize_generation_with_policy(
+        init_generation_fn,
+        generation_name: str,
+        init_time_key: str,
+        colocated_inference: bool,
+        worker_init_timing_metrics: dict,
+    ):
+        """Generic function to initialize a generation engine (vLLM or SGLang) along with policy.
+
+        Args:
+            init_generation_fn: Function that initializes the generation engine (init_vllm or init_sglang)
+            generation_name: Name of the generation engine ("vLLM" or "SGLang")
+            init_time_key: Key name for storing initialization time in metrics ("vllm_init_time_s" or "sglang_init_time_s")
+            colocated_inference: Whether inference is colocated with training
+            worker_init_timing_metrics: Dictionary to store timing metrics
+
+        Returns:
+            Tuple of (policy_generation, policy)
+        """
+        # Determine if parallel initialization is possible (non-colocated mode)
+        use_parallel_init = not colocated_inference
+
+        if use_parallel_init:
+            # Parallel initialization: Generation engine and Policy can initialize simultaneously
+            print(
+                "  ⚡ Using parallel worker initialization (non-colocated mode)",
+                flush=True,
+            )
+
+            # Execute both initializations in parallel
+            parallel_start_time = time.perf_counter()
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                generation_future = executor.submit(init_generation_fn)
+                policy_future = executor.submit(init_policy)
+                policy_generation, generation_time = generation_future.result()
+                policy, policy_time = policy_future.result()
+            parallel_wall_time = time.perf_counter() - parallel_start_time
+
+            # Store timing metrics
+            worker_init_timing_metrics[init_time_key] = generation_time
+            worker_init_timing_metrics["policy_init_time_s"] = policy_time
+            worker_init_timing_metrics["parallel_wall_time_s"] = parallel_wall_time
+            worker_init_timing_metrics["parallel_init_enabled"] = True
+
+        else:
+            # Sequential initialization: colocated mode (GPU memory requires generation engine first)
+            print(
+                "  ⚙️  Using sequential worker initialization (colocated mode)",
+                flush=True,
+            )
+
+            # Initialize generation engine first (clean GPU memory), then policy
+            policy_generation, generation_time = init_generation_fn()
+            worker_init_timing_metrics[init_time_key] = generation_time
+
+            policy, policy_time = init_policy()
+            worker_init_timing_metrics["policy_init_time_s"] = policy_time
+            worker_init_timing_metrics["parallel_init_enabled"] = 0.0
+
+        return policy_generation, policy
+
+    # Handle generation-specific setup
     if backend == "megatron":
-        # Megatron backend: policy_generation is None, only initialize policy
+        # Megatron generation: policy_generation is None, only initialize policy
         policy_generation = None
         print(
             f"  ✓ Using {backend} backend for generation with {policy_config['model_name']}",
@@ -495,7 +564,7 @@ def init_vllm():
         worker_init_timing_metrics["policy_init_time_s"] = policy_time
 
     elif backend == "vllm":
-        # vLLM backend: setup config, then decide parallel vs sequential init
+        # vLLM generation: setup config, then initialize with policy
         generation_config = cast(VllmConfig, generation_config)
         if generation_config["vllm_cfg"]["precision"] == "fp8":
             assert loss_config["use_importance_sampling_correction"] is True, (
@@ -523,48 +592,36 @@ def init_vllm():
             "hf_config_overrides", {}
         )
 
-        # Determine if parallel initialization is possible (non-colocated mode)
-        use_parallel_init = not colocated_inference
-
-        if use_parallel_init:
-            # Parallel initialization: vLLM and Policy can initialize simultaneously
-            print(
-                "  ⚡ Using parallel worker initialization (non-colocated mode)",
-                flush=True,
-            )
-
-            # Execute both initializations in parallel
-            parallel_start_time = time.perf_counter()
-            with ThreadPoolExecutor(max_workers=2) as executor:
-                vllm_future = executor.submit(init_vllm)
-                policy_future = executor.submit(init_policy)
-                policy_generation, vllm_time = vllm_future.result()
-                policy, policy_time = policy_future.result()
-            parallel_wall_time = time.perf_counter() - parallel_start_time
+        policy_generation, policy = initialize_generation_with_policy(
+            init_generation_fn=init_vllm,
+            generation_name="vLLM",
+            init_time_key="vllm_init_time_s",
+            colocated_inference=colocated_inference,
+            worker_init_timing_metrics=worker_init_timing_metrics,
+        )
 
-            # Store timing metrics
-            worker_init_timing_metrics["vllm_init_time_s"] = vllm_time
-            worker_init_timing_metrics["policy_init_time_s"] = policy_time
-            worker_init_timing_metrics["parallel_wall_time_s"] = parallel_wall_time
-            worker_init_timing_metrics["parallel_init_enabled"] = True
+        print(
+            f"  ✓ Using vLLM backend for generation with {policy_config['model_name']}",
+            flush=True,
+        )
 
-        else:
-            # Sequential initialization: colocated mode (GPU memory requires vLLM first)
-            print(
-                "  ⚙️  Using sequential worker initialization (colocated mode)",
-                flush=True,
-            )
+    elif backend == "sglang":
+        generation_config = cast(SGLangConfig, generation_config)
 
-            # Initialize vLLM first (clean GPU memory), then policy
-            policy_generation, vllm_time = init_vllm()
-            worker_init_timing_metrics["vllm_init_time_s"] = vllm_time
+        # Set model_path if not already set
+        if "model_path" not in generation_config["sglang_cfg"]:
+            generation_config["sglang_cfg"]["model_path"] = policy_config["model_name"]
 
-            policy, policy_time = init_policy()
-            worker_init_timing_metrics["policy_init_time_s"] = policy_time
-            worker_init_timing_metrics["parallel_init_enabled"] = 0.0
+        policy_generation, policy = initialize_generation_with_policy(
+            init_generation_fn=init_sglang,
+            generation_name="SGLang",
+            init_time_key="sglang_init_time_s",
+            colocated_inference=colocated_inference,
+            worker_init_timing_metrics=worker_init_timing_metrics,
+        )
 
         print(
-            f"  ✓ Using vLLM backend for generation with {policy_config['model_name']}",
+            f"  ✓ Using SGLang backend for generation with {policy_config['model_name']}",
             flush=True,
         )
 
@@ -945,16 +1002,37 @@ def refit_policy_generation(
                     policy.get_free_memory_bytes() * float(memory_ratio)
                 )
 
-            futures_train = policy.stream_weights_via_ipc_zmq(
-                buffer_size_bytes=buffer_size_bytes, kv_scales=kv_scales
-            )
-            futures_inference = policy_generation.update_weights_via_ipc_zmq()
-            # wait for all futures to complete
-            ray.get(futures_train)
-            results = ray.get(futures_inference)
-            update_success = all(result for result in results if result is not None)
+            if isinstance(policy_generation, SGLangGeneration):
+                sglang_url_to_gpu_uuids = (
+                    policy_generation.get_sglang_url_to_gpu_uuids()
+                )
+                # Stream weights via HTTP
+                flush_success = policy_generation.invalidate_kv_cache()
+                if not flush_success:
+                    print("SGLang KV cache invalidation failed before weight update. ")
+                futures_train = policy.stream_weights_via_http(
+                    sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids,
+                )
+                # Wait for all workers to complete
+                ray.get(futures_train)
+                update_success = True
+            else:
+                # Original ZMQ IPC path for vLLM
+                futures_train = policy.stream_weights_via_ipc_zmq(
+                    buffer_size_bytes=buffer_size_bytes
+                )
+                futures_inference = policy_generation.update_weights_via_ipc_zmq()
+                # wait for all futures to complete
+                ray.get(futures_train)
+                results = ray.get(futures_inference)
+                update_success = all(result for result in results if result is not None)
         else:
             # update weights through nccl
+            # SGLang haven't implemented non-colocated inference mode.
+            if isinstance(policy_generation, SGLangGeneration):
+                raise NotImplementedError(
+                    "SGLang haven't implemented non-colocated inference mode. "
+                )
             futures_train = policy.broadcast_weights_for_collective(kv_scales=kv_scales)
             futures_inference = policy_generation.update_weights_from_collective()
             # wait for all futures to complete
@@ -1148,11 +1226,9 @@ def grpo_train(
 
                 dynamic_sampling_num_gen_batches += 1
                 with timer.time("generation"):
-                    # Clear vLLM logger metrics for each generation step
-                    if policy_generation is not None and hasattr(
-                        policy_generation, "clear_vllm_logger_metrics"
-                    ):
-                        policy_generation.clear_vllm_logger_metrics()
+                    # Clear logger metrics for each generation step
+                    if policy_generation is not None:
+                        policy_generation.clear_logger_metrics()
                     # Use NeMo-Gym rollouts if enabled. We cascade NeMo-Gym first since NeMo-Gym requires async rollouts.
                     if _should_use_nemo_gym(master_config):
                         generation_config = master_config["policy"]["generation"]
@@ -1202,16 +1278,12 @@ def grpo_train(
                             greedy=False,
                         )
                     policy_generation.finish_generation()
-                    # Collect vLLM logger metrics for performance reporting after each generation step
-                    # inflight batch sizes and num pending samples are collected from each vLLM worker
-                    if policy_generation is not None and hasattr(
-                        policy_generation, "get_vllm_logger_metrics"
-                    ):
-                        vllm_logger_metrics = (
-                            policy_generation.get_vllm_logger_metrics()
+                    # Collect generation logger metrics for performance reporting after each generation step
+                    # inflight batch sizes and num pending samples are collected from each worker
+                    if policy_generation is not None:
+                        generation_logger_metrics = (
+                            policy_generation.get_logger_metrics()
                         )
-                    else:
-                        vllm_logger_metrics = {}
 
                 repeated_batch = scale_rewards(
                     repeated_batch, master_config["grpo"]["reward_scaling"]
@@ -1460,7 +1532,7 @@ def grpo_train(
                         metrics[k] = np.sum(v).item()
 
                 metrics.update(rollout_metrics)
-                metrics["vllm_logger_metrics"] = vllm_logger_metrics
+                metrics["generation_logger_metrics"] = generation_logger_metrics
                 total_valid_tokens += metrics["global_valid_toks"]
 
                 ## Checkpointing
@@ -1583,7 +1655,7 @@ def grpo_train(
                 "enable_vllm_metrics_logger", False
             ) and master_config.get("logger", {}).get("wandb_enabled", False):
                 log_generation_metrics_to_wandb(
-                    vllm_logger_metrics,
+                    generation_logger_metrics,
                     total_steps + 1,
                     master_config["policy"]["generation"]["vllm_cfg"][
                         "vllm_metrics_logger_interval"
@@ -2051,12 +2123,9 @@ def async_grpo_train(
             trajectory_collector.resume.remote()
 
     print("✅ All setup complete, starting buffer wait...")
-
-    # Clear vLLM logger metrics after at start of training
-    if policy_generation is not None and hasattr(
-        policy_generation, "clear_vllm_logger_metrics"
-    ):
-        policy_generation.clear_vllm_logger_metrics()
+    # Clear logger metrics at start of training
+    if policy_generation is not None:
+        policy_generation.clear_logger_metrics()
 
     # Wait for initial buffer fill
     print(
@@ -2296,23 +2365,19 @@ def async_grpo_train(
                     train_results = policy.train(train_data, loss_fn)
 
                 print("🔄 Synchronizing policy weights to trajectory collector…")
-                vllm_logger_metrics = None
+                generation_logger_metrics = None
                 if NEED_REFIT:
                     # Measure pending-generation wait as exposed_generation time
                     print("🔄 Coordinating with trajectory collector before refit...")
                     with timer.time("exposed_generation"):
                         ray.get(trajectory_collector.prepare_for_refit.remote())
 
-                    # Collect vLLM logger metrics for performance reporting
-                    # inflight batch sizes and num pending samples are collected from each vLLM worker
-                    if policy_generation is not None and hasattr(
-                        policy_generation, "get_vllm_logger_metrics"
-                    ):
-                        vllm_logger_metrics = (
-                            policy_generation.get_vllm_logger_metrics()
+                    # Collect generation logger metrics for performance reporting
+                    # inflight batch sizes and num pending samples are collected from each worker
+                    if policy_generation is not None:
+                        generation_logger_metrics = (
+                            policy_generation.get_logger_metrics()
                         )
-                    else:
-                        vllm_logger_metrics = {}
 
                     # Only the actual refit/weight transfer should be counted as weight_sync
                     print("🔄 Performing policy generation refit...")
@@ -2327,11 +2392,9 @@ def async_grpo_train(
                         trajectory_collector.set_weight_version.remote(weight_version)
                         trajectory_collector.resume_after_refit.remote()
 
-                # Clear vLLM logger metrics after each refit (weight sync), starting a new logging cycle
-                if policy_generation is not None and hasattr(
-                    policy_generation, "clear_vllm_logger_metrics"
-                ):
-                    policy_generation.clear_vllm_logger_metrics()
+                # Clear logger metrics after each refit (weight sync), starting a new logging cycle
+                if policy_generation is not None:
+                    policy_generation.clear_logger_metrics()
 
                 # Validation
                 val_metrics, validation_timings = None, None
@@ -2424,8 +2487,8 @@ def async_grpo_train(
                     else:
                         metrics[k] = np.sum(v).item()
                 metrics.update(rollout_metrics)
-                if vllm_logger_metrics is not None:
-                    metrics["vllm_logger_metrics"] = vllm_logger_metrics
+                if generation_logger_metrics is not None:
+                    metrics["generation_logger_metrics"] = generation_logger_metrics
                 total_valid_tokens += metrics["global_valid_toks"]
 
                 # Checkpointing (same as sync version)
@@ -2532,7 +2595,7 @@ def async_grpo_train(
                 "enable_vllm_metrics_logger", False
             ) and master_config.get("logger", {}).get("wandb_enabled", False):
                 log_generation_metrics_to_wandb(
-                    vllm_logger_metrics,
+                    generation_logger_metrics,
                     step + 1,
                     master_config["policy"]["generation"]["vllm_cfg"][
                         "vllm_metrics_logger_interval"
diff --git a/nemo_rl/algorithms/utils.py b/nemo_rl/algorithms/utils.py
index 17c69e479a..428252e1f2 100644
--- a/nemo_rl/algorithms/utils.py
+++ b/nemo_rl/algorithms/utils.py
@@ -521,46 +521,47 @@ def visualize_per_worker_timeline(
         "generation"
     ].get("vllm_cfg", {}).get("async_engine", False)
     if is_vllm_metrics_logger_enabled:
-        vllm_logger_metrics = metrics["vllm_logger_metrics"]
-        # vllm_logger_me    trics: dict[str (metric_name), dict[int (dp_idx), list[int] (metric_values)]]
+        vllm_logger_metrics = metrics.get("generation_logger_metrics", {})
+        # vllm_logger_metrics: dict[str (metric_name), dict[int (dp_idx), list[int] (metric_values)]]
         # metric_name: "inflight_batch_sizes" or "num_pending_samples"
 
-        assert "inflight_batch_sizes" in vllm_logger_metrics, (
-            "inflight_batch_sizes not found in vllm_logger_metrics"
-        )
-        assert "num_pending_samples" in vllm_logger_metrics, (
-            "num_pending_samples not found in vllm_logger_metrics"
-        )
-        assert isinstance(vllm_logger_metrics["inflight_batch_sizes"], dict), (
-            "inflight_batch_sizes must be a dictionary"
-        )
-        assert isinstance(vllm_logger_metrics["num_pending_samples"], dict), (
-            "num_pending_samples must be a dictionary"
-        )
-
-        vllm_metrics_logger_interval = master_config["policy"]["generation"][
-            "vllm_cfg"
-        ]["vllm_metrics_logger_interval"]
-        print("  • vLLM Logger Metrics:")
-        # Visualize the inflight batch sizes timeline
-        if len(vllm_logger_metrics["inflight_batch_sizes"].values()) > 0:
-            visualize_per_worker_timeline(
-                vllm_logger_metrics["inflight_batch_sizes"],
-                "Inflight Batch Sizes",
-                vllm_metrics_logger_interval,
+        if vllm_logger_metrics:
+            assert "inflight_batch_sizes" in vllm_logger_metrics, (
+                "inflight_batch_sizes not found in vllm_logger_metrics"
             )
-        if len(vllm_logger_metrics["num_pending_samples"].values()) > 0:
-            max_num_pending_samples = max(
-                (max(v) if v else 0)
-                for v in vllm_logger_metrics["num_pending_samples"].values()
+            assert "num_pending_samples" in vllm_logger_metrics, (
+                "num_pending_samples not found in vllm_logger_metrics"
             )
-            # If there is at least one pending sample, visualize the timeline
-            if max_num_pending_samples > 0:
+            assert isinstance(vllm_logger_metrics["inflight_batch_sizes"], dict), (
+                "inflight_batch_sizes must be a dictionary"
+            )
+            assert isinstance(vllm_logger_metrics["num_pending_samples"], dict), (
+                "num_pending_samples must be a dictionary"
+            )
+
+            vllm_metrics_logger_interval = master_config["policy"]["generation"][
+                "vllm_cfg"
+            ]["vllm_metrics_logger_interval"]
+            print("  • vLLM Logger Metrics:")
+            # Visualize the inflight batch sizes timeline
+            if len(vllm_logger_metrics["inflight_batch_sizes"].values()) > 0:
                 visualize_per_worker_timeline(
-                    vllm_logger_metrics["num_pending_samples"],
-                    "Num Pending Samples",
-                    None,
+                    vllm_logger_metrics["inflight_batch_sizes"],
+                    "Inflight Batch Sizes",
+                    vllm_metrics_logger_interval,
                 )
+            if len(vllm_logger_metrics["num_pending_samples"].values()) > 0:
+                max_num_pending_samples = max(
+                    (max(v) if v else 0)
+                    for v in vllm_logger_metrics["num_pending_samples"].values()
+                )
+                # If there is at least one pending sample, visualize the timeline
+                if max_num_pending_samples > 0:
+                    visualize_per_worker_timeline(
+                        vllm_logger_metrics["num_pending_samples"],
+                        "Num Pending Samples",
+                        None,
+                    )
 
     # =====================================================
     # Throughputs
diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py
index 4190062ec6..cdda4a625f 100644
--- a/nemo_rl/distributed/ray_actor_environment_registry.py
+++ b/nemo_rl/distributed/ray_actor_environment_registry.py
@@ -20,6 +20,9 @@
 VLLM_EXECUTABLE = (
     PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.VLLM
 )
+SGLANG_EXECUTABLE = (
+    PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.SGLANG
+)
 MCORE_EXECUTABLE = (
     PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.MCORE
 )
@@ -27,10 +30,11 @@
 ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = {
     "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE,
     "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE,
+    "nemo_rl.models.generation.sglang.sglang_worker.SGLangGenerationWorker": SGLANG_EXECUTABLE,
     # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM.
     # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved.
     "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE,
-    "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": PY_EXECUTABLES.AUTOMODEL,
+    "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": SGLANG_EXECUTABLE,
     "nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE,
     "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM,
     "nemo_rl.environments.vlm_environment.VLMEnvironment": PY_EXECUTABLES.SYSTEM,
diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py
index 3021b760e4..3f472e6d61 100644
--- a/nemo_rl/distributed/virtual_cluster.py
+++ b/nemo_rl/distributed/virtual_cluster.py
@@ -58,6 +58,9 @@ class PY_EXECUTABLES:
     # Use NeMo-Gym dependencies
     NEMO_GYM = f"uv run --locked --extra nemo_gym --directory {git_root}"
 
+    # Use NeMo-RL direct dependencies and SGLang.
+    SGLANG = f"uv run --locked --extra automodel --extra sglang --directory {git_root}"
+
 
 @ray.remote  # pragma: no cover
 def _get_node_ip_and_free_port() -> tuple[str, int]:
diff --git a/nemo_rl/models/generation/interfaces.py b/nemo_rl/models/generation/interfaces.py
index d134027bdf..80f4ced95e 100644
--- a/nemo_rl/models/generation/interfaces.py
+++ b/nemo_rl/models/generation/interfaces.py
@@ -257,3 +257,22 @@ def update_weights_from_collective(self) -> list[ray.ObjectRef]:
     # (e.g., vLLM prefix/KV caches) after weight updates.
     def invalidate_kv_cache(self) -> bool:
         return False
+
+    def clear_logger_metrics(self) -> None:
+        """Clear logger metrics for performance reporting.
+
+        This is an optional method that backends can implement to clear
+        telemetry metrics. Default implementation does nothing.
+        """
+        pass
+
+    def get_logger_metrics(self) -> dict[str, Any]:
+        """Get logger metrics for performance reporting.
+
+        This is an optional method that backends can implement to collect
+        telemetry metrics. Default implementation returns empty dict.
+
+        Returns:
+            Dictionary of metrics. Format may vary by backend.
+        """
+        return {}
diff --git a/nemo_rl/models/generation/sglang/__init__.py b/nemo_rl/models/generation/sglang/__init__.py
new file mode 100644
index 0000000000..76deb56ebd
--- /dev/null
+++ b/nemo_rl/models/generation/sglang/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo_rl.models.generation.sglang.config import SGLangConfig
+from nemo_rl.models.generation.sglang.sglang_generation import SGLangGeneration
+
+__all__ = [
+    "SGLangConfig",
+    "SGLangGeneration",
+    "SGLangGenerationWorker",
+]
+
+
+def __getattr__(name: str):
+    if name == "SGLangGenerationWorker":
+        from nemo_rl.models.generation.sglang.sglang_worker import (
+            SGLangGenerationWorker,
+        )
+
+        return SGLangGenerationWorker
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/nemo_rl/models/generation/sglang/config.py b/nemo_rl/models/generation/sglang/config.py
new file mode 100644
index 0000000000..9e1ea45253
--- /dev/null
+++ b/nemo_rl/models/generation/sglang/config.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, NotRequired, TypedDict
+
+from nemo_rl.models.generation.interfaces import GenerationConfig
+
+
+class SglangSpecificArgs(TypedDict):
+    """SGLang-specific configuration arguments.
+
+    Most fields below map directly to SGLang's ServerArgs (see:
+    https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py).
+    """
+
+    model_path: NotRequired[str]
+    gpus_per_server: NotRequired[int]
+    random_seed: NotRequired[int]
+    skip_tokenizer_init: NotRequired[bool]
+    disable_cuda_graph: NotRequired[bool]
+    disable_radix_cache: NotRequired[bool]
+    disable_cuda_graph_padding: NotRequired[bool]
+    enable_nccl_nvls: NotRequired[bool]
+    disable_outlines_disk_cache: NotRequired[bool]
+    disable_custom_all_reduce: NotRequired[bool]
+    disable_overlap_schedule: NotRequired[bool]
+    enable_mixed_chunk: NotRequired[bool]
+    enable_dp_attention: NotRequired[bool]
+    enable_ep_moe: NotRequired[bool]
+    enable_torch_compile: NotRequired[bool]
+    torch_compile_max_bs: NotRequired[int]
+    cuda_graph_max_bs: NotRequired[int | None]
+    cuda_graph_bs: NotRequired[list[int] | None]
+    torchao_config: NotRequired[str]
+    enable_nan_detection: NotRequired[bool]
+    enable_p2p_check: NotRequired[bool]
+    triton_attention_reduce_in_fp32: NotRequired[bool]
+    triton_attention_num_kv_splits: NotRequired[int]
+    num_continuous_decode_steps: NotRequired[int]
+    enable_memory_saver: NotRequired[bool]
+    allow_auto_truncate: NotRequired[bool]
+    attention_backend: NotRequired[str | None]
+    enable_multimodal: NotRequired[bool]
+    sampling_backend: NotRequired[str | None]
+    context_length: NotRequired[int | None]
+    mem_fraction_static: NotRequired[float | None]
+    max_running_requests: NotRequired[int | None]
+    chunked_prefill_size: NotRequired[int | None]
+    max_prefill_tokens: NotRequired[int]
+    schedule_policy: NotRequired[str]
+    schedule_conservativeness: NotRequired[float]
+    cpu_offload_gb: NotRequired[int]
+    dtype: NotRequired[str]
+    kv_cache_dtype: NotRequired[str]
+    dp_size: NotRequired[int]  # only used for dp attention
+    pp_size: NotRequired[int]  # pipeline parallel size
+    ep_size: NotRequired[int]
+    # lora
+    enable_lora: NotRequired[bool | None]
+    max_lora_rank: NotRequired[int | None]
+    lora_target_modules: NotRequired[list[str] | None]
+    lora_paths: NotRequired[list[str] | None]
+    max_loaded_loras: NotRequired[int]
+    max_loras_per_batch: NotRequired[int]
+    lora_backend: NotRequired[str]
+    # logging
+    log_level: NotRequired[str]
+    log_level_http: NotRequired[str | None]
+    log_requests: NotRequired[bool]
+    log_requests_level: NotRequired[int]
+    show_time_cost: NotRequired[bool]
+    enable_metrics: NotRequired[bool]  # Exports Prometheus-like metrics
+    # The interval (in decoding iterations) to log throughput
+    # and update prometheus metrics
+    decode_log_interval: NotRequired[int]
+    # Extra loader arguments
+    enable_multithread_load: NotRequired[bool]
+    enable_fast_load: NotRequired[bool]
+    # Server warmup
+    skip_server_warmup: NotRequired[bool]
+
+
+class SGLangConfig(GenerationConfig):
+    """Configuration for SGLang runtime."""
+
+    sglang_cfg: SglangSpecificArgs
+    sglang_kwargs: NotRequired[dict[str, Any]]
diff --git a/nemo_rl/models/generation/sglang/sglang_generation.py b/nemo_rl/models/generation/sglang/sglang_generation.py
new file mode 100644
index 0000000000..85122779ee
--- /dev/null
+++ b/nemo_rl/models/generation/sglang/sglang_generation.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from typing import (
+    Any,
+    Optional,
+    Union,
+)
+
+import numpy as np
+import ray
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict, SlicedDataDict
+from nemo_rl.distributed.named_sharding import NamedSharding
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.distributed.worker_groups import RayWorkerBuilder, RayWorkerGroup
+from nemo_rl.models.generation.interfaces import (
+    GenerationDatumSpec,
+    GenerationInterface,
+    GenerationOutputSpec,
+)
+from nemo_rl.models.generation.sglang.config import SGLangConfig
+
+# Global thresholds for top_k and top_p validation.
+# While top-k/p are not supported, these values allow for token filtering while the logprobs should be compatible.
+# See https://github.com/NVIDIA-NeMo/RL/issues/69 and https://github.com/NVIDIA-NeMo/RL/issues/237 for more details.
+TOP_K_THRESHOLD = 8000  # Allow top_k >= 8000 (effectively no filtering)
+TOP_P_THRESHOLD = 0.99  # Allow top_p >= 0.99 (close to 1.0)
+
+logger = logging.getLogger(__name__)
+
+
+class SGLangGeneration(GenerationInterface):
+    def __init__(
+        self,
+        cluster: RayVirtualCluster,
+        config: SGLangConfig,
+        name_prefix: str = "sglang_policy",
+        workers_per_node: Optional[Union[int, list[int]]] = None,
+    ):
+        """Initialize a SGLang policy with distributed workers.
+
+        SGLang server manages TP/PP internally, but we still need to:
+        1. Manage data parallel distribution across multiple servers
+        2. Assign GPU bundles to each server
+
+        Each server will see logical GPUs 0-N (via CUDA_VISIBLE_DEVICES set by Ray),
+        so we just need to tell SGLang how many GPUs to use (tp_size).
+        """
+        # Store config
+        self.cfg = config
+        self.sglang_cfg = config["sglang_cfg"]
+
+        gpus_per_server = self.sglang_cfg.get("gpus_per_server", None)
+        if gpus_per_server is None:
+            raise ValueError("gpus_per_server must be set in SGLangConfig.sglang_cfg.")
+
+        # Calculate number of servers based on available resources
+        total_gpus = cluster.world_size()
+        num_servers = total_gpus // gpus_per_server
+
+        if num_servers == 0:
+            raise ValueError(
+                f"Not enough GPUs. Need at least {gpus_per_server} GPUs per server, "
+                f"but only have {total_gpus} GPUs total."
+            )
+
+        if total_gpus % gpus_per_server != 0:
+            logger.warning(
+                f"[WARNING] Total GPUs ({total_gpus}) is not divisible by GPUs per server ({gpus_per_server}). "
+                f"Will use {num_servers} servers, leaving {total_gpus % gpus_per_server} GPUs unused."
+            )
+
+        self.dp_size = num_servers
+        self.gpus_per_server = gpus_per_server
+
+        # Create sharding annotations
+        # Even though SGLang manages TP internally, we include it in the layout to support
+        # RayWorkerGroup's worker management (which creates one worker per GPU bundle).
+        # The TP dimension becomes a "free axis" in run_all_workers_sharded_data, ensuring
+        # only the primary workers (TP rank 0) are called.
+        total_workers = num_servers * gpus_per_server
+        self.sharding_annotations = NamedSharding(
+            layout=np.arange(total_workers).reshape(num_servers, gpus_per_server),
+            names=["data_parallel", "tensor_parallel"],
+        )
+
+        # Initialize placement groups
+        # For SGLang, we use PACK strategy to keep bundles together
+        # colocated is always at top level, not in sglang_cfg
+        strategy = None if self.cfg["colocated"]["enabled"] else "PACK"
+        cluster._init_placement_groups(
+            strategy=strategy,
+            use_unified_pg=False,  # SGLang servers don't need cross-node model parallelism
+        )
+
+        # Create worker builder for SGLangGenerationWorker
+        worker_cls = (
+            "nemo_rl.models.generation.sglang.sglang_worker.SGLangGenerationWorker"
+        )
+        worker_builder = RayWorkerBuilder(worker_cls, config)
+
+        env_vars = {}
+        global_cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if global_cvd:
+            # Explicitly pass CUDA_VISIBLE_DEVICES to workers via env_vars
+            # This ensures all workers see the same global value, even though
+            env_vars["CUDA_VISIBLE_DEVICES"] = global_cvd
+
+        # Allocate bundles for each server
+        # Each server gets consecutive bundles
+        bundle_indices_list = self._allocate_bundles_for_servers(
+            cluster, num_servers, gpus_per_server
+        )
+
+        # Create worker group with explicit bundle allocation
+        self.worker_group = RayWorkerGroup(
+            cluster,
+            worker_builder,
+            name_prefix=name_prefix,
+            bundle_indices_list=bundle_indices_list,
+            sharding_annotations=self.sharding_annotations,
+            env_vars=env_vars,
+        )
+
+        # Verify data parallel size matches
+        assert self.dp_size == self.worker_group.dp_size, (
+            f"Data parallel size mismatch. Expected {self.dp_size}, got {self.worker_group.dp_size}"
+        )
+
+        # Used to track the round-robin selection of worker groups for generate_async
+        self.current_generate_dp_shard_idx = 0
+
+    def _allocate_bundles_for_servers(
+        self,
+        cluster: RayVirtualCluster,
+        num_servers: int,
+        gpus_per_server: int,
+    ) -> list[tuple[int, list[int]]]:
+        """Allocate GPU bundles to each SGLang server.
+
+        Each server gets consecutive bundles within the same placement group (node).
+        Ray will automatically set CUDA_VISIBLE_DEVICES so each server sees logical GPUs 0, 1, 2, ..., gpus_per_server-1.
+
+        Args:
+            cluster: The Ray virtual cluster
+            num_servers: Total number of SGLang servers to create
+            gpus_per_server: Number of GPUs each server needs
+
+        Returns:
+            List of (node_idx, [bundle_indices]) tuples for each server
+        """
+        placement_groups = cluster.get_placement_groups()
+
+        if not placement_groups:
+            raise ValueError("No placement groups available in the cluster")
+
+        bundle_indices_list = []
+
+        # Each server's bundles must be within the same placement group (node)
+        server_idx = 0
+        for pg_idx, pg in enumerate(placement_groups):
+            if pg.bundle_count == 0:
+                continue
+
+            # Calculate how many servers can fit in this placement group
+            num_servers_in_pg = pg.bundle_count // gpus_per_server
+
+            # Allocate servers within this placement group
+            for local_server_idx in range(num_servers_in_pg):
+                if server_idx >= num_servers:
+                    break
+
+                # Calculate which bundles this server gets (consecutive within the PG)
+                start_bundle = local_server_idx * gpus_per_server
+                server_bundles = list(
+                    range(start_bundle, start_bundle + gpus_per_server)
+                )
+
+                # Each server gets a tuple of (node_idx, [local_bundle_indices])
+                bundle_indices_list.append((pg_idx, server_bundles))
+                server_idx += 1
+
+            if server_idx >= num_servers:
+                break
+
+        if len(bundle_indices_list) < num_servers:
+            total_available = sum(
+                pg.bundle_count // gpus_per_server
+                for pg in placement_groups
+                if pg.bundle_count > 0
+            )
+            raise ValueError(
+                f"Not enough bundles to allocate all {num_servers} servers. "
+                f"Only {total_available} servers can be allocated "
+                f"(each server needs {gpus_per_server} GPUs)."
+            )
+
+        return bundle_indices_list
+
+    def init_collective(
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
+    ) -> list[ray.ObjectRef]:
+        """Initialize the collective communication.
+
+        TODO:       if weight updates via NCCL are needed in the future.
+        """
+        return []
+
+    def generate(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate a batch of data using SGLang."""
+        assert isinstance(data, BatchedDataDict), (
+            f"data must be a BatchedDataDict, got type: {type(data)}"
+        )
+        assert "input_ids" in data and "input_lengths" in data, (
+            "input_ids and input_lengths are required in data for SGLang generation"
+        )
+
+        # Shard the data across the data parallel servers
+        dp_size = self.sharding_annotations.get_axis_size("data_parallel")
+        sharded_data: list[SlicedDataDict] = data.shard_by_batch_size(
+            dp_size, allow_uneven_shards=True
+        )
+        future_bundle = self.worker_group.run_all_workers_sharded_data(
+            "generate",
+            data=sharded_data,
+            in_sharded_axes=["data_parallel"],
+            replicate_on_axes=None,
+            output_is_replicated=None,
+            common_kwargs={"greedy": greedy},
+        )
+
+        # Get results from the workers
+        results = self.worker_group.get_all_worker_results(future_bundle)
+
+        # Combine results from all servers
+        combined: BatchedDataDict[GenerationOutputSpec] = BatchedDataDict.from_batches(
+            results, pad_value_dict={"output_ids": self.cfg["_pad_token_id"]}
+        )
+
+        # Verify the output has all required fields
+        required_keys = [
+            "output_ids",
+            "generation_lengths",
+            "unpadded_sequence_lengths",
+            "logprobs",
+        ]
+        missing_keys = [key for key in required_keys if key not in combined]
+        if missing_keys:
+            raise ValueError(
+                f"Missing required keys for GenerationOutputSpec: {missing_keys}"
+            )
+
+        return combined
+
+    def prepare_refit_info(self, state_dict_info: dict[str, Any]) -> None:
+        pass
+
+    def update_weights_via_ipc_zmq(self) -> list[ray.ObjectRef]:
+        return []
+
+    def update_weights_from_collective(self) -> list[ray.ObjectRef]:
+        return []
+
+    def get_sglang_server_urls(self) -> list[str]:
+        """Get base URLs of all SGLang servers.
+
+        Returns:
+            List of base URLs (e.g., ["http://localhost:30000", "http://localhost:30001"])
+        """
+        if not self.worker_group or not self.worker_group.workers:
+            raise RuntimeError("Worker group is not initialized")
+
+        # Get base URLs from all workers (only primary workers, TP rank 0)
+        # Use run_rank_0_only_axes to only get URLs from primary workers
+        futures = self.worker_group.run_all_workers_single_data(
+            "get_base_url",
+            run_rank_0_only_axes=["tensor_parallel"],
+        )
+        urls = ray.get(futures)
+        # Filter out None values and return unique URLs
+        return list(set(url for url in urls if url is not None))
+
+    def get_sglang_url_to_gpu_uuids(self) -> dict[str, list[str]]:
+        """Get mapping from SGLang server URL to list of GPU UUIDs it uses.
+
+        Returns:
+            Dict mapping server URL to list of GPU UUIDs
+            e.g., {"http://localhost:30000": ["GPU-aaa", "GPU-bbb"], ...}
+        """
+        if not self.worker_group or not self.worker_group.workers:
+            raise RuntimeError("Worker group is not initialized")
+
+        # Get base URLs and GPU UUIDs from all primary workers (TP rank 0)
+        futures_url = self.worker_group.run_all_workers_single_data(
+            "get_base_url",
+            run_rank_0_only_axes=["tensor_parallel"],
+        )
+        futures_uuids = self.worker_group.run_all_workers_single_data(
+            "get_gpu_uuids",
+            run_rank_0_only_axes=["tensor_parallel"],
+        )
+
+        urls = ray.get(futures_url)
+        uuids_list = ray.get(futures_uuids)
+
+        # Create mapping
+        url_to_uuids = {}
+        for url, uuids in zip(urls, uuids_list):
+            if url is not None and uuids is not None:
+                url_to_uuids[url] = uuids
+
+        return url_to_uuids
+
+    def prepare_for_generation(self, *args: Any, **kwargs: Any) -> bool:
+        """Wake workers up for colocated inference."""
+        pass
+
+    def finish_generation(self, *args: Any, **kwargs: Any) -> bool:
+        """Sleep workers and reset prefix cache."""
+        pass
+
+    def shutdown(self) -> bool:
+        """Shut down all SGLang workers and clean up resources."""
+        try:
+            # Use the worker group's shutdown method with the worker's cleanup method
+            return self.worker_group.shutdown(cleanup_method="shutdown")
+        except Exception as e:
+            logger.error(f"Error during SGLang policy shutdown: {e}")
+            return False
+
+    def __del__(self) -> None:
+        """Shuts down the worker groups when the object is deleted or is garbage collected.
+
+        This is an extra safety net in case the user forgets to call shutdown() and the pointer to
+        the object is lost due to leaving a function scope. It's always recommended that the
+        user calls shutdown().
+        """
+        self.shutdown()
+
+    def invalidate_kv_cache(self) -> bool:
+        """Invalidate KV cache before weight updates (Megatron-style).
+
+        This flushes the cache before weight updates to clear stale cache.
+        Only primary workers (TP rank 0, model owners) will flush their cache.
+
+        Returns:
+            bool: True if all caches were flushed successfully, False otherwise
+        """
+        try:
+            futures = self.worker_group.run_all_workers_single_data(
+                "invalidate_kv_cache",
+                run_rank_0_only_axes=["tensor_parallel"],
+            )
+            results = ray.get(futures)
+            results = [r for r in results if r is not None]
+            success = all(result for result in results) if results else True
+            if success:
+                logger.info(
+                    "[sglang refit] All SGLang server caches flushed successfully"
+                )
+            else:
+                logger.warning(
+                    "[sglang refit] WARNING - Some SGLang server caches failed to flush"
+                )
+            return success
+        except Exception as e:
+            logger.error(f"[sglang refit] Error flushing SGLang caches: {e}")
+            return False
diff --git a/nemo_rl/models/generation/sglang/sglang_worker.py b/nemo_rl/models/generation/sglang/sglang_worker.py
new file mode 100644
index 0000000000..6f15cba1fc
--- /dev/null
+++ b/nemo_rl/models/generation/sglang/sglang_worker.py
@@ -0,0 +1,804 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import multiprocessing
+import os
+import time
+from typing import Any, Optional
+
+import aiohttp
+import ray
+import requests
+import torch
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import _get_free_port_local, _get_node_ip_local
+from nemo_rl.distributed.worker_group_utils import get_nsight_config_if_pattern_matches
+from nemo_rl.models.generation.interfaces import (
+    GenerationDatumSpec,
+    GenerationOutputSpec,
+    verify_right_padding,
+)
+from nemo_rl.models.generation.sglang.config import SGLangConfig
+from nemo_rl.models.generation.sglang.utils import AsyncLoopThread
+from nemo_rl.utils.nsys import wrap_with_nvtx_name
+
+logger = logging.getLogger(__name__)
+
+
+def _require_sglang():
+    """Import `sglang` lazily so test collection works without the optional extra."""
+    try:
+        from sglang.srt.entrypoints.http_server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import kill_process_tree
+    except ModuleNotFoundError as e:  # pragma: no cover
+        raise ModuleNotFoundError(
+            "Optional dependency `sglang` is required for the SGLang generation backend.\n"
+            "Install it via the project extra (e.g. `uv run --extra sglang ...`) to use "
+            "`SGLangGenerationWorker`."
+        ) from e
+
+    return launch_server, ServerArgs, kill_process_tree
+
+
+@ray.remote(
+    runtime_env={**get_nsight_config_if_pattern_matches("sglang_generation_worker")}
+)  # pragma: no cover
+class SGLangGenerationWorker:
+    def __repr__(self) -> str:
+        """Customizes the actor's prefix in the Ray logs.
+
+        This makes it easier to identify which worker is producing specific log messages.
+        """
+        return f"{self.__class__.__name__}"
+
+    @staticmethod
+    def configure_worker(
+        num_gpus: int | float, bundle_indices: Optional[tuple[int, list[int]]] = None
+    ) -> tuple[dict[str, Any], dict[str, str], dict[str, Any]]:
+        """Provides complete worker configuration for SGLang server.
+
+        This method configures the worker based on bundle_indices which tells us
+        how many GPUs this server should use.
+
+        Args:
+            num_gpus: Original GPU allocation for this worker based on the placement group
+            bundle_indices: Tuple of (node_idx, local_bundle_indices) for this server
+
+        Returns:
+            tuple with complete worker configuration:
+              - 'resources': Resource allocation (e.g., num_gpus)
+              - 'env_vars': Environment variables for this worker
+              - 'init_kwargs': Parameters to pass to __init__ of the worker
+        """
+        # Initialize configuration
+        resources: dict[str, Any] = {"num_gpus": num_gpus}
+        init_kwargs: dict[str, Any] = {}
+        env_vars: dict[str, str] = {}
+
+        local_bundle_indices = None
+        if bundle_indices is not None:
+            node_idx = bundle_indices[0]
+            local_bundle_indices = bundle_indices[1]
+            init_kwargs["bundle_indices"] = local_bundle_indices
+
+            # Calculate a unique seed from node_idx and bundle_indices
+            if len(local_bundle_indices) == 1:
+                seed = node_idx * 1024 + local_bundle_indices[0]
+            else:
+                bundle_id = local_bundle_indices[0] // len(local_bundle_indices)
+                seed = node_idx * 1024 + bundle_id
+
+            init_kwargs["seed"] = seed
+
+        # Check if this worker is part of a parallel group (multiple GPUs per server).
+        # A worker with local rank =0 owns the server(local_bundle_indices is not None )
+        # otherwise it is a placeholder for Ray's resource management (local_bundle_indices is None).
+        is_part_of_parallel_workers = (
+            local_bundle_indices is not None and len(local_bundle_indices) > 1
+        ) or local_bundle_indices is None
+
+        if is_part_of_parallel_workers:
+            # For parallel workers, we manage GPU assignment via base_gpu_id
+            # All workers see the same global CUDA_VISIBLE_DEVICES, but use different
+            # logical GPU ranges via base_gpu_id
+            resources["num_gpus"] = 0
+            env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1"
+            init_kwargs["fraction_of_gpus"] = num_gpus
+        else:
+            env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1"
+
+        return resources, env_vars, init_kwargs
+
+    def __init__(
+        self,
+        config: SGLangConfig,
+        bundle_indices: Optional[list[int]] = None,
+        fraction_of_gpus: float = 1.0,
+        seed: Optional[int] = None,
+    ):
+        """Initialize a SGLang worker for distributed inference.
+
+        Args:
+            config: Configuration dictionary for the policy
+            bundle_indices: List of local bundle indices for this server.
+                          The length of this list determines tp_size (number of GPUs per server).
+                          Only needed for the first worker in each server group (model owner).
+            fraction_of_gpus: Fraction of GPUs to use for this worker
+            seed: Random seed for initialization, if None, then defaults to the config's seed
+        """
+        self.cfg = config
+        self.is_model_owner = bundle_indices is not None
+        self.global_rank = int(os.environ.get("RANK", "0"))
+        self.sglang_cfg = config["sglang_cfg"]
+
+        # Create a dedicated event loop thread for async operations
+        # there will be issues if we use the event loop in the main thread
+        self.async_loop_thread = AsyncLoopThread()
+
+        # temp: Maximum concurrent requests per server
+        # we may remove this limit in the future
+        self.max_concurrent_requests = config.get("max_concurrent_requests", 999999)
+
+        # Only the primary worker (local_rank=0) in each server group starts the SGLang server
+        # Secondary workers (local_rank!=0) just returns
+        if not self.is_model_owner:
+            return
+
+        # `sglang` is an optional dependency; import only when we actually start a server.
+        _, ServerArgs, _ = _require_sglang()
+
+        # Determine tp_size from bundle_indices length
+        tp_size = len(bundle_indices)
+
+        base_gpu_id = bundle_indices[0] if bundle_indices else 0
+
+        # Get the global CUDA_VISIBLE_DEVICES (all engines see the same global value)
+        global_cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+
+        logger.info(
+            f"[SGLang Server] Rank {self.global_rank}: "
+            f"base_gpu_id={base_gpu_id}, tp_size={tp_size}, "
+            f"bundle_indices={bundle_indices}, global_cvd={global_cvd}"
+        )
+
+        # Get current node IP and a free port for the server
+        node_ip = _get_node_ip_local()
+        free_port = _get_free_port_local()
+
+        # Build SGLang server arguments
+        kwargs = {
+            "model_path": self.sglang_cfg["model_path"],
+            "trust_remote_code": True,
+            "random_seed": seed
+            if seed is not None
+            else self.sglang_cfg.get("random_seed", 1),
+            # Memory settings
+            "enable_memory_saver": self.sglang_cfg["enable_memory_saver"],
+            "gpu_id_step": 1,
+            "base_gpu_id": base_gpu_id,
+            # Parallel settings
+            "tp_size": tp_size,
+            "dp_size": self.sglang_cfg["dp_size"],
+            "pp_size": self.sglang_cfg["pp_size"],
+            "ep_size": self.sglang_cfg["ep_size"],
+            # Always skip warmup to prevent warmup timeout
+            "skip_server_warmup": self.sglang_cfg.get("skip_server_warmup", True),
+            # Server network settings - listen on all interfaces, use the free port we found
+            "host": "0.0.0.0",
+            "port": free_port,
+            "torchao_config": "",
+        }
+
+        for key in [
+            "dtype",
+            "kv_cache_dtype",
+            "context_length",
+            "max_running_requests",
+            "chunked_prefill_size",
+            "max_prefill_tokens",
+            "schedule_policy",
+            "schedule_conservativeness",
+            "cpu_offload_gb",
+            "log_level",
+            "mem_fraction_static",
+            "allow_auto_truncate",
+        ]:
+            if key in self.sglang_cfg:
+                kwargs[key] = self.sglang_cfg[key]
+
+        server_args = ServerArgs(**kwargs)
+        # Save server_args and base_url for use in generate() and _make_request()
+        self.server_args = server_args
+        self.base_url = f"http://{node_ip}:{free_port}"
+
+        logger.info(
+            f"[SGLang Worker] Rank {self.global_rank} Starting on {self.base_url}, CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}, base_gpu_id: {base_gpu_id}"
+        )
+
+        self.session = None
+        self.connector = None
+
+        self.server_process = self._launch_server_process(server_args)
+
+    def get_base_url(self) -> str:
+        """Get the base URL of this SGLang server."""
+        return self.base_url
+
+    def invalidate_kv_cache(self) -> bool:
+        """Invalidate KV cache before weight updates (Megatron-style).
+
+        This flushes the cache before weight updates to clear stale cache.
+        Uses retry logic to handle cases where there are pending requests.
+
+        Returns:
+            bool: True if flush was successful, False otherwise
+        """
+        if not self.is_model_owner:
+            return True
+
+        url = f"{self.base_url}/flush_cache"
+        max_attempts = 60
+        connection_retry_limit = 5
+
+        # flush_cache will not return status_code 200 when there are pending requests
+        for attempt in range(max_attempts):
+            try:
+                response = requests.get(url, timeout=10)
+                if response.status_code == 200:
+                    if attempt > 0:
+                        logger.info(
+                            f"[SGLang Worker] Rank {self.global_rank} Cache flushed successfully "
+                            f"(attempt {attempt + 1})"
+                        )
+                    return True
+            except requests.exceptions.ConnectionError:
+                # Server might not be ready yet - only retry for first few attempts
+                if attempt >= connection_retry_limit:
+                    logger.warning(
+                        f"[SGLang Worker] Rank {self.global_rank} Connection failed after "
+                        f"{connection_retry_limit} attempts"
+                    )
+                    return False
+            except Exception as e:
+                # For other errors, log and retry (except on last attempt)
+                if attempt == max_attempts - 1:
+                    logger.error(
+                        f"[SGLang Worker] Rank {self.global_rank} Failed to flush cache after "
+                        f"{max_attempts} attempts: {e}"
+                    )
+                    return False
+
+            time.sleep(1)
+
+        # All attempts exhausted without success
+        logger.error(
+            f"[SGLang Worker] Rank {self.global_rank} Timeout: Cache flush failed after "
+            f"{max_attempts} attempts. Server may have pending requests."
+        )
+        return False
+
+    def get_gpu_uuids(self) -> list[str]:
+        """Get list of GPU UUIDs used by this SGLang server.
+
+        Returns:
+            List of GPU UUIDs (e.g., ["GPU-xxxxx", "GPU-yyyyy"])
+        """
+        from nemo_rl.utils.nvml import get_device_uuid
+
+        # Get all GPU UUIDs used by this server
+        # SGLang server uses GPUs starting from base_gpu_id with tp_size GPUs
+        gpu_uuids = []
+        for i in range(self.server_args.tp_size):
+            gpu_id = self.server_args.base_gpu_id + i
+            uuid = get_device_uuid(gpu_id)
+            gpu_uuids.append(uuid)
+
+        return gpu_uuids
+
+    def _merge_stop_strings(self, batch_stop_strings):
+        """Merge stop strings from config and batch.
+
+        Args:
+            batch_stop_strings: List of stop strings from batch (one per sample)
+
+        Returns:
+            List of merged stop strings (one per sample)
+        """
+        stop_set: set[str] = set()
+
+        # Add stop strings from config
+        if self.cfg.get("stop_strings"):
+            stop_set.update(self.cfg["stop_strings"])
+
+        # Merge stop strings from batch
+        merged_stop_strings = []
+        for sample_ss in batch_stop_strings:
+            sample_stop_set = stop_set.copy()
+            if sample_ss:
+                if isinstance(sample_ss, str):
+                    sample_stop_set.add(sample_ss)
+                elif isinstance(sample_ss, list):
+                    sample_stop_set.update(sample_ss)
+
+            merged_stop_strings.append(
+                list(sample_stop_set) if sample_stop_set else None
+            )
+
+        return merged_stop_strings
+
+    def _build_sampling_params(
+        self,
+        *,
+        greedy: bool,
+        stop_strings,
+        max_new_tokens: Optional[int] = None,
+        input_len: Optional[int] = None,
+        context_length: Optional[int] = None,
+        sample_index: Optional[int] = None,
+    ) -> dict[str, Any]:
+        """Build sampling parameters dictionary for SGLang API.
+
+        Args:
+            greedy: Whether to use greedy decoding (temperature=0.0)
+            stop_strings: Merged stop strings (not used here, handled per sample)
+            max_new_tokens: Override max_new_tokens from config if provided
+            input_len: Input length for this sample (used for context_length adjustment)
+            context_length: Maximum context length (if provided, adjusts max_new_tokens)
+            sample_index: Sample index (used for warning messages, 0-indexed)
+
+        Returns:
+            Dictionary of sampling parameters compatible with SGLang API
+        """
+        top_k_cfg = self.cfg.get("top_k")
+        top_k_val = 1 if greedy else (top_k_cfg if top_k_cfg is not None else -1)
+        temperature = 0.0 if greedy else self.cfg["temperature"]
+
+        base_max_tokens = (
+            max_new_tokens if max_new_tokens is not None else self.cfg["max_new_tokens"]
+        )
+
+        # TODO: check if this is needed
+        final_max_tokens = base_max_tokens
+        if context_length is not None and input_len is not None:
+            max_allowed_new_tokens = max(0, context_length - input_len - 1)
+            if base_max_tokens > max_allowed_new_tokens:
+                final_max_tokens = max_allowed_new_tokens
+                if sample_index == 0:
+                    logger.warning(
+                        f"[SGLang Worker] Rank {self.global_rank} Warning: "
+                        f"Sample {sample_index} input length ({input_len}) + max_new_tokens ({base_max_tokens}) "
+                        f"would exceed context_length ({context_length}). "
+                        f"Reducing max_new_tokens to {final_max_tokens} for this sample."
+                    )
+
+        # Build sampling params dict
+        sampling_params = {
+            "temperature": temperature,
+            "top_p": self.cfg.get("top_p", 1.0),
+            "max_new_tokens": final_max_tokens,
+        }
+
+        if top_k_val != -1:
+            sampling_params["top_k"] = top_k_val
+
+        stop_token_ids = self.cfg.get("stop_token_ids")
+        if stop_token_ids is not None:
+            sampling_params["stop_token_ids"] = stop_token_ids
+
+        return sampling_params
+
+    async def _ensure_session(self):
+        if self.session is None:
+            # Create connector with connection pool limit
+            self.connector = aiohttp.TCPConnector(limit=512, limit_per_host=512)
+            # Create session with timeout
+            timeout = aiohttp.ClientTimeout(total=300)  # 5 minutes timeout
+            self.session = aiohttp.ClientSession(
+                connector=self.connector, timeout=timeout
+            )
+        return self.session
+
+    async def _generate_single_sample(
+        self,
+        input_ids: list[int],
+        sampling_params: dict[str, Any],
+        stop_string: Optional[str] = None,
+    ) -> tuple[list[int], list[float]]:
+        """Generate a single sample using SGLang API (async function).
+
+        Args:
+            input_ids: List of input token IDs (without padding)
+            sampling_params: Dictionary of sampling parameters (temperature, top_p, max_new_tokens, etc.)
+            stop_string: Optional stop string for this sample
+
+        Returns:
+            Tuple of (generated_tokens, logprobs):
+                - generated_tokens: List of generated token IDs
+                - logprobs: List of log probabilities for generated tokens
+        """
+        # Prepare payload for SGLang API
+        # Note: stop should be in sampling_params, not in payload top level
+        # TODO: double check this
+        if stop_string is not None:
+            # stop can be a string or list of strings
+            sampling_params = sampling_params.copy()  # Don't modify the original
+            sampling_params["stop"] = stop_string
+
+        payload = {
+            "sampling_params": sampling_params,
+            "return_logprob": True,
+            "input_ids": input_ids,
+        }
+
+        url = f"{self.base_url}/generate"
+        headers = {
+            "Content-Type": "application/json; charset=utf-8",
+        }
+
+        session = await self._ensure_session()
+
+        try:
+            async with session.post(url, json=payload, headers=headers) as response:
+                response.raise_for_status()
+                result = await response.json()
+        except Exception as e:
+            logger.error(
+                f"[SGLang Worker] Rank {self.global_rank} Request failed for input_len={len(input_ids)}: {e}"
+            )
+            raise
+
+        # Extract generated tokens and logprobs
+        meta_info = result.get("meta_info", {})
+        output_token_logprobs = meta_info.get("output_token_logprobs", [])
+
+        if output_token_logprobs:
+            new_tokens = [item[1] for item in output_token_logprobs]
+            new_logprobs = [item[0] for item in output_token_logprobs]
+        else:
+            # Fallback: empty if token logprobs not available
+            new_tokens = []
+            new_logprobs = []
+
+        return new_tokens, new_logprobs
+
+    async def _generate_async(self, tasks):
+        """Execute generation tasks with concurrency control.
+
+        TEMP: Uses a semaphore to limit the number of concurrent requests per server, preventing server overload.
+        A router based solution is preffered in the future.
+        """
+        semaphore = asyncio.Semaphore(self.max_concurrent_requests)
+
+        async def wrap(idx, coro):
+            async with semaphore:
+                try:
+                    result = await coro
+                    return idx, result
+                except Exception as e:
+                    raise
+
+        wrapped = [wrap(i, t) for i, t in enumerate(tasks)]
+        results = [None] * len(tasks)
+        count = 0
+
+        for fut in asyncio.as_completed(wrapped):
+            idx, value = await fut
+            results[idx] = value
+            count += 1
+            if count % 50 == 0 or count == len(tasks):
+                logger.debug(
+                    f"[SGLang Worker] Rank {self.global_rank} Completed {count}/{len(tasks)} tasks"
+                )
+
+        return results
+
+    def _launch_server_process(self, server_args: Any) -> multiprocessing.Process:
+        """Launch the SGLang server process and wait for it to be ready."""
+        # Ensure `sglang` is importable when we actually start a server.
+        launch_server, _, kill_process_tree = _require_sglang()
+        p = multiprocessing.Process(target=launch_server, args=(server_args,))
+        p.start()
+
+        # Wait for server to be ready by checking health endpoint
+        # Use the base_url we stored earlier
+        headers = {
+            "Content-Type": "application/json; charset=utf-8",
+        }
+
+        max_wait_time = 300  # 5 minutes timeout
+        start_time = time.time()
+        with requests.Session() as session:
+            while True:
+                if time.time() - start_time > max_wait_time:
+                    kill_process_tree(p.pid)
+                    raise TimeoutError(
+                        f"[SGLang Server] Rank {self.global_rank} Server failed to start within {max_wait_time}s"
+                    )
+                try:
+                    response = session.get(
+                        f"{self.base_url}/health_generate", headers=headers, timeout=10
+                    )
+                    if response.status_code == 200:
+                        logger.info(
+                            f"[SGLang Server] Rank {self.global_rank} Server is ready at {self.base_url}"
+                        )
+                        break
+                except requests.RequestException:
+                    pass
+
+                if not p.is_alive():
+                    raise RuntimeError(
+                        f"[SGLang Server] Rank {self.global_rank} Server process terminated unexpectedly."
+                    )
+
+                time.sleep(2)
+        return p
+
+    @wrap_with_nvtx_name("sglang_genertion_worker/generate")
+    def generate(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate a batch of data using SGLang generation.
+
+        Args:
+            data: BatchedDataDict containing input_ids and input_lengths tensors
+            greedy: Whether to use greedy decoding instead of sampling
+
+        Returns:
+            BatchedDataDict conforming to GenerationOutputSpec:
+                - output_ids: input + generated token IDs with proper padding
+                - logprobs: Log probabilities for tokens
+                - generation_lengths: Lengths of each response
+                - unpadded_sequence_lengths: Lengths of each input + generated sequence
+        """
+        # Handle empty input case
+        if len(data["input_ids"]) == 0:
+            return BatchedDataDict[GenerationOutputSpec](
+                {
+                    "output_ids": torch.zeros((0, 0), dtype=torch.long),
+                    "logprobs": torch.zeros((0, 0), dtype=torch.float),
+                    "generation_lengths": torch.zeros(0, dtype=torch.long),
+                    "unpadded_sequence_lengths": torch.zeros(0, dtype=torch.long),
+                }
+            )
+
+        input_ids = data["input_ids"]
+        input_lengths = data["input_lengths"]
+        batch_stop_strings = data.get("stop_strings", [None] * len(input_lengths))
+        stop_strings = self._merge_stop_strings(batch_stop_strings)
+        batch_size = len(input_lengths)
+        pad_token_id = self.cfg["_pad_token_id"]
+
+        # Verify inputs have correct padding
+        verify_right_padding(data, pad_value=pad_token_id)
+
+        # Original input length with padding
+        padded_input_length = input_ids.size(1)
+
+        logger.debug(
+            f"[SGLang Worker] Rank {self.global_rank} batch_size: {batch_size}, padded_input_length: {padded_input_length}"
+        )
+
+        if batch_size == 0:
+            raise ValueError("Empty batch received")
+
+        context_length = self.sglang_cfg.get("context_length", None)
+
+        # Create async tasks for all samples
+        tasks = []
+        for i in range(batch_size):
+            input_len = input_lengths[i].item()
+
+            # Truncate input if it exceeds context_length
+            if context_length is not None and input_len >= context_length:
+                input_len = context_length - 1
+
+            valid_input_ids = input_ids[i, :input_len].tolist()
+
+            # Build sampling params for this sample (with context_length adjustment)
+            sample_sampling_params = self._build_sampling_params(
+                greedy=greedy,
+                stop_strings=stop_strings,
+                max_new_tokens=None,
+                input_len=input_len,
+                context_length=context_length,
+                sample_index=i,
+            )
+
+            tasks.append(
+                self._generate_single_sample(
+                    input_ids=valid_input_ids,
+                    sampling_params=sample_sampling_params,
+                    stop_string=stop_strings[i],
+                )
+            )
+
+        # Execute all requests concurrently using the dedicated event loop thread
+        try:
+            all_results = self.async_loop_thread.run(self._generate_async(tasks))
+        except Exception as e:
+            raise
+
+        total_generated_tokens = sum(len(tokens) for tokens, _ in all_results)
+        avg_generation_length = (
+            total_generated_tokens / batch_size if batch_size > 0 else 0
+        )
+
+        # Process results
+        output_ids_list = []
+        logprobs_list = []
+        generation_lengths_list = []
+        unpadded_sequence_lengths_list = []
+        max_length = 0
+
+        # First pass: calculate max_length
+        for i, (new_tokens, new_logprobs) in enumerate(all_results):
+            input_len = input_lengths[i].item()
+            generation_length = len(new_tokens)
+            unpadded_length = input_len + generation_length
+            max_length = max(max_length, unpadded_length)
+
+        total_length = max(max_length, padded_input_length)
+
+        for i, (new_tokens, new_logprobs) in enumerate(all_results):
+            input_len = input_lengths[i].item()
+            generation_length = len(new_tokens)
+            unpadded_length = input_len + generation_length
+
+            full_output = torch.full(
+                (total_length,), pad_token_id, dtype=input_ids.dtype
+            )
+            full_output[:input_len] = input_ids[i][:input_len]
+
+            # Add generated tokens after the original input
+            if new_tokens:
+                full_output[input_len : input_len + len(new_tokens)] = torch.tensor(
+                    new_tokens, dtype=input_ids.dtype
+                )
+
+            # Construct logprobs: zeros for input tokens, actual logprobs for generated tokens
+            full_logprobs = torch.zeros(total_length, dtype=torch.float32)
+            if new_logprobs:
+                for idx, logprob in enumerate(new_logprobs):
+                    position = input_len + idx
+                    full_logprobs[position] = logprob
+
+            output_ids_list.append(full_output)
+            logprobs_list.append(full_logprobs)
+            generation_lengths_list.append(generation_length)
+            unpadded_sequence_lengths_list.append(unpadded_length)
+
+        # Stack into tensors
+        output_ids = torch.stack(output_ids_list)
+        logprobs = torch.stack(logprobs_list)
+        generation_lengths = torch.tensor(generation_lengths_list, dtype=torch.long)
+        unpadded_sequence_lengths = torch.tensor(
+            unpadded_sequence_lengths_list, dtype=torch.long
+        )
+        logger.debug(
+            f"[SGLang Worker] Rank {self.global_rank} Generated {total_generated_tokens} tokens across {batch_size} samples (avg: {avg_generation_length:.1f} tokens/sample)"
+        )
+        return BatchedDataDict[GenerationOutputSpec](
+            {
+                "output_ids": output_ids,
+                "generation_lengths": generation_lengths,
+                "unpadded_sequence_lengths": unpadded_sequence_lengths,
+                "logprobs": logprobs,
+            }
+        )
+
+    def sleep(self):
+        # TODO
+        pass
+
+    def wake_up(self, **kwargs):
+        # TODO
+        pass
+
+    def shutdown(self) -> bool:
+        """Shutdown the SGLang server process and cleanup async resources.
+
+        Returns:
+            bool: True if shutdown was successful, False otherwise
+        """
+        if not self.is_model_owner:
+            if hasattr(self, "async_loop_thread"):
+                try:
+                    self.async_loop_thread.shutdown()
+                    logger.info(
+                        f"[SGLang Worker] Rank {self.global_rank} Async loop thread shut down."
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"[SGLang Worker] Rank {self.global_rank} Error shutting down async loop thread: {e}"
+                    )
+            return True
+
+        try:
+            # Only model owners started a server process; they require sglang for shutdown.
+            _, _, kill_process_tree = _require_sglang()
+            if hasattr(self, "session") and self.session is not None:
+                try:
+
+                    async def close_session():
+                        await self.session.close()
+                        if self.connector is not None:
+                            await self.connector.close()
+
+                    self.async_loop_thread.run(close_session())
+                    logger.info(
+                        f"[SGLang Worker] Rank {self.global_rank} aiohttp session closed."
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"[SGLang Worker] Rank {self.global_rank} Error closing aiohttp session: {e}"
+                    )
+
+            # Shutdown async loop thread after session cleanup
+            if hasattr(self, "async_loop_thread"):
+                try:
+                    self.async_loop_thread.shutdown()
+                    logger.info(
+                        f"[SGLang Worker] Rank {self.global_rank} Async loop thread shut down."
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"[SGLang Worker] Rank {self.global_rank} Error shutting down async loop thread: {e}"
+                    )
+
+            if not hasattr(self, "server_process") or self.server_process is None:
+                return True
+
+            logger.info(
+                f"[SGLang Worker] Rank {self.global_rank} Shutting down server at {self.base_url}..."
+            )
+
+            if self.server_process.is_alive():
+                kill_process_tree(self.server_process.pid)
+
+            # Wait for the process to terminate
+            self.server_process.join(timeout=5.0)
+
+            if self.server_process.is_alive():
+                return False
+            return True
+
+        except Exception as e:
+            logger.error(
+                f"[SGLang Worker] Rank {self.global_rank} Error during shutdown: {e}"
+            )
+            return False
+
+    def _make_request(self, endpoint: str, payload: Optional[dict] = None):
+        """Make a POST request to the specified endpoint with the given payload.
+
+        Args:
+            endpoint: The API endpoint to call
+            payload: The JSON payload to send (default: empty dict)
+
+        Returns:
+            The JSON response from the server
+        """
+        # Use the stored base_url instead of constructing from server_args
+        url = f"{self.base_url}/{endpoint}"
+        headers = {
+            "Content-Type": "application/json; charset=utf-8",
+        }
+        response = requests.post(url, json=payload or {}, headers=headers, timeout=60)
+        response.raise_for_status()
+        return response.json()
diff --git a/nemo_rl/models/generation/sglang/utils.py b/nemo_rl/models/generation/sglang/utils.py
new file mode 100644
index 0000000000..7460302b5a
--- /dev/null
+++ b/nemo_rl/models/generation/sglang/utils.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import threading
+
+
+class AsyncLoopThread:
+    """A background event loop thread for running async operations in Ray actors.
+
+    This class creates a dedicated thread with its own event loop, allowing
+    synchronous Ray actor methods to execute async coroutines without blocking
+    the main actor thread. This is necessary because run_coroutine_threadsafe
+    requires the event loop to be in a different thread.
+    """
+
+    def __init__(self):
+        self.loop = asyncio.new_event_loop()
+        self._ready = threading.Event()
+        self._thread = threading.Thread(target=self._start_loop, daemon=True)
+        self._thread.start()
+        if not self._ready.wait(timeout=5.0):
+            raise RuntimeError("Event loop thread failed to start within 5 seconds")
+
+    def _start_loop(self):
+        """Run the event loop in the background thread."""
+        asyncio.set_event_loop(self.loop)
+        self._ready.set()
+        self.loop.run_forever()
+
+    def run(self, coro):
+        """Schedule a coroutine onto the loop and block until it's done.
+
+        Args:
+            coro: The coroutine to execute
+
+        Returns:
+            The result of the coroutine
+        """
+        if not self.loop.is_running():
+            raise RuntimeError("Event loop is not running")
+        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+        result = future.result()
+        return result
+
+    def shutdown(self):
+        """Shutdown the event loop and wait for the thread to finish."""
+        if self.loop.is_running():
+            self.loop.call_soon_threadsafe(self.loop.stop)
+        self._thread.join(timeout=2.0)
+        if not self.loop.is_closed():
+            self.loop.close()
diff --git a/nemo_rl/models/generation/vllm/vllm_generation.py b/nemo_rl/models/generation/vllm/vllm_generation.py
index 93540ebe82..1366ce28c5 100644
--- a/nemo_rl/models/generation/vllm/vllm_generation.py
+++ b/nemo_rl/models/generation/vllm/vllm_generation.py
@@ -876,6 +876,14 @@ def clear_vllm_logger_metrics(self) -> None:
         )
         ray.get(futures)
 
+    def clear_logger_metrics(self) -> None:
+        """Clear logger metrics for performance reporting."""
+        self.clear_vllm_logger_metrics()
+
+    def get_logger_metrics(self) -> dict[str, Any]:
+        """Get logger metrics for performance reporting."""
+        return self.get_vllm_logger_metrics()
+
     def __del__(self) -> None:
         """Shuts down the worker groups when the object is deleted or is garbage collected.
 
diff --git a/nemo_rl/models/policy/interfaces.py b/nemo_rl/models/policy/interfaces.py
index 144b0c517d..6e64c6289b 100644
--- a/nemo_rl/models/policy/interfaces.py
+++ b/nemo_rl/models/policy/interfaces.py
@@ -182,6 +182,18 @@ def stream_weights_via_ipc_zmq(
     ) -> list[ray.ObjectRef]:
         pass
 
+    def stream_weights_via_http(
+        self, sglang_url_to_gpu_uuids: dict[str, list[str]]
+    ) -> list[ray.ObjectRef]:
+        """Stream model weights to SGLang servers via HTTP API.
+
+        Args:
+            sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses
+        """
+        raise NotImplementedError(
+            "stream_weights_via_http is not implemented for this policy worker"
+        )
+
     @abstractmethod
     def broadcast_weights_for_collective(
         self, kv_scales: Optional[dict[str, float]] = None
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
index 144683c95c..1f908824fe 100644
--- a/nemo_rl/models/policy/lm_policy.py
+++ b/nemo_rl/models/policy/lm_policy.py
@@ -768,6 +768,20 @@ def stream_weights_via_ipc_zmq(
         )
         return futures
 
+    def stream_weights_via_http(
+        self, sglang_url_to_gpu_uuids: dict[str, list[str]]
+    ) -> list[ray.ObjectRef]:
+        """Send the weights to SGLang servers via HTTP API.
+
+        Args:
+            sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses
+        """
+        futures = self.worker_group.run_all_workers_single_data(
+            "stream_weights_via_http",
+            sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids,
+        )
+        return futures
+
     def broadcast_weights_for_collective(
         self, kv_scales: Optional[dict[str, float]] = None
     ) -> list[ray.ObjectRef]:
diff --git a/nemo_rl/models/policy/utils.py b/nemo_rl/models/policy/utils.py
index 7ad33708a2..ad79f1a1d8 100644
--- a/nemo_rl/models/policy/utils.py
+++ b/nemo_rl/models/policy/utils.py
@@ -16,9 +16,11 @@
 import os
 import traceback
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, cast
 
+import requests
 import torch
+import torch.distributed as dist
 import zmq
 from torch.multiprocessing.reductions import rebuild_cuda_tensor
 from transformers import (
@@ -473,3 +475,268 @@ def rebuild_cuda_tensor_from_ipc(
     list_args = list(args)
     list_args[6] = device_id
     return func(*list_args)
+
+
+def stream_weights_via_http_impl(
+    params_generator,
+    sglang_url_to_gpu_uuids: dict[str, list[str]],
+    rank: int,
+    worker_name: str,
+    current_device_uuid: str,
+) -> None:
+    """Stream weights to SGLang servers via HTTP API (update_weights_from_tensor).
+
+    Flow: Each rank creates IPC handler → gather handlers in rank order → send list → SGLang matches by tp_rank index
+
+    Key points:
+    - Each rank creates handler on its own GPU
+    - Handlers are gathered in rank order: [rank0_handler, rank1_handler, ...]
+    - List index = rank = GPU ID
+    - SGLang automatically matches: handler = serialized_handlers[tp_rank]
+
+    Args:
+        params_generator: Generator yielding (name, tensor) pairs
+        sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses
+        rank: Worker rank for logging
+        worker_name: Name of the worker for logging
+        current_device_uuid: UUID of the current training worker's GPU
+    """
+    from sglang.srt.utils import MultiprocessingSerializer  # type: ignore[import-error]
+
+    try:
+        from sglang.srt.utils.patch_torch import (
+            monkey_patch_torch_reductions,  # type: ignore[import-error]
+        )
+    except ImportError:
+        from sglang.srt.patch_torch import (
+            monkey_patch_torch_reductions,  # type: ignore[import-error]
+        )
+    print("[sglang refit details] entering stream_weights_via_http_impl")
+
+    monkey_patch_torch_reductions()
+
+    target_urls = [
+        url
+        for url, uuids in sglang_url_to_gpu_uuids.items()
+        if current_device_uuid in uuids
+    ]
+
+    if not target_urls:
+        raise RuntimeError(
+            f"{worker_name} (rank {rank}): No matching SGLang server found for GPU UUID {current_device_uuid}. "
+            f"Available servers: {list(sglang_url_to_gpu_uuids.keys())}"
+        )
+
+    if len(target_urls) > 1:
+        print(
+            f"[WARNING] {worker_name} (rank {rank}): GPU UUID {current_device_uuid} matches multiple SGLang servers: {target_urls}. "
+            f"Using the first one: {target_urls[0]}"
+        )
+        target_urls = [target_urls[0]]
+
+    base_url = target_urls[0]
+    url = f"{base_url}/update_weights_from_tensor"
+    sglang_gpu_uuids = sglang_url_to_gpu_uuids[base_url]
+
+    ipc_gather_group, ipc_gather_src, matching_ranks = _setup_ipc_gather_group(
+        rank, current_device_uuid, sglang_gpu_uuids, sglang_url_to_gpu_uuids
+    )
+    print(
+        f"[sglang refit] {worker_name} (rank {rank}): ipc_gather_group={ipc_gather_group}, ipc_gather_src={ipc_gather_src}, matching_ranks={matching_ranks}"
+    )
+    tensor_count = 0
+
+    try:
+        tensor_list = list(params_generator)
+        total_tensors = len(tensor_list)
+
+        if rank == ipc_gather_src:
+            print(
+                f"[sglang refit details] {worker_name}: Starting weight update - "
+                f"Total parameters to update: {total_tensors}",
+                flush=True,
+            )
+
+        for idx, (name, tensor) in enumerate(tensor_list):
+            torch.cuda.current_stream().synchronize()
+            tensor = tensor.contiguous().cuda()
+
+            named_tensors = [(name, tensor)]
+            serialized_handler = MultiprocessingSerializer.serialize(
+                named_tensors, output_str=True
+            )
+            # output_str=True ensures the return type is str
+            serialized_handler_str = cast(str, serialized_handler)
+
+            gathered_handlers = _gather_ipc_handlers(
+                serialized_handler_str,
+                ipc_gather_group,
+                ipc_gather_src,
+                rank,
+                matching_ranks,
+            )
+
+            if rank == ipc_gather_src and gathered_handlers is not None:
+                _send_tensor_to_sglang(
+                    url,
+                    name,
+                    gathered_handlers,
+                    tensor.shape,
+                    str(tensor.dtype),
+                    flush_cache=False,
+                )
+                tensor_count += 1
+
+            del tensor, serialized_handler
+            if rank == ipc_gather_src:
+                del gathered_handlers
+            torch.cuda.empty_cache()
+
+        if rank == ipc_gather_src:
+            print(
+                f"[sglang refit details] {worker_name}: Weight update completed - "
+                f"Successfully updated {tensor_count}/{total_tensors} parameters to SGLang server: {base_url}",
+                flush=True,
+            )
+            if tensor_count != total_tensors:
+                print(
+                    f"[sglang refit details] {worker_name}: WARNING - Expected {total_tensors} tensors, "
+                    f"but only sent {tensor_count}",
+                    flush=True,
+                )
+
+    except Exception as e:
+        print(
+            f"{worker_name} (rank {rank}): Error during HTTP weight streaming: {e}.\n"
+            f"{traceback.format_exc()}"
+        )
+        raise
+
+    finally:
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+def _setup_ipc_gather_group(
+    rank: int,
+    current_device_uuid: str,
+    sglang_gpu_uuids: list[str],
+    sglang_url_to_gpu_uuids: dict[str, list[str]],
+) -> tuple[Optional[dist.ProcessGroup], Optional[int], Optional[list[int]]]:
+    """Setup gather configuration for IPC handlers.
+
+    Returns:
+        Tuple of (gather_group, gather_src_rank, matching_ranks)
+        - gather_group: None (use default FSDP group)
+        - gather_src_rank: The rank that will collect and send to SGLang server
+        - matching_ranks: List of ranks that belong to the same SGLang server
+    """
+    if not dist.is_initialized():
+        return None, None, None
+
+    world_size = dist.get_world_size()
+    my_rank = dist.get_rank()
+
+    all_ranks_uuids = [None] * world_size
+    dist.all_gather_object(all_ranks_uuids, current_device_uuid)
+
+    matching_ranks = [
+        r for r, uuid in enumerate(all_ranks_uuids) if uuid in sglang_gpu_uuids
+    ]
+
+    if len(matching_ranks) == 0:
+        return None, None, None
+
+    matching_ranks = sorted(matching_ranks)
+    gather_src = matching_ranks[0]
+
+    return None, gather_src, matching_ranks
+
+
+def _gather_ipc_handlers(
+    serialized_handler: str,
+    gather_group: Optional[dist.ProcessGroup],
+    gather_src: Optional[int],
+    rank: int,
+    matching_ranks: Optional[list[int]] = None,
+) -> Optional[list[str]]:
+    """Gather IPC handlers from all ranks in the default FSDP group, then filter by server.
+
+    Args:
+        serialized_handler: Serialized IPC handler from this rank
+        gather_group: Process group (None means use default FSDP group)
+        gather_src: Rank that will collect and filter handlers
+        rank: Current rank
+        matching_ranks: List of ranks that belong to the same SGLang server
+
+    Returns:
+        List of serialized handlers in rank order (only on gather_src rank), None otherwise
+        The list contains handlers from matching_ranks only, in rank order
+    """
+    if gather_src is None:
+        return None
+
+    if not dist.is_initialized():
+        return None
+
+    world_size = dist.get_world_size()
+
+    all_handlers: list[Optional[str]] = [None for _ in range(world_size)]
+    dist.all_gather_object(all_handlers, serialized_handler)
+    all_handlers_str = cast(list[str], all_handlers)
+
+    if rank == gather_src and matching_ranks is not None:
+        filtered_handlers: list[str] = [all_handlers_str[r] for r in matching_ranks]
+        return filtered_handlers
+    else:
+        return None
+
+
+def _send_tensor_to_sglang(
+    url: str,
+    tensor_name: str,
+    gathered_handlers: list[str],
+    shape: torch.Size,
+    dtype: str,
+    flush_cache: bool = False,
+) -> None:
+    """Send gathered IPC handlers to SGLang server via HTTP.
+
+    Key: gathered_handlers are in rank order [rank0, rank1, ...]
+    SGLang will automatically match: handler = serialized_handlers[tp_rank]
+
+    Args:
+        url: SGLang server URL
+        tensor_name: Name of the tensor
+        gathered_handlers: List of serialized IPC handlers in rank order
+        shape: Tensor shape
+        dtype: Tensor dtype
+        flush_cache: Whether to flush cache after this tensor (for last tensor)
+    """
+    payload = {
+        "serialized_named_tensors": gathered_handlers,
+        "flush_cache": flush_cache,
+    }
+
+    try:
+        response = requests.post(
+            url,
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=120,
+        )
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as e:
+        error_msg = f"Failed to send tensor '{tensor_name}' to {url}: {e}"
+        try:
+            error_detail = response.text
+            error_msg += f"\nResponse status: {response.status_code}"
+            error_msg += f"\nResponse body: {error_detail[:500]}"
+        except:
+            pass
+        print(f"[sglang refit] {error_msg}", flush=True)
+        raise RuntimeError(error_msg) from e
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to send tensor '{tensor_name}' to {url}: {e}"
+        ) from e
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
index 785568cc76..76613dfb8a 100644
--- a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
+++ b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
@@ -1771,6 +1771,53 @@ def dtensor_params_generator():
             worker_name=str(self),
         )
 
+    @torch.no_grad()
+    @wrap_with_nvtx_name("dtensor_policy_worker_v2/stream_weights_via_http")
+    def stream_weights_via_http(
+        self,
+        sglang_url_to_gpu_uuids: dict[str, list[str]],
+    ) -> None:
+        """Stream model weights to SGLang servers via HTTP API.
+
+        Args:
+            sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses
+        """
+        # Manually move model to cuda for cpu offload case
+        if self.cpu_offload:
+            self.model = self.move_to_cuda(self.model)
+
+        from nemo_rl.models.policy.utils import stream_weights_via_http_impl
+
+        # Get current GPU UUID
+        current_device_uuid = self.report_device_id()
+
+        def dtensor_params_generator():
+            """Generator that yields (name, tensor) pairs, converting DTensors to local tensors."""
+            state_dict_items = sorted(
+                self.model.state_dict().items(), key=lambda x: x[0]
+            )
+            for name, tensor in state_dict_items:
+                if isinstance(tensor, DTensor):
+                    # Convert DTensor to full tensor for streaming
+                    full_tensor = tensor.full_tensor()
+                    # Convert to target dtype
+                    yield (
+                        name,
+                        full_tensor.to(self.dtype, non_blocking=True).contiguous(),
+                    )
+                else:
+                    # Convert to target dtype
+                    yield name, tensor.to(self.dtype, non_blocking=True).contiguous()
+
+        # Use the HTTP implementation
+        stream_weights_via_http_impl(
+            params_generator=dtensor_params_generator(),
+            sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids,
+            rank=self.rank,
+            worker_name=str(self),
+            current_device_uuid=current_device_uuid,
+        )
+
     @torch.no_grad()
     def broadcast_weights_for_collective(
         self, kv_scales: Optional[dict[str, float]] = None
diff --git a/pyproject.toml b/pyproject.toml
index 19916dbf6c..80c3286e7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,11 +19,11 @@ dependencies = [
   "setuptools",
   "pip",                                                                                                              # Required for frozen environments; uv venv --seed may not reliably install pip
   "ninja",                                                                                                            # for flash-attn parallel build
-  "torch==2.9.0",
+  "torch==2.8.0",
   "triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
   "colored==2.2.3",
   "ray[default]==2.49.2",
-  "transformers==4.57.1",
+  "transformers>=4.55.4",
   "wandb",
   "numpy",
   "datasets>=4.0.0",
@@ -49,6 +49,7 @@ dependencies = [
   "nvidia-nvshmem-cu12; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # for deep_ep build
   "swanlab",
   "pyzmq",
+  "coverage>=7.10.4",
 ]
 
 [project.optional-dependencies]
@@ -58,13 +59,10 @@ automodel = [
   # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
   # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
   # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
-  "vllm==0.11.2",                                                                                     # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
+  "vllm==0.11.0",      # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
   "flash-attn==2.8.1",
   "mamba-ssm",
   "causal-conv1d",
-  "nv-grouped-gemm",
-  "transformer-engine[pytorch]==2.8.0",
-  "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480",
 ]
 vllm = [
   "cuda-python",
@@ -72,8 +70,8 @@ vllm = [
   # deep_ep also needs libibverbs-dev
   # sudo apt-get update
   # sudo apt-get install libibverbs-dev
-  "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480",
-  "vllm==0.11.2",
+  "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@e3908bf5bd0cc6265bcb225d15cd8c996d4759ef",
+  "vllm==0.11.0",
   "num2words>=0.5.14",
   # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
   "flash-attn==2.8.1",
@@ -82,6 +80,26 @@ vllm = [
   # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
   "causal-conv1d",
 ]
+sglang = [
+  "sglang>=0.4.1",
+  "pybase64",
+  "orjson",
+  "uvloop",
+  "requests",
+  "openai",
+  "partial-json-parser",
+  "sentencepiece",
+  "sgl-kernel==0.3.17.post1",
+  "compressed-tensors",
+  "msgspec",
+  "python-multipart",
+  "torchao",
+  "xgrammar",
+  "interegular",
+  "openai-harmony",
+  "torch-memory-saver",
+  "einops",
+]
 mcore = [
   # also need cudnn (https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_network)
   # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
@@ -96,7 +114,7 @@ mcore = [
   "megatron-core",
   "megatron-bridge",
   # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
-  "vllm==0.11.2",
+  "vllm==0.11.0",
   # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular)
   # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
   # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
@@ -109,7 +127,7 @@ nemo_gym = ["nemo_gym"]
 # This is a default group so that we install these even with bare `uv sync`
 build = [
   # Build requirement for TE
-  "torch==2.9.0",
+  "torch==2.8.0",
   # Build requirement for TE
   "setuptools",
   "packaging",
@@ -170,7 +188,6 @@ triton = [
 ]
 causal-conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d", tag = "v1.5.0.post8" }
 mamba-ssm = { git = "https://github.com/state-spaces/mamba.git", rev = "2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }
-nv-grouped-gemm = { git = "https://github.com/fanshiqing/grouped_gemm", tag = "v1.1.4.post7" }
 
 [tool.uv.workspace]
 members = [
@@ -179,7 +196,7 @@ members = [
   "3rdparty/Megatron-Bridge-workspace",
   "3rdparty/Gym-workspace",
   # Research projects are also added here in order for them to share the global root level uv.lock.
-  # If we don't do this, the research projects do not see the global uv.lock, and may mistakenly
+  # If we don't do this, the research projects do not see the global uv.lock, and may mistakenly 
   # install numpy>=2.0 because nemo-rl's core [dependencies] do not pin numpy, but when you inspect
   # nemo-rl's uv.lock you'll see it's 1.X b/c megatron mandates 1.X in the optional dependencies, so
   # globally we must choose 1.X otherwise we run into pickle issues from ray.
@@ -219,11 +236,12 @@ default-groups = ["dev", "build"]
 link-mode = "copy"
 # The TE override is needed because automodel/mbridge we are on is still on 2.5.0
 # The opencv-python-headless override is needed because automodel pins it to 4.10.0.84, whereas vllm>=0.11.0 needs >= 4.11.0
+# The transformers override is needed since automodel is still on <=4.55.4
 # The timm override is needed because current automodel pins to 1.0.16. This can be removed once we move ToT automodel
-# The nvidia-modelopt override is needed because mcore is still on 0.33
 override-dependencies = [
   "transformer-engine[pytorch]==2.8.0",
   "opencv-python-headless>=4.11.0",
+  "transformers>=4.57.1",
   "timm<=1.0.22",
   "nvidia-modelopt[torch]>=0.39.0",
 ]
@@ -267,7 +285,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"]
 [[tool.uv.dependency-metadata]]
 name = "deep_ep"
 # This version has to match the version in the commit/rev/tag used
-version = "v1.2.1+bfded34"
+version = "v1.1.0+e3908bf"
 requires-dist = ["torch", "packaging", "ninja"]
 
 [[tool.uv.dependency-metadata]]
@@ -279,7 +297,7 @@ requires-dist = ["torch", "packaging", "ninja"]
 [[tool.uv.dependency-metadata]]
 name = "nv-grouped-gemm"
 # This version has to match the version in the commit/rev/tag used
-version = "v1.1.4.post7"
+version = "1.1.4.post6"
 requires-dist = ["setuptools", "wheel", "torch", "numpy"]
 
 [tool.black]
@@ -303,6 +321,7 @@ markers = [
   "hf_gated: marks tests that require HuggingFace token access for gated models",
   "automodel: marks tests that require the automodel extra",
   "vllm: marks tests that require the vllm extra",
+  "sglang: marks tests that require the sglang extra",
 ]
 
 [tool.pyrefly]
diff --git a/pyrefly.toml b/pyrefly.toml
index 74f0f29ed9..e4476c03ea 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -103,6 +103,8 @@ project-includes = [
   "nemo_rl/models/generation/vllm/config.py",
   "nemo_rl/models/generation/vllm/utils.py",
   "nemo_rl/models/generation/vllm/vllm_backend.py",
+  "nemo_rl/models/generation/sglang/__init__.py",
+  "nemo_rl/models/generation/sglang/config.py",
   "nemo_rl/models/huggingface/__init__.py",
   "nemo_rl/models/megatron/__init__.py",
   "nemo_rl/models/megatron/community_import.py",
diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh
index ec7527f583..095a01c447 100644
--- a/tests/functional/L1_Functional_Tests_GPU.sh
+++ b/tests/functional/L1_Functional_Tests_GPU.sh
@@ -31,6 +31,7 @@ time uv run --no-sync bash ./tests/functional/grpo_megatron.sh
 time uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh
 time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
 time uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh
+time uv run --no-sync bash ./tests/functional/grpo_sglang.sh
 time uv run --no-sync bash ./tests/functional/dpo.sh
 time uv run --no-sync bash ./tests/functional/rm.sh
 time uv run --no-sync bash ./tests/functional/eval.sh
diff --git a/tests/functional/grpo_sglang.sh b/tests/functional/grpo_sglang.sh
new file mode 100755
index 0000000000..8e7d7608bd
--- /dev/null
+++ b/tests/functional/grpo_sglang.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+uv run --extra sglang coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \
+    $PROJECT_ROOT/examples/run_grpo_math.py \
+    --config $PROJECT_ROOT/examples/configs/grpo_math_1B_sglang.yaml \
+    policy.model_name=Qwen/Qwen3-0.6B \
+    grpo.num_prompts_per_step=2 \
+    grpo.num_generations_per_prompt=4 \
+    policy.train_global_batch_size=4 \
+    policy.train_micro_batch_size=1 \
+    cluster.gpus_per_node=1 \
+    policy.generation.sglang_cfg.gpus_per_server=1 \
+    grpo.max_num_steps=2 \
+    logger.tensorboard_enabled=true \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=false \
+    logger.monitor_gpus=true \
+    checkpointing.enabled=false \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+    'max(data["train/token_mult_prob_error"]) < 1.05'
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh
new file mode 100755
index 0000000000..47fd7eb186
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=450
+MAX_STEPS=450
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+# Using the same metrics thresholds as the vllm version to verify alignment
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["450"] < 1.1' \
+        'mean(data["timing/train/total_step_time"], 2) < 25'
+fi
+
+
diff --git a/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh b/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh
new file mode 100755
index 0000000000..69c35eb54c
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+MAX_STEPS=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["500"] < 1.1' \
+        'mean(data["timing/train/total_step_time"], 2) < 30'
+fi
+
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index ee1fda01b1..24d16d1c62 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -7,6 +7,10 @@ tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh
 tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh
 tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh
 
+# SGLang backend
+tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh
+tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh
+
 # Dtensor (Qwen/Qwen2.5-7B-Instruct)
 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh
 
diff --git a/tests/unit/L0_Unit_Tests_Generation.sh b/tests/unit/L0_Unit_Tests_Generation.sh
index e7b7a6e2ca..d30e051c66 100644
--- a/tests/unit/L0_Unit_Tests_Generation.sh
+++ b/tests/unit/L0_Unit_Tests_Generation.sh
@@ -45,3 +45,11 @@ if [[ $exit_code -eq 5 ]]; then
 else
     uv run --extra vllm bash -x ./tests/run_unit.sh unit/models/generation/ --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only
 fi
+
+# Check and run sglang tests
+exit_code=$(uv run --extra sglang pytest tests/unit/models/generation/ --collect-only --hf-gated --sglang-only -q >/dev/null 2>&1; echo $?)
+if [[ $exit_code -eq 5 ]]; then
+    echo "No sglang tests to run"
+else
+    uv run --extra sglang bash -x ./tests/run_unit.sh unit/models/generation/ --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only
+fi
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index ab3368185c..ebc5569f86 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -57,6 +57,12 @@ def pytest_addoption(parser):
         default=False,
         help="Run ONLY vllm tests",
     )
+    parser.addoption(
+        "--sglang-only",
+        action="store_true",
+        default=False,
+        help="Run ONLY sglang tests",
+    )
 
 
 def pytest_collection_modifyitems(config, items):
@@ -65,12 +71,18 @@ def pytest_collection_modifyitems(config, items):
     run_mcore_only = config.getoption("--mcore-only")
     run_automodel_only = config.getoption("--automodel-only")
     run_vllm_only = config.getoption("--vllm-only")
+    run_sglang_only = config.getoption("--sglang-only")
 
     # Check for mutually exclusive options
-    exclusive_options = [run_mcore_only, run_automodel_only, run_vllm_only]
+    exclusive_options = [
+        run_mcore_only,
+        run_automodel_only,
+        run_vllm_only,
+        run_sglang_only,
+    ]
     if sum(exclusive_options) > 1:
         raise ValueError(
-            "--mcore-only, --automodel-only, and --vllm-only are mutually exclusive"
+            "--mcore-only, --automodel-only, --vllm-only, and --sglang-only are mutually exclusive"
         )
 
     marker_expr = config.getoption("-m", default="")
@@ -140,6 +152,24 @@ def pytest_collection_modifyitems(config, items):
         # Exclude vllm tests by default
         new_items = [item for item in new_items if not item.get_closest_marker("vllm")]
 
+    # Filter by sglang marker
+    if run_sglang_only:
+        # Validate that sglang is available
+        try:
+            import sglang  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "Cannot run sglang tests: sglang is not available.\n"
+                "Please run tests with: uv run --extra sglang --group test pytest ..."
+            )
+        # Include only sglang tests
+        new_items = [item for item in new_items if item.get_closest_marker("sglang")]
+    else:
+        # Exclude sglang tests by default
+        new_items = [
+            item for item in new_items if not item.get_closest_marker("sglang")
+        ]
+
     # Ensure run_first tests are prioritized
     new_items.sort(key=lambda item: 0 if item.get_closest_marker("run_first") else 1)
 
diff --git a/tests/unit/models/generation/test_sglang_generation.py b/tests/unit/models/generation/test_sglang_generation.py
new file mode 100644
index 0000000000..299bd8e3d6
--- /dev/null
+++ b/tests/unit/models/generation/test_sglang_generation.py
@@ -0,0 +1,927 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SGLang generation backend.
+
+These tests verify that the SGLang generation backend produces sane outputs.
+While not true unit tests, they validate the generation quality in unit test runs.
+"""
+
+import gc
+from copy import deepcopy
+
+import pytest
+import ray
+import torch
+
+from nemo_rl.algorithms.utils import get_tokenizer
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.models.generation.sglang import SGLangConfig, SGLangGeneration
+
+model_name = "Qwen/Qwen3-0.6B"
+
+# Define basic SGLang test config
+basic_sglang_test_config: SGLangConfig = {
+    "backend": "sglang",
+    "model_name": model_name,
+    "model_path": model_name,
+    "tokenizer": {
+        "name": model_name,
+    },
+    "dtype": "bfloat16",
+    "max_new_tokens": 5,  # Small number of tokens for testing
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "top_k": None,
+    "stop_token_ids": None,
+    "stop_strings": None,
+    "sglang_cfg": {
+        "model_path": model_name,
+        "gpus_per_server": 2,
+        "dtype": "bfloat16",
+        "context_length": 1024,
+        "log_level": "warning",
+        "skip_server_warmup": True,
+        "enable_memory_saver": False,
+        "dp_size": 1,
+        "pp_size": 1,
+        "ep_size": 1,
+        "mem_fraction_static": 0.7,
+    },
+    "colocated": {
+        "enabled": True,
+        "resources": {
+            "gpus_per_node": None,
+            "num_nodes": None,
+        },
+    },
+    "sglang_kwargs": {},
+}
+
+# Basic DTensor test config for Policy tests
+basic_dtensor_test_config = {
+    "model_name": model_name,
+    "tokenizer": {
+        "name": model_name,
+    },
+    "train_global_batch_size": 1,
+    "train_micro_batch_size": 1,
+    "learning_rate": 5e-6,
+    "logprob_batch_size": 1,
+    "max_new_tokens": 16,
+    "do_sample": False,
+    "precision": "float32",
+    "offload_optimizer_for_logprob": False,
+    "optimizer": {
+        "name": "torch.optim.AdamW",
+        "kwargs": {
+            "lr": 5e-6,
+            "weight_decay": 0.01,
+            "betas": [0.9, 0.999],
+            "eps": 1e-8,
+        },
+    },
+    "dtensor_cfg": {
+        "_v2": True,  # Use DTensorPolicyWorkerV2 for stream_weights_via_http
+        "enabled": True,
+        "cpu_offload": False,
+        "sequence_parallel": False,
+        "activation_checkpointing": False,
+        "tensor_parallel_size": 2,
+        "context_parallel_size": 1,
+        "custom_parallel_plan": None,
+    },
+    "dynamic_batching": {
+        "enabled": True,
+        "train_mb_tokens": 40,
+        "logprob_mb_tokens": 40,
+        "sequence_length_round": 4,
+    },
+    "sequence_packing": {
+        "enabled": False,
+    },
+    "max_grad_norm": 1.0,
+    "make_sequence_length_divisible_by": 1,
+    "generation": deepcopy(basic_sglang_test_config),
+}
+
+
+def configure_sglang_config(
+    config: SGLangConfig, tokenizer, is_eval=True
+) -> SGLangConfig:
+    """Apply specific configurations to SGLang config."""
+    config = deepcopy(config)
+    config["_pad_token_id"] = tokenizer.pad_token_id
+    if config["stop_token_ids"] is None:
+        config["stop_token_ids"] = [tokenizer.eos_token_id]
+    return config
+
+
+@pytest.fixture(scope="function")
+def cluster():
+    """Create a virtual cluster for testing with 2 GPUs."""
+    virtual_cluster = RayVirtualCluster(
+        bundle_ct_per_node_list=[2],
+        use_gpus=True,
+        max_colocated_worker_groups=2,
+        num_gpus_per_node=2,
+        name="sglang-test-cluster",
+    )
+    yield virtual_cluster
+    virtual_cluster.shutdown()
+
+
+@pytest.fixture(scope="function")
+def tokenizer():
+    """Initialize tokenizer for the test model."""
+    tokenizer = get_tokenizer(basic_sglang_test_config["tokenizer"])
+    return tokenizer
+
+
+@pytest.fixture(scope="function")
+def policy(cluster, tokenizer):
+    """Initialize the SGLang policy."""
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config = configure_sglang_config(sglang_config, tokenizer)
+    p = SGLangGeneration(cluster, sglang_config)
+    yield p
+    try:
+        p.shutdown()
+        gc.collect()
+        torch.cuda.empty_cache()
+    except Exception as e:
+        print(f"Error during policy cleanup: {e}")
+
+
+@pytest.fixture(scope="function")
+def test_input_data(tokenizer):
+    """Create test input data for inference."""
+    test_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+
+    # Tokenize prompts
+    encodings = tokenizer(
+        test_prompts,
+        padding="max_length",
+        max_length=20,
+        truncation=True,
+        return_tensors="pt",
+        padding_side="right",
+    )
+
+    # Calculate input lengths from attention mask
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+
+    # Create input data dictionary
+    return BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+
+@pytest.fixture(scope="function")
+def policy_cluster_separate():
+    """Create a virtual cluster for the Policy, using 2 GPUs."""
+    cluster = RayVirtualCluster(
+        bundle_ct_per_node_list=[2],
+        use_gpus=True,
+        max_colocated_worker_groups=2,
+        num_gpus_per_node=2,
+        name="sglang-test-policy-cluster-separate",
+    )
+    yield cluster
+    try:
+        cluster.shutdown()
+    except Exception as e:
+        print(f"Error during policy_cluster_separate shutdown: {e}")
+
+
+def get_generation_cluster_separate(num_gpus_per_node: int = 2) -> RayVirtualCluster:
+    """Create a virtual cluster for the SGLangGeneration policy."""
+    return RayVirtualCluster(
+        bundle_ct_per_node_list=[num_gpus_per_node],
+        use_gpus=True,
+        max_colocated_worker_groups=1,
+        num_gpus_per_node=num_gpus_per_node,
+        name="sglang-test-generation-cluster-separate",
+    )
+
+
+# =============================================================================
+# Basic Configuration Tests
+# =============================================================================
+
+
+@pytest.mark.sglang
+@pytest.mark.timeout(120)
+def test_sglang_missing_required_config_key(cluster, tokenizer):
+    """Test that an error is raised when a required config key is missing."""
+    # SGLang requires sglang_cfg to be present
+    incomplete_config = deepcopy(basic_sglang_test_config)
+    incomplete_config = configure_sglang_config(incomplete_config, tokenizer)
+    del incomplete_config["sglang_cfg"]
+
+    with pytest.raises((KeyError, ValueError, AssertionError, TypeError)):
+        SGLangGeneration(cluster, incomplete_config)
+
+
+@pytest.mark.sglang
+def test_sglang_top_p_top_k_validation(cluster, tokenizer):
+    """Test that top_p and top_k values are accepted by SGLang.
+
+    Note: SGLang may have different validation thresholds than vLLM.
+    This test verifies that reasonable sampling parameters are accepted.
+    """
+    # Test that reasonable top_p and top_k values are accepted
+    config = deepcopy(basic_sglang_test_config)
+    config["top_p"] = 0.95
+    config["top_k"] = 50
+    config = configure_sglang_config(config, tokenizer)
+
+    policy = None
+    try:
+        policy = SGLangGeneration(cluster, config)
+        print("Successfully initialized with top_p=0.95 and top_k=50")
+    except Exception as e:
+        pytest.fail(f"Should not raise error with reasonable sampling params: {e}")
+    finally:
+        if policy:
+            policy.shutdown()
+            gc.collect()
+            torch.cuda.empty_cache()
+
+
+# =============================================================================
+# Basic Generation Tests
+# =============================================================================
+
+
+@pytest.mark.sglang
+@pytest.mark.timeout(180)
+def test_sglang_policy_generation(policy, test_input_data, tokenizer):
+    """Test SGLang policy generation capabilities."""
+    print("Testing SGLang generation...")
+    outputs = policy.generate(test_input_data)
+
+    # Validate outputs format
+    assert "output_ids" in outputs, "output_ids not found in generation output"
+    assert "logprobs" in outputs, "logprobs not found in generation output"
+    assert "generation_lengths" in outputs, (
+        "generation_lengths not found in generation output"
+    )
+    assert "unpadded_sequence_lengths" in outputs, (
+        "unpadded_sequence_lengths not found in generation output"
+    )
+
+    # Validate outputs shape and content
+    assert outputs["output_ids"].shape[0] == len(test_input_data["input_ids"]), (
+        "Wrong batch size in output"
+    )
+    assert outputs["generation_lengths"].shape[0] == len(
+        test_input_data["input_ids"]
+    ), "Wrong batch size in generation_lengths"
+
+    # Decode and check outputs
+    generated_sequences = outputs["output_ids"]
+    generated_texts = tokenizer.batch_decode(
+        generated_sequences, skip_special_tokens=True
+    )
+
+    print(f"Generated texts: {generated_texts}")
+
+    # All texts should have a non-zero length
+    assert all(len(text) > 0 for text in generated_texts), (
+        "Some generated texts are empty"
+    )
+
+
+@pytest.mark.sglang
+def test_sglang_worker_seed_behavior(cluster, tokenizer):
+    """
+    Test that different workers generate different outputs for identical prompts due to different seeds.
+    This ensures proper randomization across distributed workers for diverse exploration in RLHF.
+
+    Key: Use gpus_per_server=1 to create 2 independent SGLang servers (each with its own seed),
+    rather than 1 server with TP=2.
+    """
+    from nemo_rl.algorithms.grpo import refit_policy_generation
+    from nemo_rl.models.policy.lm_policy import Policy
+
+    unique_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+
+    # Create a batch where each prompt appears twice
+    # When sharded, different workers will get the same prompt
+    duplicated_prompts = unique_prompts + unique_prompts
+
+    # Tokenize prompts
+    encodings = tokenizer(
+        duplicated_prompts,
+        padding="max_length",
+        max_length=20,
+        truncation=True,
+        return_tensors="pt",
+        padding_side="right",
+    )
+
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+
+    # Create input data dictionary
+    duplicated_batch = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+    # Test with gpus_per_server=1 to create 2 independent servers with different seeds
+    print("Creating SGLang policy with gpus_per_server=1 (2 independent servers)...")
+    sglang_config = deepcopy(basic_sglang_test_config)
+    # Use gpus_per_server=1 to create 2 independent SGLang servers
+    sglang_config["sglang_cfg"]["gpus_per_server"] = 1
+    sglang_config = configure_sglang_config(sglang_config, tokenizer)
+
+    policy = SGLangGeneration(cluster, sglang_config)
+    policy.finish_generation()
+
+    dtensor_config = deepcopy(basic_dtensor_test_config)
+    dtensor_config["dtensor_cfg"]["tensor_parallel_size"] = 1  # Match gpus_per_server
+    lm_policy = Policy(cluster, dtensor_config, tokenizer)
+
+    state_dict_info = lm_policy.prepare_refit_info()
+    policy.prepare_refit_info(state_dict_info)
+
+    print("Refitting SGLang policy...")
+    refit_policy_generation(lm_policy, policy, sglang_config["colocated"]["enabled"])
+
+    try:
+        # Generate with duplicated prompts
+        print("Running generation with duplicated prompts...")
+        outputs = policy.generate(duplicated_batch, greedy=False)
+
+        # Decode the generated sequences
+        gen_texts = tokenizer.batch_decode(
+            outputs["output_ids"], skip_special_tokens=True
+        )
+
+        print(f"Generated texts with duplicated prompts: {gen_texts}")
+
+        # Check if the duplicated prompts generated different texts
+        # The first half and second half should be different due to different worker seeds
+        first_half = gen_texts[: len(unique_prompts)]
+        second_half = gen_texts[len(unique_prompts) :]
+
+        print(f"First worker outputs: {first_half}")
+        print(f"Second worker outputs: {second_half}")
+
+        # At least one of the pairs should be different due to different seeds
+        assert first_half != second_half, (
+            "Different workers should generate different outputs for identical prompts due to different seeds"
+        )
+
+    finally:
+        # Clean up resources
+        if "policy" in locals() and hasattr(policy, "shutdown"):
+            policy.shutdown()
+        if "lm_policy" in locals() and hasattr(lm_policy, "shutdown"):
+            lm_policy.shutdown()
+
+        # Force garbage collection
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.sglang
+def test_sglang_policy_tensor_parallel(cluster, tokenizer):
+    """Test SGLang policy with tensor parallelism > 1 (gpus_per_server=2)."""
+    # Configure with gpus_per_server=2 for tensor parallelism
+    tp_config = deepcopy(basic_sglang_test_config)
+    tp_config = configure_sglang_config(tp_config, tokenizer)
+    tp_config["sglang_cfg"]["gpus_per_server"] = 2  # TP=2
+
+    sglang_policy = None
+    try:
+        sglang_policy = SGLangGeneration(cluster, tp_config)
+
+        # Create simple test input
+        test_prompts = ["Hello, my name is", "The capital of France is"]
+        encodings = tokenizer(
+            test_prompts,
+            padding="max_length",
+            max_length=10,
+            truncation=True,
+            return_tensors="pt",
+            padding_side="right",
+        )
+
+        test_input_data = BatchedDataDict(
+            {
+                "input_ids": encodings["input_ids"],
+                "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32),
+            }
+        )
+
+        # Test generation with tensor parallelism
+        outputs = sglang_policy.generate(test_input_data)
+
+        sglang_policy.finish_generation()
+        sglang_policy.prepare_for_generation()
+
+        # Test generation again after cache reset
+        outputs = sglang_policy.generate(test_input_data)
+
+        assert "output_ids" in outputs, "output_ids not found in generation output"
+        assert outputs["output_ids"].shape[0] == 2, "Wrong batch size in output"
+
+        # Decode and check output
+        generated_text = tokenizer.decode(
+            outputs["output_ids"][0], skip_special_tokens=True
+        )
+        print(f"Generated text with TP=2: {generated_text}")
+        assert len(generated_text) > 0, "Generated text is empty"
+
+    finally:
+        # Clean up resources
+        if sglang_policy:
+            sglang_policy.shutdown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.sglang
+def test_sglang_generate_text(cluster, tokenizer):
+    """Test that SGLang can generate coherent text.
+
+    Note: SGLang doesn't have a generate_text method like vLLM,
+    so we use generate + tokenizer decode to verify text generation.
+    """
+    # Prepare test data
+    test_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+
+    encodings = tokenizer(
+        test_prompts,
+        padding="max_length",
+        max_length=10,
+        truncation=True,
+        return_tensors="pt",
+        padding_side="right",
+    )
+
+    test_input_data = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32),
+        }
+    )
+
+    # Create SGLang config with gpus_per_server=2 (using tensor parallelism)
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config["sglang_cfg"]["gpus_per_server"] = 2
+    sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True)
+
+    # Ensure correct model
+    assert sglang_config["model_name"] == "Qwen/Qwen3-0.6B", (
+        "Model name should be Qwen/Qwen3-0.6B to get expected output"
+    )
+
+    sglang_generation = None
+    try:
+        # Create SGLang generation
+        sglang_generation = SGLangGeneration(cluster, sglang_config)
+
+        # Generate with greedy decoding for deterministic output
+        output = sglang_generation.generate(test_input_data, greedy=True)
+
+        # Decode generated text
+        generated_texts = tokenizer.batch_decode(
+            output["output_ids"], skip_special_tokens=True
+        )
+
+        print(f"Generated texts: {generated_texts}")
+
+        # Verify we got non-empty text for each prompt
+        for i, text in enumerate(generated_texts):
+            assert len(text) > len(test_prompts[i]), (
+                f"Generated text should be longer than input prompt: {text}"
+            )
+            # Verify the generated text starts with or contains the prompt
+            print(f"Prompt: {test_prompts[i]} -> Generated: {text}")
+
+    finally:
+        # Clean up
+        if sglang_generation:
+            sglang_generation.shutdown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+def _wait_for_sglang_http_server_spinup(base_url: str):
+    """Wait for the SGLang HTTP server to be ready."""
+    import time
+
+    import requests
+
+    max_wait = 60  # 60 seconds max wait
+    start = time.time()
+    while time.time() - start < max_wait:
+        try:
+            response = requests.get(f"{base_url}/health_generate", timeout=5)
+            if response.status_code == 200:
+                return
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+            pass
+        time.sleep(1)
+    raise TimeoutError(f"SGLang server at {base_url} did not start within {max_wait}s")
+
+
+@pytest.mark.sglang
+def test_sglang_http_server(cluster, tokenizer):
+    """Test that SGLang HTTP server works with direct API calls.
+
+    SGLang exposes a /generate endpoint that accepts input_ids and sampling_params.
+    This test verifies we can make direct HTTP requests to the SGLang server.
+    """
+    import requests
+
+    # Create SGLang config
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True)
+
+    # Ensure correct model for reproducible output
+    assert sglang_config["model_name"] == "Qwen/Qwen3-0.6B", (
+        "Model name should be Qwen/Qwen3-0.6B to get expected output"
+    )
+
+    sglang_generation = None
+    try:
+        # Create SGLang generation (this starts the servers)
+        sglang_generation = SGLangGeneration(cluster, sglang_config)
+
+        # Get server URLs
+        base_urls = sglang_generation.get_sglang_server_urls()
+        print(f"SGLang server URLs: {base_urls}")
+        assert len(base_urls) >= 1, "Should have at least one SGLang server"
+
+        # Wait for server to be ready
+        _wait_for_sglang_http_server_spinup(base_urls[0])
+
+        # Prepare input - tokenize "count to 5"
+        test_prompt = "count to 5"
+        input_ids = tokenizer.encode(test_prompt, add_special_tokens=True)
+
+        # Build request payload for SGLang /generate endpoint
+        payload = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": 0.0,  # Greedy for determinism
+                "top_p": 1.0,
+                "max_new_tokens": 5,
+            },
+            "return_logprob": True,
+        }
+
+        # Make request to SGLang server
+        response = requests.post(
+            url=f"{base_urls[0]}/generate",
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=30,
+        )
+        actual_result = response.json()
+        print(f"SGLang response: {actual_result}")
+
+        # Verify response structure
+        assert response.status_code == 200, f"Expected 200, got {response.status_code}"
+        assert "meta_info" in actual_result, "Response should contain meta_info"
+
+        meta_info = actual_result["meta_info"]
+        assert "output_token_logprobs" in meta_info, (
+            "meta_info should contain output_token_logprobs"
+        )
+
+        # Verify we got some generated tokens
+        output_token_logprobs = meta_info["output_token_logprobs"]
+        assert len(output_token_logprobs) > 0, (
+            "Should have generated at least one token"
+        )
+
+        # Each entry should be [logprob, token_id]
+        first_token_info = output_token_logprobs[0]
+        assert len(first_token_info) >= 2, (
+            "Each token info should have logprob and token_id"
+        )
+
+        logprob = first_token_info[0]
+        token_id = first_token_info[1]
+        assert isinstance(logprob, float), "Logprob should be a float"
+        assert isinstance(token_id, int), "Token ID should be an int"
+
+        print(f"First generated token: id={token_id}, logprob={logprob}")
+
+        # Decode the generated tokens to verify text output
+        generated_token_ids = [item[1] for item in output_token_logprobs]
+        generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
+        print(f"Generated text: {generated_text}")
+
+    finally:
+        # Clean up
+        if sglang_generation:
+            sglang_generation.shutdown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.sglang
+@pytest.mark.timeout(180)
+def test_sglang_non_divisible_batch_handling(policy):
+    """Test that SGLang generation handles non divisible input batches correctly."""
+    empty_batch = BatchedDataDict(
+        {
+            "input_ids": torch.zeros((1, 1), dtype=torch.long),
+            "input_lengths": torch.ones(1, dtype=torch.long),
+        }
+    )
+
+    outputs = policy.generate(empty_batch)
+
+    required_keys = [
+        "output_ids",
+        "logprobs",
+        "generation_lengths",
+        "unpadded_sequence_lengths",
+    ]
+    assert all(key in outputs for key in required_keys), (
+        "Missing required output fields"
+    )
+    assert all(outputs[key].shape[0] == 1 for key in required_keys), (
+        "Output tensors should have batch dimension of 1"
+    )
+
+
+# =============================================================================
+# Policy Integration Tests
+# =============================================================================
+
+
+@pytest.mark.sglang
+@pytest.mark.timeout(300)
+def test_sglang_generation_with_hf_training_colocated(cluster, tokenizer):
+    """Test that DTensor policy can work together with colocated SGLang policy."""
+    from nemo_rl.algorithms.grpo import refit_policy_generation
+    from nemo_rl.models.policy.lm_policy import Policy
+
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config = configure_sglang_config(sglang_config, tokenizer)
+
+    dtensor_config = deepcopy(basic_dtensor_test_config)
+    dtensor_config["train_global_batch_size"] = 4
+    dtensor_config["dtensor_cfg"]["_v2"] = (
+        True  # Use DTensorPolicyWorkerV2 for stream_weights_via_http
+    )
+
+    sglang_policy = None
+    lm_policy = None
+
+    try:
+        print("Creating SGLang policy...")
+        sglang_policy = SGLangGeneration(cluster, sglang_config)
+        sglang_policy.finish_generation()
+
+        print("Creating DTensor policy...")
+        lm_policy = Policy(cluster, dtensor_config, tokenizer)
+
+        print("Preparing refit info...")
+        state_dict_info = lm_policy.prepare_refit_info()
+        sglang_policy.prepare_refit_info(state_dict_info)
+
+        print("Refitting SGLang policy...")
+        refit_policy_generation(
+            lm_policy, sglang_policy, sglang_config["colocated"]["enabled"]
+        )
+
+        # Test generation
+        test_prompts = ["Hello, my name is", "The capital of France is"]
+        encodings = tokenizer(
+            test_prompts,
+            padding="max_length",
+            max_length=20,
+            truncation=True,
+            return_tensors="pt",
+            padding_side="right",
+        )
+        test_input_data = BatchedDataDict(
+            {
+                "input_ids": encodings["input_ids"],
+                "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32),
+            }
+        )
+
+        outputs = sglang_policy.generate(test_input_data, greedy=True)
+        assert "output_ids" in outputs, "output_ids not found in generation output"
+
+        generated_texts = tokenizer.batch_decode(
+            outputs["output_ids"], skip_special_tokens=True
+        )
+        print(f"Generated texts: {generated_texts}")
+
+    finally:
+        if sglang_policy:
+            sglang_policy.shutdown()
+        if lm_policy and hasattr(lm_policy, "shutdown"):
+            lm_policy.shutdown()
+
+
+@pytest.mark.skip(reason="Non-colocated mode not implemented for SGLang")
+@pytest.mark.timeout(300)
+@pytest.mark.sglang
+def test_sglang_generation_with_hf_training_non_colocated(
+    policy_cluster_separate, tokenizer
+):
+    """Test that DTensor policy can work together with non-colocated SGLang policy."""
+    from nemo_rl.algorithms.grpo import refit_policy_generation
+    from nemo_rl.models.policy.lm_policy import Policy
+
+    generation_cluster_separate = get_generation_cluster_separate(2)
+
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config = configure_sglang_config(sglang_config, tokenizer)
+    sglang_config["colocated"]["enabled"] = False
+
+    dtensor_config = deepcopy(basic_dtensor_test_config)
+    dtensor_config["generation"]["colocated"]["enabled"] = False
+    dtensor_config["train_global_batch_size"] = 4
+    dtensor_config["dtensor_cfg"]["_v2"] = (
+        True  # Use DTensorPolicyWorkerV2 for stream_weights_via_http
+    )
+
+    sglang_policy = None
+    lm_policy = None
+
+    try:
+        print("Creating SGLang policy...")
+        sglang_policy = SGLangGeneration(generation_cluster_separate, sglang_config)
+        sglang_policy.finish_generation()
+
+        print("Creating DTensor policy...")
+        lm_policy = Policy(policy_cluster_separate, dtensor_config, tokenizer)
+
+        # Initialize collective communication
+        ip, port = policy_cluster_separate.get_master_address_and_port()
+        train_world_size = policy_cluster_separate.world_size()
+        inference_world_size = generation_cluster_separate.world_size()
+        world_size = train_world_size + inference_world_size
+
+        futures_train = lm_policy.init_collective(
+            ip, port, world_size=world_size, train_world_size=train_world_size
+        )
+        futures_inference = sglang_policy.init_collective(
+            ip, port, world_size=world_size, train_world_size=train_world_size
+        )
+        ray.get(futures_train + futures_inference)
+
+        # Prepare refit info
+        state_dict_info = lm_policy.prepare_refit_info()
+        sglang_policy.prepare_refit_info(state_dict_info)
+
+        print("Refitting SGLang policy...")
+        refit_policy_generation(lm_policy, sglang_policy, False)
+
+        # Test generation
+        test_prompts = ["Hello, my name is", "The capital of France is"]
+        encodings = tokenizer(
+            test_prompts,
+            padding="max_length",
+            max_length=20,
+            truncation=True,
+            return_tensors="pt",
+            padding_side="right",
+        )
+        test_input_data = BatchedDataDict(
+            {
+                "input_ids": encodings["input_ids"],
+                "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32),
+            }
+        )
+
+        outputs = sglang_policy.generate(test_input_data, greedy=True)
+        assert "output_ids" in outputs, "output_ids not found in generation output"
+
+    finally:
+        if sglang_policy:
+            sglang_policy.shutdown()
+        if lm_policy and hasattr(lm_policy, "shutdown"):
+            lm_policy.shutdown()
+        try:
+            generation_cluster_separate.shutdown()
+        except Exception as e:
+            print(f"Error during generation_cluster_separate shutdown: {e}")
+
+
+@pytest.mark.sglang
+@pytest.mark.timeout(180)
+def test_sglang_weight_update_and_prefix_cache_reset(cluster, tokenizer):
+    """Test that the SGLang prefix cache is correctly reset when weights change."""
+    from nemo_rl.models.policy.lm_policy import Policy
+
+    sglang_config = deepcopy(basic_sglang_test_config)
+    sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True)
+
+    dtensor_config = basic_dtensor_test_config
+
+    sglang_policy = None
+    lm_policy = None
+
+    try:
+        print("Creating DTensor policy...")
+        lm_policy = Policy(cluster, dtensor_config, tokenizer)
+
+        print("Creating SGLang policy...")
+        sglang_policy = SGLangGeneration(cluster, sglang_config)
+
+        print("Preparing refit info...")
+        state_dict_info = lm_policy.prepare_refit_info()
+        sglang_policy.prepare_refit_info(state_dict_info)
+
+        # Prepare input data
+        text = "Answer the question. What is 2+2?"
+        test_prompt = [text, text]
+        encodings = tokenizer(
+            test_prompt,
+            padding=True,
+            return_tensors="pt",
+            padding_side="right",
+        )
+        input_ids = encodings["input_ids"]
+        input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+        test_input_data = BatchedDataDict(
+            {"input_ids": input_ids, "input_lengths": input_lengths}
+        )
+
+        print("Running Generation 1 (Initial)...")
+        sglang_policy.prepare_for_generation()
+        outputs1 = sglang_policy.generate(test_input_data, greedy=True)
+        logprob1 = outputs1["logprobs"][0, input_lengths[0]].item()
+        print(f"Logprob of first generated token (Run 1): {logprob1}")
+
+        print("Adding noise to weights in HF policy...")
+        ray.get(
+            [
+                worker._add_noise_to_weights.remote()
+                for worker in lm_policy.worker_group.workers
+            ]
+        )
+
+        print("Updating SGLang weights from DTensor policy via HTTP...")
+        # Get SGLang server URL to GPU UUID mapping
+        sglang_url_to_gpu_uuids = sglang_policy.get_sglang_url_to_gpu_uuids()
+        print(f"SGLang URL to GPU UUIDs: {sglang_url_to_gpu_uuids}")
+
+        # Stream weights via HTTP (CUDA IPC)
+        ray.get(lm_policy.stream_weights_via_http(sglang_url_to_gpu_uuids))
+
+        print("Running Generation 2 (Weights Updated)...")
+        outputs2 = sglang_policy.generate(test_input_data, greedy=True)
+        logprob2 = outputs2["logprobs"][0, input_lengths[0]].item()
+        print(f"Logprob of first generated token (Run 2): {logprob2}")
+        assert logprob2 != logprob1, "Logprobs should be different after weight update."
+
+        print("Resetting SGLang prefix cache...")
+        sglang_policy.finish_generation()
+        sglang_policy.prepare_for_generation()
+
+        print("Running Generation 3 (Cache Reset)...")
+        outputs3 = sglang_policy.generate(test_input_data, greedy=True)
+        logprob3 = outputs3["logprobs"][0, input_lengths[0]].item()
+        print(f"Logprob of first generated token (Run 3): {logprob3}")
+
+        print("Prefix cache reset verified successfully.")
+
+    finally:
+        print("Cleaning up resources...")
+        if sglang_policy:
+            sglang_policy.shutdown()
+        if lm_policy:
+            lm_policy.shutdown()
+        gc.collect()
+        torch.cuda.empty_cache()
diff --git a/uv.lock b/uv.lock
index f98bc2e21f..c03963443a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -32,10 +32,10 @@ constraints = [
     { name = "urllib3", specifier = ">=2.6.3" },
 ]
 overrides = [
-    { name = "nvidia-modelopt", extras = ["torch"], specifier = ">=0.39.0" },
     { name = "opencv-python-headless", specifier = ">=4.11.0" },
     { name = "timm", specifier = "<=1.0.22" },
     { name = "transformer-engine", extras = ["pytorch"], specifier = "==2.8.0" },
+    { name = "transformers", specifier = ">=4.57.1" },
 ]
 
 [[manifest.dependency-metadata]]
@@ -45,7 +45,7 @@ requires-dist = ["torch", "packaging", "ninja"]
 
 [[manifest.dependency-metadata]]
 name = "deep-ep"
-version = "1.2.1+bfded34"
+version = "1.1.0+e3908bf"
 requires-dist = ["torch", "packaging", "ninja"]
 
 [[manifest.dependency-metadata]]
@@ -64,7 +64,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"]
 
 [[manifest.dependency-metadata]]
 name = "nv-grouped-gemm"
-version = "1.1.4.post7"
+version = "1.1.4.post6"
 requires-dist = ["setuptools", "wheel", "torch", "numpy"]
 
 [[package]]
@@ -87,8 +87,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f7/66/be171836d86dc5b8698b3a9bf4b9eb10cb53369729939f88bf650167588b/accelerate-1.10.0.tar.gz", hash = "sha256:8270568fda9036b5cccdc09703fef47872abccd56eb5f6d53b54ea5fb7581496", size = 392261, upload-time = "2025-08-07T10:54:51.664Z" }
 wheels = [
@@ -309,25 +309,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
-[[package]]
-name = "anthropic"
-version = "0.71.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "anyio" },
-    { name = "distro" },
-    { name = "docstring-parser" },
-    { name = "httpx" },
-    { name = "jiter" },
-    { name = "pydantic" },
-    { name = "sniffio" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/82/4f/70682b068d897841f43223df82d96ec1d617435a8b759c4a2d901a50158b/anthropic-0.71.0.tar.gz", hash = "sha256:eb8e6fa86d049061b3ef26eb4cbae0174ebbff21affa6de7b3098da857d8de6a", size = 489102, upload-time = "2025-10-16T15:54:40.08Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5d/77/073e8ac488f335aec7001952825275582fb8f433737e90f24eeef9d878f6/anthropic-0.71.0-py3-none-any.whl", hash = "sha256:85c5015fcdbdc728390f11b17642a65a4365d03b12b799b18b6cc57e71fdb327", size = 355035, upload-time = "2025-10-16T15:54:38.238Z" },
-]
-
 [[package]]
 name = "antlr4-python3-runtime"
 version = "4.9.3"
@@ -776,8 +757,8 @@ source = { git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8#82
 dependencies = [
     { name = "ninja" },
     { name = "packaging" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 
 [[package]]
@@ -983,18 +964,18 @@ wheels = [
 
 [[package]]
 name = "compressed-tensors"
-version = "0.12.2"
+version = "0.11.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "loguru" },
+    { name = "frozendict" },
     { name = "pydantic" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "transformers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a2/79/4c5c1cd14266f8cf2650bdb940f986ce7fcaeb56aad8cfa9e9afedf14e2f/compressed_tensors-0.12.2.tar.gz", hash = "sha256:5bb40856dd17f128ab73557ecc73799f80db4dd82fab6de875f1e6899b9ea0c4", size = 190409, upload-time = "2025-10-07T14:30:59.302Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b8/99/3fdabfc95609d6efdf02fa7f1ed0245524cb1209d3d4a17109d3205d2eed/compressed_tensors-0.11.0.tar.gz", hash = "sha256:95ddf19699f775df6494dd864e5f52e8a24f8015496520190c1a22c6cfc44b1f", size = 187566, upload-time = "2025-08-19T18:59:31.854Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f0/c0/1695b87d369e6652ec0d650912e02eca2151c5e9c29244f94d2afccfe970/compressed_tensors-0.12.2-py3-none-any.whl", hash = "sha256:e554ea761710ca2b0c0ea49276a4ef8e08658624f1591e6a7368817106b48fbe", size = 183049, upload-time = "2025-10-07T14:30:56.523Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/81/e3073017a8f5c75169e79108eda209e6089e3f96c9f197d307cbda7df71c/compressed_tensors-0.11.0-py3-none-any.whl", hash = "sha256:e1cbc46e1ae032b7ceea915fe18c8d2de5a54d3a50a607969b6bdfe703b6cb83", size = 179951, upload-time = "2025-08-19T18:59:29.308Z" },
 ]
 
 [[package]]
@@ -1243,10 +1224,10 @@ version = "25.3.2"
 source = { git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab#87a86aba72cfd2f0d8abecaf81c13c4528ea07d8" }
 dependencies = [
     { name = "setuptools" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
     { name = "triton", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
-    { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1336,13 +1317,13 @@ wheels = [
 
 [[package]]
 name = "deep-ep"
-version = "1.2.1+bfded34"
-source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480#bfded34800dfec415b71503f8205181de90b2480" }
+version = "1.1.0+e3908bf"
+source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef#e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" }
 dependencies = [
     { name = "ninja" },
     { name = "packaging" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 
 [[package]]
@@ -1352,8 +1333,8 @@ source = { git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1
 dependencies = [
     { name = "ninja" },
     { name = "packaging" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 
 [[package]]
@@ -1379,15 +1360,15 @@ wheels = [
 
 [[package]]
 name = "depyf"
-version = "0.20.0"
+version = "0.19.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "astor" },
     { name = "dill" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/88/35/83fb0178212279aa0af031031905804c6de5618435d229f41ed21bb9ad2c/depyf-0.20.0.tar.gz", hash = "sha256:fb7683bd72c44f67b56029df2c47721e9a02ffa4d7b19095f1c54c4ebf797a98", size = 6168761, upload-time = "2025-10-13T12:33:38.589Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/38/69157d711be575f1b9cf3177b64ef4ade44373fc02839f183fdd98ec2dd6/depyf-0.19.0.tar.gz", hash = "sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44", size = 6171405, upload-time = "2025-04-20T08:07:41.224Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cf/65/4df6936130b56e1429114e663e7c1576cf845f3aef1b2dd200c0a5d19dba/depyf-0.20.0-py3-none-any.whl", hash = "sha256:d31effad4261cebecb58955d832e448ace88f432328f95f82fd99c30fd9308d4", size = 39381, upload-time = "2025-10-13T12:33:33.647Z" },
+    { url = "https://files.pythonhosted.org/packages/28/4d/1192acbcdc5e843f5e5d51f6e8788f2b60a9fe0b578ac385ded67a0b0b26/depyf-0.19.0-py3-none-any.whl", hash = "sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5", size = 39034, upload-time = "2025-04-20T08:07:37.036Z" },
 ]
 
 [[package]]
@@ -1489,15 +1470,6 @@ version = "0.6.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a2/55/8f8cab2afd404cf578136ef2cc5dfb50baa1761b68c9da1fb1e4eed343c9/docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491", size = 25901, upload-time = "2014-06-16T11:18:57.406Z" }
 
-[[package]]
-name = "docstring-parser"
-version = "0.17.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" },
-]
-
 [[package]]
 name = "docutils"
 version = "0.21.2"
@@ -1544,8 +1516,8 @@ version = "0.1.0"
 source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0#d5363b4a418128cd8111983b191c4b8869a9766b" }
 dependencies = [
     { name = "absl-py" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "typing-extensions" },
 ]
 
@@ -1672,8 +1644,8 @@ version = "0.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "einops" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/67/c6/10a1149b07e6bab45b2cb2d07f6b827716c2baf5f3404161753f25c6389b/fla_core-0.3.2.tar.gz", hash = "sha256:d38db16bc4e1c6fa8c04df442f246da1e6926a209426bc6ef703d41bfbc37c92", size = 296725, upload-time = "2025-09-10T07:43:40.155Z" }
 wheels = [
@@ -1689,8 +1661,8 @@ dependencies = [
     { name = "ninja" },
     { name = "psutil" },
     { name = "setuptools" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e8/6d/7066d160bdffa2f9da29a8c3957f266b17a03ca0b3bdc8fdae86d9881fe7/flash_attn-2.8.1.tar.gz", hash = "sha256:0ff003899fcb244f357905b04f622d5c9736887126dd6675f8f4bc52954e3923", size = 8166563, upload-time = "2025-07-10T05:16:39.729Z" }
 
@@ -1725,8 +1697,8 @@ dependencies = [
     { name = "packaging" },
     { name = "requests" },
     { name = "tabulate" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "tqdm" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d8/04/e357eaa50238e12c49e66fcf47f83e066e741ef19a117c136782b32eafbb/flashinfer_python-0.5.2.tar.gz", hash = "sha256:99d097a28be1e98c7f85e4a767e9e9a4794374f9318c27db14d21e367149063f", size = 4632657, upload-time = "2025-11-07T02:53:27.261Z" }
@@ -1805,6 +1777,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0f/64/9d606e66d498917cd7a2ff24f558010d42d6fd4576d9dd57f0bd98333f5a/fonttools-4.59.1-py3-none-any.whl", hash = "sha256:647db657073672a8330608970a984d51573557f328030566521bc03415535042", size = 1130094, upload-time = "2025-08-14T16:28:12.048Z" },
 ]
 
+[[package]]
+name = "frozendict"
+version = "2.4.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/b2/2a3d1374b7780999d3184e171e25439a8358c47b481f68be883c14086b4c/frozendict-2.4.7.tar.gz", hash = "sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd", size = 317082, upload-time = "2025-11-11T22:40:14.251Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/74/f94141b38a51a553efef7f510fc213894161ae49b88bffd037f8d2a7cb2f/frozendict-2.4.7-py3-none-any.whl", hash = "sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550", size = 16264, upload-time = "2025-11-11T22:40:12.836Z" },
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.7.0"
@@ -2441,6 +2422,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635, upload-time = "2024-01-06T23:01:20.829Z" },
 ]
 
+[[package]]
+name = "ipython"
+version = "9.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "decorator" },
+    { name = "ipython-pygments-lexers" },
+    { name = "jedi" },
+    { name = "matplotlib-inline" },
+    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "prompt-toolkit" },
+    { name = "pygments" },
+    { name = "stack-data" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/12/51/a703c030f4928646d390b4971af4938a1b10c9dfce694f0d99a0bb073cb2/ipython-9.8.0.tar.gz", hash = "sha256:8e4ce129a627eb9dd221c41b1d2cdaed4ef7c9da8c17c63f6f578fe231141f83", size = 4424940, upload-time = "2025-12-03T10:18:24.353Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/df/8ee1c5dd1e3308b5d5b2f2dfea323bb2f3827da8d654abb6642051199049/ipython-9.8.0-py3-none-any.whl", hash = "sha256:ebe6d1d58d7d988fbf23ff8ff6d8e1622cfdb194daf4b7b73b792c4ec3b85385", size = 621374, upload-time = "2025-12-03T10:18:22.335Z" },
+]
+
+[[package]]
+name = "ipython-pygments-lexers"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
+]
+
 [[package]]
 name = "itsdangerous"
 version = "2.2.0"
@@ -2450,6 +2464,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
 ]
 
+[[package]]
+name = "jedi"
+version = "0.19.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "parso" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -2692,9 +2718,9 @@ name = "liger-kernel"
 version = "0.6.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" },
+    { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
     { name = "triton", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or sys_platform == 'win32'" },
-    { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/31/23/be0b4dcac42d77f99406c906567cde22a7a3d71b3f3ffdfda2ac6153ec36/liger_kernel-0.6.2.tar.gz", hash = "sha256:5c5bcffffa769bc26ae838f5a4954170dd5cacde036abb1b383039f39fa5fd69", size = 3679495, upload-time = "2025-08-22T00:15:28.456Z" }
 wheels = [
@@ -2703,15 +2729,15 @@ wheels = [
 
 [[package]]
 name = "llguidance"
-version = "1.3.0"
+version = "0.7.30"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/38/d1ef3ae08d8d857e5e0690c5b1e07bf7eb4a1cae5881d87215826dc6cadb/llguidance-0.7.30.tar.gz", hash = "sha256:e93bf75f2b6e48afb86a5cee23038746975e1654672bf5ba0ae75f7d4d4a2248", size = 1055528, upload-time = "2025-06-23T00:23:49.247Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" },
-    { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" },
-    { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/e1/694c89986fcae7777184fc8b22baa0976eba15a6847221763f6ad211fc1f/llguidance-0.7.30-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c80af02c118d2b0526bcecaab389af2ed094537a069b0fc724cd2a2f2ba3990f", size = 3327974, upload-time = "2025-06-23T00:23:47.556Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/77/ab7a548ae189dc23900fdd37803c115c2339b1223af9e8eb1f4329b5935a/llguidance-0.7.30-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:00a256d532911d2cf5ba4ef63e182944e767dd2402f38d63002016bc37755958", size = 3210709, upload-time = "2025-06-23T00:23:45.872Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/5b/6a166564b14f9f805f0ea01ec233a84f55789cb7eeffe1d6224ccd0e6cdd/llguidance-0.7.30-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8741c867e4bc7e42f7cdc68350c076b4edd0ca10ecefbde75f15a9f6bc25d0", size = 14867038, upload-time = "2025-06-23T00:23:39.571Z" },
+    { url = "https://files.pythonhosted.org/packages/af/80/5a40b9689f17612434b820854cba9b8cabd5142072c491b5280fe5f7a35e/llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9edc409b9decd6cffba5f5bf3b4fbd7541f95daa8cbc9510cbf96c6ab1ffc153", size = 15004926, upload-time = "2025-06-23T00:23:43.965Z" },
+    { url = "https://files.pythonhosted.org/packages/99/47/58e49a118b514855b245f8a962c6aaf9a5cc95a0f61eac7e230e691c7b7e/llguidance-0.7.30-cp39-abi3-win_amd64.whl", hash = "sha256:05234ecceea7c9c6ff13b9739112043173a3bcb88cae860249b20335a07b3075", size = 2796878, upload-time = "2025-06-23T00:23:51Z" },
 ]
 
 [[package]]
@@ -2747,19 +2773,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/ef/11292bb0b85cf4c93447cab5a29f64576ed14d3ab4280e35ddd23486594a/lm_format_enforcer-0.11.3-py3-none-any.whl", hash = "sha256:cf586350875def1ae7a8fba84fcbbfc8371424b6c9d05c1fcba70aa233fbf06f", size = 45418, upload-time = "2025-08-24T19:37:46.325Z" },
 ]
 
-[[package]]
-name = "loguru"
-version = "0.7.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
-]
-
 [[package]]
 name = "lxml"
 version = "6.0.0"
@@ -2820,8 +2833,8 @@ dependencies = [
     { name = "causal-conv1d" },
     { name = "ninja" },
     { name = "packaging" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 
 [[package]]
@@ -2949,6 +2962,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/60/3601f8ce6d76a7c81c7f25a0e15fde0d6b66226dd187aa6d2838e6374161/matplotlib-3.10.5-cp314-cp314t-win_arm64.whl", hash = "sha256:2efaf97d72629e74252e0b5e3c46813e9eeaa94e011ecf8084a971a31a97f40b", size = 8153849, upload-time = "2025-07-31T18:09:19.673Z" },
 ]
 
+[[package]]
+name = "matplotlib-inline"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" },
+]
+
 [[package]]
 name = "mdit-py-plugins"
 version = "0.5.0"
@@ -3037,7 +3062,7 @@ dependencies = [
     { name = "multi-storage-client" },
     { name = "numpy" },
     { name = "nv-grouped-gemm" },
-    { name = "nvidia-modelopt" },
+    { name = "nvidia-modelopt", marker = "sys_platform != 'darwin'" },
     { name = "nvidia-resiliency-ext" },
     { name = "nvtx" },
     { name = "onnxscript" },
@@ -3046,8 +3071,8 @@ dependencies = [
     { name = "setuptools" },
     { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
     { name = "tensorstore", version = "0.1.76", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "tqdm" },
     { name = "transformer-engine", extra = ["pytorch"] },
     { name = "wget" },
@@ -3065,7 +3090,7 @@ requires-dist = [
     { name = "megatron-energon", extras = ["av-decode"], specifier = "~=6.0" },
     { name = "multi-storage-client", specifier = "~=0.27" },
     { name = "numpy", specifier = "<2.0.0" },
-    { name = "nv-grouped-gemm", git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7" },
+    { name = "nv-grouped-gemm", specifier = "~=1.1" },
     { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin'", specifier = ">=0.33.0a0,<0.34.0" },
     { name = "nvidia-resiliency-ext", specifier = ">=0.4.0a0,<0.5.0" },
     { name = "nvtx", specifier = "~=0.2" },
@@ -3093,8 +3118,8 @@ dependencies = [
     { name = "pillow" },
     { name = "pyyaml" },
     { name = "s3fs" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "tqdm" },
     { name = "webdataset" },
 ]
@@ -3120,8 +3145,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy" },
     { name = "packaging" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a0/be/06ada3d765ebca304e2d87873d6cf00807b43155ed57058abcd813d13a5d/megatron_fsdp-0.1.0rc1.tar.gz", hash = "sha256:4852a1c62bb95b5fc9567165ee7119f2e68bc75d6103af06bd1e6d392a50021f", size = 71600, upload-time = "2025-09-02T21:29:10.757Z" }
 wheels = [
@@ -3148,6 +3173,10 @@ wheels = [
 ]
 
 [package.optional-dependencies]
+audio = [
+    { name = "soundfile" },
+    { name = "soxr" },
+]
 image = [
     { name = "opencv-python-headless" },
 ]
@@ -3338,21 +3367,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/59/8e4dee2893a56fc68a27eec7ec7ed9559c7ea01099313a9b8196373bf3cf/mlx_metal-0.28.0-py3-none-macosx_15_0_arm64.whl", hash = "sha256:214ece3781d44f57eb9686561594b28915ec5568df4a5a73da59c66880b204ed", size = 33167706, upload-time = "2025-08-07T07:53:03.852Z" },
 ]
 
-[[package]]
-name = "model-hosting-container-standards"
-version = "0.1.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "fastapi" },
-    { name = "httpx" },
-    { name = "jmespath" },
-    { name = "pydantic" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/d0/eaba9ff13f7a534bf2c0f28e4e32dee58583dc3a31fe3eebb3b93ed13675/model_hosting_container_standards-0.1.4.tar.gz", hash = "sha256:86838d16e4d05bc6fdafdf83dc292a9d34124b63584764ad6cd67b05d09cda62", size = 63332, upload-time = "2025-11-10T17:58:37.321Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/fc/d6034069e52003ed86f72e436b65f16084fa4d08c6b8220bc0fc85e33eab/model_hosting_container_standards-0.1.4-py3-none-any.whl", hash = "sha256:ede565ba750e812eef028804c84b8244a96fb733fcaec9a1e552568df809d841", size = 86597, upload-time = "2025-11-10T17:58:35.843Z" },
-]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -3561,8 +3575,8 @@ dependencies = [
     { name = "opencv-python-headless" },
     { name = "pybind11" },
     { name = "pyyaml" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "torchao" },
     { name = "torchdata" },
     { name = "transformers" },
@@ -3608,8 +3622,8 @@ vlm = [
 [package.dev-dependencies]
 build = [
     { name = "setuptools" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 dev = [
     { name = "cut-cross-entropy" },
@@ -3749,6 +3763,7 @@ dependencies = [
     { name = "accelerate" },
     { name = "blobfile" },
     { name = "colored" },
+    { name = "coverage" },
     { name = "datasets" },
     { name = "debugpy" },
     { name = "hydra-core" },
@@ -3773,26 +3788,23 @@ dependencies = [
     { name = "sympy" },
     { name = "tensorboard" },
     { name = "tiktoken" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "torchdata" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
     { name = "transformers" },
-    { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "wandb" },
 ]
 
 [package.optional-dependencies]
 automodel = [
     { name = "causal-conv1d" },
-    { name = "deep-ep" },
     { name = "flash-attn" },
     { name = "mamba-ssm" },
     { name = "nemo-automodel" },
-    { name = "nv-grouped-gemm" },
-    { name = "transformer-engine", extra = ["pytorch"] },
     { name = "vllm" },
 ]
 mcore = [
@@ -3805,6 +3817,26 @@ mcore = [
 nemo-gym = [
     { name = "nemo-gym" },
 ]
+sglang = [
+    { name = "compressed-tensors" },
+    { name = "einops" },
+    { name = "interegular" },
+    { name = "msgspec" },
+    { name = "openai" },
+    { name = "openai-harmony" },
+    { name = "orjson" },
+    { name = "partial-json-parser" },
+    { name = "pybase64" },
+    { name = "python-multipart" },
+    { name = "requests" },
+    { name = "sentencepiece" },
+    { name = "sgl-kernel" },
+    { name = "sglang" },
+    { name = "torch-memory-saver" },
+    { name = "torchao" },
+    { name = "uvloop" },
+    { name = "xgrammar" },
+]
 vllm = [
     { name = "causal-conv1d" },
     { name = "cuda-python" },
@@ -3824,8 +3856,8 @@ build = [
     { name = "psutil" },
     { name = "pybind11" },
     { name = "setuptools" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 dev = [
     { name = "pre-commit" },
@@ -3862,16 +3894,19 @@ requires-dist = [
     { name = "causal-conv1d", marker = "extra == 'automodel'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" },
     { name = "causal-conv1d", marker = "extra == 'vllm'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" },
     { name = "colored", specifier = "==2.2.3" },
+    { name = "compressed-tensors", marker = "extra == 'sglang'" },
+    { name = "coverage", specifier = ">=7.10.4" },
     { name = "cuda-python", marker = "extra == 'vllm'" },
     { name = "datasets", specifier = ">=4.0.0" },
     { name = "debugpy" },
-    { name = "deep-ep", marker = "extra == 'automodel'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" },
-    { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" },
+    { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" },
     { name = "deep-gemm", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" },
+    { name = "einops", marker = "extra == 'sglang'" },
     { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.8.1" },
     { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.8.1" },
     { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.8.1" },
     { name = "hydra-core" },
+    { name = "interegular", marker = "extra == 'sglang'" },
     { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" },
     { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" },
     { name = "math-verify" },
@@ -3879,43 +3914,56 @@ requires-dist = [
     { name = "megatron-bridge", marker = "extra == 'mcore'", editable = "3rdparty/Megatron-Bridge-workspace" },
     { name = "megatron-core", marker = "extra == 'mcore'", editable = "3rdparty/Megatron-LM-workspace" },
     { name = "mlflow", specifier = ">=3.5.0,<3.6.0" },
+    { name = "msgspec", marker = "extra == 'sglang'" },
     { name = "nemo-automodel", marker = "extra == 'automodel'", editable = "3rdparty/Automodel-workspace/Automodel" },
     { name = "nemo-gym", marker = "extra == 'nemo-gym'", editable = "3rdparty/Gym-workspace" },
     { name = "ninja" },
     { name = "num2words", specifier = ">=0.5.14" },
     { name = "num2words", marker = "extra == 'vllm'", specifier = ">=0.5.14" },
     { name = "numpy" },
-    { name = "nv-grouped-gemm", marker = "extra == 'automodel'", git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7" },
     { name = "nvidia-ml-py" },
     { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "nvtx" },
     { name = "omegaconf" },
+    { name = "openai", marker = "extra == 'sglang'" },
+    { name = "openai-harmony", marker = "extra == 'sglang'" },
+    { name = "orjson", marker = "extra == 'sglang'" },
+    { name = "partial-json-parser", marker = "extra == 'sglang'" },
     { name = "pillow", specifier = ">=11.3.0" },
     { name = "pip" },
     { name = "plotly" },
+    { name = "pybase64", marker = "extra == 'sglang'" },
+    { name = "python-multipart", marker = "extra == 'sglang'" },
     { name = "pyzmq" },
     { name = "ray", extras = ["default"], specifier = "==2.49.2" },
+    { name = "requests", marker = "extra == 'sglang'" },
     { name = "rich" },
+    { name = "sentencepiece", marker = "extra == 'sglang'" },
     { name = "setuptools" },
+    { name = "sgl-kernel", marker = "extra == 'sglang'", specifier = "==0.3.17.post1" },
+    { name = "sglang", marker = "extra == 'sglang'", specifier = ">=0.4.1" },
     { name = "swanlab" },
     { name = "sympy", specifier = ">=1.14.0" },
     { name = "tensorboard" },
     { name = "tiktoken" },
-    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.9.0", index = "https://download.pytorch.org/whl/cu129" },
-    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.9.0", index = "https://pypi.org/simple" },
+    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu129" },
+    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.8.0", index = "https://pypi.org/simple" },
+    { name = "torch-memory-saver", marker = "extra == 'sglang'" },
+    { name = "torchao", marker = "extra == 'sglang'" },
     { name = "torchdata" },
     { name = "torchvision", marker = "sys_platform != 'darwin'", specifier = ">=0.22.0", index = "https://download.pytorch.org/whl/cu129" },
     { name = "torchvision", marker = "sys_platform == 'darwin'", specifier = ">=0.22.0", index = "https://pypi.org/simple" },
-    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'automodel'", specifier = "==2.8.0" },
     { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'mcore'", specifier = "==2.8.0" },
-    { name = "transformers", specifier = "==4.57.1" },
+    { name = "transformers", specifier = ">=4.55.4" },
     { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')", index = "https://download.pytorch.org/whl/cu129" },
-    { name = "vllm", marker = "extra == 'automodel'", specifier = "==0.11.2" },
-    { name = "vllm", marker = "extra == 'mcore'", specifier = "==0.11.2" },
-    { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.11.2" },
+    { name = "uvloop", marker = "extra == 'sglang'" },
+    { name = "vllm", marker = "extra == 'automodel'", specifier = "==0.11.0" },
+    { name = "vllm", marker = "extra == 'mcore'", specifier = "==0.11.0" },
+    { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.11.0" },
     { name = "wandb" },
+    { name = "xgrammar", marker = "extra == 'sglang'" },
 ]
-provides-extras = ["automodel", "vllm", "mcore", "nemo-gym"]
+provides-extras = ["automodel", "vllm", "sglang", "mcore", "nemo-gym"]
 
 [package.metadata.requires-dev]
 build = [
@@ -3925,8 +3973,8 @@ build = [
     { name = "psutil" },
     { name = "pybind11" },
     { name = "setuptools" },
-    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.9.0", index = "https://download.pytorch.org/whl/cu129" },
-    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.9.0", index = "https://pypi.org/simple" },
+    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu129" },
+    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.8.0", index = "https://pypi.org/simple" },
 ]
 dev = [
     { name = "pre-commit", specifier = ">=4.2.0" },
@@ -4053,21 +4101,20 @@ wheels = [
 [[package]]
 name = "nv-grouped-gemm"
 version = "1.1.4.post7"
-source = { git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7#6dfaf60e6112166b8b82e9210b51c7f557956f0a" }
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "absl-py" },
     { name = "numpy" },
-    { name = "setuptools" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
-    { name = "wheel" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/63/36/13d0a1e1af31c3b2a297c15b6e7da532b13361730b32d11d9698854bdbe3/nv_grouped_gemm-1.1.4.post7.tar.gz", hash = "sha256:bc9f7906c9b0bd7fefea5a776acbc277577c65b103181340fd26ca2b8460c6a5", size = 26520, upload-time = "2025-12-16T19:42:33.176Z" }
 
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.9.1.4"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/6c/90d3f532f608a03a13c1d6c16c266ffa3828e8011b1549d3b61db2ad59f5/nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7a950dae01add3b415a5a5cdc4ec818fb5858263e9cca59004bb99fdbbd3a5d6", size = 575006342, upload-time = "2025-06-05T20:04:16.902Z" },
     { url = "https://files.pythonhosted.org/packages/77/3c/aa88abe01f3be3d1f8f787d1d33dc83e76fec05945f9a28fbb41cfb99cd5/nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:453611eb21a7c1f2c2156ed9f3a45b691deda0440ec550860290dc901af5b4c2", size = 581242350, upload-time = "2025-06-05T20:04:51.979Z" },
 ]
 
@@ -4076,7 +4123,6 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.9.79"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/78/351b5c8cdbd9a6b4fb0d6ee73fb176dcdc1b6b6ad47c2ffff5ae8ca4a1f7/nvidia_cuda_cupti_cu12-12.9.79-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:791853b030602c6a11d08b5578edfb957cadea06e9d3b26adbf8d036135a4afe", size = 10077166, upload-time = "2025-06-05T20:01:01.385Z" },
     { url = "https://files.pythonhosted.org/packages/c1/2e/b84e32197e33f39907b455b83395a017e697c07a449a2b15fd07fc1c9981/nvidia_cuda_cupti_cu12-12.9.79-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:096bcf334f13e1984ba36685ad4c1d6347db214de03dbb6eebb237b41d9d934f", size = 10814997, upload-time = "2025-06-05T20:01:10.168Z" },
 ]
 
@@ -4086,7 +4132,6 @@ version = "12.9.86"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/b8/85/e4af82cc9202023862090bfca4ea827d533329e925c758f0cde964cb54b7/nvidia_cuda_nvrtc_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:210cf05005a447e29214e9ce50851e83fc5f4358df8b453155d5e1918094dcb4", size = 89568129, upload-time = "2025-06-05T20:02:41.973Z" },
-    { url = "https://files.pythonhosted.org/packages/64/eb/c2295044b8f3b3b08860e2f6a912b702fc92568a167259df5dddb78f325e/nvidia_cuda_nvrtc_cu12-12.9.86-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:096d4de6bda726415dfaf3198d4f5c522b8e70139c97feef5cd2ca6d4cd9cead", size = 44528905, upload-time = "2025-06-05T20:02:29.754Z" },
 ]
 
 [[package]]
@@ -4094,7 +4139,6 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.9.79"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bc/e0/0279bd94539fda525e0c8538db29b72a5a8495b0c12173113471d28bce78/nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83469a846206f2a733db0c42e223589ab62fd2fabac4432d2f8802de4bded0a4", size = 3515012, upload-time = "2025-06-05T20:00:35.519Z" },
     { url = "https://files.pythonhosted.org/packages/bc/46/a92db19b8309581092a3add7e6fceb4c301a3fd233969856a8cbf042cd3c/nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25bba2dfb01d48a9b59ca474a1ac43c6ebf7011f1b0b8cc44f54eb6ac48a96c3", size = 3493179, upload-time = "2025-06-05T20:00:53.735Z" },
 ]
 
@@ -4103,10 +4147,9 @@ name = "nvidia-cudnn-cu12"
 version = "9.10.2.21"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
     { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
 ]
 
@@ -4128,10 +4171,9 @@ name = "nvidia-cufft-cu12"
 version = "11.4.1.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/2b/76445b0af890da61b501fde30650a1a4bd910607261b209cccb5235d3daa/nvidia_cufft_cu12-11.4.1.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1a28c9b12260a1aa7a8fd12f5ebd82d027963d635ba82ff39a1acfa7c4c0fbcf", size = 200822453, upload-time = "2025-06-05T20:05:27.889Z" },
     { url = "https://files.pythonhosted.org/packages/95/f4/61e6996dd20481ee834f57a8e9dca28b1869366a135e0d42e2aa8493bdd4/nvidia_cufft_cu12-11.4.1.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c67884f2a7d276b4b80eb56a79322a95df592ae5e765cf1243693365ccab4e28", size = 200877592, upload-time = "2025-06-05T20:05:45.862Z" },
 ]
 
@@ -4141,7 +4183,6 @@ version = "1.14.1.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/28/b960e06d705a440c030edd84e16888ee14c743390bdb2a6368e92ffe8ef8/nvidia_cufile_cu12-1.14.1.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9552e2231792e94b1ff17bc99e958cc0e6bbbaa4a9d91fa2dbeed97716628fe6", size = 1210714, upload-time = "2025-06-05T20:06:11.898Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/d2/110af3a1f77999d5eebf6ffae5d2305ab839e53c76eec3696640cc25b35d/nvidia_cufile_cu12-1.14.1.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8dea77590761e02cb6dd955a57cb6414c58aa3cb1b7adbf9919869a11509cf65", size = 1135994, upload-time = "2025-06-05T20:06:03.952Z" },
 ]
 
 [[package]]
@@ -4149,7 +4190,6 @@ name = "nvidia-curand-cu12"
 version = "10.3.10.19"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/14/1c/2a45afc614d99558d4a773fa740d8bb5471c8398eeed925fc0fcba020173/nvidia_curand_cu12-10.3.10.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:de663377feb1697e1d30ed587b07d5721fdd6d2015c738d7528a6002a6134d37", size = 68292066, upload-time = "2025-05-01T19:39:13.595Z" },
     { url = "https://files.pythonhosted.org/packages/31/44/193a0e171750ca9f8320626e8a1f2381e4077a65e69e2fb9708bd479e34a/nvidia_curand_cu12-10.3.10.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:49b274db4780d421bd2ccd362e1415c13887c53c214f0d4b761752b8f9f6aa1e", size = 68295626, upload-time = "2025-05-01T19:39:38.885Z" },
 ]
 
@@ -4158,12 +4198,11 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.5.82"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/03/99/686ff9bf3a82a531c62b1a5c614476e8dfa24a9d89067aeedf3592ee4538/nvidia_cusolver_cu12-11.7.5.82-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:62efa83e4ace59a4c734d052bb72158e888aa7b770e1a5f601682f16fe5b4fd2", size = 337869834, upload-time = "2025-06-05T20:06:53.125Z" },
     { url = "https://files.pythonhosted.org/packages/33/40/79b0c64d44d6c166c0964ec1d803d067f4a145cca23e23925fd351d0e642/nvidia_cusolver_cu12-11.7.5.82-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:15da72d1340d29b5b3cf3fd100e3cd53421dde36002eda6ed93811af63c40d88", size = 338117415, upload-time = "2025-06-05T20:07:16.809Z" },
 ]
 
@@ -4172,10 +4211,9 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.10.65"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5e/6f/8710fbd17cdd1d0fc3fea7d36d5b65ce1933611c31e1861da330206b253a/nvidia_cusparse_cu12-12.5.10.65-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:221c73e7482dd93eda44e65ce567c031c07e2f93f6fa0ecd3ba876a195023e83", size = 366359408, upload-time = "2025-06-05T20:07:42.501Z" },
     { url = "https://files.pythonhosted.org/packages/12/46/b0fd4b04f86577921feb97d8e2cf028afe04f614d17fb5013de9282c9216/nvidia_cusparse_cu12-12.5.10.65-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:73060ce019ac064a057267c585bf1fd5a353734151f87472ff02b2c5c9984e78", size = 366465088, upload-time = "2025-06-05T20:08:20.413Z" },
 ]
 
@@ -4184,7 +4222,6 @@ name = "nvidia-cusparselt-cu12"
 version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" },
     { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
 ]
 
@@ -4215,35 +4252,46 @@ wheels = [
 
 [[package]]
 name = "nvidia-modelopt"
-version = "0.40.0"
+version = "0.33.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "ninja" },
-    { name = "numpy" },
-    { name = "nvidia-ml-py" },
-    { name = "packaging" },
-    { name = "pulp" },
-    { name = "pydantic" },
-    { name = "regex" },
-    { name = "rich" },
-    { name = "safetensors" },
-    { name = "scipy" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
-    { name = "torchprofile" },
-    { name = "tqdm" },
+    { name = "ninja", marker = "sys_platform != 'darwin'" },
+    { name = "numpy", marker = "sys_platform != 'darwin'" },
+    { name = "nvidia-ml-py", marker = "sys_platform != 'darwin'" },
+    { name = "nvidia-modelopt-core", marker = "sys_platform != 'darwin'" },
+    { name = "packaging", marker = "sys_platform != 'darwin'" },
+    { name = "pulp", marker = "sys_platform != 'darwin'" },
+    { name = "pydantic", marker = "sys_platform != 'darwin'" },
+    { name = "regex", marker = "sys_platform != 'darwin'" },
+    { name = "rich", marker = "sys_platform != 'darwin'" },
+    { name = "safetensors", marker = "sys_platform != 'darwin'" },
+    { name = "scipy", marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torchprofile", marker = "sys_platform != 'darwin'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "tqdm", marker = "sys_platform != 'darwin'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/cb/4af39357792a96f334c7877ea0380c9337aec210ff4794a7dd95beb7c349/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6c51091683a117cd40fdb96a0ec28579f2276f6b627db7ccddc370df544e1dd7", size = 751683, upload-time = "2025-08-12T18:37:48.832Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/b1/fc2f468d140ef58e90fac584759d0cc449db9bc4f64668cdff750ef38fef/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ef78a98901890f265596ec413dffac177d4a1865201d89a14f29f4fa0cf8e710", size = 751683, upload-time = "2025-08-12T18:36:59.964Z" },
 ]
+
+[[package]]
+name = "nvidia-modelopt-core"
+version = "0.33.1"
+source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/4a/4b4c339637fdbd54bc98b92c87c8b22f5efee05ca9e31e40a8d49ee66187/nvidia_modelopt-0.40.0-py3-none-any.whl", hash = "sha256:0315f53aef014b902866e427038db5803e3c6787a8e1f09c3650031550885051", size = 901421, upload-time = "2025-12-12T10:35:28.506Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/b5/ba79b1c52b634b24e45dca409f133f947217a5c7ec5c256266e4ec5fa3eb/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1ddd9279d8312f8e972b302692a26e6180f1c9fd277232f5925a5589f42b1b76", size = 1338081, upload-time = "2025-08-12T18:40:36.156Z" },
+    { url = "https://files.pythonhosted.org/packages/13/40/4427583475dfd8eb1b8c7522d75d4d059f0512ff03dcc62d6986a22ab918/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:69d5ace564f2b056c916117be2023f2b7fc01cd1501073915e6b2ced2b8a5394", size = 1363366, upload-time = "2025-08-12T18:39:28.854Z" },
 ]
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.27.5"
+version = "2.27.3"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" },
 ]
 
 [[package]]
@@ -4252,7 +4300,6 @@ version = "12.9.86"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/46/0c/c75bbfb967457a0b7670b8ad267bfc4fffdf341c074e0a80db06c24ccfd4/nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:e3f1171dbdc83c5932a45f0f4c99180a70de9bd2718c1ab77d14104f6d7147f9", size = 39748338, upload-time = "2025-06-05T20:10:25.613Z" },
-    { url = "https://files.pythonhosted.org/packages/97/bc/2dcba8e70cf3115b400fef54f213bcd6715a3195eba000f8330f11e40c45/nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:994a05ef08ef4b0b299829cde613a424382aff7efb08a7172c1fa616cc3af2ca", size = 39514880, upload-time = "2025-06-05T20:10:04.89Z" },
 ]
 
 [[package]]
@@ -4270,7 +4317,6 @@ version = "12.9.79"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/86/ed/bb230dce7741f2778ba2ae3e8778fdb8bc58eee9fd95f07bf7b2d18e8081/nvidia_nvtx_cu12-12.9.79-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fec150986817f2b4e7eed72ed059f2dcb9ba3856b9a96134e448eac946a6952f", size = 85504, upload-time = "2025-06-05T20:03:10.21Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/e4/82155e4aaedb41621087ba219c95e99c5e417f37a7649b4fb6ec32dcb14d/nvidia_nvtx_cu12-12.9.79-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d1f258e752294acdb4f61c3d31fee87bd0f60e459f1e2f624376369b524cd15d", size = 86120, upload-time = "2025-06-05T20:02:51.838Z" },
 ]
 
 [[package]]
@@ -4284,8 +4330,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pynvml" },
     { name = "pyyaml" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/70/05/38d491962273c7905708762279f440520eb79f3c00b67a023497215ad023/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_aarch64.whl", hash = "sha256:b3bd5f01535574b16d0f38bca6e39afe3806c4a2896eee1b321cd944e00025a7", size = 444570, upload-time = "2025-07-17T03:50:58.877Z" },
@@ -4402,11 +4448,11 @@ dependencies = [
     { name = "regex" },
     { name = "safetensors" },
     { name = "timm" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
     { name = "tqdm" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/30/46/fb8be250fa7fcfc56fbeb41583645e18d868268f67fbbbeb8ed62a8ff18a/open_clip_torch-3.2.0.tar.gz", hash = "sha256:62b7743012ccc40fb7c64819fa762fba0a13dd74585ac733babe58c2974c2506", size = 1502853, upload-time = "2025-09-21T17:32:08.289Z" }
@@ -4679,6 +4725,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
 ]
 
+[[package]]
+name = "parso"
+version = "0.8.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" },
+]
+
 [[package]]
 name = "partial-json-parser"
 version = "0.2.1.1.post6"
@@ -4709,8 +4764,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "tqdm" },
     { name = "transformers" },
 ]
@@ -4737,6 +4792,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/17/b7cb1a10ebb0a9a4c9fbcd96a28b43d44e08a90f620bab07e644a658d2f1/perceptron-0.1.4-py3-none-any.whl", hash = "sha256:f490a6df6c15167e91e1a528601cae98ce99a30991cf792f9ef83ebc15d335c4", size = 57421, upload-time = "2025-11-12T20:00:26.395Z" },
 ]
 
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ptyprocess", marker = "sys_platform != 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
+]
+
 [[package]]
 name = "pillow"
 version = "11.3.0"
@@ -4907,6 +4974,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/27/72/0824c18f3bc75810f55dacc2dd933f6ec829771180245ae3cc976195dec0/prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9", size = 19296, upload-time = "2025-03-19T19:35:04.323Z" },
 ]
 
+[[package]]
+name = "prompt-toolkit"
+version = "3.0.52"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wcwidth" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" },
+]
+
 [[package]]
 name = "propcache"
 version = "0.3.2"
@@ -5005,6 +5084,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" },
 ]
 
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
+]
+
 [[package]]
 name = "pulp"
 version = "3.2.2"
@@ -5014,6 +5102,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/8d/a6a9d58c929a869f7f1b99b3d37b3f14ef63e2826eef581416338d686c3f/pulp-3.2.2-py3-none-any.whl", hash = "sha256:d3ca5ff11a28b3e7b2508a992d7e51f3533471d89305f0560b5fe3b6cc821043", size = 16385354, upload-time = "2025-07-29T11:42:01.829Z" },
 ]
 
+[[package]]
+name = "pure-eval"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
+]
+
 [[package]]
 name = "py-cpuinfo"
 version = "9.0.0"
@@ -6302,6 +6399,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0d/6d/b4752b044bf94cb802d88a888dc7d288baaf77d7910b7dedda74b5ceea0c/setuptools-79.0.1-py3-none-any.whl", hash = "sha256:e147c0549f27767ba362f9da434eab9c5dc0045d5304feb602a0af001089fc51", size = 1256281, upload-time = "2025-04-23T22:20:56.768Z" },
 ]
 
+[[package]]
+name = "sgl-kernel"
+version = "0.3.17.post1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/57/a2/d2b36e0b8a7b5d88117d8d96c4eb612fe3677069316d444479ff78c73547/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:330057ad2d239e9363ee9abd85ed445ee1795161c60b7357f9792103121039cc", size = 341776329, upload-time = "2025-11-15T15:39:54.528Z" },
+    { url = "https://files.pythonhosted.org/packages/10/8f/6286c74887c42ee4e888a6c36170ff394185e581fbecce2f1bf5c174b96e/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:c864e6d6eebcd91e59a71ba781739761a21774f0cb862578381f54f504f93b4a", size = 511995347, upload-time = "2025-11-15T15:41:45.029Z" },
+]
+
+[[package]]
+name = "sglang"
+version = "0.5.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "ipython" },
+    { name = "numpy" },
+    { name = "requests" },
+    { name = "setproctitle" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/f0/954c401fe1bc80135c245f477cb117d7bb301f7b2eebcf38dcf211c03ac1/sglang-0.5.2.tar.gz", hash = "sha256:0c8a9ad02278d12eba2f30928e0464a646d03b2e2f32efcf6c681bbd795df793", size = 1627791, upload-time = "2025-09-11T23:09:48.602Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b1/2b/44c336e0be9a9a23e56b6fcfed3b6f03dfc8a4181ef2cc82129aa9811fa8/sglang-0.5.2-py3-none-any.whl", hash = "sha256:83aae146f3913ed0802bb1ea356facff47efe0e7d18041a3f143de9ef6e44b2c", size = 2184239, upload-time = "2025-09-11T23:09:46.458Z" },
+]
+
 [[package]]
 name = "shellingham"
 version = "1.5.4"
@@ -6639,6 +6762,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload-time = "2024-12-10T12:05:27.824Z" },
 ]
 
+[[package]]
+name = "stack-data"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "executing" },
+    { name = "pure-eval" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
+]
+
 [[package]]
 name = "standard-aifc"
 version = "3.13.0"
@@ -6931,11 +7068,11 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/94/f6/4d7a8c261341fa6ad281920618739f2a650f41043afcedb570f24e99a776/timm-1.0.16.tar.gz", hash = "sha256:a3b8130dd2cb8dc3b9f5e3d09ab6d677a6315a8695fd5264eb6d52a4a46c1044", size = 2339999, upload-time = "2025-06-26T17:09:44.208Z" }
 wheels = [
@@ -6978,7 +7115,7 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.9.0"
+version = "2.8.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.13' and sys_platform == 'darwin'",
@@ -6994,16 +7131,14 @@ dependencies = [
     { name = "typing-extensions", marker = "sys_platform == 'darwin'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" },
-    { url = "https://files.pythonhosted.org/packages/66/e8/fc414d8656250ee46120b44836ffbb3266343db424b3e18ca79ebbf69d4f/torch-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e4e5b5cba837a2a8d1a497ba9a58dae46fa392593eaa13b871c42f71847503a5", size = 74830362, upload-time = "2025-10-15T15:46:48.983Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/c3/a91f96ec74347fa5fd24453fa514bc61c61ecc79196fa760b012a1873d96/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:f8877779cf56d1ce431a7636703bdb13307f5960bb1af49716d8b179225e0e6a", size = 74480732, upload-time = "2025-10-15T15:47:38.002Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/73/9f70af34b334a7e0ef496ceec96b7ec767bd778ea35385ce6f77557534d1/torch-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7e614fae699838038d888729f82b687c03413c5989ce2a9481f9a7e7a396e0bb", size = 74433037, upload-time = "2025-10-15T15:47:41.894Z" },
-    { url = "https://files.pythonhosted.org/packages/83/36/74f8c051f785500396e42f93542422422dfd874a174f21f8d955d36e5d64/torch-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:71d9309aee457bbe0b164bce2111cd911c4ed4e847e65d5077dbbcd3aba6befc", size = 74823353, upload-time = "2025-10-15T15:49:16.59Z" },
+    { url = "https://files.pythonhosted.org/packages/be/66/5c9a321b325aaecb92d4d1855421e3a055abd77903b7dab6575ca07796db/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:619c2869db3ada2c0105487ba21b5008defcc472d23f8b80ed91ac4a380283b0", size = 73630478, upload-time = "2025-08-06T14:53:57.144Z" },
+    { url = "https://files.pythonhosted.org/packages/de/69/8b7b13bba430f5e21d77708b616f767683629fc4f8037564a177d20f90ed/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:1a62a1ec4b0498930e2543535cf70b1bef8c777713de7ceb84cd79115f553767", size = 73915128, upload-time = "2025-08-06T14:54:34.769Z" },
+    { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" },
 ]
 
 [[package]]
 name = "torch"
-version = "2.9.0+cu129"
+version = "2.8.0+cu129"
 source = { registry = "https://download.pytorch.org/whl/cu129" }
 resolution-markers = [
     "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
@@ -7022,42 +7157,44 @@ dependencies = [
     { name = "fsspec", marker = "sys_platform != 'darwin'" },
     { name = "jinja2", marker = "sys_platform != 'darwin'" },
     { name = "networkx", marker = "sys_platform != 'darwin'" },
-    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "sys_platform != 'darwin'" },
     { name = "sympy", marker = "sys_platform != 'darwin'" },
-    { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
+    { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
     { name = "typing-extensions", marker = "sys_platform != 'darwin'" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-win_amd64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:692fe6e513b667f789a543fa9b1baba58e77a46d5c8629764ca0c00a56823e1f" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:02c7258e917f3043c978b53acf6f02b818db0d0d85db0e58ae578af333b9b4e2" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-win_amd64.whl", hash = "sha256:2bc729898e422b9f3da54349eed98f2f0b5dd415434508ee2ab2a13fb021815d" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ad2d64316635e7ab06f6c973a252526d59a92a2045825c102f876914a72304d0" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:563740167be2189b71530b503f0c8a8d7a8267dd49d4de6f9c5f1d23fbe237df" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-win_amd64.whl", hash = "sha256:2cef066f9759ff4d7868a8c3695aa60d9a878598acb3685bb1ef2fdac29dcd68" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2982bf34249cbb38f1090e71ad7097a214a21023ccdc0413961986ab7d0396e6" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6344260959ebcfa6dae458e1c4365195bcfdf00f4f1f1ad438cbaf50756829ed" },
+    { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-win_amd64.whl", hash = "sha256:9c0cd89e54ce44ce3208c5cf4163773b9cda0067e4b48cfcac56a4e04af52040" },
+]
+
+[[package]]
+name = "torch-memory-saver"
+version = "0.0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/28/6c/21dfda5d31afb71f52cedff52370acbb8290485b3f0fee6816a15a3d08f1/torch_memory_saver-0.0.9.tar.gz", hash = "sha256:3bbf76391fb16870b1b0df279fc281c8a05ef8f8809400b309b0a8240e8ee5ba", size = 14220, upload-time = "2025-10-18T02:10:18.163Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/35/b22df9e730d8444d62445a594421992781c7fad271325d41656d8a32d103/torch_memory_saver-0.0.9-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:0cf26332993649f8ea1b95d7307dfba3a95ee6cee53de84a3e561fb21752b584", size = 488722, upload-time = "2025-10-18T02:10:16.825Z" },
 ]
 
 [[package]]
@@ -7071,33 +7208,25 @@ wheels = [
 
 [[package]]
 name = "torchaudio"
-version = "2.9.0"
+version = "2.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/63/3c0ede3aa3d19a8a6698ddd107fa88660549360b51bf8ce2717cd498d800/torchaudio-2.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab4cbcccfd873b0fb41fcb39c9869e59ef84bb95b093f6f58e2d05172a7500d2", size = 809116, upload-time = "2025-10-15T15:52:00.911Z" },
-    { url = "https://files.pythonhosted.org/packages/be/d5/25e58745defe9d05893d3cba5c0e1a76aeaac503ac5ec4d9f83c871df71c/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7f93388b6e536c14d6015b6f75277a8b45efc532f61b35adc1ed06c98a86003e", size = 476020, upload-time = "2025-10-15T15:51:59.967Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/9c/58b8b49dfba2ae85e41ca86b0c52de45bbbea01987490de219c99c523a58/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:508318a2130b40ad51378f90caf8727a4bd3ac2b296f2b90c900b44e6068a940", size = 2059901, upload-time = "2025-10-15T15:51:54.634Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/eb/58b05f75d12f69ccc460893a20c999da082e063082120ed06e05cca3a053/torchaudio-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:82117e3a605f2959dc09b4cd8a11178d6e92727d5f85e5d4f9fe47502f84ee96", size = 665350, upload-time = "2025-10-15T15:52:08.384Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/66/974371d4e4042d186931b72365817d9d3a509f2bc570888a48612448c060/torchaudio-2.9.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5549c25db4c2da306e179e9aa99980e7f5b1826a8d2d7de08125f3943a5620b2", size = 809149, upload-time = "2025-10-15T15:52:16.133Z" },
-    { url = "https://files.pythonhosted.org/packages/09/61/8f7b875a2d879666f2f121e458817703e5499988a86105d2a25afecb9987/torchaudio-2.9.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:1eb0d1dac8cefbc4a54afb21aac72a1c25a91f73e9c3bd85f6684930a4a1be5d", size = 475699, upload-time = "2025-10-15T15:52:06.349Z" },
-    { url = "https://files.pythonhosted.org/packages/26/db/10ba200f90b76f7b859f46b5ba30cdded69f71bcb0fe3c59bb215532cd2b/torchaudio-2.9.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:266d304dd4ed738a10148b020e3d066e81272ee851f6f92193fe549df96af868", size = 2060349, upload-time = "2025-10-15T15:52:09.329Z" },
-    { url = "https://files.pythonhosted.org/packages/be/53/5f9adbea55e48f91532ee4f041283900939ee5cb6bc1395587214e67a629/torchaudio-2.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:7d3926129389d934aa048bd6c6f68fbf3ef26828ebbbbeac99794ea00e90dc1c", size = 665310, upload-time = "2025-10-15T15:52:05.101Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/41/88b989aab1e11134d858350196fcf3afd4c2a6821d74efb3c1b9ab23b8cf/torchaudio-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:967d664477fb91dffad82ef64ea3695801c0cc35304baec71be875b569440872", size = 813491, upload-time = "2025-10-15T15:52:10.346Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/c1/8d0481fc921cb72d6cadbacd338fa71db0052e8fdb1bf33127c694bbf257/torchaudio-2.9.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:276871d6f5fed5268a87c5da303a13ca2e06b9d29a4c44663b960f0a2e2f46d7", size = 477749, upload-time = "2025-10-15T15:52:04.189Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/d3/d085cd76413b9f3f792e61933235d982caf5cdbdf60f0e4fdae71879becc/torchaudio-2.9.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3d5657d929d6ca07b08cfa005988f2ea8caacf9af42f20bc7eff10f88812ce30", size = 2062165, upload-time = "2025-10-15T15:52:12.784Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/41/d9876f5b19b4b2f98a6131d1a98ee6d5d8f707c01311bbba7cc3bb02f4bf/torchaudio-2.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3fe9cac0c2ee713e07f8c88d09528d55e0fa74987b0122e27911dfb720f39054", size = 669260, upload-time = "2025-10-15T15:52:13.8Z" },
-    { url = "https://files.pythonhosted.org/packages/97/ad/db50c49d73d1904152bbaaaa281e03a41ec519dd6a9df48cc69ea5cd48b9/torchaudio-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3fa41447a21103fcde930b4ad2bd2634565a0becff1a5425535b4f0116c0d5df", size = 810532, upload-time = "2025-10-15T15:52:17.197Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/00/aa8ed83a169a87af72d6cdc17e0350f418b3cba3bd7397b0cca873274789/torchaudio-2.9.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:69f46f21bd67e90ade33a7d0f0cf98270cd61b98f5f8249d3893be0a16b3e31f", size = 475864, upload-time = "2025-10-15T15:52:11.446Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/bb/7ca64ed0556afa08d3a7a47c887ee9b1c4f3eebd193baf47505b6fac479c/torchaudio-2.9.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:631b0f43564a25e27e615b217454c334f52162679f39ae10b9fa7562ed587dfc", size = 2060360, upload-time = "2025-10-15T15:52:14.992Z" },
-    { url = "https://files.pythonhosted.org/packages/63/13/4407b79ddedc9ea95d88fa54c3758df21f0117683fceba4bacd98ceaa772/torchaudio-2.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:ed6df9f14431e13498b984dc87df1aabb2156b9ce0ce7268ce4a61650197310a", size = 665048, upload-time = "2025-10-15T15:52:19.116Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/1a/d3cd6b67b5c68ff4211be923978d1d7c10ea2f44f826d4cd15b775f52c11/torchaudio-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:93358d8f2f24969ba3f368f4eec33295df830af54836c7fd3336740228f9af16", size = 813499, upload-time = "2025-10-15T15:52:20.412Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/65/a35a182519b40dcd2cedaf5fdcac6f724ae2451c534dfcece6ff5f85f983/torchaudio-2.9.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:742143d9d62769bc4b9a2977ca4f4720e0a5e922bdc5df585c155e0a1f545461", size = 477752, upload-time = "2025-10-15T15:52:18.14Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/1c/30272b71ae08817eaca00bb856ebef25dd44041329579903c1915b57f0c9/torchaudio-2.9.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0a234634e1142fb2652c49e935a98b4d9656fd0af9e4aa14b1b05a80c3cf8e78", size = 2062173, upload-time = "2025-10-15T15:52:22.724Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/d6/d007f6bc55a16a86e64e9bba295b90485011cc6a113d8f56b503b4f34a7d/torchaudio-2.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:cbf5d6da8fd2ed545c78218b39fd6aacaa4dd5e265c5f85b248a2fac223f0bd6", size = 669272, upload-time = "2025-10-15T15:52:21.696Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/cc/c2e2a3eb6ee956f73c68541e439916f8146170ea9cc61e72adea5c995312/torchaudio-2.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ddef94bf181e6447cbb05f38beaca8f6c5bb8d2b9ddced1aa3452025b9fc70d3", size = 1856736, upload-time = "2025-08-06T14:58:36.3Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/0d/24dad878784f1edd62862f27173781669f0c71eb46368636787d1e364188/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:862e2e40bf09d865e5df080a84c1a39bbcef40e43140f4b1737eb3a389d3b38f", size = 1692930, upload-time = "2025-08-06T14:58:41.312Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/a6/84d80f34472503e9eb82245d7df501c59602d75d7360e717fb9b84f91c5e/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:93a8583f280fe83ba021aa713319381ea71362cc87b67ee38e97a43cb2254aee", size = 4014607, upload-time = "2025-08-06T14:58:47.234Z" },
+    { url = "https://files.pythonhosted.org/packages/43/ab/96ad33afa320738a7cfb4b51ba97e2f3cfb1e04ae3115d5057655103ba4f/torchaudio-2.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:4b82cacd1b8ccd543b1149d8cab257a40dfda8119023d2e3a96c66349c84bffb", size = 2499890, upload-time = "2025-08-06T14:58:55.066Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/ea/2a68259c4dbb5fe44ebfdcfa40b115010d8c677221a7ef0f5577f3c4f5f1/torchaudio-2.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f851d32e94ca05e470f0c60e25726ec1e0eb71cb2ca5a0206b7fd03272ccc3c8", size = 1857045, upload-time = "2025-08-06T14:58:51.984Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/a3/1c79a8ef29fe403b83bdfc033db852bc2a888b80c406325e5c6fb37a7f2d/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:09535a9b727c0793cd07c1ace99f3f353626281bcc3e30c2f2314e3ebc9d3f96", size = 1692755, upload-time = "2025-08-06T14:58:50.868Z" },
+    { url = "https://files.pythonhosted.org/packages/49/df/61941198e9ac6bcebfdd57e1836e4f3c23409308e3d8d7458f0198a6a366/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2a85b124494736241884372fe1c6dd8c15e9bc1931bd325838c5c00238c7378", size = 4013897, upload-time = "2025-08-06T14:59:01.66Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ab/7175d35a4bbc4a465a9f1388571842f16eb6dec5069d7ea9c8c2d7b5b401/torchaudio-2.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:c1b5139c840367a7855a062a06688a416619f6fd2ca46d9b9299b49a7d133dfd", size = 2500085, upload-time = "2025-08-06T14:58:44.95Z" },
+    { url = "https://files.pythonhosted.org/packages/34/1a/69b9f8349d9d57953d5e7e445075cbf74000173fb5f5d5d9e9d59415fc63/torchaudio-2.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:68df9c9068984edff8065c2b6656725e6114fe89281b0cf122c7505305fc98a4", size = 1935600, upload-time = "2025-08-06T14:58:46.051Z" },
+    { url = "https://files.pythonhosted.org/packages/71/76/40fec21b65bccfdc5c8cdb9d511033ab07a7ad4b05f0a5b07f85c68279fc/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1951f10ed092f2dda57634f6a3950ef21c9d9352551aa84a9fccd51bbda18095", size = 1704199, upload-time = "2025-08-06T14:58:43.594Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/53/95c3363413c2f2009f805144160b093a385f641224465fbcd717449c71fb/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4f7d97494698d98854129349b12061e8c3398d33bd84c929fa9aed5fd1389f73", size = 4020596, upload-time = "2025-08-06T14:59:03.031Z" },
+    { url = "https://files.pythonhosted.org/packages/52/27/7fc2d7435af044ffbe0b9b8e98d99eac096d43f128a5cde23c04825d5dcf/torchaudio-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d4a715d09ac28c920d031ee1e60ecbc91e8a5079ad8c61c0277e658436c821a6", size = 2549553, upload-time = "2025-08-06T14:59:00.019Z" },
 ]
 
 [[package]]
@@ -7115,8 +7244,8 @@ version = "0.11.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "requests" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "urllib3" },
 ]
 wheels = [
@@ -7128,12 +7257,10 @@ name = "torchprofile"
 version = "0.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "numpy", marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6f/36/574c0c46e818533b78b3c09505211162918188325ab4165ef11a3f295755/torchprofile-0.0.4.tar.gz", hash = "sha256:96b6da17d752a06b02977e078aea95614893b31d4117dd5dcd081f30ce65611b", size = 4557, upload-time = "2021-06-22T04:58:03.592Z" }
 wheels = [
@@ -7142,7 +7269,7 @@ wheels = [
 
 [[package]]
 name = "torchvision"
-version = "0.24.0"
+version = "0.23.0"
 source = { registry = "https://download.pytorch.org/whl/cu129" }
 resolution-markers = [
     "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
@@ -7151,19 +7278,17 @@ resolution-markers = [
 dependencies = [
     { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
     { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:630f602db2c594c9cbc89b964d5fb4873adf4193805df65339b24cd3f4cf57f7" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:20f7e25a24f91d93d09398b80929dec805c4ee2f5527fad8eecd6e43dc5fd5d0" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cb70cc000e6a398270044c3406a89ee8ab6157a4e81b5d40c5904e1d0e22e2f8" },
 ]
 
 [[package]]
 name = "torchvision"
-version = "0.24.0"
+version = "0.23.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.13' and sys_platform == 'darwin'",
@@ -7172,19 +7297,17 @@ resolution-markers = [
 dependencies = [
     { name = "numpy", marker = "sys_platform == 'darwin'" },
     { name = "pillow", marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/b5/b2008e4b77a8d6aada828dd0f6a438d8f94befa23fdd2d62fa0ac6e60113/torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84d79cfc6457310107ce4d712de7a3d388b24484bc9aeded4a76d8f8e3a2813d", size = 1891722, upload-time = "2025-10-15T15:51:28.854Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/d7/3dd10830b047eeb46ae6b465474258d7b4fbb7d8872dca69bd42449f5c82/torchvision-0.24.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ab956a6e588623353e0f20d4b03eb1656cb4a3c75ca4dd8b4e32e01bc43271a", size = 2028355, upload-time = "2025-10-15T15:51:22.384Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/24/790a39645cc8c71bf442d54a76da9bda5caeb2a44c5f7e02498649cd99d4/torchvision-0.24.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4bdfc85a5ed706421555f32cdc5e3ddb6d40bf65ef03a274ce3c176393e2904b", size = 2028335, upload-time = "2025-10-15T15:51:26.252Z" },
-    { url = "https://files.pythonhosted.org/packages/08/f7/261d1353c611820541ecd43046b89da3f1ae998dc786e4288b890a009883/torchvision-0.24.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:68120e7e03c31900e499a10bb7fdd63cfd67f0054c9fa108e7e27f9cd372f315", size = 2028359, upload-time = "2025-10-15T15:51:32.119Z" },
+    { url = "https://files.pythonhosted.org/packages/df/1d/0ea0b34bde92a86d42620f29baa6dcbb5c2fc85990316df5cb8f7abb8ea2/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440", size = 1856885, upload-time = "2025-08-06T14:58:06.503Z" },
+    { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" },
+    { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" },
 ]
 
 [[package]]
 name = "torchvision"
-version = "0.24.0+cu129"
+version = "0.23.0+cu129"
 source = { registry = "https://download.pytorch.org/whl/cu129" }
 resolution-markers = [
     "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
@@ -7199,14 +7322,15 @@ resolution-markers = [
 dependencies = [
     { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
     { name = "pillow", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6226be1b8399ef655a11965ea4975250f7823fc9b200b35deb9eeac350c667a9" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp312-cp312-win_amd64.whl", hash = "sha256:57cf57ada9a5407755e170a4ab3842337b83862c93f9483decaf0b6b4d69fa09" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:04316e24ddd1cee3b301208811a9d7c4cfca5f566ea367f33bda059d8f0e012e" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313-win_amd64.whl", hash = "sha256:a486a0cee466807a17749d0b916d52088343453dc911baa20f0f459b2fa43c9a" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c718f6d2c0e61feed39763925eea3e1f42979f6b21e61276f487409168d9e352" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313t-win_amd64.whl", hash = "sha256:8218c1f614972abb4710afde96d0f70b174b235f390e165e6fd4cdd5cee6d93d" },
 ]
 
 [[package]]
@@ -7221,6 +7345,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
 ]
 
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
+
 [[package]]
 name = "transformer-engine"
 version = "2.8.0"
@@ -7259,8 +7392,8 @@ dependencies = [
     { name = "einops" },
     { name = "onnx" },
     { name = "onnxscript" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/38/63/1e3953244ed4f318f87889309a56cdd664759f007967eb850ee415a5584d/transformer_engine_torch-2.8.0.tar.gz", hash = "sha256:ce09f1bd9b8e532a5c347b9e9b3a3a771722095daddca673ae82ccce8e68d759", size = 209805, upload-time = "2025-10-07T04:54:11.134Z" }
 
@@ -7285,6 +7418,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" },
 ]
 
+[[package]]
+name = "triton"
+version = "3.4.0"
+source = { registry = "https://download.pytorch.org/whl/cu129" }
+resolution-markers = [
+    "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
+]
+
 [[package]]
 name = "triton"
 version = "3.4.0"
@@ -7301,29 +7456,6 @@ dependencies = [
     { name = "setuptools", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
 
-[[package]]
-name = "triton"
-version = "3.5.0"
-source = { registry = "https://download.pytorch.org/whl/cu129" }
-resolution-markers = [
-    "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" },
-    { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" },
-]
-
 [[package]]
 name = "trove-classifiers"
 version = "2025.8.6.13"
@@ -7477,11 +7609,10 @@ wheels = [
 
 [[package]]
 name = "vllm"
-version = "0.11.2"
+version = "0.11.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
-    { name = "anthropic" },
     { name = "blake3" },
     { name = "cachetools" },
     { name = "cbor2" },
@@ -7492,13 +7623,11 @@ dependencies = [
     { name = "einops" },
     { name = "fastapi", extra = ["standard"] },
     { name = "filelock" },
-    { name = "flashinfer-python" },
     { name = "gguf" },
     { name = "lark" },
-    { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+    { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
     { name = "lm-format-enforcer" },
-    { name = "mistral-common", extra = ["image"] },
-    { name = "model-hosting-container-standards" },
+    { name = "mistral-common", extra = ["audio", "image"] },
     { name = "msgspec" },
     { name = "ninja" },
     { name = "numba" },
@@ -7529,23 +7658,23 @@ dependencies = [
     { name = "six" },
     { name = "tiktoken" },
     { name = "tokenizers" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "torchaudio" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
     { name = "tqdm" },
     { name = "transformers" },
     { name = "typing-extensions" },
     { name = "watchfiles" },
     { name = "xformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/40/15/bc50794c5c6a48f075d72fde8035647d38072ad81031168d27ca631f9395/vllm-0.11.2.tar.gz", hash = "sha256:496d15bb64ca0fe73adbc57a93b29f4671fa12404c09e0ba02f777bfe60af671", size = 17287801, upload-time = "2025-11-20T08:31:35.084Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/5a/36d2351206f4d8d871b10780f874d03957985e08298d430cc837723e07af/vllm-0.11.0.tar.gz", hash = "sha256:f435a64c24e9c4178d657a76f8edd8548ddc444012f7d06a9f79ac3a6392bfae", size = 10822208, upload-time = "2025-10-04T01:39:57.798Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/5d/d6af7818e41957a5d35f1b0ecd0186ac80e322f228dc390dcbc4aafce58d/vllm-0.11.2-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:ea473bd4fde06940fe3f681a00476060652f62b3279ef11aaffac5768856cfe8", size = 370306629, upload-time = "2025-11-20T08:30:43.713Z" },
-    { url = "https://files.pythonhosted.org/packages/24/7c/f27896162b88c360d569fd632cf0525d5ce89cba8e555532d80dc3ee0a12/vllm-0.11.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:a084f5ca768d22bf55810948cbb50825a35015e07593ab6c9c42fcbe18bdd5cc", size = 368543904, upload-time = "2025-11-20T08:31:15.933Z" },
+    { url = "https://files.pythonhosted.org/packages/47/33/d19e0763c34392ec956534536fa837c060495bfff31ed83452135ea7608d/vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:3861c75ff2b12e24f6d179ff5c084d791b42ded8675d76c8706697c79f68cd62", size = 438217982, upload-time = "2025-10-04T01:39:32.382Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/bf/973444bb959fc7acbbeb3d226bd4d135dcd49b6af174b29aab1b50e2d710/vllm-0.11.0-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:52369c9ee949944354bdc7afc88ded2d1ed02b098bf90db06cf80098a19787b7", size = 401003969, upload-time = "2025-10-04T01:39:50.251Z" },
 ]
 
 [[package]]
@@ -7737,24 +7866,6 @@ version = "3.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857, upload-time = "2015-10-22T15:26:37.51Z" }
 
-[[package]]
-name = "wheel"
-version = "0.45.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef809ae6338005b3f21bb568bea3165cfc6a243fdc25c/wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729", size = 107545, upload-time = "2024-11-23T00:18:23.513Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" },
-]
-
-[[package]]
-name = "win32-setctime"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
-]
-
 [[package]]
 name = "wrapt"
 version = "1.17.3"
@@ -7845,15 +7956,15 @@ wheels = [
 
 [[package]]
 name = "xformers"
-version = "0.0.33.post1"
+version = "0.0.32.post1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6f/c1/cd0d6b89da38d8aa174e8eabf29530f8871daf53b886ec6b680ef9d3e71f/xformers-0.0.33.post1.tar.gz", hash = "sha256:e555258249b514ba117b3403523fe0bd7d3e92e930575f0e0dbf5f7db5b42677", size = 14784437, upload-time = "2025-11-13T20:16:14.793Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/6f/33/3b9c4d3d5b2da453d27de891df4ad653ac5795324961aa3a5c15b0353fe6/xformers-0.0.32.post1.tar.gz", hash = "sha256:1de84a45c497c8d92326986508d81f4b0a8c6be4d3d62a29b8ad6048a6ab51e1", size = 12106196, upload-time = "2025-08-14T18:07:45.486Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/39/94/3ad80d1070ddfb280c20a67dfbc094a93579a02910ef41f20631a9b566fe/xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a8d72c6272453450eede2ed9aaa14448e6525569e14217573057ded146090db3", size = 122884756, upload-time = "2025-11-13T20:16:04.002Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/df/6817346f1a77278315d5fe1fc9f239ba3282ba36e8ab3256babd448dde62/xformers-0.0.32.post1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5f245b5555188da112070d8fefb6b7ae1ae47422856521d66c837e9d2352fbe4", size = 117199943, upload-time = "2025-08-14T18:07:34.78Z" },
 ]
 
 [[package]]
@@ -7865,10 +7976,10 @@ dependencies = [
     { name = "ninja" },
     { name = "numpy" },
     { name = "pydantic" },
-    { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
+    { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" },
     { name = "transformers" },
-    { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f2/a9/dc3c63cf7f082d183711e46ef34d10d8a135c2319dc581905d79449f52ea/xgrammar-0.1.25.tar.gz", hash = "sha256:70ce16b27e8082f20808ed759b0733304316facc421656f0f30cfce514b5b77a", size = 2297187, upload-time = "2025-09-21T05:58:58.942Z" }