diff --git a/examples/configs/grpo_math_1B_sglang.yaml b/examples/configs/grpo_math_1B_sglang.yaml new file mode 100644 index 0000000000..17b30f3ef5 --- /dev/null +++ b/examples/configs/grpo_math_1B_sglang.yaml @@ -0,0 +1,25 @@ +defaults: grpo_math_1B.yaml + +grpo: + val_batch_size: 128 + +policy: + generation: + backend: "sglang" + sglang_cfg: + # SGLang specific configuration + model_path: ${policy.model_name} + gpus_per_server: 1 + dtype: ${policy.precision} + context_length: 512 # Maximum context length + allow_auto_truncate: true + enable_memory_saver: false + dp_size: 1 + pp_size: 1 + ep_size: 1 + max_running_requests: null + mem_fraction_static: 0.7 + skip_server_warmup: true + +logger: + wandb_enabled: true diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml new file mode 100644 index 0000000000..8428b1cd96 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.yaml @@ -0,0 +1,48 @@ +defaults: ../../grpo_math_1B.yaml + +grpo: + max_num_steps: 450 + +checkpointing: + checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang + +policy: + model_name: Qwen/Qwen2.5-Math-1.5B-Instruct + tokenizer: + name: Qwen/Qwen2.5-Math-1.5B-Instruct + dynamic_batching: + enabled: true + sequence_packing: + enabled: false + make_sequence_length_divisible_by: 1 + generation: + backend: "sglang" + max_new_tokens: 512 + sglang_cfg: + model_path: ${policy.model_name} + gpus_per_server: 8 + dtype: ${policy.precision} + context_length: 512 + allow_auto_truncate: true + enable_memory_saver: false + dp_size: 1 + pp_size: 1 + ep_size: 1 + max_running_requests: null + mem_fraction_static: 0.7 + skip_server_warmup: true + +data: + max_input_seq_length: 512 + +logger: + log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang + +cluster: + gpus_per_node: 8 + diff --git a/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml b/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml new file mode 100644 index 0000000000..30c6f5f76c --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen3-0.6b-1n8g-sglang.yaml @@ -0,0 +1,49 @@ +defaults: ../../grpo_math_1B.yaml + +grpo: + max_num_steps: 500 + val_batch_size: 128 + +checkpointing: + checkpoint_dir: results/grpo-qwen3-0.6b-1n8g-sglang + +policy: + model_name: Qwen/Qwen3-0.6B + tokenizer: + name: Qwen/Qwen3-0.6B + dynamic_batching: + enabled: true + sequence_packing: + enabled: false + make_sequence_length_divisible_by: 1 + generation: + backend: "sglang" + max_new_tokens: 512 + sglang_cfg: + model_path: ${policy.model_name} + gpus_per_server: 8 + dtype: ${policy.precision} + context_length: 512 + allow_auto_truncate: true + enable_memory_saver: false + dp_size: 1 + pp_size: 1 + ep_size: 1 + max_running_requests: null + mem_fraction_static: 0.7 + skip_server_warmup: true + +data: + max_input_seq_length: 512 + +logger: + log_dir: logs/grpo-qwen3-0.6b-1n8g-sglang + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen3-0.6b-1n8g-sglang + +cluster: + gpus_per_node: 8 + diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py index 8ab62d00fb..3651521d9f 100644 --- a/nemo_rl/algorithms/grpo.py +++ b/nemo_rl/algorithms/grpo.py @@ -61,6 +61,7 @@ run_multi_turn_rollout, ) from nemo_rl.models.generation.interfaces import GenerationInterface +from nemo_rl.models.generation.sglang import SGLangConfig, SGLangGeneration from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.interfaces import ColocatablePolicyInterface @@ -482,9 +483,77 @@ def init_vllm(): pg.finish_generation() return pg, time.perf_counter() - t0 - # Handle backend-specific setup + def init_sglang(): + """Initialize SGLang generation workers.""" + t0 = time.perf_counter() + pg = SGLangGeneration(cluster=inference_cluster, config=generation_config) + pg.finish_generation() + return pg, time.perf_counter() - t0 + + def initialize_generation_with_policy( + init_generation_fn, + generation_name: str, + init_time_key: str, + colocated_inference: bool, + worker_init_timing_metrics: dict, + ): + """Generic function to initialize a generation engine (vLLM or SGLang) along with policy. + + Args: + init_generation_fn: Function that initializes the generation engine (init_vllm or init_sglang) + generation_name: Name of the generation engine ("vLLM" or "SGLang") + init_time_key: Key name for storing initialization time in metrics ("vllm_init_time_s" or "sglang_init_time_s") + colocated_inference: Whether inference is colocated with training + worker_init_timing_metrics: Dictionary to store timing metrics + + Returns: + Tuple of (policy_generation, policy) + """ + # Determine if parallel initialization is possible (non-colocated mode) + use_parallel_init = not colocated_inference + + if use_parallel_init: + # Parallel initialization: Generation engine and Policy can initialize simultaneously + print( + " ⚡ Using parallel worker initialization (non-colocated mode)", + flush=True, + ) + + # Execute both initializations in parallel + parallel_start_time = time.perf_counter() + with ThreadPoolExecutor(max_workers=2) as executor: + generation_future = executor.submit(init_generation_fn) + policy_future = executor.submit(init_policy) + policy_generation, generation_time = generation_future.result() + policy, policy_time = policy_future.result() + parallel_wall_time = time.perf_counter() - parallel_start_time + + # Store timing metrics + worker_init_timing_metrics[init_time_key] = generation_time + worker_init_timing_metrics["policy_init_time_s"] = policy_time + worker_init_timing_metrics["parallel_wall_time_s"] = parallel_wall_time + worker_init_timing_metrics["parallel_init_enabled"] = True + + else: + # Sequential initialization: colocated mode (GPU memory requires generation engine first) + print( + " ⚙️ Using sequential worker initialization (colocated mode)", + flush=True, + ) + + # Initialize generation engine first (clean GPU memory), then policy + policy_generation, generation_time = init_generation_fn() + worker_init_timing_metrics[init_time_key] = generation_time + + policy, policy_time = init_policy() + worker_init_timing_metrics["policy_init_time_s"] = policy_time + worker_init_timing_metrics["parallel_init_enabled"] = 0.0 + + return policy_generation, policy + + # Handle generation-specific setup if backend == "megatron": - # Megatron backend: policy_generation is None, only initialize policy + # Megatron generation: policy_generation is None, only initialize policy policy_generation = None print( f" ✓ Using {backend} backend for generation with {policy_config['model_name']}", @@ -495,7 +564,7 @@ def init_vllm(): worker_init_timing_metrics["policy_init_time_s"] = policy_time elif backend == "vllm": - # vLLM backend: setup config, then decide parallel vs sequential init + # vLLM generation: setup config, then initialize with policy generation_config = cast(VllmConfig, generation_config) if generation_config["vllm_cfg"]["precision"] == "fp8": assert loss_config["use_importance_sampling_correction"] is True, ( @@ -523,48 +592,36 @@ def init_vllm(): "hf_config_overrides", {} ) - # Determine if parallel initialization is possible (non-colocated mode) - use_parallel_init = not colocated_inference - - if use_parallel_init: - # Parallel initialization: vLLM and Policy can initialize simultaneously - print( - " ⚡ Using parallel worker initialization (non-colocated mode)", - flush=True, - ) - - # Execute both initializations in parallel - parallel_start_time = time.perf_counter() - with ThreadPoolExecutor(max_workers=2) as executor: - vllm_future = executor.submit(init_vllm) - policy_future = executor.submit(init_policy) - policy_generation, vllm_time = vllm_future.result() - policy, policy_time = policy_future.result() - parallel_wall_time = time.perf_counter() - parallel_start_time + policy_generation, policy = initialize_generation_with_policy( + init_generation_fn=init_vllm, + generation_name="vLLM", + init_time_key="vllm_init_time_s", + colocated_inference=colocated_inference, + worker_init_timing_metrics=worker_init_timing_metrics, + ) - # Store timing metrics - worker_init_timing_metrics["vllm_init_time_s"] = vllm_time - worker_init_timing_metrics["policy_init_time_s"] = policy_time - worker_init_timing_metrics["parallel_wall_time_s"] = parallel_wall_time - worker_init_timing_metrics["parallel_init_enabled"] = True + print( + f" ✓ Using vLLM backend for generation with {policy_config['model_name']}", + flush=True, + ) - else: - # Sequential initialization: colocated mode (GPU memory requires vLLM first) - print( - " ⚙️ Using sequential worker initialization (colocated mode)", - flush=True, - ) + elif backend == "sglang": + generation_config = cast(SGLangConfig, generation_config) - # Initialize vLLM first (clean GPU memory), then policy - policy_generation, vllm_time = init_vllm() - worker_init_timing_metrics["vllm_init_time_s"] = vllm_time + # Set model_path if not already set + if "model_path" not in generation_config["sglang_cfg"]: + generation_config["sglang_cfg"]["model_path"] = policy_config["model_name"] - policy, policy_time = init_policy() - worker_init_timing_metrics["policy_init_time_s"] = policy_time - worker_init_timing_metrics["parallel_init_enabled"] = 0.0 + policy_generation, policy = initialize_generation_with_policy( + init_generation_fn=init_sglang, + generation_name="SGLang", + init_time_key="sglang_init_time_s", + colocated_inference=colocated_inference, + worker_init_timing_metrics=worker_init_timing_metrics, + ) print( - f" ✓ Using vLLM backend for generation with {policy_config['model_name']}", + f" ✓ Using SGLang backend for generation with {policy_config['model_name']}", flush=True, ) @@ -945,16 +1002,37 @@ def refit_policy_generation( policy.get_free_memory_bytes() * float(memory_ratio) ) - futures_train = policy.stream_weights_via_ipc_zmq( - buffer_size_bytes=buffer_size_bytes, kv_scales=kv_scales - ) - futures_inference = policy_generation.update_weights_via_ipc_zmq() - # wait for all futures to complete - ray.get(futures_train) - results = ray.get(futures_inference) - update_success = all(result for result in results if result is not None) + if isinstance(policy_generation, SGLangGeneration): + sglang_url_to_gpu_uuids = ( + policy_generation.get_sglang_url_to_gpu_uuids() + ) + # Stream weights via HTTP + flush_success = policy_generation.invalidate_kv_cache() + if not flush_success: + print("SGLang KV cache invalidation failed before weight update. ") + futures_train = policy.stream_weights_via_http( + sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids, + ) + # Wait for all workers to complete + ray.get(futures_train) + update_success = True + else: + # Original ZMQ IPC path for vLLM + futures_train = policy.stream_weights_via_ipc_zmq( + buffer_size_bytes=buffer_size_bytes + ) + futures_inference = policy_generation.update_weights_via_ipc_zmq() + # wait for all futures to complete + ray.get(futures_train) + results = ray.get(futures_inference) + update_success = all(result for result in results if result is not None) else: # update weights through nccl + # SGLang haven't implemented non-colocated inference mode. + if isinstance(policy_generation, SGLangGeneration): + raise NotImplementedError( + "SGLang haven't implemented non-colocated inference mode. " + ) futures_train = policy.broadcast_weights_for_collective(kv_scales=kv_scales) futures_inference = policy_generation.update_weights_from_collective() # wait for all futures to complete @@ -1148,11 +1226,9 @@ def grpo_train( dynamic_sampling_num_gen_batches += 1 with timer.time("generation"): - # Clear vLLM logger metrics for each generation step - if policy_generation is not None and hasattr( - policy_generation, "clear_vllm_logger_metrics" - ): - policy_generation.clear_vllm_logger_metrics() + # Clear logger metrics for each generation step + if policy_generation is not None: + policy_generation.clear_logger_metrics() # Use NeMo-Gym rollouts if enabled. We cascade NeMo-Gym first since NeMo-Gym requires async rollouts. if _should_use_nemo_gym(master_config): generation_config = master_config["policy"]["generation"] @@ -1202,16 +1278,12 @@ def grpo_train( greedy=False, ) policy_generation.finish_generation() - # Collect vLLM logger metrics for performance reporting after each generation step - # inflight batch sizes and num pending samples are collected from each vLLM worker - if policy_generation is not None and hasattr( - policy_generation, "get_vllm_logger_metrics" - ): - vllm_logger_metrics = ( - policy_generation.get_vllm_logger_metrics() + # Collect generation logger metrics for performance reporting after each generation step + # inflight batch sizes and num pending samples are collected from each worker + if policy_generation is not None: + generation_logger_metrics = ( + policy_generation.get_logger_metrics() ) - else: - vllm_logger_metrics = {} repeated_batch = scale_rewards( repeated_batch, master_config["grpo"]["reward_scaling"] @@ -1460,7 +1532,7 @@ def grpo_train( metrics[k] = np.sum(v).item() metrics.update(rollout_metrics) - metrics["vllm_logger_metrics"] = vllm_logger_metrics + metrics["generation_logger_metrics"] = generation_logger_metrics total_valid_tokens += metrics["global_valid_toks"] ## Checkpointing @@ -1583,7 +1655,7 @@ def grpo_train( "enable_vllm_metrics_logger", False ) and master_config.get("logger", {}).get("wandb_enabled", False): log_generation_metrics_to_wandb( - vllm_logger_metrics, + generation_logger_metrics, total_steps + 1, master_config["policy"]["generation"]["vllm_cfg"][ "vllm_metrics_logger_interval" @@ -2051,12 +2123,9 @@ def async_grpo_train( trajectory_collector.resume.remote() print("✅ All setup complete, starting buffer wait...") - - # Clear vLLM logger metrics after at start of training - if policy_generation is not None and hasattr( - policy_generation, "clear_vllm_logger_metrics" - ): - policy_generation.clear_vllm_logger_metrics() + # Clear logger metrics at start of training + if policy_generation is not None: + policy_generation.clear_logger_metrics() # Wait for initial buffer fill print( @@ -2296,23 +2365,19 @@ def async_grpo_train( train_results = policy.train(train_data, loss_fn) print("🔄 Synchronizing policy weights to trajectory collector…") - vllm_logger_metrics = None + generation_logger_metrics = None if NEED_REFIT: # Measure pending-generation wait as exposed_generation time print("🔄 Coordinating with trajectory collector before refit...") with timer.time("exposed_generation"): ray.get(trajectory_collector.prepare_for_refit.remote()) - # Collect vLLM logger metrics for performance reporting - # inflight batch sizes and num pending samples are collected from each vLLM worker - if policy_generation is not None and hasattr( - policy_generation, "get_vllm_logger_metrics" - ): - vllm_logger_metrics = ( - policy_generation.get_vllm_logger_metrics() + # Collect generation logger metrics for performance reporting + # inflight batch sizes and num pending samples are collected from each worker + if policy_generation is not None: + generation_logger_metrics = ( + policy_generation.get_logger_metrics() ) - else: - vllm_logger_metrics = {} # Only the actual refit/weight transfer should be counted as weight_sync print("🔄 Performing policy generation refit...") @@ -2327,11 +2392,9 @@ def async_grpo_train( trajectory_collector.set_weight_version.remote(weight_version) trajectory_collector.resume_after_refit.remote() - # Clear vLLM logger metrics after each refit (weight sync), starting a new logging cycle - if policy_generation is not None and hasattr( - policy_generation, "clear_vllm_logger_metrics" - ): - policy_generation.clear_vllm_logger_metrics() + # Clear logger metrics after each refit (weight sync), starting a new logging cycle + if policy_generation is not None: + policy_generation.clear_logger_metrics() # Validation val_metrics, validation_timings = None, None @@ -2424,8 +2487,8 @@ def async_grpo_train( else: metrics[k] = np.sum(v).item() metrics.update(rollout_metrics) - if vllm_logger_metrics is not None: - metrics["vllm_logger_metrics"] = vllm_logger_metrics + if generation_logger_metrics is not None: + metrics["generation_logger_metrics"] = generation_logger_metrics total_valid_tokens += metrics["global_valid_toks"] # Checkpointing (same as sync version) @@ -2532,7 +2595,7 @@ def async_grpo_train( "enable_vllm_metrics_logger", False ) and master_config.get("logger", {}).get("wandb_enabled", False): log_generation_metrics_to_wandb( - vllm_logger_metrics, + generation_logger_metrics, step + 1, master_config["policy"]["generation"]["vllm_cfg"][ "vllm_metrics_logger_interval" diff --git a/nemo_rl/algorithms/utils.py b/nemo_rl/algorithms/utils.py index 17c69e479a..428252e1f2 100644 --- a/nemo_rl/algorithms/utils.py +++ b/nemo_rl/algorithms/utils.py @@ -521,46 +521,47 @@ def visualize_per_worker_timeline( "generation" ].get("vllm_cfg", {}).get("async_engine", False) if is_vllm_metrics_logger_enabled: - vllm_logger_metrics = metrics["vllm_logger_metrics"] - # vllm_logger_me trics: dict[str (metric_name), dict[int (dp_idx), list[int] (metric_values)]] + vllm_logger_metrics = metrics.get("generation_logger_metrics", {}) + # vllm_logger_metrics: dict[str (metric_name), dict[int (dp_idx), list[int] (metric_values)]] # metric_name: "inflight_batch_sizes" or "num_pending_samples" - assert "inflight_batch_sizes" in vllm_logger_metrics, ( - "inflight_batch_sizes not found in vllm_logger_metrics" - ) - assert "num_pending_samples" in vllm_logger_metrics, ( - "num_pending_samples not found in vllm_logger_metrics" - ) - assert isinstance(vllm_logger_metrics["inflight_batch_sizes"], dict), ( - "inflight_batch_sizes must be a dictionary" - ) - assert isinstance(vllm_logger_metrics["num_pending_samples"], dict), ( - "num_pending_samples must be a dictionary" - ) - - vllm_metrics_logger_interval = master_config["policy"]["generation"][ - "vllm_cfg" - ]["vllm_metrics_logger_interval"] - print(" • vLLM Logger Metrics:") - # Visualize the inflight batch sizes timeline - if len(vllm_logger_metrics["inflight_batch_sizes"].values()) > 0: - visualize_per_worker_timeline( - vllm_logger_metrics["inflight_batch_sizes"], - "Inflight Batch Sizes", - vllm_metrics_logger_interval, + if vllm_logger_metrics: + assert "inflight_batch_sizes" in vllm_logger_metrics, ( + "inflight_batch_sizes not found in vllm_logger_metrics" ) - if len(vllm_logger_metrics["num_pending_samples"].values()) > 0: - max_num_pending_samples = max( - (max(v) if v else 0) - for v in vllm_logger_metrics["num_pending_samples"].values() + assert "num_pending_samples" in vllm_logger_metrics, ( + "num_pending_samples not found in vllm_logger_metrics" ) - # If there is at least one pending sample, visualize the timeline - if max_num_pending_samples > 0: + assert isinstance(vllm_logger_metrics["inflight_batch_sizes"], dict), ( + "inflight_batch_sizes must be a dictionary" + ) + assert isinstance(vllm_logger_metrics["num_pending_samples"], dict), ( + "num_pending_samples must be a dictionary" + ) + + vllm_metrics_logger_interval = master_config["policy"]["generation"][ + "vllm_cfg" + ]["vllm_metrics_logger_interval"] + print(" • vLLM Logger Metrics:") + # Visualize the inflight batch sizes timeline + if len(vllm_logger_metrics["inflight_batch_sizes"].values()) > 0: visualize_per_worker_timeline( - vllm_logger_metrics["num_pending_samples"], - "Num Pending Samples", - None, + vllm_logger_metrics["inflight_batch_sizes"], + "Inflight Batch Sizes", + vllm_metrics_logger_interval, ) + if len(vllm_logger_metrics["num_pending_samples"].values()) > 0: + max_num_pending_samples = max( + (max(v) if v else 0) + for v in vllm_logger_metrics["num_pending_samples"].values() + ) + # If there is at least one pending sample, visualize the timeline + if max_num_pending_samples > 0: + visualize_per_worker_timeline( + vllm_logger_metrics["num_pending_samples"], + "Num Pending Samples", + None, + ) # ===================================================== # Throughputs diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py index 4190062ec6..cdda4a625f 100644 --- a/nemo_rl/distributed/ray_actor_environment_registry.py +++ b/nemo_rl/distributed/ray_actor_environment_registry.py @@ -20,6 +20,9 @@ VLLM_EXECUTABLE = ( PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.VLLM ) +SGLANG_EXECUTABLE = ( + PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.SGLANG +) MCORE_EXECUTABLE = ( PY_EXECUTABLES.SYSTEM if USE_SYSTEM_EXECUTABLE else PY_EXECUTABLES.MCORE ) @@ -27,10 +30,11 @@ ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = { "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE, "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE, + "nemo_rl.models.generation.sglang.sglang_worker.SGLangGenerationWorker": SGLANG_EXECUTABLE, # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM. # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved. "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE, - "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": PY_EXECUTABLES.AUTOMODEL, + "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": SGLANG_EXECUTABLE, "nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE, "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM, "nemo_rl.environments.vlm_environment.VLMEnvironment": PY_EXECUTABLES.SYSTEM, diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py index 3021b760e4..3f472e6d61 100644 --- a/nemo_rl/distributed/virtual_cluster.py +++ b/nemo_rl/distributed/virtual_cluster.py @@ -58,6 +58,9 @@ class PY_EXECUTABLES: # Use NeMo-Gym dependencies NEMO_GYM = f"uv run --locked --extra nemo_gym --directory {git_root}" + # Use NeMo-RL direct dependencies and SGLang. + SGLANG = f"uv run --locked --extra automodel --extra sglang --directory {git_root}" + @ray.remote # pragma: no cover def _get_node_ip_and_free_port() -> tuple[str, int]: diff --git a/nemo_rl/models/generation/interfaces.py b/nemo_rl/models/generation/interfaces.py index d134027bdf..80f4ced95e 100644 --- a/nemo_rl/models/generation/interfaces.py +++ b/nemo_rl/models/generation/interfaces.py @@ -257,3 +257,22 @@ def update_weights_from_collective(self) -> list[ray.ObjectRef]: # (e.g., vLLM prefix/KV caches) after weight updates. def invalidate_kv_cache(self) -> bool: return False + + def clear_logger_metrics(self) -> None: + """Clear logger metrics for performance reporting. + + This is an optional method that backends can implement to clear + telemetry metrics. Default implementation does nothing. + """ + pass + + def get_logger_metrics(self) -> dict[str, Any]: + """Get logger metrics for performance reporting. + + This is an optional method that backends can implement to collect + telemetry metrics. Default implementation returns empty dict. + + Returns: + Dictionary of metrics. Format may vary by backend. + """ + return {} diff --git a/nemo_rl/models/generation/sglang/__init__.py b/nemo_rl/models/generation/sglang/__init__.py new file mode 100644 index 0000000000..76deb56ebd --- /dev/null +++ b/nemo_rl/models/generation/sglang/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_rl.models.generation.sglang.config import SGLangConfig +from nemo_rl.models.generation.sglang.sglang_generation import SGLangGeneration + +__all__ = [ + "SGLangConfig", + "SGLangGeneration", + "SGLangGenerationWorker", +] + + +def __getattr__(name: str): + if name == "SGLangGenerationWorker": + from nemo_rl.models.generation.sglang.sglang_worker import ( + SGLangGenerationWorker, + ) + + return SGLangGenerationWorker + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/nemo_rl/models/generation/sglang/config.py b/nemo_rl/models/generation/sglang/config.py new file mode 100644 index 0000000000..9e1ea45253 --- /dev/null +++ b/nemo_rl/models/generation/sglang/config.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, NotRequired, TypedDict + +from nemo_rl.models.generation.interfaces import GenerationConfig + + +class SglangSpecificArgs(TypedDict): + """SGLang-specific configuration arguments. + + Most fields below map directly to SGLang's ServerArgs (see: + https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py). + """ + + model_path: NotRequired[str] + gpus_per_server: NotRequired[int] + random_seed: NotRequired[int] + skip_tokenizer_init: NotRequired[bool] + disable_cuda_graph: NotRequired[bool] + disable_radix_cache: NotRequired[bool] + disable_cuda_graph_padding: NotRequired[bool] + enable_nccl_nvls: NotRequired[bool] + disable_outlines_disk_cache: NotRequired[bool] + disable_custom_all_reduce: NotRequired[bool] + disable_overlap_schedule: NotRequired[bool] + enable_mixed_chunk: NotRequired[bool] + enable_dp_attention: NotRequired[bool] + enable_ep_moe: NotRequired[bool] + enable_torch_compile: NotRequired[bool] + torch_compile_max_bs: NotRequired[int] + cuda_graph_max_bs: NotRequired[int | None] + cuda_graph_bs: NotRequired[list[int] | None] + torchao_config: NotRequired[str] + enable_nan_detection: NotRequired[bool] + enable_p2p_check: NotRequired[bool] + triton_attention_reduce_in_fp32: NotRequired[bool] + triton_attention_num_kv_splits: NotRequired[int] + num_continuous_decode_steps: NotRequired[int] + enable_memory_saver: NotRequired[bool] + allow_auto_truncate: NotRequired[bool] + attention_backend: NotRequired[str | None] + enable_multimodal: NotRequired[bool] + sampling_backend: NotRequired[str | None] + context_length: NotRequired[int | None] + mem_fraction_static: NotRequired[float | None] + max_running_requests: NotRequired[int | None] + chunked_prefill_size: NotRequired[int | None] + max_prefill_tokens: NotRequired[int] + schedule_policy: NotRequired[str] + schedule_conservativeness: NotRequired[float] + cpu_offload_gb: NotRequired[int] + dtype: NotRequired[str] + kv_cache_dtype: NotRequired[str] + dp_size: NotRequired[int] # only used for dp attention + pp_size: NotRequired[int] # pipeline parallel size + ep_size: NotRequired[int] + # lora + enable_lora: NotRequired[bool | None] + max_lora_rank: NotRequired[int | None] + lora_target_modules: NotRequired[list[str] | None] + lora_paths: NotRequired[list[str] | None] + max_loaded_loras: NotRequired[int] + max_loras_per_batch: NotRequired[int] + lora_backend: NotRequired[str] + # logging + log_level: NotRequired[str] + log_level_http: NotRequired[str | None] + log_requests: NotRequired[bool] + log_requests_level: NotRequired[int] + show_time_cost: NotRequired[bool] + enable_metrics: NotRequired[bool] # Exports Prometheus-like metrics + # The interval (in decoding iterations) to log throughput + # and update prometheus metrics + decode_log_interval: NotRequired[int] + # Extra loader arguments + enable_multithread_load: NotRequired[bool] + enable_fast_load: NotRequired[bool] + # Server warmup + skip_server_warmup: NotRequired[bool] + + +class SGLangConfig(GenerationConfig): + """Configuration for SGLang runtime.""" + + sglang_cfg: SglangSpecificArgs + sglang_kwargs: NotRequired[dict[str, Any]] diff --git a/nemo_rl/models/generation/sglang/sglang_generation.py b/nemo_rl/models/generation/sglang/sglang_generation.py new file mode 100644 index 0000000000..85122779ee --- /dev/null +++ b/nemo_rl/models/generation/sglang/sglang_generation.py @@ -0,0 +1,384 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import ( + Any, + Optional, + Union, +) + +import numpy as np +import ray + +from nemo_rl.distributed.batched_data_dict import BatchedDataDict, SlicedDataDict +from nemo_rl.distributed.named_sharding import NamedSharding +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.distributed.worker_groups import RayWorkerBuilder, RayWorkerGroup +from nemo_rl.models.generation.interfaces import ( + GenerationDatumSpec, + GenerationInterface, + GenerationOutputSpec, +) +from nemo_rl.models.generation.sglang.config import SGLangConfig + +# Global thresholds for top_k and top_p validation. +# While top-k/p are not supported, these values allow for token filtering while the logprobs should be compatible. +# See https://github.com/NVIDIA-NeMo/RL/issues/69 and https://github.com/NVIDIA-NeMo/RL/issues/237 for more details. +TOP_K_THRESHOLD = 8000 # Allow top_k >= 8000 (effectively no filtering) +TOP_P_THRESHOLD = 0.99 # Allow top_p >= 0.99 (close to 1.0) + +logger = logging.getLogger(__name__) + + +class SGLangGeneration(GenerationInterface): + def __init__( + self, + cluster: RayVirtualCluster, + config: SGLangConfig, + name_prefix: str = "sglang_policy", + workers_per_node: Optional[Union[int, list[int]]] = None, + ): + """Initialize a SGLang policy with distributed workers. + + SGLang server manages TP/PP internally, but we still need to: + 1. Manage data parallel distribution across multiple servers + 2. Assign GPU bundles to each server + + Each server will see logical GPUs 0-N (via CUDA_VISIBLE_DEVICES set by Ray), + so we just need to tell SGLang how many GPUs to use (tp_size). + """ + # Store config + self.cfg = config + self.sglang_cfg = config["sglang_cfg"] + + gpus_per_server = self.sglang_cfg.get("gpus_per_server", None) + if gpus_per_server is None: + raise ValueError("gpus_per_server must be set in SGLangConfig.sglang_cfg.") + + # Calculate number of servers based on available resources + total_gpus = cluster.world_size() + num_servers = total_gpus // gpus_per_server + + if num_servers == 0: + raise ValueError( + f"Not enough GPUs. Need at least {gpus_per_server} GPUs per server, " + f"but only have {total_gpus} GPUs total." + ) + + if total_gpus % gpus_per_server != 0: + logger.warning( + f"[WARNING] Total GPUs ({total_gpus}) is not divisible by GPUs per server ({gpus_per_server}). " + f"Will use {num_servers} servers, leaving {total_gpus % gpus_per_server} GPUs unused." + ) + + self.dp_size = num_servers + self.gpus_per_server = gpus_per_server + + # Create sharding annotations + # Even though SGLang manages TP internally, we include it in the layout to support + # RayWorkerGroup's worker management (which creates one worker per GPU bundle). + # The TP dimension becomes a "free axis" in run_all_workers_sharded_data, ensuring + # only the primary workers (TP rank 0) are called. + total_workers = num_servers * gpus_per_server + self.sharding_annotations = NamedSharding( + layout=np.arange(total_workers).reshape(num_servers, gpus_per_server), + names=["data_parallel", "tensor_parallel"], + ) + + # Initialize placement groups + # For SGLang, we use PACK strategy to keep bundles together + # colocated is always at top level, not in sglang_cfg + strategy = None if self.cfg["colocated"]["enabled"] else "PACK" + cluster._init_placement_groups( + strategy=strategy, + use_unified_pg=False, # SGLang servers don't need cross-node model parallelism + ) + + # Create worker builder for SGLangGenerationWorker + worker_cls = ( + "nemo_rl.models.generation.sglang.sglang_worker.SGLangGenerationWorker" + ) + worker_builder = RayWorkerBuilder(worker_cls, config) + + env_vars = {} + global_cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None) + if global_cvd: + # Explicitly pass CUDA_VISIBLE_DEVICES to workers via env_vars + # This ensures all workers see the same global value, even though + env_vars["CUDA_VISIBLE_DEVICES"] = global_cvd + + # Allocate bundles for each server + # Each server gets consecutive bundles + bundle_indices_list = self._allocate_bundles_for_servers( + cluster, num_servers, gpus_per_server + ) + + # Create worker group with explicit bundle allocation + self.worker_group = RayWorkerGroup( + cluster, + worker_builder, + name_prefix=name_prefix, + bundle_indices_list=bundle_indices_list, + sharding_annotations=self.sharding_annotations, + env_vars=env_vars, + ) + + # Verify data parallel size matches + assert self.dp_size == self.worker_group.dp_size, ( + f"Data parallel size mismatch. Expected {self.dp_size}, got {self.worker_group.dp_size}" + ) + + # Used to track the round-robin selection of worker groups for generate_async + self.current_generate_dp_shard_idx = 0 + + def _allocate_bundles_for_servers( + self, + cluster: RayVirtualCluster, + num_servers: int, + gpus_per_server: int, + ) -> list[tuple[int, list[int]]]: + """Allocate GPU bundles to each SGLang server. + + Each server gets consecutive bundles within the same placement group (node). + Ray will automatically set CUDA_VISIBLE_DEVICES so each server sees logical GPUs 0, 1, 2, ..., gpus_per_server-1. + + Args: + cluster: The Ray virtual cluster + num_servers: Total number of SGLang servers to create + gpus_per_server: Number of GPUs each server needs + + Returns: + List of (node_idx, [bundle_indices]) tuples for each server + """ + placement_groups = cluster.get_placement_groups() + + if not placement_groups: + raise ValueError("No placement groups available in the cluster") + + bundle_indices_list = [] + + # Each server's bundles must be within the same placement group (node) + server_idx = 0 + for pg_idx, pg in enumerate(placement_groups): + if pg.bundle_count == 0: + continue + + # Calculate how many servers can fit in this placement group + num_servers_in_pg = pg.bundle_count // gpus_per_server + + # Allocate servers within this placement group + for local_server_idx in range(num_servers_in_pg): + if server_idx >= num_servers: + break + + # Calculate which bundles this server gets (consecutive within the PG) + start_bundle = local_server_idx * gpus_per_server + server_bundles = list( + range(start_bundle, start_bundle + gpus_per_server) + ) + + # Each server gets a tuple of (node_idx, [local_bundle_indices]) + bundle_indices_list.append((pg_idx, server_bundles)) + server_idx += 1 + + if server_idx >= num_servers: + break + + if len(bundle_indices_list) < num_servers: + total_available = sum( + pg.bundle_count // gpus_per_server + for pg in placement_groups + if pg.bundle_count > 0 + ) + raise ValueError( + f"Not enough bundles to allocate all {num_servers} servers. " + f"Only {total_available} servers can be allocated " + f"(each server needs {gpus_per_server} GPUs)." + ) + + return bundle_indices_list + + def init_collective( + self, ip: str, port: int, world_size: int, *, train_world_size: int + ) -> list[ray.ObjectRef]: + """Initialize the collective communication. + + TODO: if weight updates via NCCL are needed in the future. + """ + return [] + + def generate( + self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False + ) -> BatchedDataDict[GenerationOutputSpec]: + """Generate a batch of data using SGLang.""" + assert isinstance(data, BatchedDataDict), ( + f"data must be a BatchedDataDict, got type: {type(data)}" + ) + assert "input_ids" in data and "input_lengths" in data, ( + "input_ids and input_lengths are required in data for SGLang generation" + ) + + # Shard the data across the data parallel servers + dp_size = self.sharding_annotations.get_axis_size("data_parallel") + sharded_data: list[SlicedDataDict] = data.shard_by_batch_size( + dp_size, allow_uneven_shards=True + ) + future_bundle = self.worker_group.run_all_workers_sharded_data( + "generate", + data=sharded_data, + in_sharded_axes=["data_parallel"], + replicate_on_axes=None, + output_is_replicated=None, + common_kwargs={"greedy": greedy}, + ) + + # Get results from the workers + results = self.worker_group.get_all_worker_results(future_bundle) + + # Combine results from all servers + combined: BatchedDataDict[GenerationOutputSpec] = BatchedDataDict.from_batches( + results, pad_value_dict={"output_ids": self.cfg["_pad_token_id"]} + ) + + # Verify the output has all required fields + required_keys = [ + "output_ids", + "generation_lengths", + "unpadded_sequence_lengths", + "logprobs", + ] + missing_keys = [key for key in required_keys if key not in combined] + if missing_keys: + raise ValueError( + f"Missing required keys for GenerationOutputSpec: {missing_keys}" + ) + + return combined + + def prepare_refit_info(self, state_dict_info: dict[str, Any]) -> None: + pass + + def update_weights_via_ipc_zmq(self) -> list[ray.ObjectRef]: + return [] + + def update_weights_from_collective(self) -> list[ray.ObjectRef]: + return [] + + def get_sglang_server_urls(self) -> list[str]: + """Get base URLs of all SGLang servers. + + Returns: + List of base URLs (e.g., ["http://localhost:30000", "http://localhost:30001"]) + """ + if not self.worker_group or not self.worker_group.workers: + raise RuntimeError("Worker group is not initialized") + + # Get base URLs from all workers (only primary workers, TP rank 0) + # Use run_rank_0_only_axes to only get URLs from primary workers + futures = self.worker_group.run_all_workers_single_data( + "get_base_url", + run_rank_0_only_axes=["tensor_parallel"], + ) + urls = ray.get(futures) + # Filter out None values and return unique URLs + return list(set(url for url in urls if url is not None)) + + def get_sglang_url_to_gpu_uuids(self) -> dict[str, list[str]]: + """Get mapping from SGLang server URL to list of GPU UUIDs it uses. + + Returns: + Dict mapping server URL to list of GPU UUIDs + e.g., {"http://localhost:30000": ["GPU-aaa", "GPU-bbb"], ...} + """ + if not self.worker_group or not self.worker_group.workers: + raise RuntimeError("Worker group is not initialized") + + # Get base URLs and GPU UUIDs from all primary workers (TP rank 0) + futures_url = self.worker_group.run_all_workers_single_data( + "get_base_url", + run_rank_0_only_axes=["tensor_parallel"], + ) + futures_uuids = self.worker_group.run_all_workers_single_data( + "get_gpu_uuids", + run_rank_0_only_axes=["tensor_parallel"], + ) + + urls = ray.get(futures_url) + uuids_list = ray.get(futures_uuids) + + # Create mapping + url_to_uuids = {} + for url, uuids in zip(urls, uuids_list): + if url is not None and uuids is not None: + url_to_uuids[url] = uuids + + return url_to_uuids + + def prepare_for_generation(self, *args: Any, **kwargs: Any) -> bool: + """Wake workers up for colocated inference.""" + pass + + def finish_generation(self, *args: Any, **kwargs: Any) -> bool: + """Sleep workers and reset prefix cache.""" + pass + + def shutdown(self) -> bool: + """Shut down all SGLang workers and clean up resources.""" + try: + # Use the worker group's shutdown method with the worker's cleanup method + return self.worker_group.shutdown(cleanup_method="shutdown") + except Exception as e: + logger.error(f"Error during SGLang policy shutdown: {e}") + return False + + def __del__(self) -> None: + """Shuts down the worker groups when the object is deleted or is garbage collected. + + This is an extra safety net in case the user forgets to call shutdown() and the pointer to + the object is lost due to leaving a function scope. It's always recommended that the + user calls shutdown(). + """ + self.shutdown() + + def invalidate_kv_cache(self) -> bool: + """Invalidate KV cache before weight updates (Megatron-style). + + This flushes the cache before weight updates to clear stale cache. + Only primary workers (TP rank 0, model owners) will flush their cache. + + Returns: + bool: True if all caches were flushed successfully, False otherwise + """ + try: + futures = self.worker_group.run_all_workers_single_data( + "invalidate_kv_cache", + run_rank_0_only_axes=["tensor_parallel"], + ) + results = ray.get(futures) + results = [r for r in results if r is not None] + success = all(result for result in results) if results else True + if success: + logger.info( + "[sglang refit] All SGLang server caches flushed successfully" + ) + else: + logger.warning( + "[sglang refit] WARNING - Some SGLang server caches failed to flush" + ) + return success + except Exception as e: + logger.error(f"[sglang refit] Error flushing SGLang caches: {e}") + return False diff --git a/nemo_rl/models/generation/sglang/sglang_worker.py b/nemo_rl/models/generation/sglang/sglang_worker.py new file mode 100644 index 0000000000..6f15cba1fc --- /dev/null +++ b/nemo_rl/models/generation/sglang/sglang_worker.py @@ -0,0 +1,804 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import multiprocessing +import os +import time +from typing import Any, Optional + +import aiohttp +import ray +import requests +import torch + +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import _get_free_port_local, _get_node_ip_local +from nemo_rl.distributed.worker_group_utils import get_nsight_config_if_pattern_matches +from nemo_rl.models.generation.interfaces import ( + GenerationDatumSpec, + GenerationOutputSpec, + verify_right_padding, +) +from nemo_rl.models.generation.sglang.config import SGLangConfig +from nemo_rl.models.generation.sglang.utils import AsyncLoopThread +from nemo_rl.utils.nsys import wrap_with_nvtx_name + +logger = logging.getLogger(__name__) + + +def _require_sglang(): + """Import `sglang` lazily so test collection works without the optional extra.""" + try: + from sglang.srt.entrypoints.http_server import launch_server + from sglang.srt.server_args import ServerArgs + from sglang.srt.utils import kill_process_tree + except ModuleNotFoundError as e: # pragma: no cover + raise ModuleNotFoundError( + "Optional dependency `sglang` is required for the SGLang generation backend.\n" + "Install it via the project extra (e.g. `uv run --extra sglang ...`) to use " + "`SGLangGenerationWorker`." + ) from e + + return launch_server, ServerArgs, kill_process_tree + + +@ray.remote( + runtime_env={**get_nsight_config_if_pattern_matches("sglang_generation_worker")} +) # pragma: no cover +class SGLangGenerationWorker: + def __repr__(self) -> str: + """Customizes the actor's prefix in the Ray logs. + + This makes it easier to identify which worker is producing specific log messages. + """ + return f"{self.__class__.__name__}" + + @staticmethod + def configure_worker( + num_gpus: int | float, bundle_indices: Optional[tuple[int, list[int]]] = None + ) -> tuple[dict[str, Any], dict[str, str], dict[str, Any]]: + """Provides complete worker configuration for SGLang server. + + This method configures the worker based on bundle_indices which tells us + how many GPUs this server should use. + + Args: + num_gpus: Original GPU allocation for this worker based on the placement group + bundle_indices: Tuple of (node_idx, local_bundle_indices) for this server + + Returns: + tuple with complete worker configuration: + - 'resources': Resource allocation (e.g., num_gpus) + - 'env_vars': Environment variables for this worker + - 'init_kwargs': Parameters to pass to __init__ of the worker + """ + # Initialize configuration + resources: dict[str, Any] = {"num_gpus": num_gpus} + init_kwargs: dict[str, Any] = {} + env_vars: dict[str, str] = {} + + local_bundle_indices = None + if bundle_indices is not None: + node_idx = bundle_indices[0] + local_bundle_indices = bundle_indices[1] + init_kwargs["bundle_indices"] = local_bundle_indices + + # Calculate a unique seed from node_idx and bundle_indices + if len(local_bundle_indices) == 1: + seed = node_idx * 1024 + local_bundle_indices[0] + else: + bundle_id = local_bundle_indices[0] // len(local_bundle_indices) + seed = node_idx * 1024 + bundle_id + + init_kwargs["seed"] = seed + + # Check if this worker is part of a parallel group (multiple GPUs per server). + # A worker with local rank =0 owns the server(local_bundle_indices is not None ) + # otherwise it is a placeholder for Ray's resource management (local_bundle_indices is None). + is_part_of_parallel_workers = ( + local_bundle_indices is not None and len(local_bundle_indices) > 1 + ) or local_bundle_indices is None + + if is_part_of_parallel_workers: + # For parallel workers, we manage GPU assignment via base_gpu_id + # All workers see the same global CUDA_VISIBLE_DEVICES, but use different + # logical GPU ranges via base_gpu_id + resources["num_gpus"] = 0 + env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1" + init_kwargs["fraction_of_gpus"] = num_gpus + else: + env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1" + + return resources, env_vars, init_kwargs + + def __init__( + self, + config: SGLangConfig, + bundle_indices: Optional[list[int]] = None, + fraction_of_gpus: float = 1.0, + seed: Optional[int] = None, + ): + """Initialize a SGLang worker for distributed inference. + + Args: + config: Configuration dictionary for the policy + bundle_indices: List of local bundle indices for this server. + The length of this list determines tp_size (number of GPUs per server). + Only needed for the first worker in each server group (model owner). + fraction_of_gpus: Fraction of GPUs to use for this worker + seed: Random seed for initialization, if None, then defaults to the config's seed + """ + self.cfg = config + self.is_model_owner = bundle_indices is not None + self.global_rank = int(os.environ.get("RANK", "0")) + self.sglang_cfg = config["sglang_cfg"] + + # Create a dedicated event loop thread for async operations + # there will be issues if we use the event loop in the main thread + self.async_loop_thread = AsyncLoopThread() + + # temp: Maximum concurrent requests per server + # we may remove this limit in the future + self.max_concurrent_requests = config.get("max_concurrent_requests", 999999) + + # Only the primary worker (local_rank=0) in each server group starts the SGLang server + # Secondary workers (local_rank!=0) just returns + if not self.is_model_owner: + return + + # `sglang` is an optional dependency; import only when we actually start a server. + _, ServerArgs, _ = _require_sglang() + + # Determine tp_size from bundle_indices length + tp_size = len(bundle_indices) + + base_gpu_id = bundle_indices[0] if bundle_indices else 0 + + # Get the global CUDA_VISIBLE_DEVICES (all engines see the same global value) + global_cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + logger.info( + f"[SGLang Server] Rank {self.global_rank}: " + f"base_gpu_id={base_gpu_id}, tp_size={tp_size}, " + f"bundle_indices={bundle_indices}, global_cvd={global_cvd}" + ) + + # Get current node IP and a free port for the server + node_ip = _get_node_ip_local() + free_port = _get_free_port_local() + + # Build SGLang server arguments + kwargs = { + "model_path": self.sglang_cfg["model_path"], + "trust_remote_code": True, + "random_seed": seed + if seed is not None + else self.sglang_cfg.get("random_seed", 1), + # Memory settings + "enable_memory_saver": self.sglang_cfg["enable_memory_saver"], + "gpu_id_step": 1, + "base_gpu_id": base_gpu_id, + # Parallel settings + "tp_size": tp_size, + "dp_size": self.sglang_cfg["dp_size"], + "pp_size": self.sglang_cfg["pp_size"], + "ep_size": self.sglang_cfg["ep_size"], + # Always skip warmup to prevent warmup timeout + "skip_server_warmup": self.sglang_cfg.get("skip_server_warmup", True), + # Server network settings - listen on all interfaces, use the free port we found + "host": "0.0.0.0", + "port": free_port, + "torchao_config": "", + } + + for key in [ + "dtype", + "kv_cache_dtype", + "context_length", + "max_running_requests", + "chunked_prefill_size", + "max_prefill_tokens", + "schedule_policy", + "schedule_conservativeness", + "cpu_offload_gb", + "log_level", + "mem_fraction_static", + "allow_auto_truncate", + ]: + if key in self.sglang_cfg: + kwargs[key] = self.sglang_cfg[key] + + server_args = ServerArgs(**kwargs) + # Save server_args and base_url for use in generate() and _make_request() + self.server_args = server_args + self.base_url = f"http://{node_ip}:{free_port}" + + logger.info( + f"[SGLang Worker] Rank {self.global_rank} Starting on {self.base_url}, CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}, base_gpu_id: {base_gpu_id}" + ) + + self.session = None + self.connector = None + + self.server_process = self._launch_server_process(server_args) + + def get_base_url(self) -> str: + """Get the base URL of this SGLang server.""" + return self.base_url + + def invalidate_kv_cache(self) -> bool: + """Invalidate KV cache before weight updates (Megatron-style). + + This flushes the cache before weight updates to clear stale cache. + Uses retry logic to handle cases where there are pending requests. + + Returns: + bool: True if flush was successful, False otherwise + """ + if not self.is_model_owner: + return True + + url = f"{self.base_url}/flush_cache" + max_attempts = 60 + connection_retry_limit = 5 + + # flush_cache will not return status_code 200 when there are pending requests + for attempt in range(max_attempts): + try: + response = requests.get(url, timeout=10) + if response.status_code == 200: + if attempt > 0: + logger.info( + f"[SGLang Worker] Rank {self.global_rank} Cache flushed successfully " + f"(attempt {attempt + 1})" + ) + return True + except requests.exceptions.ConnectionError: + # Server might not be ready yet - only retry for first few attempts + if attempt >= connection_retry_limit: + logger.warning( + f"[SGLang Worker] Rank {self.global_rank} Connection failed after " + f"{connection_retry_limit} attempts" + ) + return False + except Exception as e: + # For other errors, log and retry (except on last attempt) + if attempt == max_attempts - 1: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Failed to flush cache after " + f"{max_attempts} attempts: {e}" + ) + return False + + time.sleep(1) + + # All attempts exhausted without success + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Timeout: Cache flush failed after " + f"{max_attempts} attempts. Server may have pending requests." + ) + return False + + def get_gpu_uuids(self) -> list[str]: + """Get list of GPU UUIDs used by this SGLang server. + + Returns: + List of GPU UUIDs (e.g., ["GPU-xxxxx", "GPU-yyyyy"]) + """ + from nemo_rl.utils.nvml import get_device_uuid + + # Get all GPU UUIDs used by this server + # SGLang server uses GPUs starting from base_gpu_id with tp_size GPUs + gpu_uuids = [] + for i in range(self.server_args.tp_size): + gpu_id = self.server_args.base_gpu_id + i + uuid = get_device_uuid(gpu_id) + gpu_uuids.append(uuid) + + return gpu_uuids + + def _merge_stop_strings(self, batch_stop_strings): + """Merge stop strings from config and batch. + + Args: + batch_stop_strings: List of stop strings from batch (one per sample) + + Returns: + List of merged stop strings (one per sample) + """ + stop_set: set[str] = set() + + # Add stop strings from config + if self.cfg.get("stop_strings"): + stop_set.update(self.cfg["stop_strings"]) + + # Merge stop strings from batch + merged_stop_strings = [] + for sample_ss in batch_stop_strings: + sample_stop_set = stop_set.copy() + if sample_ss: + if isinstance(sample_ss, str): + sample_stop_set.add(sample_ss) + elif isinstance(sample_ss, list): + sample_stop_set.update(sample_ss) + + merged_stop_strings.append( + list(sample_stop_set) if sample_stop_set else None + ) + + return merged_stop_strings + + def _build_sampling_params( + self, + *, + greedy: bool, + stop_strings, + max_new_tokens: Optional[int] = None, + input_len: Optional[int] = None, + context_length: Optional[int] = None, + sample_index: Optional[int] = None, + ) -> dict[str, Any]: + """Build sampling parameters dictionary for SGLang API. + + Args: + greedy: Whether to use greedy decoding (temperature=0.0) + stop_strings: Merged stop strings (not used here, handled per sample) + max_new_tokens: Override max_new_tokens from config if provided + input_len: Input length for this sample (used for context_length adjustment) + context_length: Maximum context length (if provided, adjusts max_new_tokens) + sample_index: Sample index (used for warning messages, 0-indexed) + + Returns: + Dictionary of sampling parameters compatible with SGLang API + """ + top_k_cfg = self.cfg.get("top_k") + top_k_val = 1 if greedy else (top_k_cfg if top_k_cfg is not None else -1) + temperature = 0.0 if greedy else self.cfg["temperature"] + + base_max_tokens = ( + max_new_tokens if max_new_tokens is not None else self.cfg["max_new_tokens"] + ) + + # TODO: check if this is needed + final_max_tokens = base_max_tokens + if context_length is not None and input_len is not None: + max_allowed_new_tokens = max(0, context_length - input_len - 1) + if base_max_tokens > max_allowed_new_tokens: + final_max_tokens = max_allowed_new_tokens + if sample_index == 0: + logger.warning( + f"[SGLang Worker] Rank {self.global_rank} Warning: " + f"Sample {sample_index} input length ({input_len}) + max_new_tokens ({base_max_tokens}) " + f"would exceed context_length ({context_length}). " + f"Reducing max_new_tokens to {final_max_tokens} for this sample." + ) + + # Build sampling params dict + sampling_params = { + "temperature": temperature, + "top_p": self.cfg.get("top_p", 1.0), + "max_new_tokens": final_max_tokens, + } + + if top_k_val != -1: + sampling_params["top_k"] = top_k_val + + stop_token_ids = self.cfg.get("stop_token_ids") + if stop_token_ids is not None: + sampling_params["stop_token_ids"] = stop_token_ids + + return sampling_params + + async def _ensure_session(self): + if self.session is None: + # Create connector with connection pool limit + self.connector = aiohttp.TCPConnector(limit=512, limit_per_host=512) + # Create session with timeout + timeout = aiohttp.ClientTimeout(total=300) # 5 minutes timeout + self.session = aiohttp.ClientSession( + connector=self.connector, timeout=timeout + ) + return self.session + + async def _generate_single_sample( + self, + input_ids: list[int], + sampling_params: dict[str, Any], + stop_string: Optional[str] = None, + ) -> tuple[list[int], list[float]]: + """Generate a single sample using SGLang API (async function). + + Args: + input_ids: List of input token IDs (without padding) + sampling_params: Dictionary of sampling parameters (temperature, top_p, max_new_tokens, etc.) + stop_string: Optional stop string for this sample + + Returns: + Tuple of (generated_tokens, logprobs): + - generated_tokens: List of generated token IDs + - logprobs: List of log probabilities for generated tokens + """ + # Prepare payload for SGLang API + # Note: stop should be in sampling_params, not in payload top level + # TODO: double check this + if stop_string is not None: + # stop can be a string or list of strings + sampling_params = sampling_params.copy() # Don't modify the original + sampling_params["stop"] = stop_string + + payload = { + "sampling_params": sampling_params, + "return_logprob": True, + "input_ids": input_ids, + } + + url = f"{self.base_url}/generate" + headers = { + "Content-Type": "application/json; charset=utf-8", + } + + session = await self._ensure_session() + + try: + async with session.post(url, json=payload, headers=headers) as response: + response.raise_for_status() + result = await response.json() + except Exception as e: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Request failed for input_len={len(input_ids)}: {e}" + ) + raise + + # Extract generated tokens and logprobs + meta_info = result.get("meta_info", {}) + output_token_logprobs = meta_info.get("output_token_logprobs", []) + + if output_token_logprobs: + new_tokens = [item[1] for item in output_token_logprobs] + new_logprobs = [item[0] for item in output_token_logprobs] + else: + # Fallback: empty if token logprobs not available + new_tokens = [] + new_logprobs = [] + + return new_tokens, new_logprobs + + async def _generate_async(self, tasks): + """Execute generation tasks with concurrency control. + + TEMP: Uses a semaphore to limit the number of concurrent requests per server, preventing server overload. + A router based solution is preffered in the future. + """ + semaphore = asyncio.Semaphore(self.max_concurrent_requests) + + async def wrap(idx, coro): + async with semaphore: + try: + result = await coro + return idx, result + except Exception as e: + raise + + wrapped = [wrap(i, t) for i, t in enumerate(tasks)] + results = [None] * len(tasks) + count = 0 + + for fut in asyncio.as_completed(wrapped): + idx, value = await fut + results[idx] = value + count += 1 + if count % 50 == 0 or count == len(tasks): + logger.debug( + f"[SGLang Worker] Rank {self.global_rank} Completed {count}/{len(tasks)} tasks" + ) + + return results + + def _launch_server_process(self, server_args: Any) -> multiprocessing.Process: + """Launch the SGLang server process and wait for it to be ready.""" + # Ensure `sglang` is importable when we actually start a server. + launch_server, _, kill_process_tree = _require_sglang() + p = multiprocessing.Process(target=launch_server, args=(server_args,)) + p.start() + + # Wait for server to be ready by checking health endpoint + # Use the base_url we stored earlier + headers = { + "Content-Type": "application/json; charset=utf-8", + } + + max_wait_time = 300 # 5 minutes timeout + start_time = time.time() + with requests.Session() as session: + while True: + if time.time() - start_time > max_wait_time: + kill_process_tree(p.pid) + raise TimeoutError( + f"[SGLang Server] Rank {self.global_rank} Server failed to start within {max_wait_time}s" + ) + try: + response = session.get( + f"{self.base_url}/health_generate", headers=headers, timeout=10 + ) + if response.status_code == 200: + logger.info( + f"[SGLang Server] Rank {self.global_rank} Server is ready at {self.base_url}" + ) + break + except requests.RequestException: + pass + + if not p.is_alive(): + raise RuntimeError( + f"[SGLang Server] Rank {self.global_rank} Server process terminated unexpectedly." + ) + + time.sleep(2) + return p + + @wrap_with_nvtx_name("sglang_genertion_worker/generate") + def generate( + self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False + ) -> BatchedDataDict[GenerationOutputSpec]: + """Generate a batch of data using SGLang generation. + + Args: + data: BatchedDataDict containing input_ids and input_lengths tensors + greedy: Whether to use greedy decoding instead of sampling + + Returns: + BatchedDataDict conforming to GenerationOutputSpec: + - output_ids: input + generated token IDs with proper padding + - logprobs: Log probabilities for tokens + - generation_lengths: Lengths of each response + - unpadded_sequence_lengths: Lengths of each input + generated sequence + """ + # Handle empty input case + if len(data["input_ids"]) == 0: + return BatchedDataDict[GenerationOutputSpec]( + { + "output_ids": torch.zeros((0, 0), dtype=torch.long), + "logprobs": torch.zeros((0, 0), dtype=torch.float), + "generation_lengths": torch.zeros(0, dtype=torch.long), + "unpadded_sequence_lengths": torch.zeros(0, dtype=torch.long), + } + ) + + input_ids = data["input_ids"] + input_lengths = data["input_lengths"] + batch_stop_strings = data.get("stop_strings", [None] * len(input_lengths)) + stop_strings = self._merge_stop_strings(batch_stop_strings) + batch_size = len(input_lengths) + pad_token_id = self.cfg["_pad_token_id"] + + # Verify inputs have correct padding + verify_right_padding(data, pad_value=pad_token_id) + + # Original input length with padding + padded_input_length = input_ids.size(1) + + logger.debug( + f"[SGLang Worker] Rank {self.global_rank} batch_size: {batch_size}, padded_input_length: {padded_input_length}" + ) + + if batch_size == 0: + raise ValueError("Empty batch received") + + context_length = self.sglang_cfg.get("context_length", None) + + # Create async tasks for all samples + tasks = [] + for i in range(batch_size): + input_len = input_lengths[i].item() + + # Truncate input if it exceeds context_length + if context_length is not None and input_len >= context_length: + input_len = context_length - 1 + + valid_input_ids = input_ids[i, :input_len].tolist() + + # Build sampling params for this sample (with context_length adjustment) + sample_sampling_params = self._build_sampling_params( + greedy=greedy, + stop_strings=stop_strings, + max_new_tokens=None, + input_len=input_len, + context_length=context_length, + sample_index=i, + ) + + tasks.append( + self._generate_single_sample( + input_ids=valid_input_ids, + sampling_params=sample_sampling_params, + stop_string=stop_strings[i], + ) + ) + + # Execute all requests concurrently using the dedicated event loop thread + try: + all_results = self.async_loop_thread.run(self._generate_async(tasks)) + except Exception as e: + raise + + total_generated_tokens = sum(len(tokens) for tokens, _ in all_results) + avg_generation_length = ( + total_generated_tokens / batch_size if batch_size > 0 else 0 + ) + + # Process results + output_ids_list = [] + logprobs_list = [] + generation_lengths_list = [] + unpadded_sequence_lengths_list = [] + max_length = 0 + + # First pass: calculate max_length + for i, (new_tokens, new_logprobs) in enumerate(all_results): + input_len = input_lengths[i].item() + generation_length = len(new_tokens) + unpadded_length = input_len + generation_length + max_length = max(max_length, unpadded_length) + + total_length = max(max_length, padded_input_length) + + for i, (new_tokens, new_logprobs) in enumerate(all_results): + input_len = input_lengths[i].item() + generation_length = len(new_tokens) + unpadded_length = input_len + generation_length + + full_output = torch.full( + (total_length,), pad_token_id, dtype=input_ids.dtype + ) + full_output[:input_len] = input_ids[i][:input_len] + + # Add generated tokens after the original input + if new_tokens: + full_output[input_len : input_len + len(new_tokens)] = torch.tensor( + new_tokens, dtype=input_ids.dtype + ) + + # Construct logprobs: zeros for input tokens, actual logprobs for generated tokens + full_logprobs = torch.zeros(total_length, dtype=torch.float32) + if new_logprobs: + for idx, logprob in enumerate(new_logprobs): + position = input_len + idx + full_logprobs[position] = logprob + + output_ids_list.append(full_output) + logprobs_list.append(full_logprobs) + generation_lengths_list.append(generation_length) + unpadded_sequence_lengths_list.append(unpadded_length) + + # Stack into tensors + output_ids = torch.stack(output_ids_list) + logprobs = torch.stack(logprobs_list) + generation_lengths = torch.tensor(generation_lengths_list, dtype=torch.long) + unpadded_sequence_lengths = torch.tensor( + unpadded_sequence_lengths_list, dtype=torch.long + ) + logger.debug( + f"[SGLang Worker] Rank {self.global_rank} Generated {total_generated_tokens} tokens across {batch_size} samples (avg: {avg_generation_length:.1f} tokens/sample)" + ) + return BatchedDataDict[GenerationOutputSpec]( + { + "output_ids": output_ids, + "generation_lengths": generation_lengths, + "unpadded_sequence_lengths": unpadded_sequence_lengths, + "logprobs": logprobs, + } + ) + + def sleep(self): + # TODO + pass + + def wake_up(self, **kwargs): + # TODO + pass + + def shutdown(self) -> bool: + """Shutdown the SGLang server process and cleanup async resources. + + Returns: + bool: True if shutdown was successful, False otherwise + """ + if not self.is_model_owner: + if hasattr(self, "async_loop_thread"): + try: + self.async_loop_thread.shutdown() + logger.info( + f"[SGLang Worker] Rank {self.global_rank} Async loop thread shut down." + ) + except Exception as e: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Error shutting down async loop thread: {e}" + ) + return True + + try: + # Only model owners started a server process; they require sglang for shutdown. + _, _, kill_process_tree = _require_sglang() + if hasattr(self, "session") and self.session is not None: + try: + + async def close_session(): + await self.session.close() + if self.connector is not None: + await self.connector.close() + + self.async_loop_thread.run(close_session()) + logger.info( + f"[SGLang Worker] Rank {self.global_rank} aiohttp session closed." + ) + except Exception as e: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Error closing aiohttp session: {e}" + ) + + # Shutdown async loop thread after session cleanup + if hasattr(self, "async_loop_thread"): + try: + self.async_loop_thread.shutdown() + logger.info( + f"[SGLang Worker] Rank {self.global_rank} Async loop thread shut down." + ) + except Exception as e: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Error shutting down async loop thread: {e}" + ) + + if not hasattr(self, "server_process") or self.server_process is None: + return True + + logger.info( + f"[SGLang Worker] Rank {self.global_rank} Shutting down server at {self.base_url}..." + ) + + if self.server_process.is_alive(): + kill_process_tree(self.server_process.pid) + + # Wait for the process to terminate + self.server_process.join(timeout=5.0) + + if self.server_process.is_alive(): + return False + return True + + except Exception as e: + logger.error( + f"[SGLang Worker] Rank {self.global_rank} Error during shutdown: {e}" + ) + return False + + def _make_request(self, endpoint: str, payload: Optional[dict] = None): + """Make a POST request to the specified endpoint with the given payload. + + Args: + endpoint: The API endpoint to call + payload: The JSON payload to send (default: empty dict) + + Returns: + The JSON response from the server + """ + # Use the stored base_url instead of constructing from server_args + url = f"{self.base_url}/{endpoint}" + headers = { + "Content-Type": "application/json; charset=utf-8", + } + response = requests.post(url, json=payload or {}, headers=headers, timeout=60) + response.raise_for_status() + return response.json() diff --git a/nemo_rl/models/generation/sglang/utils.py b/nemo_rl/models/generation/sglang/utils.py new file mode 100644 index 0000000000..7460302b5a --- /dev/null +++ b/nemo_rl/models/generation/sglang/utils.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import threading + + +class AsyncLoopThread: + """A background event loop thread for running async operations in Ray actors. + + This class creates a dedicated thread with its own event loop, allowing + synchronous Ray actor methods to execute async coroutines without blocking + the main actor thread. This is necessary because run_coroutine_threadsafe + requires the event loop to be in a different thread. + """ + + def __init__(self): + self.loop = asyncio.new_event_loop() + self._ready = threading.Event() + self._thread = threading.Thread(target=self._start_loop, daemon=True) + self._thread.start() + if not self._ready.wait(timeout=5.0): + raise RuntimeError("Event loop thread failed to start within 5 seconds") + + def _start_loop(self): + """Run the event loop in the background thread.""" + asyncio.set_event_loop(self.loop) + self._ready.set() + self.loop.run_forever() + + def run(self, coro): + """Schedule a coroutine onto the loop and block until it's done. + + Args: + coro: The coroutine to execute + + Returns: + The result of the coroutine + """ + if not self.loop.is_running(): + raise RuntimeError("Event loop is not running") + future = asyncio.run_coroutine_threadsafe(coro, self.loop) + result = future.result() + return result + + def shutdown(self): + """Shutdown the event loop and wait for the thread to finish.""" + if self.loop.is_running(): + self.loop.call_soon_threadsafe(self.loop.stop) + self._thread.join(timeout=2.0) + if not self.loop.is_closed(): + self.loop.close() diff --git a/nemo_rl/models/generation/vllm/vllm_generation.py b/nemo_rl/models/generation/vllm/vllm_generation.py index 93540ebe82..1366ce28c5 100644 --- a/nemo_rl/models/generation/vllm/vllm_generation.py +++ b/nemo_rl/models/generation/vllm/vllm_generation.py @@ -876,6 +876,14 @@ def clear_vllm_logger_metrics(self) -> None: ) ray.get(futures) + def clear_logger_metrics(self) -> None: + """Clear logger metrics for performance reporting.""" + self.clear_vllm_logger_metrics() + + def get_logger_metrics(self) -> dict[str, Any]: + """Get logger metrics for performance reporting.""" + return self.get_vllm_logger_metrics() + def __del__(self) -> None: """Shuts down the worker groups when the object is deleted or is garbage collected. diff --git a/nemo_rl/models/policy/interfaces.py b/nemo_rl/models/policy/interfaces.py index 144b0c517d..6e64c6289b 100644 --- a/nemo_rl/models/policy/interfaces.py +++ b/nemo_rl/models/policy/interfaces.py @@ -182,6 +182,18 @@ def stream_weights_via_ipc_zmq( ) -> list[ray.ObjectRef]: pass + def stream_weights_via_http( + self, sglang_url_to_gpu_uuids: dict[str, list[str]] + ) -> list[ray.ObjectRef]: + """Stream model weights to SGLang servers via HTTP API. + + Args: + sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses + """ + raise NotImplementedError( + "stream_weights_via_http is not implemented for this policy worker" + ) + @abstractmethod def broadcast_weights_for_collective( self, kv_scales: Optional[dict[str, float]] = None diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py index 144683c95c..1f908824fe 100644 --- a/nemo_rl/models/policy/lm_policy.py +++ b/nemo_rl/models/policy/lm_policy.py @@ -768,6 +768,20 @@ def stream_weights_via_ipc_zmq( ) return futures + def stream_weights_via_http( + self, sglang_url_to_gpu_uuids: dict[str, list[str]] + ) -> list[ray.ObjectRef]: + """Send the weights to SGLang servers via HTTP API. + + Args: + sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses + """ + futures = self.worker_group.run_all_workers_single_data( + "stream_weights_via_http", + sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids, + ) + return futures + def broadcast_weights_for_collective( self, kv_scales: Optional[dict[str, float]] = None ) -> list[ray.ObjectRef]: diff --git a/nemo_rl/models/policy/utils.py b/nemo_rl/models/policy/utils.py index 7ad33708a2..ad79f1a1d8 100644 --- a/nemo_rl/models/policy/utils.py +++ b/nemo_rl/models/policy/utils.py @@ -16,9 +16,11 @@ import os import traceback from enum import Enum -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, cast +import requests import torch +import torch.distributed as dist import zmq from torch.multiprocessing.reductions import rebuild_cuda_tensor from transformers import ( @@ -473,3 +475,268 @@ def rebuild_cuda_tensor_from_ipc( list_args = list(args) list_args[6] = device_id return func(*list_args) + + +def stream_weights_via_http_impl( + params_generator, + sglang_url_to_gpu_uuids: dict[str, list[str]], + rank: int, + worker_name: str, + current_device_uuid: str, +) -> None: + """Stream weights to SGLang servers via HTTP API (update_weights_from_tensor). + + Flow: Each rank creates IPC handler → gather handlers in rank order → send list → SGLang matches by tp_rank index + + Key points: + - Each rank creates handler on its own GPU + - Handlers are gathered in rank order: [rank0_handler, rank1_handler, ...] + - List index = rank = GPU ID + - SGLang automatically matches: handler = serialized_handlers[tp_rank] + + Args: + params_generator: Generator yielding (name, tensor) pairs + sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses + rank: Worker rank for logging + worker_name: Name of the worker for logging + current_device_uuid: UUID of the current training worker's GPU + """ + from sglang.srt.utils import MultiprocessingSerializer # type: ignore[import-error] + + try: + from sglang.srt.utils.patch_torch import ( + monkey_patch_torch_reductions, # type: ignore[import-error] + ) + except ImportError: + from sglang.srt.patch_torch import ( + monkey_patch_torch_reductions, # type: ignore[import-error] + ) + print("[sglang refit details] entering stream_weights_via_http_impl") + + monkey_patch_torch_reductions() + + target_urls = [ + url + for url, uuids in sglang_url_to_gpu_uuids.items() + if current_device_uuid in uuids + ] + + if not target_urls: + raise RuntimeError( + f"{worker_name} (rank {rank}): No matching SGLang server found for GPU UUID {current_device_uuid}. " + f"Available servers: {list(sglang_url_to_gpu_uuids.keys())}" + ) + + if len(target_urls) > 1: + print( + f"[WARNING] {worker_name} (rank {rank}): GPU UUID {current_device_uuid} matches multiple SGLang servers: {target_urls}. " + f"Using the first one: {target_urls[0]}" + ) + target_urls = [target_urls[0]] + + base_url = target_urls[0] + url = f"{base_url}/update_weights_from_tensor" + sglang_gpu_uuids = sglang_url_to_gpu_uuids[base_url] + + ipc_gather_group, ipc_gather_src, matching_ranks = _setup_ipc_gather_group( + rank, current_device_uuid, sglang_gpu_uuids, sglang_url_to_gpu_uuids + ) + print( + f"[sglang refit] {worker_name} (rank {rank}): ipc_gather_group={ipc_gather_group}, ipc_gather_src={ipc_gather_src}, matching_ranks={matching_ranks}" + ) + tensor_count = 0 + + try: + tensor_list = list(params_generator) + total_tensors = len(tensor_list) + + if rank == ipc_gather_src: + print( + f"[sglang refit details] {worker_name}: Starting weight update - " + f"Total parameters to update: {total_tensors}", + flush=True, + ) + + for idx, (name, tensor) in enumerate(tensor_list): + torch.cuda.current_stream().synchronize() + tensor = tensor.contiguous().cuda() + + named_tensors = [(name, tensor)] + serialized_handler = MultiprocessingSerializer.serialize( + named_tensors, output_str=True + ) + # output_str=True ensures the return type is str + serialized_handler_str = cast(str, serialized_handler) + + gathered_handlers = _gather_ipc_handlers( + serialized_handler_str, + ipc_gather_group, + ipc_gather_src, + rank, + matching_ranks, + ) + + if rank == ipc_gather_src and gathered_handlers is not None: + _send_tensor_to_sglang( + url, + name, + gathered_handlers, + tensor.shape, + str(tensor.dtype), + flush_cache=False, + ) + tensor_count += 1 + + del tensor, serialized_handler + if rank == ipc_gather_src: + del gathered_handlers + torch.cuda.empty_cache() + + if rank == ipc_gather_src: + print( + f"[sglang refit details] {worker_name}: Weight update completed - " + f"Successfully updated {tensor_count}/{total_tensors} parameters to SGLang server: {base_url}", + flush=True, + ) + if tensor_count != total_tensors: + print( + f"[sglang refit details] {worker_name}: WARNING - Expected {total_tensors} tensors, " + f"but only sent {tensor_count}", + flush=True, + ) + + except Exception as e: + print( + f"{worker_name} (rank {rank}): Error during HTTP weight streaming: {e}.\n" + f"{traceback.format_exc()}" + ) + raise + + finally: + gc.collect() + torch.cuda.empty_cache() + + +def _setup_ipc_gather_group( + rank: int, + current_device_uuid: str, + sglang_gpu_uuids: list[str], + sglang_url_to_gpu_uuids: dict[str, list[str]], +) -> tuple[Optional[dist.ProcessGroup], Optional[int], Optional[list[int]]]: + """Setup gather configuration for IPC handlers. + + Returns: + Tuple of (gather_group, gather_src_rank, matching_ranks) + - gather_group: None (use default FSDP group) + - gather_src_rank: The rank that will collect and send to SGLang server + - matching_ranks: List of ranks that belong to the same SGLang server + """ + if not dist.is_initialized(): + return None, None, None + + world_size = dist.get_world_size() + my_rank = dist.get_rank() + + all_ranks_uuids = [None] * world_size + dist.all_gather_object(all_ranks_uuids, current_device_uuid) + + matching_ranks = [ + r for r, uuid in enumerate(all_ranks_uuids) if uuid in sglang_gpu_uuids + ] + + if len(matching_ranks) == 0: + return None, None, None + + matching_ranks = sorted(matching_ranks) + gather_src = matching_ranks[0] + + return None, gather_src, matching_ranks + + +def _gather_ipc_handlers( + serialized_handler: str, + gather_group: Optional[dist.ProcessGroup], + gather_src: Optional[int], + rank: int, + matching_ranks: Optional[list[int]] = None, +) -> Optional[list[str]]: + """Gather IPC handlers from all ranks in the default FSDP group, then filter by server. + + Args: + serialized_handler: Serialized IPC handler from this rank + gather_group: Process group (None means use default FSDP group) + gather_src: Rank that will collect and filter handlers + rank: Current rank + matching_ranks: List of ranks that belong to the same SGLang server + + Returns: + List of serialized handlers in rank order (only on gather_src rank), None otherwise + The list contains handlers from matching_ranks only, in rank order + """ + if gather_src is None: + return None + + if not dist.is_initialized(): + return None + + world_size = dist.get_world_size() + + all_handlers: list[Optional[str]] = [None for _ in range(world_size)] + dist.all_gather_object(all_handlers, serialized_handler) + all_handlers_str = cast(list[str], all_handlers) + + if rank == gather_src and matching_ranks is not None: + filtered_handlers: list[str] = [all_handlers_str[r] for r in matching_ranks] + return filtered_handlers + else: + return None + + +def _send_tensor_to_sglang( + url: str, + tensor_name: str, + gathered_handlers: list[str], + shape: torch.Size, + dtype: str, + flush_cache: bool = False, +) -> None: + """Send gathered IPC handlers to SGLang server via HTTP. + + Key: gathered_handlers are in rank order [rank0, rank1, ...] + SGLang will automatically match: handler = serialized_handlers[tp_rank] + + Args: + url: SGLang server URL + tensor_name: Name of the tensor + gathered_handlers: List of serialized IPC handlers in rank order + shape: Tensor shape + dtype: Tensor dtype + flush_cache: Whether to flush cache after this tensor (for last tensor) + """ + payload = { + "serialized_named_tensors": gathered_handlers, + "flush_cache": flush_cache, + } + + try: + response = requests.post( + url, + json=payload, + headers={"Content-Type": "application/json"}, + timeout=120, + ) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + error_msg = f"Failed to send tensor '{tensor_name}' to {url}: {e}" + try: + error_detail = response.text + error_msg += f"\nResponse status: {response.status_code}" + error_msg += f"\nResponse body: {error_detail[:500]}" + except: + pass + print(f"[sglang refit] {error_msg}", flush=True) + raise RuntimeError(error_msg) from e + except Exception as e: + raise RuntimeError( + f"Failed to send tensor '{tensor_name}' to {url}: {e}" + ) from e diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py index 785568cc76..76613dfb8a 100644 --- a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py @@ -1771,6 +1771,53 @@ def dtensor_params_generator(): worker_name=str(self), ) + @torch.no_grad() + @wrap_with_nvtx_name("dtensor_policy_worker_v2/stream_weights_via_http") + def stream_weights_via_http( + self, + sglang_url_to_gpu_uuids: dict[str, list[str]], + ) -> None: + """Stream model weights to SGLang servers via HTTP API. + + Args: + sglang_url_to_gpu_uuids: Dict mapping SGLang server URL to list of GPU UUIDs it uses + """ + # Manually move model to cuda for cpu offload case + if self.cpu_offload: + self.model = self.move_to_cuda(self.model) + + from nemo_rl.models.policy.utils import stream_weights_via_http_impl + + # Get current GPU UUID + current_device_uuid = self.report_device_id() + + def dtensor_params_generator(): + """Generator that yields (name, tensor) pairs, converting DTensors to local tensors.""" + state_dict_items = sorted( + self.model.state_dict().items(), key=lambda x: x[0] + ) + for name, tensor in state_dict_items: + if isinstance(tensor, DTensor): + # Convert DTensor to full tensor for streaming + full_tensor = tensor.full_tensor() + # Convert to target dtype + yield ( + name, + full_tensor.to(self.dtype, non_blocking=True).contiguous(), + ) + else: + # Convert to target dtype + yield name, tensor.to(self.dtype, non_blocking=True).contiguous() + + # Use the HTTP implementation + stream_weights_via_http_impl( + params_generator=dtensor_params_generator(), + sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids, + rank=self.rank, + worker_name=str(self), + current_device_uuid=current_device_uuid, + ) + @torch.no_grad() def broadcast_weights_for_collective( self, kv_scales: Optional[dict[str, float]] = None diff --git a/pyproject.toml b/pyproject.toml index 19916dbf6c..80c3286e7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,11 +19,11 @@ dependencies = [ "setuptools", "pip", # Required for frozen environments; uv venv --seed may not reliably install pip "ninja", # for flash-attn parallel build - "torch==2.9.0", + "torch==2.8.0", "triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", "colored==2.2.3", "ray[default]==2.49.2", - "transformers==4.57.1", + "transformers>=4.55.4", "wandb", "numpy", "datasets>=4.0.0", @@ -49,6 +49,7 @@ dependencies = [ "nvidia-nvshmem-cu12; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # for deep_ep build "swanlab", "pyzmq", + "coverage>=7.10.4", ] [project.optional-dependencies] @@ -58,13 +59,10 @@ automodel = [ # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 - "vllm==0.11.2", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved + "vllm==0.11.0", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved "flash-attn==2.8.1", "mamba-ssm", "causal-conv1d", - "nv-grouped-gemm", - "transformer-engine[pytorch]==2.8.0", - "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", ] vllm = [ "cuda-python", @@ -72,8 +70,8 @@ vllm = [ # deep_ep also needs libibverbs-dev # sudo apt-get update # sudo apt-get install libibverbs-dev - "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", - "vllm==0.11.2", + "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@e3908bf5bd0cc6265bcb225d15cd8c996d4759ef", + "vllm==0.11.0", "num2words>=0.5.14", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved "flash-attn==2.8.1", @@ -82,6 +80,26 @@ vllm = [ # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved "causal-conv1d", ] +sglang = [ + "sglang>=0.4.1", + "pybase64", + "orjson", + "uvloop", + "requests", + "openai", + "partial-json-parser", + "sentencepiece", + "sgl-kernel==0.3.17.post1", + "compressed-tensors", + "msgspec", + "python-multipart", + "torchao", + "xgrammar", + "interegular", + "openai-harmony", + "torch-memory-saver", + "einops", +] mcore = [ # also need cudnn (https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_network) # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb @@ -96,7 +114,7 @@ mcore = [ "megatron-core", "megatron-bridge", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "vllm==0.11.2", + "vllm==0.11.0", # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 @@ -109,7 +127,7 @@ nemo_gym = ["nemo_gym"] # This is a default group so that we install these even with bare `uv sync` build = [ # Build requirement for TE - "torch==2.9.0", + "torch==2.8.0", # Build requirement for TE "setuptools", "packaging", @@ -170,7 +188,6 @@ triton = [ ] causal-conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d", tag = "v1.5.0.post8" } mamba-ssm = { git = "https://github.com/state-spaces/mamba.git", rev = "2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" } -nv-grouped-gemm = { git = "https://github.com/fanshiqing/grouped_gemm", tag = "v1.1.4.post7" } [tool.uv.workspace] members = [ @@ -179,7 +196,7 @@ members = [ "3rdparty/Megatron-Bridge-workspace", "3rdparty/Gym-workspace", # Research projects are also added here in order for them to share the global root level uv.lock. - # If we don't do this, the research projects do not see the global uv.lock, and may mistakenly + # If we don't do this, the research projects do not see the global uv.lock, and may mistakenly # install numpy>=2.0 because nemo-rl's core [dependencies] do not pin numpy, but when you inspect # nemo-rl's uv.lock you'll see it's 1.X b/c megatron mandates 1.X in the optional dependencies, so # globally we must choose 1.X otherwise we run into pickle issues from ray. @@ -219,11 +236,12 @@ default-groups = ["dev", "build"] link-mode = "copy" # The TE override is needed because automodel/mbridge we are on is still on 2.5.0 # The opencv-python-headless override is needed because automodel pins it to 4.10.0.84, whereas vllm>=0.11.0 needs >= 4.11.0 +# The transformers override is needed since automodel is still on <=4.55.4 # The timm override is needed because current automodel pins to 1.0.16. This can be removed once we move ToT automodel -# The nvidia-modelopt override is needed because mcore is still on 0.33 override-dependencies = [ "transformer-engine[pytorch]==2.8.0", "opencv-python-headless>=4.11.0", + "transformers>=4.57.1", "timm<=1.0.22", "nvidia-modelopt[torch]>=0.39.0", ] @@ -267,7 +285,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"] [[tool.uv.dependency-metadata]] name = "deep_ep" # This version has to match the version in the commit/rev/tag used -version = "v1.2.1+bfded34" +version = "v1.1.0+e3908bf" requires-dist = ["torch", "packaging", "ninja"] [[tool.uv.dependency-metadata]] @@ -279,7 +297,7 @@ requires-dist = ["torch", "packaging", "ninja"] [[tool.uv.dependency-metadata]] name = "nv-grouped-gemm" # This version has to match the version in the commit/rev/tag used -version = "v1.1.4.post7" +version = "1.1.4.post6" requires-dist = ["setuptools", "wheel", "torch", "numpy"] [tool.black] @@ -303,6 +321,7 @@ markers = [ "hf_gated: marks tests that require HuggingFace token access for gated models", "automodel: marks tests that require the automodel extra", "vllm: marks tests that require the vllm extra", + "sglang: marks tests that require the sglang extra", ] [tool.pyrefly] diff --git a/pyrefly.toml b/pyrefly.toml index 74f0f29ed9..e4476c03ea 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -103,6 +103,8 @@ project-includes = [ "nemo_rl/models/generation/vllm/config.py", "nemo_rl/models/generation/vllm/utils.py", "nemo_rl/models/generation/vllm/vllm_backend.py", + "nemo_rl/models/generation/sglang/__init__.py", + "nemo_rl/models/generation/sglang/config.py", "nemo_rl/models/huggingface/__init__.py", "nemo_rl/models/megatron/__init__.py", "nemo_rl/models/megatron/community_import.py", diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh index ec7527f583..095a01c447 100644 --- a/tests/functional/L1_Functional_Tests_GPU.sh +++ b/tests/functional/L1_Functional_Tests_GPU.sh @@ -31,6 +31,7 @@ time uv run --no-sync bash ./tests/functional/grpo_megatron.sh time uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh time uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh +time uv run --no-sync bash ./tests/functional/grpo_sglang.sh time uv run --no-sync bash ./tests/functional/dpo.sh time uv run --no-sync bash ./tests/functional/rm.sh time uv run --no-sync bash ./tests/functional/eval.sh diff --git a/tests/functional/grpo_sglang.sh b/tests/functional/grpo_sglang.sh new file mode 100755 index 0000000000..8e7d7608bd --- /dev/null +++ b/tests/functional/grpo_sglang.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetches metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +set -eou pipefail + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR + +cd $PROJECT_ROOT +uv run --extra sglang coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \ + $PROJECT_ROOT/examples/run_grpo_math.py \ + --config $PROJECT_ROOT/examples/configs/grpo_math_1B_sglang.yaml \ + policy.model_name=Qwen/Qwen3-0.6B \ + grpo.num_prompts_per_step=2 \ + grpo.num_generations_per_prompt=4 \ + policy.train_global_batch_size=4 \ + policy.train_micro_batch_size=1 \ + cluster.gpus_per_node=1 \ + policy.generation.sglang_cfg.gpus_per_server=1 \ + grpo.max_num_steps=2 \ + logger.tensorboard_enabled=true \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=false \ + logger.monitor_gpus=true \ + checkpointing.enabled=false \ + $@ \ + 2>&1 | tee $RUN_LOG + +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +uv run tests/check_metrics.py $JSON_METRICS \ + 'max(data["train/token_mult_prob_error"]) < 1.05' + diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh new file mode 100755 index 0000000000..47fd7eb186 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh @@ -0,0 +1,43 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=450 +MAX_STEPS=450 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +# Using the same metrics thresholds as the vllm version to verify alignment +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["450"] < 1.1' \ + 'mean(data["timing/train/total_step_time"], 2) < 25' +fi + + diff --git a/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh b/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh new file mode 100755 index 0000000000..69c35eb54c --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["500"] < 1.1' \ + 'mean(data["timing/train/total_step_time"], 2) < 30' +fi + diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index ee1fda01b1..24d16d1c62 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -7,6 +7,10 @@ tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.sh tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.sh tests/test_suites/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.sh +# SGLang backend +tests/test_suites/llm/grpo-qwen3-0.6b-1n8g-sglang.sh +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1-sglang.sh + # Dtensor (Qwen/Qwen2.5-7B-Instruct) tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4.v3.sh diff --git a/tests/unit/L0_Unit_Tests_Generation.sh b/tests/unit/L0_Unit_Tests_Generation.sh index e7b7a6e2ca..d30e051c66 100644 --- a/tests/unit/L0_Unit_Tests_Generation.sh +++ b/tests/unit/L0_Unit_Tests_Generation.sh @@ -45,3 +45,11 @@ if [[ $exit_code -eq 5 ]]; then else uv run --extra vllm bash -x ./tests/run_unit.sh unit/models/generation/ --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only fi + +# Check and run sglang tests +exit_code=$(uv run --extra sglang pytest tests/unit/models/generation/ --collect-only --hf-gated --sglang-only -q >/dev/null 2>&1; echo $?) +if [[ $exit_code -eq 5 ]]; then + echo "No sglang tests to run" +else + uv run --extra sglang bash -x ./tests/run_unit.sh unit/models/generation/ --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only +fi diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ab3368185c..ebc5569f86 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -57,6 +57,12 @@ def pytest_addoption(parser): default=False, help="Run ONLY vllm tests", ) + parser.addoption( + "--sglang-only", + action="store_true", + default=False, + help="Run ONLY sglang tests", + ) def pytest_collection_modifyitems(config, items): @@ -65,12 +71,18 @@ def pytest_collection_modifyitems(config, items): run_mcore_only = config.getoption("--mcore-only") run_automodel_only = config.getoption("--automodel-only") run_vllm_only = config.getoption("--vllm-only") + run_sglang_only = config.getoption("--sglang-only") # Check for mutually exclusive options - exclusive_options = [run_mcore_only, run_automodel_only, run_vllm_only] + exclusive_options = [ + run_mcore_only, + run_automodel_only, + run_vllm_only, + run_sglang_only, + ] if sum(exclusive_options) > 1: raise ValueError( - "--mcore-only, --automodel-only, and --vllm-only are mutually exclusive" + "--mcore-only, --automodel-only, --vllm-only, and --sglang-only are mutually exclusive" ) marker_expr = config.getoption("-m", default="") @@ -140,6 +152,24 @@ def pytest_collection_modifyitems(config, items): # Exclude vllm tests by default new_items = [item for item in new_items if not item.get_closest_marker("vllm")] + # Filter by sglang marker + if run_sglang_only: + # Validate that sglang is available + try: + import sglang # noqa: F401 + except ImportError: + raise ImportError( + "Cannot run sglang tests: sglang is not available.\n" + "Please run tests with: uv run --extra sglang --group test pytest ..." + ) + # Include only sglang tests + new_items = [item for item in new_items if item.get_closest_marker("sglang")] + else: + # Exclude sglang tests by default + new_items = [ + item for item in new_items if not item.get_closest_marker("sglang") + ] + # Ensure run_first tests are prioritized new_items.sort(key=lambda item: 0 if item.get_closest_marker("run_first") else 1) diff --git a/tests/unit/models/generation/test_sglang_generation.py b/tests/unit/models/generation/test_sglang_generation.py new file mode 100644 index 0000000000..299bd8e3d6 --- /dev/null +++ b/tests/unit/models/generation/test_sglang_generation.py @@ -0,0 +1,927 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for SGLang generation backend. + +These tests verify that the SGLang generation backend produces sane outputs. +While not true unit tests, they validate the generation quality in unit test runs. +""" + +import gc +from copy import deepcopy + +import pytest +import ray +import torch + +from nemo_rl.algorithms.utils import get_tokenizer +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.models.generation.sglang import SGLangConfig, SGLangGeneration + +model_name = "Qwen/Qwen3-0.6B" + +# Define basic SGLang test config +basic_sglang_test_config: SGLangConfig = { + "backend": "sglang", + "model_name": model_name, + "model_path": model_name, + "tokenizer": { + "name": model_name, + }, + "dtype": "bfloat16", + "max_new_tokens": 5, # Small number of tokens for testing + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + "sglang_cfg": { + "model_path": model_name, + "gpus_per_server": 2, + "dtype": "bfloat16", + "context_length": 1024, + "log_level": "warning", + "skip_server_warmup": True, + "enable_memory_saver": False, + "dp_size": 1, + "pp_size": 1, + "ep_size": 1, + "mem_fraction_static": 0.7, + }, + "colocated": { + "enabled": True, + "resources": { + "gpus_per_node": None, + "num_nodes": None, + }, + }, + "sglang_kwargs": {}, +} + +# Basic DTensor test config for Policy tests +basic_dtensor_test_config = { + "model_name": model_name, + "tokenizer": { + "name": model_name, + }, + "train_global_batch_size": 1, + "train_micro_batch_size": 1, + "learning_rate": 5e-6, + "logprob_batch_size": 1, + "max_new_tokens": 16, + "do_sample": False, + "precision": "float32", + "offload_optimizer_for_logprob": False, + "optimizer": { + "name": "torch.optim.AdamW", + "kwargs": { + "lr": 5e-6, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-8, + }, + }, + "dtensor_cfg": { + "_v2": True, # Use DTensorPolicyWorkerV2 for stream_weights_via_http + "enabled": True, + "cpu_offload": False, + "sequence_parallel": False, + "activation_checkpointing": False, + "tensor_parallel_size": 2, + "context_parallel_size": 1, + "custom_parallel_plan": None, + }, + "dynamic_batching": { + "enabled": True, + "train_mb_tokens": 40, + "logprob_mb_tokens": 40, + "sequence_length_round": 4, + }, + "sequence_packing": { + "enabled": False, + }, + "max_grad_norm": 1.0, + "make_sequence_length_divisible_by": 1, + "generation": deepcopy(basic_sglang_test_config), +} + + +def configure_sglang_config( + config: SGLangConfig, tokenizer, is_eval=True +) -> SGLangConfig: + """Apply specific configurations to SGLang config.""" + config = deepcopy(config) + config["_pad_token_id"] = tokenizer.pad_token_id + if config["stop_token_ids"] is None: + config["stop_token_ids"] = [tokenizer.eos_token_id] + return config + + +@pytest.fixture(scope="function") +def cluster(): + """Create a virtual cluster for testing with 2 GPUs.""" + virtual_cluster = RayVirtualCluster( + bundle_ct_per_node_list=[2], + use_gpus=True, + max_colocated_worker_groups=2, + num_gpus_per_node=2, + name="sglang-test-cluster", + ) + yield virtual_cluster + virtual_cluster.shutdown() + + +@pytest.fixture(scope="function") +def tokenizer(): + """Initialize tokenizer for the test model.""" + tokenizer = get_tokenizer(basic_sglang_test_config["tokenizer"]) + return tokenizer + + +@pytest.fixture(scope="function") +def policy(cluster, tokenizer): + """Initialize the SGLang policy.""" + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config = configure_sglang_config(sglang_config, tokenizer) + p = SGLangGeneration(cluster, sglang_config) + yield p + try: + p.shutdown() + gc.collect() + torch.cuda.empty_cache() + except Exception as e: + print(f"Error during policy cleanup: {e}") + + +@pytest.fixture(scope="function") +def test_input_data(tokenizer): + """Create test input data for inference.""" + test_prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + # Tokenize prompts + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=20, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + + # Calculate input lengths from attention mask + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + + # Create input data dictionary + return BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": input_lengths, + } + ) + + +@pytest.fixture(scope="function") +def policy_cluster_separate(): + """Create a virtual cluster for the Policy, using 2 GPUs.""" + cluster = RayVirtualCluster( + bundle_ct_per_node_list=[2], + use_gpus=True, + max_colocated_worker_groups=2, + num_gpus_per_node=2, + name="sglang-test-policy-cluster-separate", + ) + yield cluster + try: + cluster.shutdown() + except Exception as e: + print(f"Error during policy_cluster_separate shutdown: {e}") + + +def get_generation_cluster_separate(num_gpus_per_node: int = 2) -> RayVirtualCluster: + """Create a virtual cluster for the SGLangGeneration policy.""" + return RayVirtualCluster( + bundle_ct_per_node_list=[num_gpus_per_node], + use_gpus=True, + max_colocated_worker_groups=1, + num_gpus_per_node=num_gpus_per_node, + name="sglang-test-generation-cluster-separate", + ) + + +# ============================================================================= +# Basic Configuration Tests +# ============================================================================= + + +@pytest.mark.sglang +@pytest.mark.timeout(120) +def test_sglang_missing_required_config_key(cluster, tokenizer): + """Test that an error is raised when a required config key is missing.""" + # SGLang requires sglang_cfg to be present + incomplete_config = deepcopy(basic_sglang_test_config) + incomplete_config = configure_sglang_config(incomplete_config, tokenizer) + del incomplete_config["sglang_cfg"] + + with pytest.raises((KeyError, ValueError, AssertionError, TypeError)): + SGLangGeneration(cluster, incomplete_config) + + +@pytest.mark.sglang +def test_sglang_top_p_top_k_validation(cluster, tokenizer): + """Test that top_p and top_k values are accepted by SGLang. + + Note: SGLang may have different validation thresholds than vLLM. + This test verifies that reasonable sampling parameters are accepted. + """ + # Test that reasonable top_p and top_k values are accepted + config = deepcopy(basic_sglang_test_config) + config["top_p"] = 0.95 + config["top_k"] = 50 + config = configure_sglang_config(config, tokenizer) + + policy = None + try: + policy = SGLangGeneration(cluster, config) + print("Successfully initialized with top_p=0.95 and top_k=50") + except Exception as e: + pytest.fail(f"Should not raise error with reasonable sampling params: {e}") + finally: + if policy: + policy.shutdown() + gc.collect() + torch.cuda.empty_cache() + + +# ============================================================================= +# Basic Generation Tests +# ============================================================================= + + +@pytest.mark.sglang +@pytest.mark.timeout(180) +def test_sglang_policy_generation(policy, test_input_data, tokenizer): + """Test SGLang policy generation capabilities.""" + print("Testing SGLang generation...") + outputs = policy.generate(test_input_data) + + # Validate outputs format + assert "output_ids" in outputs, "output_ids not found in generation output" + assert "logprobs" in outputs, "logprobs not found in generation output" + assert "generation_lengths" in outputs, ( + "generation_lengths not found in generation output" + ) + assert "unpadded_sequence_lengths" in outputs, ( + "unpadded_sequence_lengths not found in generation output" + ) + + # Validate outputs shape and content + assert outputs["output_ids"].shape[0] == len(test_input_data["input_ids"]), ( + "Wrong batch size in output" + ) + assert outputs["generation_lengths"].shape[0] == len( + test_input_data["input_ids"] + ), "Wrong batch size in generation_lengths" + + # Decode and check outputs + generated_sequences = outputs["output_ids"] + generated_texts = tokenizer.batch_decode( + generated_sequences, skip_special_tokens=True + ) + + print(f"Generated texts: {generated_texts}") + + # All texts should have a non-zero length + assert all(len(text) > 0 for text in generated_texts), ( + "Some generated texts are empty" + ) + + +@pytest.mark.sglang +def test_sglang_worker_seed_behavior(cluster, tokenizer): + """ + Test that different workers generate different outputs for identical prompts due to different seeds. + This ensures proper randomization across distributed workers for diverse exploration in RLHF. + + Key: Use gpus_per_server=1 to create 2 independent SGLang servers (each with its own seed), + rather than 1 server with TP=2. + """ + from nemo_rl.algorithms.grpo import refit_policy_generation + from nemo_rl.models.policy.lm_policy import Policy + + unique_prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + # Create a batch where each prompt appears twice + # When sharded, different workers will get the same prompt + duplicated_prompts = unique_prompts + unique_prompts + + # Tokenize prompts + encodings = tokenizer( + duplicated_prompts, + padding="max_length", + max_length=20, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + + # Create input data dictionary + duplicated_batch = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": input_lengths, + } + ) + + # Test with gpus_per_server=1 to create 2 independent servers with different seeds + print("Creating SGLang policy with gpus_per_server=1 (2 independent servers)...") + sglang_config = deepcopy(basic_sglang_test_config) + # Use gpus_per_server=1 to create 2 independent SGLang servers + sglang_config["sglang_cfg"]["gpus_per_server"] = 1 + sglang_config = configure_sglang_config(sglang_config, tokenizer) + + policy = SGLangGeneration(cluster, sglang_config) + policy.finish_generation() + + dtensor_config = deepcopy(basic_dtensor_test_config) + dtensor_config["dtensor_cfg"]["tensor_parallel_size"] = 1 # Match gpus_per_server + lm_policy = Policy(cluster, dtensor_config, tokenizer) + + state_dict_info = lm_policy.prepare_refit_info() + policy.prepare_refit_info(state_dict_info) + + print("Refitting SGLang policy...") + refit_policy_generation(lm_policy, policy, sglang_config["colocated"]["enabled"]) + + try: + # Generate with duplicated prompts + print("Running generation with duplicated prompts...") + outputs = policy.generate(duplicated_batch, greedy=False) + + # Decode the generated sequences + gen_texts = tokenizer.batch_decode( + outputs["output_ids"], skip_special_tokens=True + ) + + print(f"Generated texts with duplicated prompts: {gen_texts}") + + # Check if the duplicated prompts generated different texts + # The first half and second half should be different due to different worker seeds + first_half = gen_texts[: len(unique_prompts)] + second_half = gen_texts[len(unique_prompts) :] + + print(f"First worker outputs: {first_half}") + print(f"Second worker outputs: {second_half}") + + # At least one of the pairs should be different due to different seeds + assert first_half != second_half, ( + "Different workers should generate different outputs for identical prompts due to different seeds" + ) + + finally: + # Clean up resources + if "policy" in locals() and hasattr(policy, "shutdown"): + policy.shutdown() + if "lm_policy" in locals() and hasattr(lm_policy, "shutdown"): + lm_policy.shutdown() + + # Force garbage collection + gc.collect() + torch.cuda.empty_cache() + + +@pytest.mark.sglang +def test_sglang_policy_tensor_parallel(cluster, tokenizer): + """Test SGLang policy with tensor parallelism > 1 (gpus_per_server=2).""" + # Configure with gpus_per_server=2 for tensor parallelism + tp_config = deepcopy(basic_sglang_test_config) + tp_config = configure_sglang_config(tp_config, tokenizer) + tp_config["sglang_cfg"]["gpus_per_server"] = 2 # TP=2 + + sglang_policy = None + try: + sglang_policy = SGLangGeneration(cluster, tp_config) + + # Create simple test input + test_prompts = ["Hello, my name is", "The capital of France is"] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=10, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + + test_input_data = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32), + } + ) + + # Test generation with tensor parallelism + outputs = sglang_policy.generate(test_input_data) + + sglang_policy.finish_generation() + sglang_policy.prepare_for_generation() + + # Test generation again after cache reset + outputs = sglang_policy.generate(test_input_data) + + assert "output_ids" in outputs, "output_ids not found in generation output" + assert outputs["output_ids"].shape[0] == 2, "Wrong batch size in output" + + # Decode and check output + generated_text = tokenizer.decode( + outputs["output_ids"][0], skip_special_tokens=True + ) + print(f"Generated text with TP=2: {generated_text}") + assert len(generated_text) > 0, "Generated text is empty" + + finally: + # Clean up resources + if sglang_policy: + sglang_policy.shutdown() + gc.collect() + torch.cuda.empty_cache() + + +@pytest.mark.sglang +def test_sglang_generate_text(cluster, tokenizer): + """Test that SGLang can generate coherent text. + + Note: SGLang doesn't have a generate_text method like vLLM, + so we use generate + tokenizer decode to verify text generation. + """ + # Prepare test data + test_prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=10, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + + test_input_data = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32), + } + ) + + # Create SGLang config with gpus_per_server=2 (using tensor parallelism) + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config["sglang_cfg"]["gpus_per_server"] = 2 + sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True) + + # Ensure correct model + assert sglang_config["model_name"] == "Qwen/Qwen3-0.6B", ( + "Model name should be Qwen/Qwen3-0.6B to get expected output" + ) + + sglang_generation = None + try: + # Create SGLang generation + sglang_generation = SGLangGeneration(cluster, sglang_config) + + # Generate with greedy decoding for deterministic output + output = sglang_generation.generate(test_input_data, greedy=True) + + # Decode generated text + generated_texts = tokenizer.batch_decode( + output["output_ids"], skip_special_tokens=True + ) + + print(f"Generated texts: {generated_texts}") + + # Verify we got non-empty text for each prompt + for i, text in enumerate(generated_texts): + assert len(text) > len(test_prompts[i]), ( + f"Generated text should be longer than input prompt: {text}" + ) + # Verify the generated text starts with or contains the prompt + print(f"Prompt: {test_prompts[i]} -> Generated: {text}") + + finally: + # Clean up + if sglang_generation: + sglang_generation.shutdown() + gc.collect() + torch.cuda.empty_cache() + + +def _wait_for_sglang_http_server_spinup(base_url: str): + """Wait for the SGLang HTTP server to be ready.""" + import time + + import requests + + max_wait = 60 # 60 seconds max wait + start = time.time() + while time.time() - start < max_wait: + try: + response = requests.get(f"{base_url}/health_generate", timeout=5) + if response.status_code == 200: + return + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + pass + time.sleep(1) + raise TimeoutError(f"SGLang server at {base_url} did not start within {max_wait}s") + + +@pytest.mark.sglang +def test_sglang_http_server(cluster, tokenizer): + """Test that SGLang HTTP server works with direct API calls. + + SGLang exposes a /generate endpoint that accepts input_ids and sampling_params. + This test verifies we can make direct HTTP requests to the SGLang server. + """ + import requests + + # Create SGLang config + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True) + + # Ensure correct model for reproducible output + assert sglang_config["model_name"] == "Qwen/Qwen3-0.6B", ( + "Model name should be Qwen/Qwen3-0.6B to get expected output" + ) + + sglang_generation = None + try: + # Create SGLang generation (this starts the servers) + sglang_generation = SGLangGeneration(cluster, sglang_config) + + # Get server URLs + base_urls = sglang_generation.get_sglang_server_urls() + print(f"SGLang server URLs: {base_urls}") + assert len(base_urls) >= 1, "Should have at least one SGLang server" + + # Wait for server to be ready + _wait_for_sglang_http_server_spinup(base_urls[0]) + + # Prepare input - tokenize "count to 5" + test_prompt = "count to 5" + input_ids = tokenizer.encode(test_prompt, add_special_tokens=True) + + # Build request payload for SGLang /generate endpoint + payload = { + "input_ids": input_ids, + "sampling_params": { + "temperature": 0.0, # Greedy for determinism + "top_p": 1.0, + "max_new_tokens": 5, + }, + "return_logprob": True, + } + + # Make request to SGLang server + response = requests.post( + url=f"{base_urls[0]}/generate", + json=payload, + headers={"Content-Type": "application/json"}, + timeout=30, + ) + actual_result = response.json() + print(f"SGLang response: {actual_result}") + + # Verify response structure + assert response.status_code == 200, f"Expected 200, got {response.status_code}" + assert "meta_info" in actual_result, "Response should contain meta_info" + + meta_info = actual_result["meta_info"] + assert "output_token_logprobs" in meta_info, ( + "meta_info should contain output_token_logprobs" + ) + + # Verify we got some generated tokens + output_token_logprobs = meta_info["output_token_logprobs"] + assert len(output_token_logprobs) > 0, ( + "Should have generated at least one token" + ) + + # Each entry should be [logprob, token_id] + first_token_info = output_token_logprobs[0] + assert len(first_token_info) >= 2, ( + "Each token info should have logprob and token_id" + ) + + logprob = first_token_info[0] + token_id = first_token_info[1] + assert isinstance(logprob, float), "Logprob should be a float" + assert isinstance(token_id, int), "Token ID should be an int" + + print(f"First generated token: id={token_id}, logprob={logprob}") + + # Decode the generated tokens to verify text output + generated_token_ids = [item[1] for item in output_token_logprobs] + generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True) + print(f"Generated text: {generated_text}") + + finally: + # Clean up + if sglang_generation: + sglang_generation.shutdown() + gc.collect() + torch.cuda.empty_cache() + + +@pytest.mark.sglang +@pytest.mark.timeout(180) +def test_sglang_non_divisible_batch_handling(policy): + """Test that SGLang generation handles non divisible input batches correctly.""" + empty_batch = BatchedDataDict( + { + "input_ids": torch.zeros((1, 1), dtype=torch.long), + "input_lengths": torch.ones(1, dtype=torch.long), + } + ) + + outputs = policy.generate(empty_batch) + + required_keys = [ + "output_ids", + "logprobs", + "generation_lengths", + "unpadded_sequence_lengths", + ] + assert all(key in outputs for key in required_keys), ( + "Missing required output fields" + ) + assert all(outputs[key].shape[0] == 1 for key in required_keys), ( + "Output tensors should have batch dimension of 1" + ) + + +# ============================================================================= +# Policy Integration Tests +# ============================================================================= + + +@pytest.mark.sglang +@pytest.mark.timeout(300) +def test_sglang_generation_with_hf_training_colocated(cluster, tokenizer): + """Test that DTensor policy can work together with colocated SGLang policy.""" + from nemo_rl.algorithms.grpo import refit_policy_generation + from nemo_rl.models.policy.lm_policy import Policy + + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config = configure_sglang_config(sglang_config, tokenizer) + + dtensor_config = deepcopy(basic_dtensor_test_config) + dtensor_config["train_global_batch_size"] = 4 + dtensor_config["dtensor_cfg"]["_v2"] = ( + True # Use DTensorPolicyWorkerV2 for stream_weights_via_http + ) + + sglang_policy = None + lm_policy = None + + try: + print("Creating SGLang policy...") + sglang_policy = SGLangGeneration(cluster, sglang_config) + sglang_policy.finish_generation() + + print("Creating DTensor policy...") + lm_policy = Policy(cluster, dtensor_config, tokenizer) + + print("Preparing refit info...") + state_dict_info = lm_policy.prepare_refit_info() + sglang_policy.prepare_refit_info(state_dict_info) + + print("Refitting SGLang policy...") + refit_policy_generation( + lm_policy, sglang_policy, sglang_config["colocated"]["enabled"] + ) + + # Test generation + test_prompts = ["Hello, my name is", "The capital of France is"] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=20, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + test_input_data = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32), + } + ) + + outputs = sglang_policy.generate(test_input_data, greedy=True) + assert "output_ids" in outputs, "output_ids not found in generation output" + + generated_texts = tokenizer.batch_decode( + outputs["output_ids"], skip_special_tokens=True + ) + print(f"Generated texts: {generated_texts}") + + finally: + if sglang_policy: + sglang_policy.shutdown() + if lm_policy and hasattr(lm_policy, "shutdown"): + lm_policy.shutdown() + + +@pytest.mark.skip(reason="Non-colocated mode not implemented for SGLang") +@pytest.mark.timeout(300) +@pytest.mark.sglang +def test_sglang_generation_with_hf_training_non_colocated( + policy_cluster_separate, tokenizer +): + """Test that DTensor policy can work together with non-colocated SGLang policy.""" + from nemo_rl.algorithms.grpo import refit_policy_generation + from nemo_rl.models.policy.lm_policy import Policy + + generation_cluster_separate = get_generation_cluster_separate(2) + + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config = configure_sglang_config(sglang_config, tokenizer) + sglang_config["colocated"]["enabled"] = False + + dtensor_config = deepcopy(basic_dtensor_test_config) + dtensor_config["generation"]["colocated"]["enabled"] = False + dtensor_config["train_global_batch_size"] = 4 + dtensor_config["dtensor_cfg"]["_v2"] = ( + True # Use DTensorPolicyWorkerV2 for stream_weights_via_http + ) + + sglang_policy = None + lm_policy = None + + try: + print("Creating SGLang policy...") + sglang_policy = SGLangGeneration(generation_cluster_separate, sglang_config) + sglang_policy.finish_generation() + + print("Creating DTensor policy...") + lm_policy = Policy(policy_cluster_separate, dtensor_config, tokenizer) + + # Initialize collective communication + ip, port = policy_cluster_separate.get_master_address_and_port() + train_world_size = policy_cluster_separate.world_size() + inference_world_size = generation_cluster_separate.world_size() + world_size = train_world_size + inference_world_size + + futures_train = lm_policy.init_collective( + ip, port, world_size=world_size, train_world_size=train_world_size + ) + futures_inference = sglang_policy.init_collective( + ip, port, world_size=world_size, train_world_size=train_world_size + ) + ray.get(futures_train + futures_inference) + + # Prepare refit info + state_dict_info = lm_policy.prepare_refit_info() + sglang_policy.prepare_refit_info(state_dict_info) + + print("Refitting SGLang policy...") + refit_policy_generation(lm_policy, sglang_policy, False) + + # Test generation + test_prompts = ["Hello, my name is", "The capital of France is"] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=20, + truncation=True, + return_tensors="pt", + padding_side="right", + ) + test_input_data = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": encodings["attention_mask"].sum(dim=1).to(torch.int32), + } + ) + + outputs = sglang_policy.generate(test_input_data, greedy=True) + assert "output_ids" in outputs, "output_ids not found in generation output" + + finally: + if sglang_policy: + sglang_policy.shutdown() + if lm_policy and hasattr(lm_policy, "shutdown"): + lm_policy.shutdown() + try: + generation_cluster_separate.shutdown() + except Exception as e: + print(f"Error during generation_cluster_separate shutdown: {e}") + + +@pytest.mark.sglang +@pytest.mark.timeout(180) +def test_sglang_weight_update_and_prefix_cache_reset(cluster, tokenizer): + """Test that the SGLang prefix cache is correctly reset when weights change.""" + from nemo_rl.models.policy.lm_policy import Policy + + sglang_config = deepcopy(basic_sglang_test_config) + sglang_config = configure_sglang_config(sglang_config, tokenizer, is_eval=True) + + dtensor_config = basic_dtensor_test_config + + sglang_policy = None + lm_policy = None + + try: + print("Creating DTensor policy...") + lm_policy = Policy(cluster, dtensor_config, tokenizer) + + print("Creating SGLang policy...") + sglang_policy = SGLangGeneration(cluster, sglang_config) + + print("Preparing refit info...") + state_dict_info = lm_policy.prepare_refit_info() + sglang_policy.prepare_refit_info(state_dict_info) + + # Prepare input data + text = "Answer the question. What is 2+2?" + test_prompt = [text, text] + encodings = tokenizer( + test_prompt, + padding=True, + return_tensors="pt", + padding_side="right", + ) + input_ids = encodings["input_ids"] + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + test_input_data = BatchedDataDict( + {"input_ids": input_ids, "input_lengths": input_lengths} + ) + + print("Running Generation 1 (Initial)...") + sglang_policy.prepare_for_generation() + outputs1 = sglang_policy.generate(test_input_data, greedy=True) + logprob1 = outputs1["logprobs"][0, input_lengths[0]].item() + print(f"Logprob of first generated token (Run 1): {logprob1}") + + print("Adding noise to weights in HF policy...") + ray.get( + [ + worker._add_noise_to_weights.remote() + for worker in lm_policy.worker_group.workers + ] + ) + + print("Updating SGLang weights from DTensor policy via HTTP...") + # Get SGLang server URL to GPU UUID mapping + sglang_url_to_gpu_uuids = sglang_policy.get_sglang_url_to_gpu_uuids() + print(f"SGLang URL to GPU UUIDs: {sglang_url_to_gpu_uuids}") + + # Stream weights via HTTP (CUDA IPC) + ray.get(lm_policy.stream_weights_via_http(sglang_url_to_gpu_uuids)) + + print("Running Generation 2 (Weights Updated)...") + outputs2 = sglang_policy.generate(test_input_data, greedy=True) + logprob2 = outputs2["logprobs"][0, input_lengths[0]].item() + print(f"Logprob of first generated token (Run 2): {logprob2}") + assert logprob2 != logprob1, "Logprobs should be different after weight update." + + print("Resetting SGLang prefix cache...") + sglang_policy.finish_generation() + sglang_policy.prepare_for_generation() + + print("Running Generation 3 (Cache Reset)...") + outputs3 = sglang_policy.generate(test_input_data, greedy=True) + logprob3 = outputs3["logprobs"][0, input_lengths[0]].item() + print(f"Logprob of first generated token (Run 3): {logprob3}") + + print("Prefix cache reset verified successfully.") + + finally: + print("Cleaning up resources...") + if sglang_policy: + sglang_policy.shutdown() + if lm_policy: + lm_policy.shutdown() + gc.collect() + torch.cuda.empty_cache() diff --git a/uv.lock b/uv.lock index f98bc2e21f..c03963443a 100644 --- a/uv.lock +++ b/uv.lock @@ -32,10 +32,10 @@ constraints = [ { name = "urllib3", specifier = ">=2.6.3" }, ] overrides = [ - { name = "nvidia-modelopt", extras = ["torch"], specifier = ">=0.39.0" }, { name = "opencv-python-headless", specifier = ">=4.11.0" }, { name = "timm", specifier = "<=1.0.22" }, { name = "transformer-engine", extras = ["pytorch"], specifier = "==2.8.0" }, + { name = "transformers", specifier = ">=4.57.1" }, ] [[manifest.dependency-metadata]] @@ -45,7 +45,7 @@ requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] name = "deep-ep" -version = "1.2.1+bfded34" +version = "1.1.0+e3908bf" requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] @@ -64,7 +64,7 @@ requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"] [[manifest.dependency-metadata]] name = "nv-grouped-gemm" -version = "1.1.4.post7" +version = "1.1.4.post6" requires-dist = ["setuptools", "wheel", "torch", "numpy"] [[package]] @@ -87,8 +87,8 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f7/66/be171836d86dc5b8698b3a9bf4b9eb10cb53369729939f88bf650167588b/accelerate-1.10.0.tar.gz", hash = "sha256:8270568fda9036b5cccdc09703fef47872abccd56eb5f6d53b54ea5fb7581496", size = 392261, upload-time = "2025-08-07T10:54:51.664Z" } wheels = [ @@ -309,25 +309,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] -[[package]] -name = "anthropic" -version = "0.71.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "distro" }, - { name = "docstring-parser" }, - { name = "httpx" }, - { name = "jiter" }, - { name = "pydantic" }, - { name = "sniffio" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/82/4f/70682b068d897841f43223df82d96ec1d617435a8b759c4a2d901a50158b/anthropic-0.71.0.tar.gz", hash = "sha256:eb8e6fa86d049061b3ef26eb4cbae0174ebbff21affa6de7b3098da857d8de6a", size = 489102, upload-time = "2025-10-16T15:54:40.08Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/77/073e8ac488f335aec7001952825275582fb8f433737e90f24eeef9d878f6/anthropic-0.71.0-py3-none-any.whl", hash = "sha256:85c5015fcdbdc728390f11b17642a65a4365d03b12b799b18b6cc57e71fdb327", size = 355035, upload-time = "2025-10-16T15:54:38.238Z" }, -] - [[package]] name = "antlr4-python3-runtime" version = "4.9.3" @@ -776,8 +757,8 @@ source = { git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8#82 dependencies = [ { name = "ninja" }, { name = "packaging" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] [[package]] @@ -983,18 +964,18 @@ wheels = [ [[package]] name = "compressed-tensors" -version = "0.12.2" +version = "0.11.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "loguru" }, + { name = "frozendict" }, { name = "pydantic" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a2/79/4c5c1cd14266f8cf2650bdb940f986ce7fcaeb56aad8cfa9e9afedf14e2f/compressed_tensors-0.12.2.tar.gz", hash = "sha256:5bb40856dd17f128ab73557ecc73799f80db4dd82fab6de875f1e6899b9ea0c4", size = 190409, upload-time = "2025-10-07T14:30:59.302Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/99/3fdabfc95609d6efdf02fa7f1ed0245524cb1209d3d4a17109d3205d2eed/compressed_tensors-0.11.0.tar.gz", hash = "sha256:95ddf19699f775df6494dd864e5f52e8a24f8015496520190c1a22c6cfc44b1f", size = 187566, upload-time = "2025-08-19T18:59:31.854Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/c0/1695b87d369e6652ec0d650912e02eca2151c5e9c29244f94d2afccfe970/compressed_tensors-0.12.2-py3-none-any.whl", hash = "sha256:e554ea761710ca2b0c0ea49276a4ef8e08658624f1591e6a7368817106b48fbe", size = 183049, upload-time = "2025-10-07T14:30:56.523Z" }, + { url = "https://files.pythonhosted.org/packages/d2/81/e3073017a8f5c75169e79108eda209e6089e3f96c9f197d307cbda7df71c/compressed_tensors-0.11.0-py3-none-any.whl", hash = "sha256:e1cbc46e1ae032b7ceea915fe18c8d2de5a54d3a50a607969b6bdfe703b6cb83", size = 179951, upload-time = "2025-08-19T18:59:29.308Z" }, ] [[package]] @@ -1243,10 +1224,10 @@ version = "25.3.2" source = { git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab#87a86aba72cfd2f0d8abecaf81c13c4528ea07d8" } dependencies = [ { name = "setuptools" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, { name = "triton", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, ] [[package]] @@ -1336,13 +1317,13 @@ wheels = [ [[package]] name = "deep-ep" -version = "1.2.1+bfded34" -source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480#bfded34800dfec415b71503f8205181de90b2480" } +version = "1.1.0+e3908bf" +source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef#e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" } dependencies = [ { name = "ninja" }, { name = "packaging" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] [[package]] @@ -1352,8 +1333,8 @@ source = { git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1 dependencies = [ { name = "ninja" }, { name = "packaging" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] [[package]] @@ -1379,15 +1360,15 @@ wheels = [ [[package]] name = "depyf" -version = "0.20.0" +version = "0.19.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "astor" }, { name = "dill" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/88/35/83fb0178212279aa0af031031905804c6de5618435d229f41ed21bb9ad2c/depyf-0.20.0.tar.gz", hash = "sha256:fb7683bd72c44f67b56029df2c47721e9a02ffa4d7b19095f1c54c4ebf797a98", size = 6168761, upload-time = "2025-10-13T12:33:38.589Z" } +sdist = { url = "https://files.pythonhosted.org/packages/19/38/69157d711be575f1b9cf3177b64ef4ade44373fc02839f183fdd98ec2dd6/depyf-0.19.0.tar.gz", hash = "sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44", size = 6171405, upload-time = "2025-04-20T08:07:41.224Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/65/4df6936130b56e1429114e663e7c1576cf845f3aef1b2dd200c0a5d19dba/depyf-0.20.0-py3-none-any.whl", hash = "sha256:d31effad4261cebecb58955d832e448ace88f432328f95f82fd99c30fd9308d4", size = 39381, upload-time = "2025-10-13T12:33:33.647Z" }, + { url = "https://files.pythonhosted.org/packages/28/4d/1192acbcdc5e843f5e5d51f6e8788f2b60a9fe0b578ac385ded67a0b0b26/depyf-0.19.0-py3-none-any.whl", hash = "sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5", size = 39034, upload-time = "2025-04-20T08:07:37.036Z" }, ] [[package]] @@ -1489,15 +1470,6 @@ version = "0.6.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a2/55/8f8cab2afd404cf578136ef2cc5dfb50baa1761b68c9da1fb1e4eed343c9/docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491", size = 25901, upload-time = "2014-06-16T11:18:57.406Z" } -[[package]] -name = "docstring-parser" -version = "0.17.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" }, -] - [[package]] name = "docutils" version = "0.21.2" @@ -1544,8 +1516,8 @@ version = "0.1.0" source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0#d5363b4a418128cd8111983b191c4b8869a9766b" } dependencies = [ { name = "absl-py" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "typing-extensions" }, ] @@ -1672,8 +1644,8 @@ version = "0.3.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "einops" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/c6/10a1149b07e6bab45b2cb2d07f6b827716c2baf5f3404161753f25c6389b/fla_core-0.3.2.tar.gz", hash = "sha256:d38db16bc4e1c6fa8c04df442f246da1e6926a209426bc6ef703d41bfbc37c92", size = 296725, upload-time = "2025-09-10T07:43:40.155Z" } wheels = [ @@ -1689,8 +1661,8 @@ dependencies = [ { name = "ninja" }, { name = "psutil" }, { name = "setuptools" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e8/6d/7066d160bdffa2f9da29a8c3957f266b17a03ca0b3bdc8fdae86d9881fe7/flash_attn-2.8.1.tar.gz", hash = "sha256:0ff003899fcb244f357905b04f622d5c9736887126dd6675f8f4bc52954e3923", size = 8166563, upload-time = "2025-07-10T05:16:39.729Z" } @@ -1725,8 +1697,8 @@ dependencies = [ { name = "packaging" }, { name = "requests" }, { name = "tabulate" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "tqdm" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d8/04/e357eaa50238e12c49e66fcf47f83e066e741ef19a117c136782b32eafbb/flashinfer_python-0.5.2.tar.gz", hash = "sha256:99d097a28be1e98c7f85e4a767e9e9a4794374f9318c27db14d21e367149063f", size = 4632657, upload-time = "2025-11-07T02:53:27.261Z" } @@ -1805,6 +1777,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0f/64/9d606e66d498917cd7a2ff24f558010d42d6fd4576d9dd57f0bd98333f5a/fonttools-4.59.1-py3-none-any.whl", hash = "sha256:647db657073672a8330608970a984d51573557f328030566521bc03415535042", size = 1130094, upload-time = "2025-08-14T16:28:12.048Z" }, ] +[[package]] +name = "frozendict" +version = "2.4.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/b2/2a3d1374b7780999d3184e171e25439a8358c47b481f68be883c14086b4c/frozendict-2.4.7.tar.gz", hash = "sha256:e478fb2a1391a56c8a6e10cc97c4a9002b410ecd1ac28c18d780661762e271bd", size = 317082, upload-time = "2025-11-11T22:40:14.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/74/f94141b38a51a553efef7f510fc213894161ae49b88bffd037f8d2a7cb2f/frozendict-2.4.7-py3-none-any.whl", hash = "sha256:972af65924ea25cf5b4d9326d549e69a9a4918d8a76a9d3a7cd174d98b237550", size = 16264, upload-time = "2025-11-11T22:40:12.836Z" }, +] + [[package]] name = "frozenlist" version = "1.7.0" @@ -2441,6 +2422,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635, upload-time = "2024-01-06T23:01:20.829Z" }, ] +[[package]] +name = "ipython" +version = "9.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/12/51/a703c030f4928646d390b4971af4938a1b10c9dfce694f0d99a0bb073cb2/ipython-9.8.0.tar.gz", hash = "sha256:8e4ce129a627eb9dd221c41b1d2cdaed4ef7c9da8c17c63f6f578fe231141f83", size = 4424940, upload-time = "2025-12-03T10:18:24.353Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/df/8ee1c5dd1e3308b5d5b2f2dfea323bb2f3827da8d654abb6642051199049/ipython-9.8.0-py3-none-any.whl", hash = "sha256:ebe6d1d58d7d988fbf23ff8ff6d8e1622cfdb194daf4b7b73b792c4ec3b85385", size = 621374, upload-time = "2025-12-03T10:18:22.335Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + [[package]] name = "itsdangerous" version = "2.2.0" @@ -2450,6 +2464,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, ] +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -2692,9 +2718,9 @@ name = "liger-kernel" version = "0.6.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, + { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, { name = "triton", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or sys_platform == 'win32'" }, - { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/31/23/be0b4dcac42d77f99406c906567cde22a7a3d71b3f3ffdfda2ac6153ec36/liger_kernel-0.6.2.tar.gz", hash = "sha256:5c5bcffffa769bc26ae838f5a4954170dd5cacde036abb1b383039f39fa5fd69", size = 3679495, upload-time = "2025-08-22T00:15:28.456Z" } wheels = [ @@ -2703,15 +2729,15 @@ wheels = [ [[package]] name = "llguidance" -version = "1.3.0" +version = "0.7.30" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/38/d1ef3ae08d8d857e5e0690c5b1e07bf7eb4a1cae5881d87215826dc6cadb/llguidance-0.7.30.tar.gz", hash = "sha256:e93bf75f2b6e48afb86a5cee23038746975e1654672bf5ba0ae75f7d4d4a2248", size = 1055528, upload-time = "2025-06-23T00:23:49.247Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" }, - { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" }, - { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" }, - { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" }, - { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" }, + { url = "https://files.pythonhosted.org/packages/b3/e1/694c89986fcae7777184fc8b22baa0976eba15a6847221763f6ad211fc1f/llguidance-0.7.30-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c80af02c118d2b0526bcecaab389af2ed094537a069b0fc724cd2a2f2ba3990f", size = 3327974, upload-time = "2025-06-23T00:23:47.556Z" }, + { url = "https://files.pythonhosted.org/packages/fd/77/ab7a548ae189dc23900fdd37803c115c2339b1223af9e8eb1f4329b5935a/llguidance-0.7.30-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:00a256d532911d2cf5ba4ef63e182944e767dd2402f38d63002016bc37755958", size = 3210709, upload-time = "2025-06-23T00:23:45.872Z" }, + { url = "https://files.pythonhosted.org/packages/9c/5b/6a166564b14f9f805f0ea01ec233a84f55789cb7eeffe1d6224ccd0e6cdd/llguidance-0.7.30-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8741c867e4bc7e42f7cdc68350c076b4edd0ca10ecefbde75f15a9f6bc25d0", size = 14867038, upload-time = "2025-06-23T00:23:39.571Z" }, + { url = "https://files.pythonhosted.org/packages/af/80/5a40b9689f17612434b820854cba9b8cabd5142072c491b5280fe5f7a35e/llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9edc409b9decd6cffba5f5bf3b4fbd7541f95daa8cbc9510cbf96c6ab1ffc153", size = 15004926, upload-time = "2025-06-23T00:23:43.965Z" }, + { url = "https://files.pythonhosted.org/packages/99/47/58e49a118b514855b245f8a962c6aaf9a5cc95a0f61eac7e230e691c7b7e/llguidance-0.7.30-cp39-abi3-win_amd64.whl", hash = "sha256:05234ecceea7c9c6ff13b9739112043173a3bcb88cae860249b20335a07b3075", size = 2796878, upload-time = "2025-06-23T00:23:51Z" }, ] [[package]] @@ -2747,19 +2773,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/ef/11292bb0b85cf4c93447cab5a29f64576ed14d3ab4280e35ddd23486594a/lm_format_enforcer-0.11.3-py3-none-any.whl", hash = "sha256:cf586350875def1ae7a8fba84fcbbfc8371424b6c9d05c1fcba70aa233fbf06f", size = 45418, upload-time = "2025-08-24T19:37:46.325Z" }, ] -[[package]] -name = "loguru" -version = "0.7.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "win32-setctime", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, -] - [[package]] name = "lxml" version = "6.0.0" @@ -2820,8 +2833,8 @@ dependencies = [ { name = "causal-conv1d" }, { name = "ninja" }, { name = "packaging" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] [[package]] @@ -2949,6 +2962,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/60/3601f8ce6d76a7c81c7f25a0e15fde0d6b66226dd187aa6d2838e6374161/matplotlib-3.10.5-cp314-cp314t-win_arm64.whl", hash = "sha256:2efaf97d72629e74252e0b5e3c46813e9eeaa94e011ecf8084a971a31a97f40b", size = 8153849, upload-time = "2025-07-31T18:09:19.673Z" }, ] +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + [[package]] name = "mdit-py-plugins" version = "0.5.0" @@ -3037,7 +3062,7 @@ dependencies = [ { name = "multi-storage-client" }, { name = "numpy" }, { name = "nv-grouped-gemm" }, - { name = "nvidia-modelopt" }, + { name = "nvidia-modelopt", marker = "sys_platform != 'darwin'" }, { name = "nvidia-resiliency-ext" }, { name = "nvtx" }, { name = "onnxscript" }, @@ -3046,8 +3071,8 @@ dependencies = [ { name = "setuptools" }, { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, { name = "tensorstore", version = "0.1.76", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "tqdm" }, { name = "transformer-engine", extra = ["pytorch"] }, { name = "wget" }, @@ -3065,7 +3090,7 @@ requires-dist = [ { name = "megatron-energon", extras = ["av-decode"], specifier = "~=6.0" }, { name = "multi-storage-client", specifier = "~=0.27" }, { name = "numpy", specifier = "<2.0.0" }, - { name = "nv-grouped-gemm", git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7" }, + { name = "nv-grouped-gemm", specifier = "~=1.1" }, { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin'", specifier = ">=0.33.0a0,<0.34.0" }, { name = "nvidia-resiliency-ext", specifier = ">=0.4.0a0,<0.5.0" }, { name = "nvtx", specifier = "~=0.2" }, @@ -3093,8 +3118,8 @@ dependencies = [ { name = "pillow" }, { name = "pyyaml" }, { name = "s3fs" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "tqdm" }, { name = "webdataset" }, ] @@ -3120,8 +3145,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, { name = "packaging" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a0/be/06ada3d765ebca304e2d87873d6cf00807b43155ed57058abcd813d13a5d/megatron_fsdp-0.1.0rc1.tar.gz", hash = "sha256:4852a1c62bb95b5fc9567165ee7119f2e68bc75d6103af06bd1e6d392a50021f", size = 71600, upload-time = "2025-09-02T21:29:10.757Z" } wheels = [ @@ -3148,6 +3173,10 @@ wheels = [ ] [package.optional-dependencies] +audio = [ + { name = "soundfile" }, + { name = "soxr" }, +] image = [ { name = "opencv-python-headless" }, ] @@ -3338,21 +3367,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/59/8e4dee2893a56fc68a27eec7ec7ed9559c7ea01099313a9b8196373bf3cf/mlx_metal-0.28.0-py3-none-macosx_15_0_arm64.whl", hash = "sha256:214ece3781d44f57eb9686561594b28915ec5568df4a5a73da59c66880b204ed", size = 33167706, upload-time = "2025-08-07T07:53:03.852Z" }, ] -[[package]] -name = "model-hosting-container-standards" -version = "0.1.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "fastapi" }, - { name = "httpx" }, - { name = "jmespath" }, - { name = "pydantic" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1c/d0/eaba9ff13f7a534bf2c0f28e4e32dee58583dc3a31fe3eebb3b93ed13675/model_hosting_container_standards-0.1.4.tar.gz", hash = "sha256:86838d16e4d05bc6fdafdf83dc292a9d34124b63584764ad6cd67b05d09cda62", size = 63332, upload-time = "2025-11-10T17:58:37.321Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/fc/d6034069e52003ed86f72e436b65f16084fa4d08c6b8220bc0fc85e33eab/model_hosting_container_standards-0.1.4-py3-none-any.whl", hash = "sha256:ede565ba750e812eef028804c84b8244a96fb733fcaec9a1e552568df809d841", size = 86597, upload-time = "2025-11-10T17:58:35.843Z" }, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -3561,8 +3575,8 @@ dependencies = [ { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "torchao" }, { name = "torchdata" }, { name = "transformers" }, @@ -3608,8 +3622,8 @@ vlm = [ [package.dev-dependencies] build = [ { name = "setuptools" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] dev = [ { name = "cut-cross-entropy" }, @@ -3749,6 +3763,7 @@ dependencies = [ { name = "accelerate" }, { name = "blobfile" }, { name = "colored" }, + { name = "coverage" }, { name = "datasets" }, { name = "debugpy" }, { name = "hydra-core" }, @@ -3773,26 +3788,23 @@ dependencies = [ { name = "sympy" }, { name = "tensorboard" }, { name = "tiktoken" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "torchdata" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "transformers" }, - { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "wandb" }, ] [package.optional-dependencies] automodel = [ { name = "causal-conv1d" }, - { name = "deep-ep" }, { name = "flash-attn" }, { name = "mamba-ssm" }, { name = "nemo-automodel" }, - { name = "nv-grouped-gemm" }, - { name = "transformer-engine", extra = ["pytorch"] }, { name = "vllm" }, ] mcore = [ @@ -3805,6 +3817,26 @@ mcore = [ nemo-gym = [ { name = "nemo-gym" }, ] +sglang = [ + { name = "compressed-tensors" }, + { name = "einops" }, + { name = "interegular" }, + { name = "msgspec" }, + { name = "openai" }, + { name = "openai-harmony" }, + { name = "orjson" }, + { name = "partial-json-parser" }, + { name = "pybase64" }, + { name = "python-multipart" }, + { name = "requests" }, + { name = "sentencepiece" }, + { name = "sgl-kernel" }, + { name = "sglang" }, + { name = "torch-memory-saver" }, + { name = "torchao" }, + { name = "uvloop" }, + { name = "xgrammar" }, +] vllm = [ { name = "causal-conv1d" }, { name = "cuda-python" }, @@ -3824,8 +3856,8 @@ build = [ { name = "psutil" }, { name = "pybind11" }, { name = "setuptools" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] dev = [ { name = "pre-commit" }, @@ -3862,16 +3894,19 @@ requires-dist = [ { name = "causal-conv1d", marker = "extra == 'automodel'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" }, { name = "causal-conv1d", marker = "extra == 'vllm'", git = "https://github.com/Dao-AILab/causal-conv1d?tag=v1.5.0.post8" }, { name = "colored", specifier = "==2.2.3" }, + { name = "compressed-tensors", marker = "extra == 'sglang'" }, + { name = "coverage", specifier = ">=7.10.4" }, { name = "cuda-python", marker = "extra == 'vllm'" }, { name = "datasets", specifier = ">=4.0.0" }, { name = "debugpy" }, - { name = "deep-ep", marker = "extra == 'automodel'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" }, - { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" }, + { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" }, { name = "deep-gemm", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" }, + { name = "einops", marker = "extra == 'sglang'" }, { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.8.1" }, { name = "hydra-core" }, + { name = "interegular", marker = "extra == 'sglang'" }, { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, { name = "math-verify" }, @@ -3879,43 +3914,56 @@ requires-dist = [ { name = "megatron-bridge", marker = "extra == 'mcore'", editable = "3rdparty/Megatron-Bridge-workspace" }, { name = "megatron-core", marker = "extra == 'mcore'", editable = "3rdparty/Megatron-LM-workspace" }, { name = "mlflow", specifier = ">=3.5.0,<3.6.0" }, + { name = "msgspec", marker = "extra == 'sglang'" }, { name = "nemo-automodel", marker = "extra == 'automodel'", editable = "3rdparty/Automodel-workspace/Automodel" }, { name = "nemo-gym", marker = "extra == 'nemo-gym'", editable = "3rdparty/Gym-workspace" }, { name = "ninja" }, { name = "num2words", specifier = ">=0.5.14" }, { name = "num2words", marker = "extra == 'vllm'", specifier = ">=0.5.14" }, { name = "numpy" }, - { name = "nv-grouped-gemm", marker = "extra == 'automodel'", git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7" }, { name = "nvidia-ml-py" }, { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "nvtx" }, { name = "omegaconf" }, + { name = "openai", marker = "extra == 'sglang'" }, + { name = "openai-harmony", marker = "extra == 'sglang'" }, + { name = "orjson", marker = "extra == 'sglang'" }, + { name = "partial-json-parser", marker = "extra == 'sglang'" }, { name = "pillow", specifier = ">=11.3.0" }, { name = "pip" }, { name = "plotly" }, + { name = "pybase64", marker = "extra == 'sglang'" }, + { name = "python-multipart", marker = "extra == 'sglang'" }, { name = "pyzmq" }, { name = "ray", extras = ["default"], specifier = "==2.49.2" }, + { name = "requests", marker = "extra == 'sglang'" }, { name = "rich" }, + { name = "sentencepiece", marker = "extra == 'sglang'" }, { name = "setuptools" }, + { name = "sgl-kernel", marker = "extra == 'sglang'", specifier = "==0.3.17.post1" }, + { name = "sglang", marker = "extra == 'sglang'", specifier = ">=0.4.1" }, { name = "swanlab" }, { name = "sympy", specifier = ">=1.14.0" }, { name = "tensorboard" }, { name = "tiktoken" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.9.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.9.0", index = "https://pypi.org/simple" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.8.0", index = "https://pypi.org/simple" }, + { name = "torch-memory-saver", marker = "extra == 'sglang'" }, + { name = "torchao", marker = "extra == 'sglang'" }, { name = "torchdata" }, { name = "torchvision", marker = "sys_platform != 'darwin'", specifier = ">=0.22.0", index = "https://download.pytorch.org/whl/cu129" }, { name = "torchvision", marker = "sys_platform == 'darwin'", specifier = ">=0.22.0", index = "https://pypi.org/simple" }, - { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'automodel'", specifier = "==2.8.0" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'mcore'", specifier = "==2.8.0" }, - { name = "transformers", specifier = "==4.57.1" }, + { name = "transformers", specifier = ">=4.55.4" }, { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')", index = "https://download.pytorch.org/whl/cu129" }, - { name = "vllm", marker = "extra == 'automodel'", specifier = "==0.11.2" }, - { name = "vllm", marker = "extra == 'mcore'", specifier = "==0.11.2" }, - { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.11.2" }, + { name = "uvloop", marker = "extra == 'sglang'" }, + { name = "vllm", marker = "extra == 'automodel'", specifier = "==0.11.0" }, + { name = "vllm", marker = "extra == 'mcore'", specifier = "==0.11.0" }, + { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.11.0" }, { name = "wandb" }, + { name = "xgrammar", marker = "extra == 'sglang'" }, ] -provides-extras = ["automodel", "vllm", "mcore", "nemo-gym"] +provides-extras = ["automodel", "vllm", "sglang", "mcore", "nemo-gym"] [package.metadata.requires-dev] build = [ @@ -3925,8 +3973,8 @@ build = [ { name = "psutil" }, { name = "pybind11" }, { name = "setuptools" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.9.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.9.0", index = "https://pypi.org/simple" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.8.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "==2.8.0", index = "https://pypi.org/simple" }, ] dev = [ { name = "pre-commit", specifier = ">=4.2.0" }, @@ -4053,21 +4101,20 @@ wheels = [ [[package]] name = "nv-grouped-gemm" version = "1.1.4.post7" -source = { git = "https://github.com/fanshiqing/grouped_gemm?tag=v1.1.4.post7#6dfaf60e6112166b8b82e9210b51c7f557956f0a" } +source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "absl-py" }, { name = "numpy" }, - { name = "setuptools" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "wheel" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/63/36/13d0a1e1af31c3b2a297c15b6e7da532b13361730b32d11d9698854bdbe3/nv_grouped_gemm-1.1.4.post7.tar.gz", hash = "sha256:bc9f7906c9b0bd7fefea5a776acbc277577c65b103181340fd26ca2b8460c6a5", size = 26520, upload-time = "2025-12-16T19:42:33.176Z" } [[package]] name = "nvidia-cublas-cu12" version = "12.9.1.4" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/82/6c/90d3f532f608a03a13c1d6c16c266ffa3828e8011b1549d3b61db2ad59f5/nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7a950dae01add3b415a5a5cdc4ec818fb5858263e9cca59004bb99fdbbd3a5d6", size = 575006342, upload-time = "2025-06-05T20:04:16.902Z" }, { url = "https://files.pythonhosted.org/packages/77/3c/aa88abe01f3be3d1f8f787d1d33dc83e76fec05945f9a28fbb41cfb99cd5/nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:453611eb21a7c1f2c2156ed9f3a45b691deda0440ec550860290dc901af5b4c2", size = 581242350, upload-time = "2025-06-05T20:04:51.979Z" }, ] @@ -4076,7 +4123,6 @@ name = "nvidia-cuda-cupti-cu12" version = "12.9.79" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b4/78/351b5c8cdbd9a6b4fb0d6ee73fb176dcdc1b6b6ad47c2ffff5ae8ca4a1f7/nvidia_cuda_cupti_cu12-12.9.79-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:791853b030602c6a11d08b5578edfb957cadea06e9d3b26adbf8d036135a4afe", size = 10077166, upload-time = "2025-06-05T20:01:01.385Z" }, { url = "https://files.pythonhosted.org/packages/c1/2e/b84e32197e33f39907b455b83395a017e697c07a449a2b15fd07fc1c9981/nvidia_cuda_cupti_cu12-12.9.79-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:096bcf334f13e1984ba36685ad4c1d6347db214de03dbb6eebb237b41d9d934f", size = 10814997, upload-time = "2025-06-05T20:01:10.168Z" }, ] @@ -4086,7 +4132,6 @@ version = "12.9.86" source = { registry = "https://pypi.org/simple" } wheels = [ { url = "https://files.pythonhosted.org/packages/b8/85/e4af82cc9202023862090bfca4ea827d533329e925c758f0cde964cb54b7/nvidia_cuda_nvrtc_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:210cf05005a447e29214e9ce50851e83fc5f4358df8b453155d5e1918094dcb4", size = 89568129, upload-time = "2025-06-05T20:02:41.973Z" }, - { url = "https://files.pythonhosted.org/packages/64/eb/c2295044b8f3b3b08860e2f6a912b702fc92568a167259df5dddb78f325e/nvidia_cuda_nvrtc_cu12-12.9.86-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:096d4de6bda726415dfaf3198d4f5c522b8e70139c97feef5cd2ca6d4cd9cead", size = 44528905, upload-time = "2025-06-05T20:02:29.754Z" }, ] [[package]] @@ -4094,7 +4139,6 @@ name = "nvidia-cuda-runtime-cu12" version = "12.9.79" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bc/e0/0279bd94539fda525e0c8538db29b72a5a8495b0c12173113471d28bce78/nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83469a846206f2a733db0c42e223589ab62fd2fabac4432d2f8802de4bded0a4", size = 3515012, upload-time = "2025-06-05T20:00:35.519Z" }, { url = "https://files.pythonhosted.org/packages/bc/46/a92db19b8309581092a3add7e6fceb4c301a3fd233969856a8cbf042cd3c/nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25bba2dfb01d48a9b59ca474a1ac43c6ebf7011f1b0b8cc44f54eb6ac48a96c3", size = 3493179, upload-time = "2025-06-05T20:00:53.735Z" }, ] @@ -4103,10 +4147,9 @@ name = "nvidia-cudnn-cu12" version = "9.10.2.21" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, ] @@ -4128,10 +4171,9 @@ name = "nvidia-cufft-cu12" version = "11.4.1.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/2b/76445b0af890da61b501fde30650a1a4bd910607261b209cccb5235d3daa/nvidia_cufft_cu12-11.4.1.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1a28c9b12260a1aa7a8fd12f5ebd82d027963d635ba82ff39a1acfa7c4c0fbcf", size = 200822453, upload-time = "2025-06-05T20:05:27.889Z" }, { url = "https://files.pythonhosted.org/packages/95/f4/61e6996dd20481ee834f57a8e9dca28b1869366a135e0d42e2aa8493bdd4/nvidia_cufft_cu12-11.4.1.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c67884f2a7d276b4b80eb56a79322a95df592ae5e765cf1243693365ccab4e28", size = 200877592, upload-time = "2025-06-05T20:05:45.862Z" }, ] @@ -4141,7 +4183,6 @@ version = "1.14.1.1" source = { registry = "https://pypi.org/simple" } wheels = [ { url = "https://files.pythonhosted.org/packages/ad/28/b960e06d705a440c030edd84e16888ee14c743390bdb2a6368e92ffe8ef8/nvidia_cufile_cu12-1.14.1.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9552e2231792e94b1ff17bc99e958cc0e6bbbaa4a9d91fa2dbeed97716628fe6", size = 1210714, upload-time = "2025-06-05T20:06:11.898Z" }, - { url = "https://files.pythonhosted.org/packages/b9/d2/110af3a1f77999d5eebf6ffae5d2305ab839e53c76eec3696640cc25b35d/nvidia_cufile_cu12-1.14.1.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8dea77590761e02cb6dd955a57cb6414c58aa3cb1b7adbf9919869a11509cf65", size = 1135994, upload-time = "2025-06-05T20:06:03.952Z" }, ] [[package]] @@ -4149,7 +4190,6 @@ name = "nvidia-curand-cu12" version = "10.3.10.19" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/14/1c/2a45afc614d99558d4a773fa740d8bb5471c8398eeed925fc0fcba020173/nvidia_curand_cu12-10.3.10.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:de663377feb1697e1d30ed587b07d5721fdd6d2015c738d7528a6002a6134d37", size = 68292066, upload-time = "2025-05-01T19:39:13.595Z" }, { url = "https://files.pythonhosted.org/packages/31/44/193a0e171750ca9f8320626e8a1f2381e4077a65e69e2fb9708bd479e34a/nvidia_curand_cu12-10.3.10.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:49b274db4780d421bd2ccd362e1415c13887c53c214f0d4b761752b8f9f6aa1e", size = 68295626, upload-time = "2025-05-01T19:39:38.885Z" }, ] @@ -4158,12 +4198,11 @@ name = "nvidia-cusolver-cu12" version = "11.7.5.82" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/03/99/686ff9bf3a82a531c62b1a5c614476e8dfa24a9d89067aeedf3592ee4538/nvidia_cusolver_cu12-11.7.5.82-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:62efa83e4ace59a4c734d052bb72158e888aa7b770e1a5f601682f16fe5b4fd2", size = 337869834, upload-time = "2025-06-05T20:06:53.125Z" }, { url = "https://files.pythonhosted.org/packages/33/40/79b0c64d44d6c166c0964ec1d803d067f4a145cca23e23925fd351d0e642/nvidia_cusolver_cu12-11.7.5.82-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:15da72d1340d29b5b3cf3fd100e3cd53421dde36002eda6ed93811af63c40d88", size = 338117415, upload-time = "2025-06-05T20:07:16.809Z" }, ] @@ -4172,10 +4211,9 @@ name = "nvidia-cusparse-cu12" version = "12.5.10.65" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/6f/8710fbd17cdd1d0fc3fea7d36d5b65ce1933611c31e1861da330206b253a/nvidia_cusparse_cu12-12.5.10.65-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:221c73e7482dd93eda44e65ce567c031c07e2f93f6fa0ecd3ba876a195023e83", size = 366359408, upload-time = "2025-06-05T20:07:42.501Z" }, { url = "https://files.pythonhosted.org/packages/12/46/b0fd4b04f86577921feb97d8e2cf028afe04f614d17fb5013de9282c9216/nvidia_cusparse_cu12-12.5.10.65-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:73060ce019ac064a057267c585bf1fd5a353734151f87472ff02b2c5c9984e78", size = 366465088, upload-time = "2025-06-05T20:08:20.413Z" }, ] @@ -4184,7 +4222,6 @@ name = "nvidia-cusparselt-cu12" version = "0.7.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, ] @@ -4215,35 +4252,46 @@ wheels = [ [[package]] name = "nvidia-modelopt" -version = "0.40.0" +version = "0.33.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ninja" }, - { name = "numpy" }, - { name = "nvidia-ml-py" }, - { name = "packaging" }, - { name = "pulp" }, - { name = "pydantic" }, - { name = "regex" }, - { name = "rich" }, - { name = "safetensors" }, - { name = "scipy" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "torchprofile" }, - { name = "tqdm" }, + { name = "ninja", marker = "sys_platform != 'darwin'" }, + { name = "numpy", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-ml-py", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-modelopt-core", marker = "sys_platform != 'darwin'" }, + { name = "packaging", marker = "sys_platform != 'darwin'" }, + { name = "pulp", marker = "sys_platform != 'darwin'" }, + { name = "pydantic", marker = "sys_platform != 'darwin'" }, + { name = "regex", marker = "sys_platform != 'darwin'" }, + { name = "rich", marker = "sys_platform != 'darwin'" }, + { name = "safetensors", marker = "sys_platform != 'darwin'" }, + { name = "scipy", marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torchprofile", marker = "sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "tqdm", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/cb/4af39357792a96f334c7877ea0380c9337aec210ff4794a7dd95beb7c349/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6c51091683a117cd40fdb96a0ec28579f2276f6b627db7ccddc370df544e1dd7", size = 751683, upload-time = "2025-08-12T18:37:48.832Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b1/fc2f468d140ef58e90fac584759d0cc449db9bc4f64668cdff750ef38fef/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ef78a98901890f265596ec413dffac177d4a1865201d89a14f29f4fa0cf8e710", size = 751683, upload-time = "2025-08-12T18:36:59.964Z" }, ] + +[[package]] +name = "nvidia-modelopt-core" +version = "0.33.1" +source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/4a/4b4c339637fdbd54bc98b92c87c8b22f5efee05ca9e31e40a8d49ee66187/nvidia_modelopt-0.40.0-py3-none-any.whl", hash = "sha256:0315f53aef014b902866e427038db5803e3c6787a8e1f09c3650031550885051", size = 901421, upload-time = "2025-12-12T10:35:28.506Z" }, + { url = "https://files.pythonhosted.org/packages/9b/b5/ba79b1c52b634b24e45dca409f133f947217a5c7ec5c256266e4ec5fa3eb/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1ddd9279d8312f8e972b302692a26e6180f1c9fd277232f5925a5589f42b1b76", size = 1338081, upload-time = "2025-08-12T18:40:36.156Z" }, + { url = "https://files.pythonhosted.org/packages/13/40/4427583475dfd8eb1b8c7522d75d4d059f0512ff03dcc62d6986a22ab918/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:69d5ace564f2b056c916117be2023f2b7fc01cd1501073915e6b2ced2b8a5394", size = 1363366, upload-time = "2025-08-12T18:39:28.854Z" }, ] [[package]] name = "nvidia-nccl-cu12" -version = "2.27.5" +version = "2.27.3" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, - { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, + { url = "https://files.pythonhosted.org/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" }, ] [[package]] @@ -4252,7 +4300,6 @@ version = "12.9.86" source = { registry = "https://pypi.org/simple" } wheels = [ { url = "https://files.pythonhosted.org/packages/46/0c/c75bbfb967457a0b7670b8ad267bfc4fffdf341c074e0a80db06c24ccfd4/nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:e3f1171dbdc83c5932a45f0f4c99180a70de9bd2718c1ab77d14104f6d7147f9", size = 39748338, upload-time = "2025-06-05T20:10:25.613Z" }, - { url = "https://files.pythonhosted.org/packages/97/bc/2dcba8e70cf3115b400fef54f213bcd6715a3195eba000f8330f11e40c45/nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:994a05ef08ef4b0b299829cde613a424382aff7efb08a7172c1fa616cc3af2ca", size = 39514880, upload-time = "2025-06-05T20:10:04.89Z" }, ] [[package]] @@ -4270,7 +4317,6 @@ version = "12.9.79" source = { registry = "https://pypi.org/simple" } wheels = [ { url = "https://files.pythonhosted.org/packages/86/ed/bb230dce7741f2778ba2ae3e8778fdb8bc58eee9fd95f07bf7b2d18e8081/nvidia_nvtx_cu12-12.9.79-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fec150986817f2b4e7eed72ed059f2dcb9ba3856b9a96134e448eac946a6952f", size = 85504, upload-time = "2025-06-05T20:03:10.21Z" }, - { url = "https://files.pythonhosted.org/packages/c4/e4/82155e4aaedb41621087ba219c95e99c5e417f37a7649b4fb6ec32dcb14d/nvidia_nvtx_cu12-12.9.79-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d1f258e752294acdb4f61c3d31fee87bd0f60e459f1e2f624376369b524cd15d", size = 86120, upload-time = "2025-06-05T20:02:51.838Z" }, ] [[package]] @@ -4284,8 +4330,8 @@ dependencies = [ { name = "psutil" }, { name = "pynvml" }, { name = "pyyaml" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/70/05/38d491962273c7905708762279f440520eb79f3c00b67a023497215ad023/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_aarch64.whl", hash = "sha256:b3bd5f01535574b16d0f38bca6e39afe3806c4a2896eee1b321cd944e00025a7", size = 444570, upload-time = "2025-07-17T03:50:58.877Z" }, @@ -4402,11 +4448,11 @@ dependencies = [ { name = "regex" }, { name = "safetensors" }, { name = "timm" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "tqdm" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/46/fb8be250fa7fcfc56fbeb41583645e18d868268f67fbbbeb8ed62a8ff18a/open_clip_torch-3.2.0.tar.gz", hash = "sha256:62b7743012ccc40fb7c64819fa762fba0a13dd74585ac733babe58c2974c2506", size = 1502853, upload-time = "2025-09-21T17:32:08.289Z" } @@ -4679,6 +4725,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" }, ] +[[package]] +name = "parso" +version = "0.8.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, +] + [[package]] name = "partial-json-parser" version = "0.2.1.1.post6" @@ -4709,8 +4764,8 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "tqdm" }, { name = "transformers" }, ] @@ -4737,6 +4792,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/17/b7cb1a10ebb0a9a4c9fbcd96a28b43d44e08a90f620bab07e644a658d2f1/perceptron-0.1.4-py3-none-any.whl", hash = "sha256:f490a6df6c15167e91e1a528601cae98ce99a30991cf792f9ef83ebc15d335c4", size = 57421, upload-time = "2025-11-12T20:00:26.395Z" }, ] +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess", marker = "sys_platform != 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + [[package]] name = "pillow" version = "11.3.0" @@ -4907,6 +4974,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/72/0824c18f3bc75810f55dacc2dd933f6ec829771180245ae3cc976195dec0/prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl", hash = "sha256:978130f3c0bb7b8ebcc90d35516a6fe13e02d2eb358c8f83887cdef7020c31e9", size = 19296, upload-time = "2025-03-19T19:35:04.323Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.3.2" @@ -5005,6 +5084,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" }, ] +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + [[package]] name = "pulp" version = "3.2.2" @@ -5014,6 +5102,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/8d/a6a9d58c929a869f7f1b99b3d37b3f14ef63e2826eef581416338d686c3f/pulp-3.2.2-py3-none-any.whl", hash = "sha256:d3ca5ff11a28b3e7b2508a992d7e51f3533471d89305f0560b5fe3b6cc821043", size = 16385354, upload-time = "2025-07-29T11:42:01.829Z" }, ] +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + [[package]] name = "py-cpuinfo" version = "9.0.0" @@ -6302,6 +6399,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0d/6d/b4752b044bf94cb802d88a888dc7d288baaf77d7910b7dedda74b5ceea0c/setuptools-79.0.1-py3-none-any.whl", hash = "sha256:e147c0549f27767ba362f9da434eab9c5dc0045d5304feb602a0af001089fc51", size = 1256281, upload-time = "2025-04-23T22:20:56.768Z" }, ] +[[package]] +name = "sgl-kernel" +version = "0.3.17.post1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/a2/d2b36e0b8a7b5d88117d8d96c4eb612fe3677069316d444479ff78c73547/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:330057ad2d239e9363ee9abd85ed445ee1795161c60b7357f9792103121039cc", size = 341776329, upload-time = "2025-11-15T15:39:54.528Z" }, + { url = "https://files.pythonhosted.org/packages/10/8f/6286c74887c42ee4e888a6c36170ff394185e581fbecce2f1bf5c174b96e/sgl_kernel-0.3.17.post1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:c864e6d6eebcd91e59a71ba781739761a21774f0cb862578381f54f504f93b4a", size = 511995347, upload-time = "2025-11-15T15:41:45.029Z" }, +] + +[[package]] +name = "sglang" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "ipython" }, + { name = "numpy" }, + { name = "requests" }, + { name = "setproctitle" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/f0/954c401fe1bc80135c245f477cb117d7bb301f7b2eebcf38dcf211c03ac1/sglang-0.5.2.tar.gz", hash = "sha256:0c8a9ad02278d12eba2f30928e0464a646d03b2e2f32efcf6c681bbd795df793", size = 1627791, upload-time = "2025-09-11T23:09:48.602Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/2b/44c336e0be9a9a23e56b6fcfed3b6f03dfc8a4181ef2cc82129aa9811fa8/sglang-0.5.2-py3-none-any.whl", hash = "sha256:83aae146f3913ed0802bb1ea356facff47efe0e7d18041a3f143de9ef6e44b2c", size = 2184239, upload-time = "2025-09-11T23:09:46.458Z" }, +] + [[package]] name = "shellingham" version = "1.5.4" @@ -6639,6 +6762,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload-time = "2024-12-10T12:05:27.824Z" }, ] +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + [[package]] name = "standard-aifc" version = "3.13.0" @@ -6931,11 +7068,11 @@ dependencies = [ { name = "huggingface-hub" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/94/f6/4d7a8c261341fa6ad281920618739f2a650f41043afcedb570f24e99a776/timm-1.0.16.tar.gz", hash = "sha256:a3b8130dd2cb8dc3b9f5e3d09ab6d677a6315a8695fd5264eb6d52a4a46c1044", size = 2339999, upload-time = "2025-06-26T17:09:44.208Z" } wheels = [ @@ -6978,7 +7115,7 @@ wheels = [ [[package]] name = "torch" -version = "2.9.0" +version = "2.8.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin'", @@ -6994,16 +7131,14 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/66/e8/fc414d8656250ee46120b44836ffbb3266343db424b3e18ca79ebbf69d4f/torch-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e4e5b5cba837a2a8d1a497ba9a58dae46fa392593eaa13b871c42f71847503a5", size = 74830362, upload-time = "2025-10-15T15:46:48.983Z" }, - { url = "https://files.pythonhosted.org/packages/ff/c3/a91f96ec74347fa5fd24453fa514bc61c61ecc79196fa760b012a1873d96/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:f8877779cf56d1ce431a7636703bdb13307f5960bb1af49716d8b179225e0e6a", size = 74480732, upload-time = "2025-10-15T15:47:38.002Z" }, - { url = "https://files.pythonhosted.org/packages/5c/73/9f70af34b334a7e0ef496ceec96b7ec767bd778ea35385ce6f77557534d1/torch-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7e614fae699838038d888729f82b687c03413c5989ce2a9481f9a7e7a396e0bb", size = 74433037, upload-time = "2025-10-15T15:47:41.894Z" }, - { url = "https://files.pythonhosted.org/packages/83/36/74f8c051f785500396e42f93542422422dfd874a174f21f8d955d36e5d64/torch-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:71d9309aee457bbe0b164bce2111cd911c4ed4e847e65d5077dbbcd3aba6befc", size = 74823353, upload-time = "2025-10-15T15:49:16.59Z" }, + { url = "https://files.pythonhosted.org/packages/be/66/5c9a321b325aaecb92d4d1855421e3a055abd77903b7dab6575ca07796db/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:619c2869db3ada2c0105487ba21b5008defcc472d23f8b80ed91ac4a380283b0", size = 73630478, upload-time = "2025-08-06T14:53:57.144Z" }, + { url = "https://files.pythonhosted.org/packages/de/69/8b7b13bba430f5e21d77708b616f767683629fc4f8037564a177d20f90ed/torch-2.8.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:1a62a1ec4b0498930e2543535cf70b1bef8c777713de7ceb84cd79115f553767", size = 73915128, upload-time = "2025-08-06T14:54:34.769Z" }, + { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, ] [[package]] name = "torch" -version = "2.9.0+cu129" +version = "2.8.0+cu129" source = { registry = "https://download.pytorch.org/whl/cu129" } resolution-markers = [ "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", @@ -7022,42 +7157,44 @@ dependencies = [ { name = "fsspec", marker = "sys_platform != 'darwin'" }, { name = "jinja2", marker = "sys_platform != 'darwin'" }, { name = "networkx", marker = "sys_platform != 'darwin'" }, - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "sys_platform != 'darwin'" }, { name = "sympy", marker = "sys_platform != 'darwin'" }, - { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, + { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, { name = "typing-extensions", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp312-cp312-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp313-cp313t-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torch-2.9.0%2Bcu129-cp314-cp314t-win_amd64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:692fe6e513b667f789a543fa9b1baba58e77a46d5c8629764ca0c00a56823e1f" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:02c7258e917f3043c978b53acf6f02b818db0d0d85db0e58ae578af333b9b4e2" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp312-cp312-win_amd64.whl", hash = "sha256:2bc729898e422b9f3da54349eed98f2f0b5dd415434508ee2ab2a13fb021815d" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ad2d64316635e7ab06f6c973a252526d59a92a2045825c102f876914a72304d0" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:563740167be2189b71530b503f0c8a8d7a8267dd49d4de6f9c5f1d23fbe237df" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313-win_amd64.whl", hash = "sha256:2cef066f9759ff4d7868a8c3695aa60d9a878598acb3685bb1ef2fdac29dcd68" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2982bf34249cbb38f1090e71ad7097a214a21023ccdc0413961986ab7d0396e6" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6344260959ebcfa6dae458e1c4365195bcfdf00f4f1f1ad438cbaf50756829ed" }, + { url = "https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp313-cp313t-win_amd64.whl", hash = "sha256:9c0cd89e54ce44ce3208c5cf4163773b9cda0067e4b48cfcac56a4e04af52040" }, +] + +[[package]] +name = "torch-memory-saver" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/28/6c/21dfda5d31afb71f52cedff52370acbb8290485b3f0fee6816a15a3d08f1/torch_memory_saver-0.0.9.tar.gz", hash = "sha256:3bbf76391fb16870b1b0df279fc281c8a05ef8f8809400b309b0a8240e8ee5ba", size = 14220, upload-time = "2025-10-18T02:10:18.163Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/35/b22df9e730d8444d62445a594421992781c7fad271325d41656d8a32d103/torch_memory_saver-0.0.9-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:0cf26332993649f8ea1b95d7307dfba3a95ee6cee53de84a3e561fb21752b584", size = 488722, upload-time = "2025-10-18T02:10:16.825Z" }, ] [[package]] @@ -7071,33 +7208,25 @@ wheels = [ [[package]] name = "torchaudio" -version = "2.9.0" +version = "2.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/63/3c0ede3aa3d19a8a6698ddd107fa88660549360b51bf8ce2717cd498d800/torchaudio-2.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab4cbcccfd873b0fb41fcb39c9869e59ef84bb95b093f6f58e2d05172a7500d2", size = 809116, upload-time = "2025-10-15T15:52:00.911Z" }, - { url = "https://files.pythonhosted.org/packages/be/d5/25e58745defe9d05893d3cba5c0e1a76aeaac503ac5ec4d9f83c871df71c/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7f93388b6e536c14d6015b6f75277a8b45efc532f61b35adc1ed06c98a86003e", size = 476020, upload-time = "2025-10-15T15:51:59.967Z" }, - { url = "https://files.pythonhosted.org/packages/f0/9c/58b8b49dfba2ae85e41ca86b0c52de45bbbea01987490de219c99c523a58/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:508318a2130b40ad51378f90caf8727a4bd3ac2b296f2b90c900b44e6068a940", size = 2059901, upload-time = "2025-10-15T15:51:54.634Z" }, - { url = "https://files.pythonhosted.org/packages/d7/eb/58b05f75d12f69ccc460893a20c999da082e063082120ed06e05cca3a053/torchaudio-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:82117e3a605f2959dc09b4cd8a11178d6e92727d5f85e5d4f9fe47502f84ee96", size = 665350, upload-time = "2025-10-15T15:52:08.384Z" }, - { url = "https://files.pythonhosted.org/packages/6c/66/974371d4e4042d186931b72365817d9d3a509f2bc570888a48612448c060/torchaudio-2.9.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5549c25db4c2da306e179e9aa99980e7f5b1826a8d2d7de08125f3943a5620b2", size = 809149, upload-time = "2025-10-15T15:52:16.133Z" }, - { url = "https://files.pythonhosted.org/packages/09/61/8f7b875a2d879666f2f121e458817703e5499988a86105d2a25afecb9987/torchaudio-2.9.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:1eb0d1dac8cefbc4a54afb21aac72a1c25a91f73e9c3bd85f6684930a4a1be5d", size = 475699, upload-time = "2025-10-15T15:52:06.349Z" }, - { url = "https://files.pythonhosted.org/packages/26/db/10ba200f90b76f7b859f46b5ba30cdded69f71bcb0fe3c59bb215532cd2b/torchaudio-2.9.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:266d304dd4ed738a10148b020e3d066e81272ee851f6f92193fe549df96af868", size = 2060349, upload-time = "2025-10-15T15:52:09.329Z" }, - { url = "https://files.pythonhosted.org/packages/be/53/5f9adbea55e48f91532ee4f041283900939ee5cb6bc1395587214e67a629/torchaudio-2.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:7d3926129389d934aa048bd6c6f68fbf3ef26828ebbbbeac99794ea00e90dc1c", size = 665310, upload-time = "2025-10-15T15:52:05.101Z" }, - { url = "https://files.pythonhosted.org/packages/e3/41/88b989aab1e11134d858350196fcf3afd4c2a6821d74efb3c1b9ab23b8cf/torchaudio-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:967d664477fb91dffad82ef64ea3695801c0cc35304baec71be875b569440872", size = 813491, upload-time = "2025-10-15T15:52:10.346Z" }, - { url = "https://files.pythonhosted.org/packages/1a/c1/8d0481fc921cb72d6cadbacd338fa71db0052e8fdb1bf33127c694bbf257/torchaudio-2.9.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:276871d6f5fed5268a87c5da303a13ca2e06b9d29a4c44663b960f0a2e2f46d7", size = 477749, upload-time = "2025-10-15T15:52:04.189Z" }, - { url = "https://files.pythonhosted.org/packages/cf/d3/d085cd76413b9f3f792e61933235d982caf5cdbdf60f0e4fdae71879becc/torchaudio-2.9.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3d5657d929d6ca07b08cfa005988f2ea8caacf9af42f20bc7eff10f88812ce30", size = 2062165, upload-time = "2025-10-15T15:52:12.784Z" }, - { url = "https://files.pythonhosted.org/packages/f2/41/d9876f5b19b4b2f98a6131d1a98ee6d5d8f707c01311bbba7cc3bb02f4bf/torchaudio-2.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3fe9cac0c2ee713e07f8c88d09528d55e0fa74987b0122e27911dfb720f39054", size = 669260, upload-time = "2025-10-15T15:52:13.8Z" }, - { url = "https://files.pythonhosted.org/packages/97/ad/db50c49d73d1904152bbaaaa281e03a41ec519dd6a9df48cc69ea5cd48b9/torchaudio-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3fa41447a21103fcde930b4ad2bd2634565a0becff1a5425535b4f0116c0d5df", size = 810532, upload-time = "2025-10-15T15:52:17.197Z" }, - { url = "https://files.pythonhosted.org/packages/a8/00/aa8ed83a169a87af72d6cdc17e0350f418b3cba3bd7397b0cca873274789/torchaudio-2.9.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:69f46f21bd67e90ade33a7d0f0cf98270cd61b98f5f8249d3893be0a16b3e31f", size = 475864, upload-time = "2025-10-15T15:52:11.446Z" }, - { url = "https://files.pythonhosted.org/packages/4b/bb/7ca64ed0556afa08d3a7a47c887ee9b1c4f3eebd193baf47505b6fac479c/torchaudio-2.9.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:631b0f43564a25e27e615b217454c334f52162679f39ae10b9fa7562ed587dfc", size = 2060360, upload-time = "2025-10-15T15:52:14.992Z" }, - { url = "https://files.pythonhosted.org/packages/63/13/4407b79ddedc9ea95d88fa54c3758df21f0117683fceba4bacd98ceaa772/torchaudio-2.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:ed6df9f14431e13498b984dc87df1aabb2156b9ce0ce7268ce4a61650197310a", size = 665048, upload-time = "2025-10-15T15:52:19.116Z" }, - { url = "https://files.pythonhosted.org/packages/7d/1a/d3cd6b67b5c68ff4211be923978d1d7c10ea2f44f826d4cd15b775f52c11/torchaudio-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:93358d8f2f24969ba3f368f4eec33295df830af54836c7fd3336740228f9af16", size = 813499, upload-time = "2025-10-15T15:52:20.412Z" }, - { url = "https://files.pythonhosted.org/packages/ab/65/a35a182519b40dcd2cedaf5fdcac6f724ae2451c534dfcece6ff5f85f983/torchaudio-2.9.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:742143d9d62769bc4b9a2977ca4f4720e0a5e922bdc5df585c155e0a1f545461", size = 477752, upload-time = "2025-10-15T15:52:18.14Z" }, - { url = "https://files.pythonhosted.org/packages/6f/1c/30272b71ae08817eaca00bb856ebef25dd44041329579903c1915b57f0c9/torchaudio-2.9.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0a234634e1142fb2652c49e935a98b4d9656fd0af9e4aa14b1b05a80c3cf8e78", size = 2062173, upload-time = "2025-10-15T15:52:22.724Z" }, - { url = "https://files.pythonhosted.org/packages/b9/d6/d007f6bc55a16a86e64e9bba295b90485011cc6a113d8f56b503b4f34a7d/torchaudio-2.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:cbf5d6da8fd2ed545c78218b39fd6aacaa4dd5e265c5f85b248a2fac223f0bd6", size = 669272, upload-time = "2025-10-15T15:52:21.696Z" }, + { url = "https://files.pythonhosted.org/packages/ac/cc/c2e2a3eb6ee956f73c68541e439916f8146170ea9cc61e72adea5c995312/torchaudio-2.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ddef94bf181e6447cbb05f38beaca8f6c5bb8d2b9ddced1aa3452025b9fc70d3", size = 1856736, upload-time = "2025-08-06T14:58:36.3Z" }, + { url = "https://files.pythonhosted.org/packages/c7/0d/24dad878784f1edd62862f27173781669f0c71eb46368636787d1e364188/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:862e2e40bf09d865e5df080a84c1a39bbcef40e43140f4b1737eb3a389d3b38f", size = 1692930, upload-time = "2025-08-06T14:58:41.312Z" }, + { url = "https://files.pythonhosted.org/packages/c2/a6/84d80f34472503e9eb82245d7df501c59602d75d7360e717fb9b84f91c5e/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:93a8583f280fe83ba021aa713319381ea71362cc87b67ee38e97a43cb2254aee", size = 4014607, upload-time = "2025-08-06T14:58:47.234Z" }, + { url = "https://files.pythonhosted.org/packages/43/ab/96ad33afa320738a7cfb4b51ba97e2f3cfb1e04ae3115d5057655103ba4f/torchaudio-2.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:4b82cacd1b8ccd543b1149d8cab257a40dfda8119023d2e3a96c66349c84bffb", size = 2499890, upload-time = "2025-08-06T14:58:55.066Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ea/2a68259c4dbb5fe44ebfdcfa40b115010d8c677221a7ef0f5577f3c4f5f1/torchaudio-2.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f851d32e94ca05e470f0c60e25726ec1e0eb71cb2ca5a0206b7fd03272ccc3c8", size = 1857045, upload-time = "2025-08-06T14:58:51.984Z" }, + { url = "https://files.pythonhosted.org/packages/0d/a3/1c79a8ef29fe403b83bdfc033db852bc2a888b80c406325e5c6fb37a7f2d/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:09535a9b727c0793cd07c1ace99f3f353626281bcc3e30c2f2314e3ebc9d3f96", size = 1692755, upload-time = "2025-08-06T14:58:50.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/df/61941198e9ac6bcebfdd57e1836e4f3c23409308e3d8d7458f0198a6a366/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2a85b124494736241884372fe1c6dd8c15e9bc1931bd325838c5c00238c7378", size = 4013897, upload-time = "2025-08-06T14:59:01.66Z" }, + { url = "https://files.pythonhosted.org/packages/c3/ab/7175d35a4bbc4a465a9f1388571842f16eb6dec5069d7ea9c8c2d7b5b401/torchaudio-2.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:c1b5139c840367a7855a062a06688a416619f6fd2ca46d9b9299b49a7d133dfd", size = 2500085, upload-time = "2025-08-06T14:58:44.95Z" }, + { url = "https://files.pythonhosted.org/packages/34/1a/69b9f8349d9d57953d5e7e445075cbf74000173fb5f5d5d9e9d59415fc63/torchaudio-2.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:68df9c9068984edff8065c2b6656725e6114fe89281b0cf122c7505305fc98a4", size = 1935600, upload-time = "2025-08-06T14:58:46.051Z" }, + { url = "https://files.pythonhosted.org/packages/71/76/40fec21b65bccfdc5c8cdb9d511033ab07a7ad4b05f0a5b07f85c68279fc/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1951f10ed092f2dda57634f6a3950ef21c9d9352551aa84a9fccd51bbda18095", size = 1704199, upload-time = "2025-08-06T14:58:43.594Z" }, + { url = "https://files.pythonhosted.org/packages/8e/53/95c3363413c2f2009f805144160b093a385f641224465fbcd717449c71fb/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4f7d97494698d98854129349b12061e8c3398d33bd84c929fa9aed5fd1389f73", size = 4020596, upload-time = "2025-08-06T14:59:03.031Z" }, + { url = "https://files.pythonhosted.org/packages/52/27/7fc2d7435af044ffbe0b9b8e98d99eac096d43f128a5cde23c04825d5dcf/torchaudio-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d4a715d09ac28c920d031ee1e60ecbc91e8a5079ad8c61c0277e658436c821a6", size = 2549553, upload-time = "2025-08-06T14:59:00.019Z" }, ] [[package]] @@ -7115,8 +7244,8 @@ version = "0.11.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "urllib3" }, ] wheels = [ @@ -7128,12 +7257,10 @@ name = "torchprofile" version = "0.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "numpy", marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/36/574c0c46e818533b78b3c09505211162918188325ab4165ef11a3f295755/torchprofile-0.0.4.tar.gz", hash = "sha256:96b6da17d752a06b02977e078aea95614893b31d4117dd5dcd081f30ce65611b", size = 4557, upload-time = "2021-06-22T04:58:03.592Z" } wheels = [ @@ -7142,7 +7269,7 @@ wheels = [ [[package]] name = "torchvision" -version = "0.24.0" +version = "0.23.0" source = { registry = "https://download.pytorch.org/whl/cu129" } resolution-markers = [ "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", @@ -7151,19 +7278,17 @@ resolution-markers = [ dependencies = [ { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:630f602db2c594c9cbc89b964d5fb4873adf4193805df65339b24cd3f4cf57f7" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:20f7e25a24f91d93d09398b80929dec805c4ee2f5527fad8eecd6e43dc5fd5d0" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cb70cc000e6a398270044c3406a89ee8ab6157a4e81b5d40c5904e1d0e22e2f8" }, ] [[package]] name = "torchvision" -version = "0.24.0" +version = "0.23.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin'", @@ -7172,19 +7297,17 @@ resolution-markers = [ dependencies = [ { name = "numpy", marker = "sys_platform == 'darwin'" }, { name = "pillow", marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" }, - { url = "https://files.pythonhosted.org/packages/4f/b5/b2008e4b77a8d6aada828dd0f6a438d8f94befa23fdd2d62fa0ac6e60113/torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84d79cfc6457310107ce4d712de7a3d388b24484bc9aeded4a76d8f8e3a2813d", size = 1891722, upload-time = "2025-10-15T15:51:28.854Z" }, - { url = "https://files.pythonhosted.org/packages/7d/d7/3dd10830b047eeb46ae6b465474258d7b4fbb7d8872dca69bd42449f5c82/torchvision-0.24.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ab956a6e588623353e0f20d4b03eb1656cb4a3c75ca4dd8b4e32e01bc43271a", size = 2028355, upload-time = "2025-10-15T15:51:22.384Z" }, - { url = "https://files.pythonhosted.org/packages/1b/24/790a39645cc8c71bf442d54a76da9bda5caeb2a44c5f7e02498649cd99d4/torchvision-0.24.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4bdfc85a5ed706421555f32cdc5e3ddb6d40bf65ef03a274ce3c176393e2904b", size = 2028335, upload-time = "2025-10-15T15:51:26.252Z" }, - { url = "https://files.pythonhosted.org/packages/08/f7/261d1353c611820541ecd43046b89da3f1ae998dc786e4288b890a009883/torchvision-0.24.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:68120e7e03c31900e499a10bb7fdd63cfd67f0054c9fa108e7e27f9cd372f315", size = 2028359, upload-time = "2025-10-15T15:51:32.119Z" }, + { url = "https://files.pythonhosted.org/packages/df/1d/0ea0b34bde92a86d42620f29baa6dcbb5c2fc85990316df5cb8f7abb8ea2/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440", size = 1856885, upload-time = "2025-08-06T14:58:06.503Z" }, + { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" }, + { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" }, ] [[package]] name = "torchvision" -version = "0.24.0+cu129" +version = "0.23.0+cu129" source = { registry = "https://download.pytorch.org/whl/cu129" } resolution-markers = [ "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", @@ -7199,14 +7322,15 @@ resolution-markers = [ dependencies = [ { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "pillow", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6226be1b8399ef655a11965ea4975250f7823fc9b200b35deb9eeac350c667a9" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp312-cp312-win_amd64.whl", hash = "sha256:57cf57ada9a5407755e170a4ab3842337b83862c93f9483decaf0b6b4d69fa09" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:04316e24ddd1cee3b301208811a9d7c4cfca5f566ea367f33bda059d8f0e012e" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313-win_amd64.whl", hash = "sha256:a486a0cee466807a17749d0b916d52088343453dc911baa20f0f459b2fa43c9a" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c718f6d2c0e61feed39763925eea3e1f42979f6b21e61276f487409168d9e352" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp313-cp313t-win_amd64.whl", hash = "sha256:8218c1f614972abb4710afde96d0f70b174b235f390e165e6fd4cdd5cee6d93d" }, ] [[package]] @@ -7221,6 +7345,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + [[package]] name = "transformer-engine" version = "2.8.0" @@ -7259,8 +7392,8 @@ dependencies = [ { name = "einops" }, { name = "onnx" }, { name = "onnxscript" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/38/63/1e3953244ed4f318f87889309a56cdd664759f007967eb850ee415a5584d/transformer_engine_torch-2.8.0.tar.gz", hash = "sha256:ce09f1bd9b8e532a5c347b9e9b3a3a771722095daddca673ae82ccce8e68d759", size = 209805, upload-time = "2025-10-07T04:54:11.134Z" } @@ -7285,6 +7418,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, ] +[[package]] +name = "triton" +version = "3.4.0" +source = { registry = "https://download.pytorch.org/whl/cu129" } +resolution-markers = [ + "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "setuptools", marker = "sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, +] + [[package]] name = "triton" version = "3.4.0" @@ -7301,29 +7456,6 @@ dependencies = [ { name = "setuptools", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] -[[package]] -name = "triton" -version = "3.5.0" -source = { registry = "https://download.pytorch.org/whl/cu129" } -resolution-markers = [ - "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", - "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'", - "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/triton-3.5.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl" }, -] - [[package]] name = "trove-classifiers" version = "2025.8.6.13" @@ -7477,11 +7609,10 @@ wheels = [ [[package]] name = "vllm" -version = "0.11.2" +version = "0.11.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, - { name = "anthropic" }, { name = "blake3" }, { name = "cachetools" }, { name = "cbor2" }, @@ -7492,13 +7623,11 @@ dependencies = [ { name = "einops" }, { name = "fastapi", extra = ["standard"] }, { name = "filelock" }, - { name = "flashinfer-python" }, { name = "gguf" }, { name = "lark" }, - { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 's390x' or platform_machine == 'x86_64'" }, + { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, { name = "lm-format-enforcer" }, - { name = "mistral-common", extra = ["image"] }, - { name = "model-hosting-container-standards" }, + { name = "mistral-common", extra = ["audio", "image"] }, { name = "msgspec" }, { name = "ninja" }, { name = "numba" }, @@ -7529,23 +7658,23 @@ dependencies = [ { name = "six" }, { name = "tiktoken" }, { name = "tokenizers" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "torchaudio" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "tqdm" }, { name = "transformers" }, { name = "typing-extensions" }, { name = "watchfiles" }, { name = "xformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 's390x' or platform_machine == 'x86_64'" }, + { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/40/15/bc50794c5c6a48f075d72fde8035647d38072ad81031168d27ca631f9395/vllm-0.11.2.tar.gz", hash = "sha256:496d15bb64ca0fe73adbc57a93b29f4671fa12404c09e0ba02f777bfe60af671", size = 17287801, upload-time = "2025-11-20T08:31:35.084Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/5a/36d2351206f4d8d871b10780f874d03957985e08298d430cc837723e07af/vllm-0.11.0.tar.gz", hash = "sha256:f435a64c24e9c4178d657a76f8edd8548ddc444012f7d06a9f79ac3a6392bfae", size = 10822208, upload-time = "2025-10-04T01:39:57.798Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/75/5d/d6af7818e41957a5d35f1b0ecd0186ac80e322f228dc390dcbc4aafce58d/vllm-0.11.2-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:ea473bd4fde06940fe3f681a00476060652f62b3279ef11aaffac5768856cfe8", size = 370306629, upload-time = "2025-11-20T08:30:43.713Z" }, - { url = "https://files.pythonhosted.org/packages/24/7c/f27896162b88c360d569fd632cf0525d5ce89cba8e555532d80dc3ee0a12/vllm-0.11.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:a084f5ca768d22bf55810948cbb50825a35015e07593ab6c9c42fcbe18bdd5cc", size = 368543904, upload-time = "2025-11-20T08:31:15.933Z" }, + { url = "https://files.pythonhosted.org/packages/47/33/d19e0763c34392ec956534536fa837c060495bfff31ed83452135ea7608d/vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:3861c75ff2b12e24f6d179ff5c084d791b42ded8675d76c8706697c79f68cd62", size = 438217982, upload-time = "2025-10-04T01:39:32.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/bf/973444bb959fc7acbbeb3d226bd4d135dcd49b6af174b29aab1b50e2d710/vllm-0.11.0-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:52369c9ee949944354bdc7afc88ded2d1ed02b098bf90db06cf80098a19787b7", size = 401003969, upload-time = "2025-10-04T01:39:50.251Z" }, ] [[package]] @@ -7737,24 +7866,6 @@ version = "3.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857, upload-time = "2015-10-22T15:26:37.51Z" } -[[package]] -name = "wheel" -version = "0.45.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef809ae6338005b3f21bb568bea3165cfc6a243fdc25c/wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729", size = 107545, upload-time = "2024-11-23T00:18:23.513Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" }, -] - -[[package]] -name = "win32-setctime" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, -] - [[package]] name = "wrapt" version = "1.17.3" @@ -7845,15 +7956,15 @@ wheels = [ [[package]] name = "xformers" -version = "0.0.33.post1" +version = "0.0.32.post1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6f/c1/cd0d6b89da38d8aa174e8eabf29530f8871daf53b886ec6b680ef9d3e71f/xformers-0.0.33.post1.tar.gz", hash = "sha256:e555258249b514ba117b3403523fe0bd7d3e92e930575f0e0dbf5f7db5b42677", size = 14784437, upload-time = "2025-11-13T20:16:14.793Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/33/3b9c4d3d5b2da453d27de891df4ad653ac5795324961aa3a5c15b0353fe6/xformers-0.0.32.post1.tar.gz", hash = "sha256:1de84a45c497c8d92326986508d81f4b0a8c6be4d3d62a29b8ad6048a6ab51e1", size = 12106196, upload-time = "2025-08-14T18:07:45.486Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/39/94/3ad80d1070ddfb280c20a67dfbc094a93579a02910ef41f20631a9b566fe/xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a8d72c6272453450eede2ed9aaa14448e6525569e14217573057ded146090db3", size = 122884756, upload-time = "2025-11-13T20:16:04.002Z" }, + { url = "https://files.pythonhosted.org/packages/6b/df/6817346f1a77278315d5fe1fc9f239ba3282ba36e8ab3256babd448dde62/xformers-0.0.32.post1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5f245b5555188da112070d8fefb6b7ae1ae47422856521d66c837e9d2352fbe4", size = 117199943, upload-time = "2025-08-14T18:07:34.78Z" }, ] [[package]] @@ -7865,10 +7976,10 @@ dependencies = [ { name = "ninja" }, { name = "numpy" }, { name = "pydantic" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, { name = "transformers" }, - { name = "triton", version = "3.5.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f2/a9/dc3c63cf7f082d183711e46ef34d10d8a135c2319dc581905d79449f52ea/xgrammar-0.1.25.tar.gz", hash = "sha256:70ce16b27e8082f20808ed759b0733304316facc421656f0f30cfce514b5b77a", size = 2297187, upload-time = "2025-09-21T05:58:58.942Z" }