NVIDIA-NeMo
diff --git a/‎.github/workflows/_automodel_integration_check.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/_automodel_integration_check.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/fp8.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/fp8.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎nemo_rl/distributed/ray_actor_environment_registry.py‎
Lines changed: 3 additions & 3 deletions b/‎nemo_rl/distributed/ray_actor_environment_registry.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎nemo_rl/models/policy/interfaces.py‎
Lines changed: 8 additions & 8 deletions b/‎nemo_rl/models/policy/interfaces.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎nemo_rl/models/policy/lm_policy.py‎
Lines changed: 7 additions & 11 deletions b/‎nemo_rl/models/policy/lm_policy.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎nemo_rl/models/policy/workers/base_policy_worker.py‎
Lines changed: 154 additions & 0 deletions b/‎nemo_rl/models/policy/workers/base_policy_worker.py‎
Lines changed: 154 additions & 0 deletions
@@ -134,8 +134,8 @@ jobs:
           echo "Checking if dtensor policy worker files are synchronized..."
 
           # Define the dtensor policy worker file paths
-          DTENSOR_POLICY_WORKER_FILE="nemo_rl/models/policy/dtensor_policy_worker.py"
-          DTENSOR_POLICY_WORKER_V2_FILE="nemo_rl/models/policy/dtensor_policy_worker_v2.py"
+          DTENSOR_POLICY_WORKER_FILE="nemo_rl/models/policy/workers/dtensor_policy_worker.py"
+          DTENSOR_POLICY_WORKER_V2_FILE="nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py"
 
           # Check if dtensor_policy_worker.py was modified in this PR
           if git diff --name-only origin/${{ inputs.base_ref }}..HEAD | grep -q "^${DTENSOR_POLICY_WORKER_FILE}$"; then
 
@@ -53,7 +53,7 @@ FP8 generations are recommended to be configured with the following settings:
                 use_activation_pow2_scale: False
 ```
 
-"To train with FP8, you need to set the Megatron path and configure it using the following settings:
+To train with FP8, you need to set the Megatron path and configure it using the following settings:
 
 ```
     policy:
@@ -68,12 +68,12 @@ FP8 generations are recommended to be configured with the following settings:
 
 The TransformerEngine implementation for this recipe requires **cuda version ≥ 12.9**. The latest nemo-rl depends on torch 2.8.0 + cuda 12.9 (since this [commit](https://github.com/NVIDIA-NeMo/RL/commit/3f36d14b53e906b27c01c06e36dbbd2b8eb300cd)). Users should check-out code to latest and build container from `docker/Dockerfile` ([instructions](docker.md)). 
 
-If you are using nemo-rl before this [commit](https://github.com/NVIDIA-NeMo/RL/commit/3f36d14b53e906b27c01c06e36dbbd2b8eb300cd), you will see the following error when trying to use fp8 training
+If you are using nemo-rl before this [commit](https://github.com/NVIDIA-NeMo/RL/commit/3f36d14b53e906b27c01c06e36dbbd2b8eb300cd), you will see the following error when trying to use fp8 training:
 
 ```
-File "/opt/ray_venvs/nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker/lib/python3.12/site-packages/transformer_engine/pytorch/fp8.py", line 646, in fp8_autocast
+File "/opt/ray_venvs/nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker/lib/python3.12/site-packages/transformer_engine/pytorch/fp8.py", line 646, in fp8_autocast
 FP8GlobalStateManager.fp8_autocast_enter(
-File "/opt/ray_venvs/nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker/lib/python3.12/site-packages/transformer_engine/pytorch/fp8.py", line 465, in fp8_autocast_enter
+File "/opt/ray_venvs/nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker/lib/python3.12/site-packages/transformer_engine/pytorch/fp8.py", line 465, in fp8_autocast_enter
 assert fp8_block_available, reason_for_no_fp8_block
            ^^^^^^^^^^^^^^^^^^^
 AssertionError: FP8 block scaled GEMM requires Hopper and CUDA >= 12.9.
@@ -88,5 +88,5 @@ The above results are from Llama-3.1-8B-Instruct GRPO experiments. You can run t
 * For BF16: `examples/configs/grpo_math_8B_megatron.yaml`
 * For FP8: `examples/configs/grpo_math_8B_megatron_fp8.yaml`
 
-In the experiment in this figure, enabling FP8 rollout and training gives 15%-25% decrease in step time, and the validation accuracy curves match up to 1000 step.
+In the experiment in this figure, enabling FP8 rollout and training gives 15%-25% decrease in step time, and the validation accuracy curves match up to 1000 steps.
 Efforts are ongoing to performs longer runs and further optimize performance.
@@ -29,9 +29,9 @@
     "nemo_rl.models.generation.vllm.vllm_worker_async.VllmAsyncGenerationWorker": VLLM_EXECUTABLE,
     # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM.
     # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved.
-    "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE,
-    "nemo_rl.models.policy.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": PY_EXECUTABLES.AUTOMODEL,
-    "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE,
+    "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker": VLLM_EXECUTABLE,
+    "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2": PY_EXECUTABLES.AUTOMODEL,
+    "nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE,
     "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM,
     "nemo_rl.environments.vlm_environment.VLMEnvironment": PY_EXECUTABLES.SYSTEM,
     "nemo_rl.environments.code_environment.CodeEnvironment": PY_EXECUTABLES.SYSTEM,
 
@@ -67,7 +67,9 @@ def get_logprobs(
 
     @abstractmethod
     def get_reference_policy_logprobs(
-        self, data: BatchedDataDict[GenerationDatumSpec]
+        self,
+        data: BatchedDataDict[GenerationDatumSpec],
+        micro_batch_size: Optional[int] = None,
     ) -> BatchedDataDict[ReferenceLogprobOutputSpec]:
         """Get logprobs of actions from observations.
 
@@ -100,6 +102,7 @@ def train(
         data: BatchedDataDict,
         loss_fn: LossFunction,
         eval_mode: bool = False,
+        *,
         gbs: Optional[int] = None,
         mbs: Optional[int] = None,
     ) -> dict[str, Any]:
@@ -114,13 +117,6 @@ def train(
         """
         pass
 
-    @abstractmethod
-    def score(
-        self, data: BatchedDataDict[GenerationDatumSpec]
-    ) -> BatchedDataDict[ScoreOutputSpec]:
-        """Score a batch of data using the policy."""
-        pass
-
     @abstractmethod
     def calibrate_qkv_fp8_scales(
         self,
@@ -191,3 +187,7 @@ def broadcast_weights_for_collective(
         self, kv_scales: Optional[dict[str, float]] = None
     ) -> list[ray.ObjectRef]:
         pass
+
+    @abstractmethod
+    def prepare_for_lp_inference(self) -> None:
+        pass
@@ -87,9 +87,7 @@ def __init__(
                 "DTensor (policy.dtensor_cfg.enabled=true), not both."
             )
         if megatron_enable:
-            worker_builder_cls = (
-                "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker"
-            )
+            worker_builder_cls = "nemo_rl.models.policy.workers.megatron_policy_worker.MegatronPolicyWorker"
             tp_size = config["megatron_cfg"]["tensor_model_parallel_size"]
             pp_size = config["megatron_cfg"]["pipeline_model_parallel_size"]
             cp_size = config["megatron_cfg"]["context_parallel_size"]
@@ -112,11 +110,9 @@ def __init__(
             # Check if _v2 is enabled in dtensor_cfg (defaults to False for backward compatibility)
             use_v2 = config.get("dtensor_cfg", {}).get("_v2", False)
             if use_v2:
-                worker_builder_cls = "nemo_rl.models.policy.dtensor_policy_worker_v2.DTensorPolicyWorkerV2"
+                worker_builder_cls = "nemo_rl.models.policy.workers.dtensor_policy_worker_v2.DTensorPolicyWorkerV2"
             else:
-                worker_builder_cls = (
-                    "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker"
-                )
+                worker_builder_cls = "nemo_rl.models.policy.workers.dtensor_policy_worker.DTensorPolicyWorker"
 
             tp_size = config["dtensor_cfg"]["tensor_parallel_size"]
             cp_size = config["dtensor_cfg"]["context_parallel_size"]
@@ -666,10 +662,6 @@ def invalidate_kv_cache(self, *args: Any, **kwargs: Any) -> bool:
         # We don't need to do anything here
         return True
 
-    def finish_training(self, *args: Any, **kwargs: Any) -> None:
-        # Placeholder implementation
-        pass
-
     def prepare_refit_info(self) -> Optional[dict[str, Any]]:
         """Prepare the info for refit.
 
@@ -681,6 +673,10 @@ def prepare_refit_info(self) -> Optional[dict[str, Any]]:
         # Only get the first worker's info since all workers will have the same result
         return results[0]
 
+    def finish_training(self, *args: Any, **kwargs: Any) -> None:
+        # Placeholder implementation
+        pass
+
     def calibrate_qkv_fp8_scales(
         self,
         data: BatchedDataDict[GenerationDatumSpec],
 
@@ -0,0 +1,154 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional
+
+import ray
+import torch
+import zmq
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.models.policy.interfaces import ReferenceLogprobOutputSpec
+from nemo_rl.utils.nsys import wrap_with_nvtx_name
+
+
+class AbstractPolicyWorker:
+    """Base class for policy workers with shared functionality."""
+
+    def init_collective(
+        self, ip: str, port: int, world_size: int, *, train_world_size: int
+    ) -> None:
+        """Initialize the collective communication.
+
+        Args:
+            ip: IP address for the process group
+            port: Port for the process group
+            world_size: Total world size (train_world_size + inference_world_size)
+            train_world_size: Number of training workers (used in inference cluster)
+        """
+        from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+        from vllm.distributed.utils import StatelessProcessGroup
+
+        pg = StatelessProcessGroup.create(
+            host=ip, port=port, rank=self.rank, world_size=world_size
+        )
+        device = torch.cuda.current_device()
+        self.model_update_group = PyNcclCommunicator(pg, device=device)
+
+    def is_alive(self) -> bool:
+        """Check if the worker is alive."""
+        return True
+
+    def reset_peak_memory_stats(self) -> None:
+        """Reset peak memory statistics."""
+        torch.cuda.reset_peak_memory_stats()
+
+    def get_gpu_info(self) -> dict[str, Any]:
+        """Return information about the GPU being used by this worker."""
+        from nemo_rl.models.policy.utils import get_gpu_info
+
+        return get_gpu_info(self.model)
+
+    def report_device_id(self) -> str:
+        """Report the UUID of the current CUDA device using NVML.
+
+        Returns:
+            str: UUID of the device in the format "GPU-xxxxx"
+        """
+        from nemo_rl.utils.nvml import get_device_uuid
+
+        # Get current device index from torch
+        device_idx = torch.cuda.current_device()
+        # Get device UUID using NVML
+        return get_device_uuid(device_idx)
+
+    def get_zmq_address(self) -> str:
+        """Get the ZMQ address for the current device."""
+        return f"ipc:///tmp/{self.report_device_id()}.sock"
+
+    def maybe_init_zmq(self) -> None:
+        """Initialize the ZMQ socket if it doesn't exist."""
+        if not hasattr(self, "zmq_socket"):
+            self.zmq_context = zmq.Context()
+            self.zmq_socket = self.zmq_context.socket(zmq.REQ)
+            self.zmq_socket.setsockopt(
+                zmq.SNDTIMEO, 120000
+            )  # set timeout to 120 seconds
+            self.zmq_socket.setsockopt(
+                zmq.RCVTIMEO, 120000
+            )  # set timeout to 120 seconds
+            self.zmq_socket.setsockopt(zmq.LINGER, 0)
+            self.zmq_socket.bind(self.get_zmq_address())
+
+    def get_free_memory_bytes(self) -> int:
+        """Get the available free memory."""
+        from nemo_rl.utils.nvml import get_free_memory_bytes
+
+        device_idx = torch.cuda.current_device()
+        return get_free_memory_bytes(device_idx)
+
+    def shutdown(self) -> bool:
+        """Shutdown the policy."""
+        try:
+            # Clean up extension resources like ZMQ sockets
+            if hasattr(self, "zmq_socket"):
+                self.zmq_socket.close()
+                self.zmq_context.term()
+            return True
+        except Exception:
+            return False
+
+    def start_gpu_profiling(self) -> None:
+        """Start GPU profiling."""
+        torch.cuda.profiler.start()
+
+    def stop_gpu_profiling(self) -> None:
+        """Stop GPU profiling."""
+        torch.cuda.profiler.stop()
+
+    def report_node_ip_and_gpu_id(self) -> tuple[str, int]:
+        """Report the node IP and GPU ID of the current worker."""
+        ip = ray._private.services.get_node_ip_address()
+        gpu_id = ray.get_gpu_ids()[0]
+        return (ip, gpu_id)
+
+    # Temporary fix, 'data' is a kwarg due to some sort of ray bug
+    @wrap_with_nvtx_name("policy_worker/get_reference_policy_logprobs")
+    def get_reference_policy_logprobs(
+        self,
+        *,
+        data: BatchedDataDict[Any],
+        micro_batch_size: Optional[int] = None,
+    ) -> BatchedDataDict[ReferenceLogprobOutputSpec]:
+        """Get the logprobs from the reference policy for a batch of data.
+
+        If micro_batch_size is provided, it will be used instead of the configured
+        logprob_batch_size.
+
+        Returns:
+          a BatchedDataDict with key "reference_logprobs" and shape [batch_size, sequence_length].
+          We use the convention that the logprob of the first token is 0 so that the sequence length is maintained.
+          The logprob of input token i is specified at position i in the output logprobs tensor.
+        """
+        with self.use_reference_model():
+            reference_logprobs = self.get_logprobs(
+                data=data, micro_batch_size=micro_batch_size
+            )
+
+        return_data = BatchedDataDict[ReferenceLogprobOutputSpec]()
+        return_data["reference_logprobs"] = reference_logprobs["logprobs"].cpu()
+        return return_data
+
+    def finish_training(self, *args: Any, **kwargs: Any) -> None:
+        # Placeholder implementation
+        pass