inclusionAI
diff --git a/‎areal/README.md‎
Lines changed: 2 additions & 2 deletions b/‎areal/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/api/reward_api.py‎
Lines changed: 30 additions & 0 deletions b/‎areal/api/reward_api.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎areal/core/async_task_runner.py‎
Lines changed: 37 additions & 42 deletions b/‎areal/core/async_task_runner.py‎
Lines changed: 37 additions & 42 deletions
@@ -368,17 +368,17 @@ class PPOActor:
     def __init__(self, config: PPOActorConfig, engine: TrainEngine):
         self.config = config
         self.engine = engine
+        self.temperature = config.temperature
 
     @torch.no_grad()
     def compute_logp(
         self,
         data: dict[str, Any],
-        temperature: float | None = None,
     ) -> torch.Tensor | None:
 
         def calc_logprobs(logits, input_data):
             labels = torch.roll(input_data["input_ids"], shifts=-1, dims=-1)
-            logprobs = gather_logprobs(logits, labels, temperature or 1.0)
+            logprobs = gather_logprobs(logits, labels, self.temperature)
             return logprobs
 
         self.engine.eval()
 
@@ -1,4 +1,5 @@
 import asyncio
+import os
 import threading
 import traceback
 import weakref
@@ -12,6 +13,29 @@
 logger = logging.getLogger("Reward API")
 
 
+def _get_device_count_safely() -> int:
+    """
+    Safely get device count without initializing CUDA context.
+    """
+    gpu_types = ["nvidia", "davinci"]
+    try:
+        if os.path.exists("/dev"):
+            for gpu_type in gpu_types:
+                devices = [
+                    f
+                    for f in os.listdir("/dev")
+                    if f.startswith(gpu_type) and f[len(gpu_type) :].isdigit()
+                ]
+                if devices:
+                    return len(devices)
+    except (OSError, ValueError) as e:
+        # /dev doesn't exist or can't read (e.g., Windows, macOS)
+        logger.debug(f"Could not read device list from /dev, using fallback: {e}")
+
+    # Fallback: assume 8 devices for cautious max_workers calculation
+    return 8
+
+
 def reward_fn(
     prompt: str,
     completions: str,
@@ -54,6 +78,12 @@ def __init__(
     ):
         self.reward_fn = reward_fn
         self.timeout_seconds = timeout_seconds
+        if max_workers is None:
+            cpu_count = os.cpu_count() or 1
+            device_count = _get_device_count_safely()
+            # Heuristic for max_workers: distribute CPU cores across devices,
+            # then halve to be conservative, ensuring at least one worker.
+            max_workers = max((cpu_count // device_count) // 2, 1)
         self.max_workers = max_workers
         self.max_retries = max_retries
         self._executor_key = max_workers
 
@@ -10,7 +10,6 @@
 
 import asyncio
 import queue
-import random
 import threading
 import time
 from collections.abc import Awaitable, Callable
@@ -24,18 +23,26 @@
 
 # Polling configuration
 DEFAULT_POLL_WAIT_TIME = 0.05  # 50ms
-DEFAULT_POLL_SLEEP_TIME = 0.5  # 1 second
+DEFAULT_POLL_SLEEP_TIME = 0.5  # 500ms
 
 
 class TaskQueueFullError(RuntimeError):
     """Raised when an AsyncTaskRunner queue is full."""
 
 
 @dataclass
-class _TimedResult(Generic[T]):
-    """Internal wrapper for results with creation timestamp."""
+class TimedResult(Generic[T]):
+    """Wrapper for task results with creation timestamp.
 
-    create_time: int  # nanoseconds from time.monotonic_ns()
+    Attributes
+    ----------
+    create_time : int
+        Task creation time in nanoseconds from time.monotonic_ns().
+    data : T
+        The actual result data from the completed task.
+    """
+
+    create_time: int
     data: T
 
 
@@ -72,13 +79,13 @@ class AsyncTaskRunner(Generic[T]):
     ----------
     max_queue_size : int
         Maximum size for input and output queues. Tasks submitted when
-        the input queue is full will raise RuntimeError.
+        the input queue is full will raise TaskQueueFullError.
     poll_wait_time : float, optional
         Time in seconds to wait for task completion during each poll
         cycle. Default is 0.05 (50ms).
     poll_sleep_time : float, optional
         Time in seconds to sleep between poll cycles.
-        Default is 1.0 second.
+        Default is 0.5 seconds.
     enable_tracing : bool, optional
         Enable detailed logging of task submission and completion.
         Default is False.
@@ -163,7 +170,7 @@ def __init__(
             Default is 0.05.
         poll_sleep_time : float, optional
             Time in seconds to sleep between poll cycles.
-            Default is 1.0.
+            Default is 0.5.
         enable_tracing : bool, optional
             Enable detailed logging. Default is False.
         """
@@ -180,13 +187,10 @@ def __init__(
         self.input_queue: queue.Queue[_TaskInput[T]] = queue.Queue(
             maxsize=max_queue_size
         )
-        self.output_queue: queue.Queue[_TimedResult[T]] = queue.Queue(
+        self.output_queue: queue.Queue[TimedResult[T]] = queue.Queue(
             maxsize=max_queue_size
         )
 
-        # Cache for results to support wait() with arbitrary counts
-        self.result_cache: list[_TimedResult[T]] = []
-
         # Thread exception handling
         self._thread_exception_lock = threading.Lock()
         self._thread_exception: Exception | None = None
@@ -335,7 +339,7 @@ async def _run_async_loop(self):
                     try:
                         # Place result in output queue
                         self.output_queue.put_nowait(
-                            _TimedResult(create_time=task_obj.create_time, data=result)
+                            TimedResult(create_time=task_obj.create_time, data=result)
                         )
                         if self.enable_tracing and self.logger:
                             self.logger.info(
@@ -355,6 +359,7 @@ async def _run_async_loop(self):
                         raise TaskQueueFullError(
                             "Output queue full. Please increase max_queue_size."
                         )
+                # Sleep to avoid busy-waiting
                 await asyncio.sleep(self.poll_sleep_time)
         finally:
             # Cancel all remaining tasks on shutdown
@@ -390,9 +395,10 @@ def submit(
 
         Raises
         ------
+        TaskQueueFullError
+            If the input queue is full.
         RuntimeError
-            If the input queue is full or if the background thread
-            has died.
+            If the background thread has died.
 
         Examples
         --------
@@ -417,12 +423,13 @@ def submit(
                 "wait for tasks to complete."
             )
 
-    def wait(self, count: int, timeout: float | None = None) -> list[T]:
+    def wait(
+        self, count: int, timeout: float | None = None, with_timing: bool = False
+    ) -> list[TimedResult[T]] | list[T]:
         """Wait for a specified number of task results.
 
         This method blocks until at least `count` results are available
-        or the timeout expires. Results are returned in random order
-        (shuffled).
+        or the timeout expires.
 
         Parameters
         ----------
@@ -431,11 +438,15 @@ def wait(self, count: int, timeout: float | None = None) -> list[T]:
         timeout : float | None, optional
             Maximum time in seconds to wait. If None, waits indefinitely
             (up to 7 days). Default is None.
+        with_timing : bool, optional
+            If True, return TimedResult objects with creation timestamps.
+            If False, return only the data values. Default is False.
 
         Returns
         -------
-        List[T]
-            List of task results, shuffled randomly.
+        list[TimedResult[T]] | list[T]
+            If with_timing=True, returns list of TimedResult objects.
+            If with_timing=False, returns list of result data.
 
         Raises
         ------
@@ -460,16 +471,7 @@ def wait(self, count: int, timeout: float | None = None) -> list[T]:
             # Check thread health
             self._check_thread_health()
 
-            # Drain all available results from output queue
-            while True:
-                try:
-                    timed_result = self.output_queue.get_nowait()
-                    self.result_cache.append(timed_result)
-                except queue.Empty:
-                    break
-
-            # Check if we have enough results
-            if len(self.result_cache) >= count:
+            if self.get_output_queue_size() >= count:
                 break
 
             # Sleep briefly to avoid busy waiting
@@ -480,23 +482,16 @@ def wait(self, count: int, timeout: float | None = None) -> list[T]:
             self._check_thread_health()
             raise RuntimeError("AsyncTaskRunner is exiting, cannot wait for results.")
 
-        accepted = len(self.result_cache)
+        accepted = self.get_output_queue_size()
         if accepted < count:
             raise TimeoutError(
                 f"Timed out waiting for {count} results, only received {accepted}."
             )
 
-        # Sort by creation time for deterministic ordering
-        self.result_cache.sort(key=lambda x: x.create_time)
-
-        # Extract the requested number of results
-        results_to_return = self.result_cache[:count]
-        self.result_cache = self.result_cache[count:]
-
-        # Shuffle for randomness (helps with data diversity in ML)
-        random.shuffle(results_to_return)
-
-        # Extract just the data (remove timing metadata)
+        # Extract the requested number of results, sorted by return time
+        results_to_return = [self.output_queue.get() for _ in range(count)]
+        if with_timing:
+            return results_to_return
         return [r.data for r in results_to_return]
 
     def submit_batch(