Remove temperature bottleneck (#1276)

stbaione · web-flow · commit 2522f23caae5 · 2025-04-17T16:42:53.000-05:00
Previously, we were taking temperature on the entire `result_logits`
from a decode invocation.

Our `mistral` logits are of shape `[1, 1, 128256]`. When I ran a local
benchmark on how long the `sfnp.divide` function was taking for this
size of an array, it was ~14ms.

So, `(14ms * 16 parallel_reqs * 64 decode_steps) / 1000 == 14 s`!

In local benchmarks, I was able to decrease the latency of 16 concurrent
requests from `~25s` to `~9s`, which also increased our throughput quite
a bit.

The idea is pretty simple:

For greedy selection, or if already in a `softmax`/`log_softmax` form
don't take temperature. Dividing by a scalar won't impact which token is
the highest scoring token or which ones are the `top_k` highest scoring
tokens. It doesn't make sense to apply temperature if already in
`softmax` or `log_softmax` form.

Otherwise only take temperature on the values that are getting converted
to `softmax`. This reduces the size of the array we are dividing
drastically.
diff --git a/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/beam_group.py b/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/beam_group.py
@@ -43,17 +43,15 @@ class Beam(ABC):
     accumulated_normalization: float = 0.0
     last_token: int | None = None
 
-    def apply_temperature(self):
+    def apply_temperature(self, logits: sfnp.device_array):
         """Apply temperature to the logits of a decode invocation.
 
         Args:
             temperature (float): Value to use for `temperature`.
         """
         if self.decode_config.temperature == 1.0:
-            return
-        self.exec_req.result_logits = sfnp.divide(
-            self.exec_req.result_logits, self.decode_config.temperature
-        )
+            return logits
+        return sfnp.divide(logits, self.decode_config.temperature)
 
     def convert_logits_normalization(
         self,
@@ -114,6 +112,10 @@ def _to_softmax(
             device,
             dtype,
         )
+
+        if logits_normalization == LogitsNormalization.NONE:
+            probs_sf = self.apply_temperature(probs_sf)
+
         probs = self.convert_logits_normalization(
             logits_normalization,
             LogitsNormalization.SOFTMAX,
diff --git a/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/beam_search_token_selection_strategy.py b/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/beam_search_token_selection_strategy.py
@@ -32,7 +32,10 @@
 
 
 class BeamSearchBeam(Beam):
-    def _convert_results_to_log_probs(self, probs: List):
+    def _convert_results_to_log_probs(
+        self,
+        probs: List,
+    ):
         device = self.exec_req.result_logits.device
         dtype = self.exec_req.result_logits.dtype
         probs_sf = convert_list_to_device_array(
@@ -69,22 +72,42 @@ def sample_logits(self, k: int):
         Returns:
             Tuple[List[int], List[float]]: Tuple containing (top_tokens, top_values)
         """
-        self.apply_temperature()
+        logits = self.exec_req.result_logits
         decode_config = self.decode_config
         num_beams = decode_config.num_beams
         top_k = decode_config.top_k
         top_p = decode_config.top_p
 
         if (top_k, top_p) == (None, None):
-            log_softmax_logits = self.convert_logits_normalization(
+            tokens, probs = self.sampler.select_top_k(logits, -k)
+
+            # TODO: https://github.com/nod-ai/shark-ai/issues/1278 find cleaner way to do these conversions
+            if logits.dtype in [sfnp.float16]:
+                probs = [convert_float_to_int(prob, logits.dtype) for prob in probs]
+
+            probs_sf = convert_list_to_device_array(
+                probs,
+                [len(probs)],
+                logits.device,
+                logits.dtype,
+            )
+
+            if self.decode_config.logits_normalization == LogitsNormalization.NONE:
+                probs_sf = self.apply_temperature(probs_sf)
+
+            log_probs = self.convert_logits_normalization(
                 self.decode_config.logits_normalization,
                 LogitsNormalization.LOG_SOFTMAX,
-                self.exec_req.result_logits,
-            )
+                probs_sf,
+            ).items.tolist()
 
-            return self.sampler.select_top_k(log_softmax_logits, -k)
+            if logits.dtype in [sfnp.float16]:
+                log_probs = [
+                    convert_int_to_float(log_prob, logits.dtype)
+                    for log_prob in log_probs
+                ]
 
-        logits = self.exec_req.result_logits
+            return tokens, log_probs
 
         if top_k is not None:
             # Sample from `top_k` tokens
@@ -110,7 +133,9 @@ def sample_logits(self, k: int):
         if logits.dtype in [sfnp.float16]:
             probs = [convert_float_to_int(prob, logits.dtype) for prob in probs]
 
-        log_probs = self._convert_results_to_log_probs(probs)
+        log_probs = self._convert_results_to_log_probs(
+            probs,
+        )
 
         return tokens, log_probs
 
diff --git a/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/greedy_token_selection_strategy.py b/shortfin/python/shortfin_apps/llm/components/token_selection_strategy/greedy_token_selection_strategy.py
@@ -26,7 +26,6 @@ def sample_logits(self) -> int:
         Returns:
             int: The `argmax` of the logits.
         """
-        self.apply_temperature()
         exec_req = self.exec_req
         decode_config = self.decode_config
         top_k = decode_config.top_k
diff --git a/shortfin/tests/apps/llm/components/token_selection_strategy/beam_group_test.py b/shortfin/tests/apps/llm/components/token_selection_strategy/beam_group_test.py
@@ -75,25 +75,25 @@ def test_beam_apply_temperature(device, exec_req, decode_config):
 
     with patch.object(sfnp, "divide") as temp_mock:
         expected = value / temperature
-        beam.apply_temperature()
-        logits = beam.exec_req.result_logits.items.tolist()
-        assert all(approximately_equal(expected, logit) for logit in logits)
+        logits = beam.exec_req.result_logits
+        result = beam.apply_temperature(logits).items.tolist()
+        assert all(approximately_equal(expected, logit) for logit in result)
         temp_mock.assert_not_called()
 
     temperature = 0.5
     beam.decode_config.temperature = temperature
     expected = value / temperature
-    beam.apply_temperature()
-    logits = beam.exec_req.result_logits.items.tolist()
-    assert all(approximately_equal(expected, logit) for logit in logits)
+    logits = beam.exec_req.result_logits
+    result = beam.apply_temperature(logits).items.tolist()
+    assert all(approximately_equal(expected, logit) for logit in result)
 
     temperature = 1.5
     beam.exec_req.result_logits.items = data
     beam.decode_config.temperature = temperature
     expected = value / temperature
-    beam.apply_temperature()
-    logits = beam.exec_req.result_logits.items.tolist()
-    assert all(approximately_equal(expected, logit) for logit in logits)
+    logits = beam.exec_req.result_logits
+    result = beam.apply_temperature(logits).items.tolist()
+    assert all(approximately_equal(expected, logit) for logit in result)
 
 
 def test_convert_logits_normalization_none(device, exec_req, decode_config):