Treat None rollout_id as absent (#8745)

okhat · web-flow · commit 9e5967dc2e73 · 2025-08-31T12:47:16.000-04:00
diff --git a/docs/docs/cheatsheet.md b/docs/docs/cheatsheet.md
@@ -463,7 +463,7 @@ dspy.configure_cache(
 
 ### BestofN
 
-Runs a module up to `N` times with different temperatures and returns the best prediction, as defined by the `reward_fn`, or the first prediction that passes the `threshold`.
+Runs a module up to `N` times with different rollout IDs (bypassing cache) and returns the best prediction, as defined by the `reward_fn`, or the first prediction that passes the `threshold`.
 
 ```python
 import dspy
@@ -478,7 +478,7 @@ best_of_3(question="What is the capital of Belgium?").answer
 
 ### Refine
 
-Refines a module by running it up to `N` times with different temperatures and returns the best prediction, as defined by the `reward_fn`, or the first prediction that passes the `threshold`. After each attempt (except the final one), `Refine` automatically generates detailed feedback about the module's performance and uses this feedback as hints for subsequent runs, creating an iterative refinement process.
+Refines a module by running it up to `N` times with different rollout IDs (bypassing cache) and returns the best prediction, as defined by the `reward_fn`, or the first prediction that passes the `threshold`. After each attempt (except the final one), `Refine` automatically generates detailed feedback about the module's performance and uses this feedback as hints for subsequent runs, creating an iterative refinement process.
 
 ```python
 import dspy
diff --git a/docs/docs/learn/programming/language_models.md b/docs/docs/learn/programming/language_models.md
@@ -166,6 +166,14 @@ gpt_4o_mini = dspy.LM('openai/gpt-4o-mini', temperature=0.9, max_tokens=3000, st
 
 By default LMs in DSPy are cached. If you repeat the same call, you will get the same outputs. But you can turn off caching by setting `cache=False`.
 
+If you want to keep caching enabled but force a new request (for example, to obtain diverse outputs),
+pass a unique `rollout_id` in your call. Different values ensure a different cache entry while
+still caching future calls with the same inputs and `rollout_id`.
+
+```python linenums="1"
+lm("Say this is a test!", rollout_id=1)
+```
+
 
 ## Inspecting output and usage metadata.
 
diff --git a/docs/docs/tutorials/output_refinement/best-of-n-and-refine.md b/docs/docs/tutorials/output_refinement/best-of-n-and-refine.md
@@ -1,14 +1,14 @@
 # Output Refinement: BestOfN and Refine
 
-Both `BestOfN` and `Refine` are DSPy modules designed to improve the reliability and quality of predictions by making multiple `LM` calls with different parameter settings. Both modules stop when they have reached `N` attempts or when the `reward_fn` returns an award above the `threshold`.
+Both `BestOfN` and `Refine` are DSPy modules designed to improve the reliability and quality of predictions by making multiple `LM` calls with different rollout IDs to bypass caching. Both modules stop when they have reached `N` attempts or when the `reward_fn` returns an award above the `threshold`.
 
 ## BestOfN
 
-`BestOfN` is a module that runs the provided module multiple times (up to `N`) with different temperature settings. It returns either the first prediction that passes a specified threshold or the one with the highest reward if none meets the threshold.
+`BestOfN` is a module that runs the provided module multiple times (up to `N`) with different rollout IDs. It returns either the first prediction that passes a specified threshold or the one with the highest reward if none meets the threshold.
 
 ### Basic Usage
 
-Lets say we wanted to have the best chance of getting a one word answer from the model. We could use `BestOfN` to try multiple temperature settings and return the best result.
+Lets say we wanted to have the best chance of getting a one word answer from the model. We could use `BestOfN` to try multiple rollout IDs and return the best result.
 
 ```python
 import dspy
@@ -86,7 +86,7 @@ refine = dspy.Refine(
 
 Both modules serve similar purposes but differ in their approach:
 
-- `BestOfN` simply tries different temperature settings and selects the best resulting prediction as defined by the `reward_fn`.
+- `BestOfN` simply tries different rollout IDs and selects the best resulting prediction as defined by the `reward_fn`.
 - `Refine` adds an feedback loop, using the lm to generate a detailed feedback about the module's own performance using the previous prediction and the code in the `reward_fn`. This feedback is then used as hints for subsequent runs.
 
 ## Practical Examples
diff --git a/dspy/clients/base_lm.py b/dspy/clients/base_lm.py
@@ -110,7 +110,12 @@ async def aforward(self, prompt=None, messages=None, **kwargs):
         raise NotImplementedError("Subclasses must implement this method.")
 
     def copy(self, **kwargs):
-        """Returns a copy of the language model with possibly updated parameters."""
+        """Returns a copy of the language model with possibly updated parameters.
+
+        Any provided keyword arguments update the corresponding attributes or LM kwargs of
+        the copy. For example, ``lm.copy(rollout_id=1)`` returns an LM whose requests use a
+        different rollout ID to bypass cache collisions.
+        """
 
         import copy
 
@@ -121,7 +126,10 @@ def copy(self, **kwargs):
             if hasattr(self, key):
                 setattr(new_instance, key, value)
             if (key in self.kwargs) or (not hasattr(self, key)):
-                new_instance.kwargs[key] = value
+                if value is None:
+                    new_instance.kwargs.pop(key, None)
+                else:
+                    new_instance.kwargs[key] = value
 
         return new_instance
 
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -59,6 +59,10 @@ def __init__(
             provider: The provider to use. If not specified, the provider will be inferred from the model.
             finetuning_model: The model to finetune. In some providers, the models available for finetuning is different
                 from the models available for inference.
+            rollout_id: Optional integer used to differentiate cache entries for otherwise
+                identical requests. Different values bypass DSPy's caches while still caching
+                future calls with the same inputs and rollout ID. This argument is stripped
+                before sending requests to the provider.
         """
         # Remember to update LM.copy() if you modify the constructor!
         self.model = model
@@ -85,8 +89,12 @@ def __init__(
                     "`dspy.LM(...)`, e.g., dspy.LM('openai/gpt-5', temperature=1.0, max_tokens=20000)"
                 )
             self.kwargs = dict(temperature=temperature, max_completion_tokens=max_tokens, **kwargs)
+            if self.kwargs.get("rollout_id") is None:
+                self.kwargs.pop("rollout_id", None)
         else:
             self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
+            if self.kwargs.get("rollout_id") is None:
+                self.kwargs.pop("rollout_id", None)
 
     def _get_cached_completion_fn(self, completion_fn, cache):
         ignored_args_for_cache_key = ["api_key", "api_base", "base_url"]
@@ -102,10 +110,13 @@ def _get_cached_completion_fn(self, completion_fn, cache):
 
     def forward(self, prompt=None, messages=None, **kwargs):
         # Build the request.
+        kwargs = dict(kwargs)
         cache = kwargs.pop("cache", self.cache)
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
+        if kwargs.get("rollout_id") is None:
+            kwargs.pop("rollout_id", None)
 
         if self.model_type == "chat":
             completion = litellm_completion
@@ -129,10 +140,13 @@ def forward(self, prompt=None, messages=None, **kwargs):
 
     async def aforward(self, prompt=None, messages=None, **kwargs):
         # Build the request.
+        kwargs = dict(kwargs)
         cache = kwargs.pop("cache", self.cache)
 
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
+        if kwargs.get("rollout_id") is None:
+            kwargs.pop("rollout_id", None)
 
         if self.model_type == "chat":
             completion = alitellm_completion
@@ -296,6 +310,8 @@ async def async_stream_completion():
 
 def litellm_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     stream_completion = _get_stream_completion_fn(request, cache, sync=True)
     if stream_completion is None:
         return litellm.completion(
@@ -310,6 +326,8 @@ def litellm_completion(request: dict[str, Any], num_retries: int, cache: dict[st
 
 def litellm_text_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     # Extract the provider and model from the model string.
     # TODO: Not all the models are in the format of "provider/model"
     model = request.pop("model").split("/", 1)
@@ -336,6 +354,8 @@ def litellm_text_completion(request: dict[str, Any], num_retries: int, cache: di
 
 async def alitellm_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     stream_completion = _get_stream_completion_fn(request, cache, sync=False)
     if stream_completion is None:
         return await litellm.acompletion(
@@ -350,6 +370,8 @@ async def alitellm_completion(request: dict[str, Any], num_retries: int, cache:
 
 async def alitellm_text_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     model = request.pop("model").split("/", 1)
     provider, model = model[0] if len(model) > 1 else "openai", model[-1]
 
@@ -373,6 +395,8 @@ async def alitellm_text_completion(request: dict[str, Any], num_retries: int, ca
 
 def litellm_responses_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     request = _convert_chat_request_to_responses_request(request)
 
     return litellm.responses(
@@ -385,6 +409,8 @@ def litellm_responses_completion(request: dict[str, Any], num_retries: int, cach
 
 async def alitellm_responses_completion(request: dict[str, Any], num_retries: int, cache: dict[str, Any] | None = None):
     cache = cache or {"no-cache": True, "no-store": True}
+    request = dict(request)
+    request.pop("rollout_id", None)
     request = _convert_chat_request_to_responses_request(request)
 
     return await litellm.aresponses(
@@ -395,6 +421,7 @@ async def alitellm_responses_completion(request: dict[str, Any], num_retries: in
     )
 
 def _convert_chat_request_to_responses_request(request: dict[str, Any]):
+    request = dict(request)
     if "messages" in request:
         content_blocks = []
         for msg in request.pop("messages"):
diff --git a/dspy/predict/best_of_n.py b/dspy/predict/best_of_n.py
@@ -14,7 +14,7 @@ def __init__(
         fail_count: int | None = None,
     ):
         """
-        Runs a module up to `N` times with different temperatures and returns the best prediction
+        Runs a module up to `N` times with different rollout IDs and returns the best prediction
         out of `N` attempts or the first prediction that passes the `threshold`.
 
         Args:
@@ -53,12 +53,14 @@ def one_word_answer(args, pred):
 
     def forward(self, **kwargs):
         lm = self.module.get_lm() or dspy.settings.lm
-        temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / self.N) for i in range(self.N)]
-        temps = list(dict.fromkeys(temps))[: self.N]
+        base_rollout = lm.kwargs.get("rollout_id")
+        start = 0 if base_rollout is None else base_rollout
+        rollout_ids = [start + i for i in range(self.N)]
+        rollout_ids = list(dict.fromkeys(rollout_ids))[: self.N]
         best_pred, best_trace, best_reward = None, None, -float("inf")
 
-        for idx, t in enumerate(temps):
-            lm_ = lm.copy(temperature=t)
+        for idx, rid in enumerate(rollout_ids):
+            lm_ = lm.copy(rollout_id=rid)
             mod = self.module.deepcopy()
             mod.set_lm(lm_)
 
@@ -77,7 +79,7 @@ def forward(self, **kwargs):
                     break
 
             except Exception as e:
-                print(f"BestOfN: Attempt {idx + 1} failed with temperature {t}: {e}")
+                print(f"BestOfN: Attempt {idx + 1} failed with rollout id {rid}: {e}")
                 if idx > self.fail_count:
                     raise e
                 self.fail_count -= 1
diff --git a/dspy/predict/refine.py b/dspy/predict/refine.py
@@ -48,9 +48,9 @@ def __init__(
         fail_count: int | None = None,
     ):
         """
-        Refines a module by running it up to N times with different temperatures and returns the best prediction.
+        Refines a module by running it up to N times with different rollout IDs and returns the best prediction.
 
-        This module runs the provided module multiple times with varying temperature settings and selects
+        This module runs the provided module multiple times with varying rollout identifiers and selects
         either the first prediction that exceeds the specified threshold or the one with the highest reward.
         If no prediction meets the threshold, it automatically generates feedback to improve future predictions.
 
@@ -96,14 +96,16 @@ def one_word_answer(args, pred):
 
     def forward(self, **kwargs):
         lm = self.module.get_lm() or dspy.settings.lm
-        temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / self.N) for i in range(self.N)]
-        temps = list(dict.fromkeys(temps))[: self.N]
+        base_rollout = lm.kwargs.get("rollout_id")
+        start = 0 if base_rollout is None else base_rollout
+        rollout_ids = [start + i for i in range(self.N)]
+        rollout_ids = list(dict.fromkeys(rollout_ids))[: self.N]
         best_pred, best_trace, best_reward = None, None, -float("inf")
         advice = None
         adapter = dspy.settings.adapter or dspy.ChatAdapter()
 
-        for idx, t in enumerate(temps):
-            lm_ = lm.copy(temperature=t)
+        for idx, rid in enumerate(rollout_ids):
+            lm_ = lm.copy(rollout_id=rid)
             mod = self.module.deepcopy()
             mod.set_lm(lm_)
 
@@ -167,7 +169,7 @@ def __call__(self, lm, lm_kwargs, signature, demos, inputs):
                 # print(f"Advice for each module: {advice}")
 
             except Exception as e:
-                print(f"Refine: Attempt failed with temperature {t}: {e}")
+                print(f"Refine: Attempt failed with rollout id {rid}: {e}")
                 if idx > self.fail_count:
                     raise e
                 self.fail_count -= 1
diff --git a/dspy/propose/grounded_proposer.py b/dspy/propose/grounded_proposer.py
@@ -330,7 +330,6 @@ def propose_instructions_for_program(
         demo_candidates,
         trial_logs,
         N, # noqa: N803
-        T, # noqa: N803
     ) -> list[str]:
         """This method is responsible for returning the full set of new instructions for our program, given the specified criteria."""
 
@@ -375,7 +374,6 @@ def propose_instructions_for_program(
                         program=program,
                         predictor=predictor,
                         pred_i=pred_i,
-                        T=T,
                         demo_candidates=demo_candidates,
                         demo_set_i=demo_set_i,
                         trial_logs=trial_logs,
@@ -390,7 +388,6 @@ def propose_instruction_for_predictor(
         program,
         predictor,
         pred_i,
-        T, # noqa: N803
         demo_candidates,
         demo_set_i,
         trial_logs,
@@ -414,14 +411,10 @@ def propose_instruction_for_predictor(
             verbose=self.verbose
         )
 
-        # Generate a new instruction for our predictor, using the temperature specified for this round
-        original_temp = self.prompt_model.kwargs["temperature"]
+        # Generate a new instruction for our predictor using a unique rollout id to bypass cache
+        rollout_lm = self.prompt_model.copy(rollout_id=self.rng.randint(0, 10**9))
 
-        epsilon = self.rng.uniform(0.01, 0.05)
-        modified_temp = T + epsilon
-
-        with dspy.settings.context(lm=self.prompt_model):
-            self.prompt_model.kwargs["temperature"] = modified_temp
+        with dspy.settings.context(lm=rollout_lm):
             proposed_instruction = instruction_generator(
                 demo_candidates=demo_candidates,
                 pred_i=pred_i,
@@ -432,7 +425,6 @@ def propose_instruction_for_predictor(
                 num_demos_in_context = self.num_demos_in_context,
                 tip=tip,
             ).proposed_instruction
-        self.prompt_model.kwargs["temperature"] = original_temp
 
         # Log the trace used to generate the new instruction, along with the new instruction itself
         if self.verbose:
diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
@@ -181,7 +181,7 @@ def _bootstrap_one_example(self, example, round_idx=0):
         try:
             with dspy.settings.context(trace=[], **self.teacher_settings):
                 lm = dspy.settings.lm
-                lm = lm.copy(temperature=0.7 + 0.001 * round_idx) if round_idx > 0 else lm
+                lm = lm.copy(rollout_id=round_idx) if round_idx > 0 else lm
                 new_settings = {"lm": lm} if round_idx > 0 else {}
 
                 with dspy.settings.context(**new_settings):
diff --git a/dspy/teleprompt/infer_rules.py b/dspy/teleprompt/infer_rules.py
@@ -143,7 +143,7 @@ class CustomRulesInduction(dspy.Signature):
 
     def forward(self, examples_text):
         with dspy.settings.context(**self.teacher_settings):
-            lm = dspy.settings.lm.copy(temperature=self.rng.uniform(0.9, 1.0))
+            lm = dspy.settings.lm.copy(rollout_id=self.rng.randint(0, 10**9))
             with dspy.settings.context(lm=lm):
                 rules = self.rules_induction(examples_text=examples_text).natural_language_rules
 
diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -408,7 +408,6 @@ def _propose_instructions(
             program=program,
             demo_candidates=demo_candidates,
             N=self.num_instruct_candidates,
-            T=self.init_temperature,
             trial_logs={},
         )
 
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
@@ -14,9 +14,11 @@
 
 def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
-    temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / n) for i in range(n)]
-    temps = list(dict.fromkeys(temps))[:n]
-    return [lm.copy(temperature=t) for t in temps]
+    base_rollout = lm.kwargs.get("rollout_id")
+    start = 0 if base_rollout is None else base_rollout
+    rollout_ids = [start + i for i in range(n)]
+    rollout_ids = list(dict.fromkeys(rollout_ids))[:n]
+    return [lm.copy(rollout_id=r) for r in rollout_ids]
 
 
 def wrap_program(program: dspy.Module, metric: Callable):
diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py
diff --git a/tests/propose/test_grounded_proposer.py b/tests/propose/test_grounded_proposer.py