fixing requirements and adding tests

HendrikStrobelt · HendrikStrobelt · commit decab140c9b9 · 2025-08-28T16:52:11.000+02:00
diff --git a/mellea/stdlib/sampling.py b/mellea/stdlib/sampling.py
@@ -72,6 +72,7 @@ def sample(
         self,
         action: Component,
         context: Context,
+        requirements: list[Requirement],
         *,
         generate_logs: list[GenerateLog] | None = None,
         validation_ctx: Context | None = None,
@@ -83,6 +84,7 @@ def sample(
         Args:
             action : The action object to be sampled.
             context: The context to be passed to the sampling strategy.
+            requirements: The requirements to be used by the sampling strategy (merged with global requirements).
             generate_logs: Optional list of GenerateLog objects. If None, no collection happens.
             validation_ctx: Optional context to use for validation. If None, validation_ctx = ctx.
         """
@@ -152,10 +154,10 @@ def sample(
         self,
         action: Component,
         context: Context,
+        requirements: list[Requirement],
         *,
         show_progress: bool = True,
         generate_logs: list[GenerateLog] | None = None,
-        requirements: list[Requirement] | None = None,
         validation_ctx: Context | None = None,
     ) -> SamplingResult:
         """This method performs a sampling operation based on the given instruction.
@@ -165,7 +167,7 @@ def sample(
             context: The context to be passed to the sampling strategy.
             show_progress: if true, a tqdm progress bar is used. Otherwise, messages will still be sent to flog.
             generate_logs: If provided, the generations will be logged.
-            requirements: List of requirements to test against.
+            requirements: List of requirements to test against (merged with global requirements).
             validation_ctx: Optional context to use for validation. If None, validation_ctx = ctx.
 
         Returns:
@@ -192,12 +194,14 @@ def sample(
         sampled_scores: list[list[tuple[Requirement, ValidationResult]]] = []
         sampled_actions: list[Component] = []
 
+        reqs = []
+        # global requirements supersede local requirements (global requiremenst can be defined by user)
+        # Todo: re-evaluate if this makes sense
         if self.requirements is not None:
-            reqs = self.requirements
-            if requirements is not None:
-                flog.warn("Some requirements are ignored.")
-        else:
-            reqs = requirements if requirements is not None else []
+            reqs += self.requirements
+        elif requirements is not None:
+            reqs += requirements
+        reqs = list(set(reqs))
 
         loop_count = 0
         loop_budget_range_iterator = (
diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
@@ -218,7 +218,9 @@ def instruct(
                 )
 
             # sample
-            res = strategy.sample(i, self.ctx, generate_logs=generate_logs)
+            res = strategy.sample(
+                i, self.ctx, i.requirements, generate_logs=generate_logs
+            )
 
             # make sure that one Log is marked as the one related to res.result
             if res.success:
diff --git a/test/stdlib_basics/test_sampling_ctx.py b/test/stdlib_basics/test_sampling_ctx.py
@@ -0,0 +1,63 @@
+from mellea import LinearContext, start_session
+from mellea.backends import ModelOption
+from mellea.stdlib.sampling import (
+    AgenticSamplingStrategy,
+    RejectionSamplingStrategy,
+    SamplingResult,
+)
+
+
+class TestSamplingCtxCase:
+    m = start_session(
+        model_options={ModelOption.MAX_NEW_TOKENS: 100}, ctx=LinearContext()
+    )
+
+    def _run_asserts_for_ctx_testing(self, res):
+        assert isinstance(res, SamplingResult), "res should be a SamplingResult."
+
+        assert isinstance(res.value, str), "Value should be set and a string."
+
+        assert len(res.sample_generations) >= 1, (
+            "sample generation should have at least one sample."
+        )
+        assert len(res.sample_validations) >= 1, (
+            "sample validation should have at least one sample."
+        )
+        assert len(res.sample_validations[0]) == 3, (
+            "there should be 3 validation results."
+        )
+        assert len(self.m.ctx._ctx) == 2, (
+            "there should only be a message and a response in the ctx."
+        )
+
+    def test_ctx_for_rejection_sampling(self):
+        self.m.ctx.reset()
+        res = self.m.instruct(
+            "Write a sentence.",
+            requirements=[
+                "be funny",
+                "be formal",
+                "use only words starting with the letter w",
+            ],
+            strategy=RejectionSamplingStrategy(loop_budget=3),
+            return_sampling_results=True,
+        )
+        self._run_asserts_for_ctx_testing(res)
+        assert len(self.m.last_prompt()) == 1, "Last prompt should only have only one instruction inside - independent of sampling iterations."
+
+    def test_ctx_for_agentic(self):
+        self.m.ctx.reset()
+        res = self.m.instruct(
+            "Write a sentence.",
+            requirements=[
+                "be funny",
+                "be formal",
+                "use only words starting with the letter w",
+            ],
+            strategy=AgenticSamplingStrategy(loop_budget=3),
+            return_sampling_results=True,
+        )
+
+        self._run_asserts_for_ctx_testing(res)
+
+        assert len(self.m.last_prompt()) == len(res.sample_generations)*2-1, "For n sampling iterations there should be 2n-1 prompt conversation elements in the last prompt."

Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,9 @@ def instruct(`
`218`	`218`	`)`
`219`	`219`
`220`	`220`	`# sample`
`221`		`- res = strategy.sample(i, self.ctx, generate_logs=generate_logs)`
	`221`	`+ res = strategy.sample(`
	`222`	`+ i, self.ctx, i.requirements, generate_logs=generate_logs`
	`223`	`+ )`
`222`	`224`
`223`	`225`	`# make sure that one Log is marked as the one related to res.result`
`224`	`226`	`if res.success:`