more tests

dtam · dtam · commit 66a568baa1c4 · 2024-09-26T16:58:38.000-07:00
diff --git a/tests/integration_tests/mock_llm_outputs.py b/tests/integration_tests/mock_llm_outputs.py
@@ -13,7 +13,7 @@ class MockLiteLLMCallableOther(LiteLLMCallable):
     # NOTE: this class normally overrides `llm_providers.LiteLLMCallable`,
     # which compiles instructions and prompt into a single prompt;
     # here the instructions are passed into kwargs and ignored
-    def _invoke_llm(self, prompt, *args, **kwargs):
+    def _invoke_llm(self, messages, *args, **kwargs):
         """Mock the OpenAI API call to Completion.create."""
 
         _rail_to_compiled_prompt = {  # noqa
@@ -43,16 +43,16 @@ def _invoke_llm(self, prompt, *args, **kwargs):
         }
 
         try:
-            output = mock_llm_responses[prompt]
+            output = mock_llm_responses[messages[0]["content"]]
             return LLMResponse(
                 output=output,
                 prompt_token_count=123,
                 response_token_count=1234,
             )
         except KeyError:
-            print("Unrecognized prompt!")
-            print(prompt)
-            raise ValueError("Compiled prompt not found")
+            print("Unrecognized messages!")
+            print(messages)
+            raise ValueError("Compiled messages not found")
 
 
 class MockAsyncLiteLLMCallable(AsyncLiteLLMCallable):
@@ -129,7 +129,10 @@ def _invoke_llm(
 
         try:
             if messages:
-                key = (messages[0]["content"], messages[1]["content"])
+                if len(messages) == 2:
+                    key = (messages[0]["content"], messages[1]["content"])
+                elif len(messages) == 1:
+                    key = (messages[0]["content"], None)
                 out_text = mock_llm_responses[key]
             if prompt and instructions and not messages:
                 out_text = mock_llm_responses[(prompt, instructions)]
diff --git a/tests/integration_tests/test_assets/lists_object.py b/tests/integration_tests/test_assets/lists_object.py
@@ -2,11 +2,7 @@
 
 from pydantic import BaseModel
 
-LIST_PROMPT = """Create a list of items that may be found in a grocery store.
-
-Json Output:
-
-"""
+LIST_PROMPT = """Create a list of items that may be found in a grocery store."""
 
 
 LIST_OUTPUT = """[{"name": "apple", "price": 1.0}, {"name": "banana", "price": 0.5}, {"name": "orange", "price": 1.5}]"""  # noqa: E501
@@ -28,6 +24,8 @@ class Item(BaseModel):
         <float name="price" />
     </object>
   </output>
-  <prompt>Create a list of items that may be found in a grocery store.</prompt>
+  <messages>
+  <message role="user">Create a list of items that may be found in a grocery store.</message>
+  </messages>
 </rail>
 """
diff --git a/tests/integration_tests/test_assets/python_rail/validator_parallelism.rail b/tests/integration_tests/test_assets/python_rail/validator_parallelism.rail
@@ -11,10 +11,12 @@
     on-fail-length="reask"
 />
 
-<prompt>
+<messages>
+<message role="user">
 Say hullo to my little friend
 
 ${gr.complete_string_suffix}
-</prompt>
+</message>
+</messages>
 
 </rail>
diff --git a/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_1.txt b/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_1.txt
@@ -14,7 +14,3 @@ Your generated response should satisfy the following properties:
 
 Don't talk; just go.
 
-
-
-String Output:
-
diff --git a/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_2.txt b/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_2.txt
@@ -22,7 +22,3 @@ Your generated response should satisfy the following properties:
 - length: min=1 max=10
 
 Don't talk; just go.
-
-
-String Output:
-
diff --git a/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_3.txt b/tests/integration_tests/test_assets/python_rail/validator_parallelism_prompt_3.txt
@@ -18,7 +18,3 @@ Your generated response should satisfy the following properties:
 - length: min=1 max=10
 
 Don't talk; just go.
-
-
-String Output:
-
diff --git a/tests/integration_tests/test_guard.py b/tests/integration_tests/test_guard.py
@@ -26,6 +26,7 @@
 )
 
 from .mock_llm_outputs import (
+    MockLiteLLMCallableOther,
     MockLiteLLMCallable,
     entity_extraction,
     lists_object,
@@ -173,10 +174,10 @@ def test_entity_extraction_with_reask(
     )
 
     content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
-    guard = guard_initializer(rail, prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
 
     final_output: ValidationOutcome = guard(
-        llm_api=openai.completions.create,
+        model="gpt-3.5-turbo",
         prompt_params={"document": content[:6000]},
         num_reasks=1,
         max_tokens=2000,
@@ -259,7 +260,7 @@ def test_entity_extraction_with_noop(mocker, rail, prompt):
     mocker.patch("guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallable)
 
     content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
-    guard = guard_initializer(rail, prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
     final_output = guard(
         llm_api=openai.completions.create,
         prompt_params={"document": content[:6000]},
@@ -305,7 +306,7 @@ def test_entity_extraction_with_filter(mocker, rail, prompt):
     mocker.patch("guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallable)
 
     content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
-    guard = guard_initializer(rail, prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
     final_output = guard(
         llm_api=openai.completions.create,
         prompt_params={"document": content[:6000]},
@@ -340,7 +341,7 @@ def test_entity_extraction_with_fix(mocker, rail, prompt):
     mocker.patch("guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallable)
 
     content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
-    guard = guard_initializer(rail, prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
     final_output = guard(
         llm_api=openai.completions.create,
         prompt_params={"document": content[:6000]},
@@ -376,7 +377,7 @@ def test_entity_extraction_with_refrain(mocker, rail, prompt):
     mocker.patch("guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallable)
 
     content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
-    guard = guard_initializer(rail, prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
     final_output = guard(
         llm_api=openai.completions.create,
         prompt_params={"document": content[:6000]},
@@ -857,11 +858,12 @@ def test_in_memory_validator_log_is_not_duplicated(mocker):
     try:
         content = gd.docs_utils.read_pdf("docs/examples/data/chase_card_agreement.pdf")
         guard = guard_initializer(
-            entity_extraction.PYDANTIC_RAIL_WITH_NOOP, entity_extraction.PYDANTIC_PROMPT
+            entity_extraction.PYDANTIC_RAIL_WITH_NOOP,
+            messages=[{"role": "user", "content": entity_extraction.PYDANTIC_PROMPT}],
         )
 
         guard(
-            llm_api=openai.completions.create,
+            model="gpt-3.5-turbo",
             prompt_params={"document": content[:6000]},
             num_reasks=1,
         )
@@ -942,11 +944,13 @@ def test_guard_with_top_level_list_return_type(mocker, rail, prompt):
     # Create a Guard with a top level list return type
 
     # Mock the LLM
-    mocker.patch("guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallable)
+    mocker.patch(
+        "guardrails.llm_providers.LiteLLMCallable", new=MockLiteLLMCallableOther
+    )
 
-    guard = guard_initializer(rail, prompt=prompt)
+    guard = guard_initializer(rail, messages=[{"role": "user", "content": prompt}])
 
-    output = guard(llm_api=openai.completions.create)
+    output = guard(model="gpt-3.5-turbo")
 
     # Validate the output
     assert output.validated_output == [
@@ -1002,7 +1006,7 @@ def test_string_output(mocker):
 
     guard = gd.Guard.from_rail_string(string.RAIL_SPEC_FOR_STRING)
     final_output = guard(
-        llm_api=openai.completions.create,
+        model="gpt-3.5-turbo",
         prompt_params={"ingredients": "tomato, cheese, sour cream"},
         num_reasks=1,
     )
@@ -1015,7 +1019,7 @@ def test_string_output(mocker):
     assert call.iterations.length == 1
 
     # For original prompt and output
-    assert call.compiled_prompt == string.COMPILED_PROMPT
+    assert call.compiled_messages[1]["content"]._source == string.COMPILED_PROMPT
     assert call.raw_outputs.last == string.LLM_OUTPUT
     assert mock_invoke_llm.call_count == 1
     mock_invoke_llm = None
@@ -1138,7 +1142,7 @@ def test_string_reask(mocker):
 
     guard = gd.Guard.from_rail_string(string.RAIL_SPEC_FOR_STRING_REASK)
     final_output = guard(
-        llm_api=openai.completions.create,
+        model="gpt-3.5-turbo",
         prompt_params={"ingredients": "tomato, cheese, sour cream"},
         num_reasks=1,
         max_tokens=100,
@@ -1152,15 +1156,18 @@ def test_string_reask(mocker):
     assert call.iterations.length == 2
 
     # For orginal prompt and output
-    assert call.compiled_instructions == string.COMPILED_INSTRUCTIONS
-    assert call.compiled_prompt == string.COMPILED_PROMPT
+    assert call.compiled_messages[0]["content"]._source == string.COMPILED_INSTRUCTIONS
+    assert call.compiled_messages[1]["content"]._source == string.COMPILED_PROMPT
     assert call.iterations.first.raw_output == string.LLM_OUTPUT
     assert call.iterations.first.validation_response == string.VALIDATED_OUTPUT_REASK
 
     # For re-asked prompt and output
-    assert call.iterations.last.inputs.prompt == gd.Prompt(string.COMPILED_PROMPT_REASK)
+    assert (
+        call.iterations.last.inputs.messages[1]["content"]
+        == string.COMPILED_PROMPT_REASK
+    )
     # Same thing as above
-    assert call.reask_prompts.last == string.COMPILED_PROMPT_REASK
+    assert call.reask_messages[0][1]["content"] == string.COMPILED_PROMPT_REASK
 
     assert call.raw_outputs.last == string.LLM_OUTPUT_REASK
     assert call.guarded_output == string.LLM_OUTPUT_REASK
diff --git a/tests/integration_tests/test_multi_reask.py b/tests/integration_tests/test_multi_reask.py
@@ -1,4 +1,3 @@
-import openai
 import guardrails as gd
 from guardrails.classes.llm.llm_response import LLMResponse
 
@@ -44,21 +43,30 @@ def test_multi_reask(mocker):
 
     assert len(call.iterations) == 3
 
-    assert call.compiled_prompt == python_rail.VALIDATOR_PARALLELISM_PROMPT_1
+    assert (
+        call.compiled_messages[0]["content"]._source
+        == python_rail.VALIDATOR_PARALLELISM_PROMPT_1
+    )
     assert call.raw_outputs.first == python_rail.VALIDATOR_PARALLELISM_RESPONSE_1
     assert (
         call.iterations.first.validation_response
         == python_rail.VALIDATOR_PARALLELISM_REASK_1
     )
 
-    assert call.reask_prompts.first == python_rail.VALIDATOR_PARALLELISM_PROMPT_2
+    assert (
+        call.reask_messages[0][1]["content"]
+        == python_rail.VALIDATOR_PARALLELISM_PROMPT_2
+    )
     assert call.raw_outputs.at(1) == python_rail.VALIDATOR_PARALLELISM_RESPONSE_2
     assert (
         call.iterations.at(1).validation_response
         == python_rail.VALIDATOR_PARALLELISM_REASK_2
     )
 
-    assert call.reask_prompts.last == python_rail.VALIDATOR_PARALLELISM_PROMPT_3
+    assert (
+        call.reask_messages[1][1]["content"]
+        == python_rail.VALIDATOR_PARALLELISM_PROMPT_3
+    )
     assert call.raw_outputs.last == python_rail.VALIDATOR_PARALLELISM_RESPONSE_3
     # The output here fails some validators but passes others.
     # Since those that it fails in the end are noop fixes, validation fails.
diff --git a/tests/integration_tests/test_pydantic.py b/tests/integration_tests/test_pydantic.py
@@ -1,6 +1,5 @@
 import json
 from typing import Dict, List
-import openai
 import pytest
 from pydantic import BaseModel
 
@@ -36,10 +35,10 @@ def test_pydantic_with_reask(mocker):
         ),
     ]
 
-    guard = gd.Guard.from_pydantic(ListOfPeople, 
-                                   messages=[{
-                                    "role": "user",
-                                      "content": VALIDATED_RESPONSE_REASK_PROMPT}])
+    guard = gd.Guard.from_pydantic(
+        ListOfPeople,
+        messages=[{"role": "user", "content": VALIDATED_RESPONSE_REASK_PROMPT}],
+    )
     final_output = guard(
         model="text-davinci-003",
         max_tokens=512,
@@ -124,10 +123,15 @@ def test_pydantic_with_full_schema_reask(mocker):
         ),
     ]
 
-    guard = gd.Guard.from_pydantic(ListOfPeople, messages=[{
-            "content": VALIDATED_RESPONSE_REASK_PROMPT,
-            "role": "user",
-        }])
+    guard = gd.Guard.from_pydantic(
+        ListOfPeople,
+        messages=[
+            {
+                "content": VALIDATED_RESPONSE_REASK_PROMPT,
+                "role": "user",
+            }
+        ],
+    )
     final_output = guard(
         model="gpt-3.5-turbo",
         max_tokens=512,
@@ -153,9 +157,14 @@ def test_pydantic_with_full_schema_reask(mocker):
     )
 
     # For re-asked prompt and output
-    assert call.iterations.at(1).inputs.messages[0]["content"]._source == pydantic.COMPILED_INSTRUCTIONS_CHAT
-    assert call.iterations.at(1).inputs.messages[1]["content"]._source == pydantic.COMPILED_PROMPT_FULL_REASK
-
+    assert (
+        call.iterations.at(1).inputs.messages[0]["content"]._source
+        == pydantic.COMPILED_PROMPT_FULL_REASK_1
+    )
+    assert (
+        call.iterations.at(1).inputs.messages[1]["content"]._source
+        == pydantic.COMPILED_INSTRUCTIONS_CHAT
+    )
     assert call.iterations.at(1).raw_output == pydantic.LLM_OUTPUT_FULL_REASK_1
     assert (
         call.iterations.at(1).validation_response == pydantic.VALIDATED_OUTPUT_REASK_2