Server improvements, common hallucination fix

KillianLucas · KillianLucas · commit d0a59e2a86ad · 2024-07-03T18:26:22.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
diff --git a/interpreter/core/async_core.py b/interpreter/core/async_core.py
@@ -25,6 +25,7 @@ def __init__(self, *args, **kwargs):
         self.stop_event = threading.Event()
         self.output_queue = None
         self.id = os.getenv("INTERPRETER_ID", datetime.now().timestamp())
+        self.print = True  # Will print output
 
         self.server = Server(self)
 
@@ -45,41 +46,60 @@ async def input(self, chunk):
         elif "end" in chunk:
             # If the user is done talking, the interpreter should respond.
 
-            # But first, process any client messages.
-            if self.messages[-1]["role"] == "client":
+            run_code = None  # Will later default to auto_run unless the user makes a command here
+
+            # But first, process any commands.
+            if self.messages[-1]["type"] == "command":
                 command = self.messages[-1]["content"]
                 self.messages = self.messages[:-1]
 
                 if command == "stop":
+                    # Any start flag would have stopped it a moment ago, but to be sure:
+                    self.stop_event.set()
+                    self.respond_thread.join()
                     return
                 if command == "go":
                     # This is to approve code.
-                    # We do nothing, as self.respond will run the last code block if the last message is one.
+                    run_code = True
                     pass
 
             self.stop_event.clear()
-            self.respond_thread = threading.Thread(target=self.respond)
+            self.respond_thread = threading.Thread(
+                target=self.respond, args=(run_code,)
+            )
             self.respond_thread.start()
 
     async def output(self):
         if self.output_queue == None:
             self.output_queue = janus.Queue()
         return await self.output_queue.async_q.get()
 
-    def respond(self):
+    def respond(self, run_code=None):
+        if run_code == None:
+            run_code = self.auto_run
+
         for chunk in self._respond_and_store():
-            if chunk["type"] in ["code", "output"]:
-                if "start" in chunk:
-                    print("\n\n```" + chunk["format"], flush=True)
-                if "end" in chunk:
-                    print("\n```", flush=True)
-            print(chunk.get("content", ""), end="", flush=True)
+            if chunk["type"] == "confirmation":
+                if run_code:
+                    continue  # We don't need to send out confirmation chunks on the server. I don't even like them.
+                else:
+                    break
+
             if self.stop_event.is_set():
                 return
+
+            if self.print:
+                if chunk["type"] in ["code", "output"]:
+                    if "start" in chunk:
+                        print("\n\n```" + chunk["format"], flush=True)
+                    if "end" in chunk:
+                        print("\n```", flush=True)
+                print(chunk.get("content", ""), end="", flush=True)
+
             self.output_queue.sync_q.put(chunk)
 
         self.output_queue.sync_q.put(
-            {"role": "server", "type": "status", "content": "complete"}
+            {"role": "assistant", "type": "status", "content": "complete"}
         )
 
     def accumulate(self, chunk):
@@ -202,7 +222,7 @@ async def set_settings(payload: Dict[str, Any]):
 
         return {"status": "success"}
 
-    @router.get("/interpreter/{setting}")
+    @router.get("/settings/{setting}")
     async def get_setting(setting: str):
         if hasattr(async_interpreter, setting):
             setting_value = getattr(async_interpreter, setting)
diff --git a/interpreter/core/llm/llm.py b/interpreter/core/llm/llm.py
@@ -16,6 +16,8 @@
     display_markdown_message,
 )
 from .run_function_calling_llm import run_function_calling_llm
+
+# from .run_tool_calling_llm import run_tool_calling_llm
 from .run_text_llm import run_text_llm
 from .utils.convert_to_openai_messages import convert_to_openai_messages
 
@@ -283,6 +285,7 @@ def run(self, messages):
 
         if self.supports_functions:
             yield from run_function_calling_llm(self, params)
+            # yield from run_tool_calling_llm(self, params)
         else:
             yield from run_text_llm(self, params)
 
diff --git a/interpreter/core/llm/run_function_calling_llm.py b/interpreter/core/llm/run_function_calling_llm.py
@@ -31,9 +31,9 @@ def run_function_calling_llm(llm, request_params):
     request_params["functions"] = [function_schema]
 
     # Add OpenAI's recommended function message
-    request_params["messages"][0][
-        "content"
-    ] += "\nUse ONLY the function you have been provided with — 'execute(language, code)'."
+    # request_params["messages"][0][
+    #     "content"
+    # ] += "\nUse ONLY the function you have been provided with — 'execute(language, code)'."
 
     ## Convert output to LMC format
 
diff --git a/interpreter/core/llm/run_tool_calling_llm.py b/interpreter/core/llm/run_tool_calling_llm.py
@@ -0,0 +1,142 @@
+from .utils.merge_deltas import merge_deltas
+from .utils.parse_partial_json import parse_partial_json
+
+tool_schema = {
+    "type": "function",
+    "function": {
+        "name": "execute",
+        "description": "Executes code on the user's machine **in the users local environment** and returns the output",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "language": {
+                    "type": "string",
+                    "description": "The programming language (required parameter to the `execute` function)",
+                    "enum": [
+                        # This will be filled dynamically with the languages OI has access to.
+                    ],
+                },
+                "code": {
+                    "type": "string",
+                    "description": "The code to execute (required)",
+                },
+            },
+            "required": ["language", "code"],
+        },
+    },
+}
+
+
+def run_tool_calling_llm(llm, request_params):
+    ## Setup
+
+    # Add languages OI has access to
+    tool_schema["function"]["parameters"]["properties"]["language"]["enum"] = [
+        i.name.lower() for i in llm.interpreter.computer.terminal.languages
+    ]
+    request_params["tools"] = [tool_schema]
+
+    # Add OpenAI's recommended function message
+    # request_params["messages"][0][
+    #     "content"
+    # ] += "\nUse ONLY the function you have been provided with — 'execute(language, code)'."
+
+    ## Convert output to LMC format
+
+    accumulated_deltas = {}
+    language = None
+    code = ""
+
+    for chunk in llm.completions(**request_params):
+        if "choices" not in chunk or len(chunk["choices"]) == 0:
+            # This happens sometimes
+            continue
+
+        delta = chunk["choices"][0]["delta"]
+
+        # Convert tool call into function call, which we have great parsing logic for below
+        if "tool_calls" in delta:
+            if (
+                len(delta["tool_calls"]) > 0
+                and "function_call" in delta["tool_calls"][0]
+            ):
+                delta["function_call"] = delta["tool_calls"][0]["function_call"]
+
+        # Accumulate deltas
+        accumulated_deltas = merge_deltas(accumulated_deltas, delta)
+
+        if "content" in delta and delta["content"]:
+            yield {"type": "message", "content": delta["content"]}
+
+        if (
+            accumulated_deltas.get("function_call")
+            and "arguments" in accumulated_deltas["function_call"]
+            and accumulated_deltas["function_call"]["arguments"]
+        ):
+            if (
+                "name" in accumulated_deltas["function_call"]
+                and accumulated_deltas["function_call"]["name"] == "execute"
+            ):
+                arguments = accumulated_deltas["function_call"]["arguments"]
+                arguments = parse_partial_json(arguments)
+
+                if arguments:
+                    if (
+                        language is None
+                        and "language" in arguments
+                        and "code"
+                        in arguments  # <- This ensures we're *finished* typing language, as opposed to partially done
+                        and arguments["language"]
+                    ):
+                        language = arguments["language"]
+
+                    if language is not None and "code" in arguments:
+                        # Calculate the delta (new characters only)
+                        code_delta = arguments["code"][len(code) :]
+                        # Update the code
+                        code = arguments["code"]
+                        # Yield the delta
+                        if code_delta:
+                            yield {
+                                "type": "code",
+                                "format": language,
+                                "content": code_delta,
+                            }
+                else:
+                    if llm.interpreter.verbose:
+                        print("Arguments not a dict.")
+
+            # Common hallucinations
+            elif "name" in accumulated_deltas["function_call"] and (
+                accumulated_deltas["function_call"]["name"] == "python"
+                or accumulated_deltas["function_call"]["name"] == "functions"
+            ):
+                if llm.interpreter.verbose:
+                    print("Got direct python call")
+                if language is None:
+                    language = "python"
+
+                if language is not None:
+                    # Pull the code string straight out of the "arguments" string
+                    code_delta = accumulated_deltas["function_call"]["arguments"][
+                        len(code) :
+                    ]
+                    # Update the code
+                    code = accumulated_deltas["function_call"]["arguments"]
+                    # Yield the delta
+                    if code_delta:
+                        yield {
+                            "type": "code",
+                            "format": language,
+                            "content": code_delta,
+                        }
+
+            else:
+                # If name exists and it's not "execute" or "python" or "functions", who knows what's going on.
+                if "name" in accumulated_deltas["function_call"]:
+                    yield {
+                        "type": "code",
+                        "format": "python",
+                        "content": accumulated_deltas["function_call"]["name"],
+                    }
+                    return
diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
@@ -146,6 +146,18 @@ def respond(interpreter):
                     code = code[2:].strip()
                     if interpreter.verbose:
                         print("Removing `\n")
+                    interpreter.messages[-1]["content"] = code  # So the LLM can see it.
+
+                # A common hallucination
+                if code.startswith("functions.execute("):
+                    code = code.replace("functions.execute(", "").rstrip(")")
+                    code_dict = json.loads(code)
+                    language = code_dict.get("language", language)
+                    code = code_dict.get("code", code)
+                    interpreter.messages[-1]["content"] = code  # So the LLM can see it.
+                    interpreter.messages[-1][
+                        "format"
+                    ] = language  # So the LLM can see it.
 
                 if language == "text" or language == "markdown":
                     # It does this sometimes just to take notes. Let it, it's useful.
diff --git a/tests/test_interpreter.py b/tests/test_interpreter.py