allow for evals to have follow ups

SamSaffron · SamSaffron · commit 0e628a164653 · 2025-03-14T15:42:31.000+11:00
diff --git a/config/eval-llms.yml b/config/eval-llms.yml
@@ -49,6 +49,21 @@ llms:
     max_prompt_tokens: 200000
     vision_enabled: true
 
+  claude-3.7-sonnet-thinking:
+    display_name: Claude 3.7 Sonnet
+    name: claude-3-7-sonnet-latest
+    tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
+    api_key_env: ANTHROPIC_API_KEY
+    provider: anthropic
+    url: https://api.anthropic.com/v1/messages
+    max_prompt_tokens: 200000
+    vision_enabled: true
+    provider_params:
+      disable_top_p: true
+      disable_temperature: true
+      enable_reasoning: true
+      reasoning_tokens: 1024
+
   gemini-2.0-flash:
     display_name: Gemini 2.0 Flash
     name: gemini-2-0-flash
diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
@@ -220,11 +220,20 @@ def pdf_to_text(llm, path:)
     upload.destroy if upload
   end
 
-  def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
+  def prompt_call(
+    llm,
+    system_prompt:,
+    message:,
+    temperature: nil,
+    tools: nil,
+    stream: false,
+    output_thinking: false,
+    followup: nil
+  )
     if tools
       tools.each do |tool|
         tool.symbolize_keys!
-        tool[:parameters].symbolize_keys! if tool[:parameters]
+        tool[:parameters]&.each(&:symbolize_keys!)
       end
     end
     prompt =
@@ -236,16 +245,57 @@ def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, str
     prompt.tools = tools if tools
 
     result = nil
-    if stream
-      result = []
-      llm
-        .llm_model
-        .to_llm
-        .generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
-          result << partial
+    generate_result =
+      lambda do
+        if stream
+          result = []
+          llm
+            .llm_model
+            .to_llm
+            .generate(
+              prompt,
+              user: Discourse.system_user,
+              temperature: temperature,
+              output_thinking: output_thinking,
+            ) { |partial| result << partial }
+        else
+          result =
+            llm.llm_model.to_llm.generate(
+              prompt,
+              user: Discourse.system_user,
+              temperature: temperature,
+              output_thinking: output_thinking,
+            )
         end
-    else
-      result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
+      end
+
+    generate_result.call
+
+    if followup
+      followup = followup.dup.symbolize_keys!
+      prompt.tools = followup[:tools] if followup[:tools]
+      prompt.push_model_response(result)
+      message = followup[:message].dup
+      message = message.symbolize_keys!
+      message[:type] = message[:type].to_sym if message[:type]
+      %i[id name].each do |key|
+        if message[key].is_a?(Array)
+          type, inner_key = message[key]
+          # this allows us to dynamically set the id or name of the tool call
+          prev = prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
+          message[key] = prev[inner_key.to_sym] if prev
+        end
+      end
+
+      prompt.push(**message)
+
+      begin
+        generate_result.call
+      rescue => e
+        # should not happen but it helps debugging...
+        puts e
+        result = []
+      end
     end
     result
   end
diff --git a/lib/completions/prompt.rb b/lib/completions/prompt.rb
@@ -41,6 +41,49 @@ def initialize(
         @tool_choice = tool_choice
       end
 
+      # this new api tries to create symmetry between responses and prompts
+      # this means anything we get back from the model via endpoint can be easily appended
+      def push_model_response(response)
+        response = [response] if !response.is_a? Array
+
+        thinking, thinking_signature, redacted_thinking_signature = nil
+
+        response.each do |message|
+          if message.is_a?(Thinking)
+            # we can safely skip partials here
+            next if message.partial?
+            if message.redacted
+              redacted_thinking_signature = message.signature
+            else
+              thinking = message.message
+              thinking_signature = message.signature
+            end
+          elsif message.is_a?(ToolCall)
+            next if message.partial?
+            # this is a bit surprising about the API
+            # needing to add arguments is not ideal
+            push(
+              type: :tool_call,
+              content: { arguments: message.parameters }.to_json,
+              id: message.id,
+              name: message.name,
+            )
+          elsif message.is_a?(String)
+            push(type: :model, content: message)
+          else
+            raise ArgumentError, "response must be an array of strings, ToolCalls, or Thinkings"
+          end
+        end
+
+        # anthropic rules are that we attach thinking to last for the response
+        # it is odd, I wonder if long term we just keep thinking as a separate object
+        if thinking || redacted_thinking_signature
+          messages.last[:thinking] = thinking
+          messages.last[:thinking_signature] = thinking_signature
+          messages.last[:redacted_thinking_signature] = redacted_thinking_signature
+        end
+      end
+
       def push(
         type:,
         content:,