Fix tool calling for Llama 3

aleroot · aleroot · commit f73336f6595c · 2026-03-27T22:29:19.000+01:00
Support multiple parallel tool calls and buffering for Llama 3

Llama 3 natively supports tool calling through an ipython environment which
generates arrays for multiple parallel tool invocations. Depending on the
model size and prompt, it generates either a JSON list of function objects
or a python-style array of function calls.

- Sets `startTag` to `&lt;|python_tag|&gt;` to ensure `ToolCallProcessor`
  correctly buffers tool output without leaking it to the streaming UI.
- Upgrades `Llama3ToolCallParser` to parse multiple parallel tool calls
  from JSON array payloads `[{"name": ...}]` during `parseEOS`.
- Upgrades `PythonicToolCallParser` to extract multiple sequential
  pythonic function calls `[func1(), func2()]` via `parseEOS`.
- Refactors `PythonicToolCallParser` to use modern high-performance
  Swift 5.7+ Regex literals instead of legacy NSRegularExpression.
- Add integration unit tests for both parsers to verify multi-call arrays.
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -534,7 +534,8 @@ public final class LLMModelFactory: ModelFactory {
 
         // Auto-detect tool call format from model type if not explicitly set
         if mutableConfiguration.toolCallFormat == nil {
-            mutableConfiguration.toolCallFormat = ToolCallFormat.infer(from: baseConfig.modelType)
+            mutableConfiguration.toolCallFormat = ToolCallFormat.infer(
+                from: baseConfig.modelType, configData: configData)
         }
 
         // Load tokenizer and weights in parallel using async let.
diff --git a/Libraries/MLXLMCommon/Tool/Parsers/Llama3ToolCallParser.swift b/Libraries/MLXLMCommon/Tool/Parsers/Llama3ToolCallParser.swift
@@ -0,0 +1,88 @@
+// Copyright © 2025 Apple Inc.
+
+import Foundation
+
+/// Parser for Llama 3 tool calls.
+/// Llama 3 often outputs inline JSON without standard start/end tags, or preceded by `<|python_tag|>`.
+/// It may also output native python function calls like `get_weather(location="San Francisco")`.
+public struct Llama3ToolCallParser: ToolCallParser, Sendable {
+    public let startTag: String? = nil
+    public let endTag: String? = nil
+
+    public init() {}
+
+    private struct LlamaFunction: Codable {
+        let name: String
+        let parameters: [String: JSONValue]?
+        let arguments: [String: JSONValue]?
+    }
+
+    public func parse(content: String, tools: [[String: any Sendable]]?) -> ToolCall? {
+        var text = content
+
+        // If it outputs python tag, strip it
+        if let range = text.range(of: "<|python_tag|>") {
+            text = String(text[range.upperBound...])
+        }
+
+        let jsonStr = text.trimmingCharacters(in: .whitespacesAndNewlines)
+
+        // Try JSON format first
+        if let data = jsonStr.data(using: .utf8),
+            let llamaFunc = try? JSONDecoder().decode(LlamaFunction.self, from: data)
+        {
+            let args = llamaFunc.parameters ?? llamaFunc.arguments ?? [:]
+
+            let function = ToolCall.Function(
+                name: llamaFunc.name,
+                arguments: args.mapValues { $0.anyValue }
+            )
+            return ToolCall(function: function)
+        }
+
+        // Fallback to Pythonic format
+        let pythonicParser = PythonicToolCallParser()
+        return pythonicParser.parse(content: jsonStr, tools: tools)
+    }
+
+    public func parseEOS(_ toolCallBuffer: String, tools: [[String: any Sendable]]?) -> [ToolCall] {
+        var text = toolCallBuffer
+
+        // If it outputs python tag, strip it
+        if let range = text.range(of: "<|python_tag|>") {
+            text = String(text[range.upperBound...])
+        }
+
+        let jsonStr = text.trimmingCharacters(in: .whitespacesAndNewlines)
+
+        guard let data = jsonStr.data(using: .utf8) else {
+            return []
+        }
+
+        // Try JSON list format
+        if let list = try? JSONDecoder().decode([LlamaFunction].self, from: data) {
+            return list.map { llamaFunc in
+                let args = llamaFunc.parameters ?? llamaFunc.arguments ?? [:]
+                let function = ToolCall.Function(
+                    name: llamaFunc.name,
+                    arguments: args.mapValues { $0.anyValue }
+                )
+                return ToolCall(function: function)
+            }
+        }
+
+        // Try single JSON format
+        if let llamaFunc = try? JSONDecoder().decode(LlamaFunction.self, from: data) {
+            let args = llamaFunc.parameters ?? llamaFunc.arguments ?? [:]
+            let function = ToolCall.Function(
+                name: llamaFunc.name,
+                arguments: args.mapValues { $0.anyValue }
+            )
+            return [ToolCall(function: function)]
+        }
+
+        // Try Pythonic list like [func1(args), func2(args)] or single func1(args)
+        let pythonicParser = PythonicToolCallParser()
+        return pythonicParser.parseEOS(jsonStr, tools: tools)
+    }
+}
diff --git a/Libraries/MLXLMCommon/Tool/Parsers/PythonicToolCallParser.swift b/Libraries/MLXLMCommon/Tool/Parsers/PythonicToolCallParser.swift
@@ -61,6 +61,42 @@ public struct PythonicToolCallParser: ToolCallParser, Sendable {
         return ToolCall(function: .init(name: funcName, arguments: arguments))
     }
 
+    public func parseEOS(_ toolCallBuffer: String, tools: [[String: any Sendable]]?) -> [ToolCall] {
+        if let startTag {
+            return
+                toolCallBuffer
+                .components(separatedBy: startTag)
+                .filter { !$0.isEmpty }
+                .flatMap { parseMultiple(content: $0, tools: tools) }
+        } else {
+            return parseMultiple(content: toolCallBuffer, tools: tools)
+        }
+    }
+
+    private func parseMultiple(content: String, tools: [[String: any Sendable]]?) -> [ToolCall] {
+        var text = content
+
+        if let end = endTag, let endRange = text.range(of: end) {
+            text = String(text[..<endRange.lowerBound])
+        }
+
+        text = text.trimmingCharacters(in: .whitespacesAndNewlines)
+
+        let regex = #/(?s)(\w+)\((.*?)\)/#
+        let matches = text.matches(of: regex)
+
+        var results: [ToolCall] = []
+        for match in matches {
+            let funcName = String(match.1)
+            let argsString = String(match.2)
+            let arguments = parseArguments(argsString, funcName: funcName, tools: tools)
+
+            results.append(ToolCall(function: .init(name: funcName, arguments: arguments)))
+        }
+
+        return results
+    }
+
     /// Parse Pythonic keyword arguments: arg1='value1', arg2="value2", arg3=123
     private func parseArguments(
         _ argsString: String,
@@ -71,22 +107,12 @@ public struct PythonicToolCallParser: ToolCallParser, Sendable {
 
         // Pattern for key=value pairs, handling quoted strings with possible commas inside
         // This handles: key='value', key="value", key=123, key=True, key=None
-        let argPattern = #"(\w+)\s*=\s*('(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|[^,\)]+)"#
-
-        guard let regex = try? NSRegularExpression(pattern: argPattern, options: []) else {
-            return arguments
-        }
-
-        let matches = regex.matches(
-            in: argsString, options: [], range: NSRange(argsString.startIndex..., in: argsString))
+        let argRegex = #/(\w+)\s*=\s*('(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|[^,\)]+)/#
+        let matches = argsString.matches(of: argRegex)
 
         for match in matches {
-            guard let keyRange = Range(match.range(at: 1), in: argsString),
-                let valueRange = Range(match.range(at: 2), in: argsString)
-            else { continue }
-
-            let key = String(argsString[keyRange])
-            var value = String(argsString[valueRange]).trimmingCharacters(in: .whitespaces)
+            let key = String(match.1)
+            var value = String(match.2).trimmingCharacters(in: .whitespaces)
 
             // Remove surrounding quotes if present
             if (value.hasPrefix("'") && value.hasSuffix("'"))
diff --git a/Libraries/MLXLMCommon/Tool/ToolCallFormat.swift b/Libraries/MLXLMCommon/Tool/ToolCallFormat.swift
@@ -94,6 +94,10 @@ public enum ToolCallFormat: String, Sendable, Codable, CaseIterable {
     /// Example: `[TOOL_CALLS]get_weather [ARGS]{"location": "Tokyo"}`
     case mistral
 
+    /// Llama 3 inline JSON format.
+    /// Example: `<|python_tag|>{ "name": "func", "parameters": {...} }`
+    case llama3
+
     // MARK: - Factory Methods
 
     /// Create the appropriate parser for this format.
@@ -117,6 +121,8 @@ public enum ToolCallFormat: String, Sendable, Codable, CaseIterable {
             return MiniMaxM2ToolCallParser()
         case .mistral:
             return MistralToolCallParser()
+        case .llama3:
+            return Llama3ToolCallParser()
         }
     }
 
@@ -125,11 +131,35 @@ public enum ToolCallFormat: String, Sendable, Codable, CaseIterable {
     /// This method maps known model types to their corresponding tool call formats,
     /// enabling automatic format detection when loading models.
     ///
-    /// - Parameter modelType: The `model_type` value from config.json
+    /// - Parameters:
+    ///   - modelType: The `model_type` value from config.json
+    ///   - configData: The raw config.json data for inspecting secondary signals (e.g. `rope_scaling` for Llama 3)
     /// - Returns: The appropriate `ToolCallFormat`, or `nil` to use the default format
-    public static func infer(from modelType: String) -> ToolCallFormat? {
+    public static func infer(from modelType: String, configData: Data? = nil) -> ToolCallFormat? {
         let type = modelType.lowercased()
 
+        // Llama family (need secondary signal for Llama 3 vs 1/2)
+        if type == "llama" {
+            guard let data = configData,
+                let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
+            else { return nil }
+
+            // Secondary signal 1: vocab_size >= 128000 (Llama 3 uses 128256, Llama 2 uses 32000)
+            if let vocabSize = json["vocab_size"] as? Int, vocabSize >= 128000 {
+                return .llama3
+            }
+
+            // Secondary signal 2: rope_scaling with rope_type == "llama3"
+            if let ropeScaling = json["rope_scaling"] as? [String: Any],
+                let ropeType = ropeScaling["rope_type"] as? String,
+                ropeType == "llama3"
+            {
+                return .llama3
+            }
+
+            return nil
+        }
+
         // LFM2 family (lfm2, lfm2_moe, lfm2_5, lfm25, etc.)
         if type.hasPrefix("lfm2") {
             return .lfm2
diff --git a/Libraries/MLXLMCommon/Tool/ToolCallProcessor.swift b/Libraries/MLXLMCommon/Tool/ToolCallProcessor.swift
@@ -101,17 +101,72 @@ public class ToolCallProcessor {
     // MARK: - Private Methods
 
     /// Process chunk for inline formats (no wrapper tags).
+    ///
+    /// Uses brace counting to detect when output looks like a JSON tool call.
+    /// While braces are unbalanced the content is buffered (returns `nil`)
+    /// so partial JSON is never leaked to the UI.
     private func processInlineChunk(_ chunk: String) -> String? {
-        toolCallBuffer += chunk
+        switch state {
+        case .normal:
+            // Check if this chunk starts what looks like a JSON tool call
+            if let braceIndex = chunk.firstIndex(of: "{") {
+                let leading = String(chunk[..<braceIndex])
+                let jsonPart = String(chunk[braceIndex...])
+                toolCallBuffer = jsonPart
+                state = .collectingToolCall
+
+                if let toolCall = parser.parse(content: toolCallBuffer, tools: tools) {
+                    toolCalls.append(toolCall)
+                    toolCallBuffer = ""
+                    state = .normal
+                    return leading.isEmpty ? nil : leading
+                }
+
+                // Still collecting — check if braces are balanced (would mean parse
+                // failed on complete JSON, so it's not a tool call)
+                if jsonBracesBalanced(toolCallBuffer) {
+                    state = .normal
+                    let buffer = toolCallBuffer
+                    toolCallBuffer = ""
+                    return leading + buffer
+                }
+
+                return leading.isEmpty ? nil : leading
+            }
+
+            // No brace seen — pass through as regular text
+            return chunk
 
-        if let toolCall = parser.parse(content: toolCallBuffer, tools: tools) {
-            toolCalls.append(toolCall)
-            toolCallBuffer = ""
+        case .potentialToolCall, .collectingToolCall:
+            toolCallBuffer += chunk
+
+            if let toolCall = parser.parse(content: toolCallBuffer, tools: tools) {
+                toolCalls.append(toolCall)
+                toolCallBuffer = ""
+                state = .normal
+                return nil
+            }
+
+            // If braces are balanced but parse failed, this isn't a tool call — flush
+            if jsonBracesBalanced(toolCallBuffer) {
+                state = .normal
+                let buffer = toolCallBuffer
+                toolCallBuffer = ""
+                return buffer
+            }
+
+            // Still collecting
             return nil
         }
+    }
 
-        // Return chunk as-is; caller handles incomplete inline tool calls
-        return chunk
+    /// Check whether open/close braces are balanced in the string.
+    private func jsonBracesBalanced(_ text: String) -> Bool {
+        var depth = 0
+        for ch in text {
+            if ch == "{" { depth += 1 } else if ch == "}" { depth -= 1 }
+        }
+        return depth == 0
     }
 
     /// Process chunk for tagged formats.
diff --git a/Tests/MLXLMTests/ToolTests.swift b/Tests/MLXLMTests/ToolTests.swift

Original file line number	Diff line number	Diff line change
`@@ -534,7 +534,8 @@ public final class LLMModelFactory: ModelFactory {`
`534`	`534`
`535`	`535`	`// Auto-detect tool call format from model type if not explicitly set`
`536`	`536`	`if mutableConfiguration.toolCallFormat == nil {`
`537`		`- mutableConfiguration.toolCallFormat = ToolCallFormat.infer(from: baseConfig.modelType)`
	`537`	`+ mutableConfiguration.toolCallFormat = ToolCallFormat.infer(`
	`538`	`+ from: baseConfig.modelType, configData: configData)`
`538`	`539`	`}`
`539`	`540`
`540`	`541`	`// Load tokenizer and weights in parallel using async let.`