diff --git a/agent_test_data/get_weather_kakinada.json b/agent_test_data/get_weather_kakinada.json new file mode 100644 index 00000000..07d53d17 --- /dev/null +++ b/agent_test_data/get_weather_kakinada.json @@ -0,0 +1,259 @@ +{ + "use_case": "Get weather for Kakinada", + "user_prompt": "What's the weather in Kakinada?", + "tool_calls": [ + { + "url": "https://geocoding-api.open-meteo.com/v1/search?name=Kakinada&count=1&language=en&format=json", + "method": "GET", + "fields": { + "url": "https://geocoding-api.open-meteo.com/v1/search?name=Kakinada&count=1&language=en&format=json", + "method": "GET", + "headers": "{\"Content-Type\":\"application/json\"}", + "body": "" + }, + "response": { + "success": true, + "action": "GET", + "status": 200, + "url": "https://geocoding-api.open-meteo.com/v1/search?name=Kakinada&count=1&language=en&format=json", + "output": { + "generationtime_ms": 0.15807152, + "results": [ + { + "admin1": "Andhra Pradesh", + "admin1_id": 1278629, + "admin2": "Kakinada", + "admin2_id": 12680261, + "admin3": "Kakinada Urban", + "admin3_id": 12686894, + "country": "India", + "country_code": "IN", + "country_id": 1269750, + "elevation": 6, + "feature_code": "PPL", + "id": 1268561, + "latitude": 16.96036, + "longitude": 82.23809, + "name": "Kākināda", + "population": 384182, + "timezone": "Asia/Kolkata" + } + ] + }, + "raw_response": { + "Result": "", + "body": { + "generationtime_ms": 0.15807152, + "results": [ + { + "admin1": "Andhra Pradesh", + "admin1_id": 1278629, + "admin2": "Kakinada", + "admin2_id": 12680261, + "admin3": "Kakinada Urban", + "admin3_id": 12686894, + "country": "India", + "country_code": "IN", + "country_id": 1269750, + "elevation": 6, + "feature_code": "PPL", + "id": 1268561, + "latitude": 16.96036, + "longitude": 82.23809, + "name": "Kākināda", + "population": 384182, + "timezone": "Asia/Kolkata" + } + ] + }, + "headers": { + "Connection": "keep-alive", + "Content-Encoding": "deflate", + "Content-Length": "257", + "Content-Type": "application/json; charset=utf-8", + "Date": "Sat, 27 Dec 2025 06:22:08 GMT", + "X-Encoding-Time": "0.0031948089599609375 ms" + }, + "status": 200, + "success": true, + "url": "https://geocoding-api.open-meteo.com/v1/search?name=Kakinada&count=1&language=en&format=json" + }, + "retries": 1 + } + }, + { + "url": "https://api.open-meteo.com/v1/forecast?latitude=16.96036&longitude=82.23809&timezone=Asia%2FKolkata¤t=temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,weather_code,wind_speed_10m,wind_direction_10m&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&forecast_days=3", + "method": "GET", + "fields": { + "url": "https://api.open-meteo.com/v1/forecast?latitude=16.96036&longitude=82.23809&timezone=Asia%2FKolkata¤t=temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,weather_code,wind_speed_10m,wind_direction_10m&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&forecast_days=3", + "method": "GET", + "headers": "{\"Content-Type\":\"application/json\"}", + "body": "" + }, + "response": { + "success": true, + "action": "GET", + "status": 200, + "url": "https://api.open-meteo.com/v1/forecast?latitude=16.96036&longitude=82.23809&timezone=Asia%2FKolkata¤t=temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,weather_code,wind_speed_10m,wind_direction_10m&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&forecast_days=3", + "output": { + "current": { + "apparent_temperature": 30.6, + "interval": 900, + "precipitation": 0, + "relative_humidity_2m": 57, + "temperature_2m": 27.4, + "time": "2025-12-27T11:45", + "weather_code": 1, + "wind_speed_10m": 6 + }, + "current_units": { + "apparent_temperature": "°C", + "interval": "seconds", + "precipitation": "mm", + "relative_humidity_2m": "%", + "temperature_2m": "°C", + "time": "iso8601", + "weather_code": "wmo code", + "wind_speed_10m": "km/h" + }, + "elevation": 7, + "generationtime_ms": 0.3038644790649414, + "latitude": 16.875, + "longitude": 82.25, + "timezone": "Asia/Kolkata", + "timezone_abbreviation": "GMT+5:30", + "utc_offset_seconds": 19800 + }, + "raw_response": { + "Result": "", + "body": { + "current": { + "apparent_temperature": 30.6, + "interval": 900, + "precipitation": 0, + "relative_humidity_2m": 57, + "temperature_2m": 27.4, + "time": "2025-12-27T11:45", + "weather_code": 1, + "wind_speed_10m": 6 + }, + "current_units": { + "apparent_temperature": "°C", + "interval": "seconds", + "precipitation": "mm", + "relative_humidity_2m": "%", + "temperature_2m": "°C", + "time": "iso8601", + "weather_code": "wmo code", + "wind_speed_10m": "km/h" + }, + "elevation": 7, + "generationtime_ms": 0.3038644790649414, + "latitude": 16.875, + "longitude": 82.25, + "timezone": "Asia/Kolkata", + "timezone_abbreviation": "GMT+5:30", + "utc_offset_seconds": 19800 + }, + "headers": { + "Connection": "keep-alive", + "Content-Encoding": "deflate", + "Content-Type": "application/json; charset=utf-8", + "Date": "Sat, 27 Dec 2025 06:22:17 GMT", + "Transfer-Encoding": "chunked" + }, + "status": 200, + "success": true, + "url": "https://api.open-meteo.com/v1/forecast?latitude=16.96036&longitude=82.23809&timezone=Asia%2FKolkata¤t=temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,weather_code,wind_speed_10m,wind_direction_10m&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&forecast_days=3" + }, + "retries": 1 + } + } + ], + "expected_decisions": [ + { + "i": 0, + "action": "api", + "tool": "http", + "category": "singul", + "confidence": 0.93, + "runs": "1", + "fields": [ + { + "key": "url", + "value": "https://geocoding-api.open-meteo.com/v1/search?name=Kakinada&count=1&language=en&format=json" + }, + { + "key": "method", + "value": "GET" + }, + { + "key": "headers", + "value": "{\"Accept\":\"application/json\"}" + }, + { + "key": "body", + "value": "" + } + ], + "reason": "Geocode Kakinada to get latitude/longitude for weather lookup." + }, + { + "i": 1, + "action": "api", + "tool": "http", + "category": "singul", + "confidence": 0.9, + "runs": "1", + "fields": [ + { + "key": "url", + "value": "https://api.open-meteo.com/v1/forecast?latitude=16.96036&longitude=82.23809&timezone=Asia%2FKolkata¤t=temperature_2m,relative_humidity_2m,apparent_temperature,precipitation,weather_code,wind_speed_10m,wind_direction_10m&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&forecast_days=3" + }, + { + "key": "method", + "value": "GET" + }, + { + "key": "headers", + "value": "{\"Accept\":\"application/json\"}" + }, + { + "key": "body", + "value": "" + } + ], + "reason": "Fetch current weather for Kakinada using Open-Meteo (no API key required)." + }, + { + "i": 2, + "action": "answer", + "tool": "", + "category": "standalone", + "confidence": 0.6, + "runs": "1", + "fields": [ + { + "key": "output", + "value": "I’m fetching the current weather in Kakinada now." + } + ], + "reason": "Provide the weather result to the user after API responses are available." + }, + { + "i": 2, + "action": "finish", + "tool": "", + "category": "finish", + "confidence": 0.92, + "runs": "1", + "fields": [ + { + "key": "body", + "value": "### Current weather in Kakinada (Andhra Pradesh, India)\n- **Time (local):** 2025-12-28 17:00 (Asia/Kolkata)\n- **Temperature:** 24.8°C\n- **Feels like:** 25.6°C\n- **Humidity:** 57%\n- **Wind speed:** 7.6 km/h\n- **Precipitation:** 0 mm\n- **Conditions:** Mostly clear (WMO code 1)" + } + ], + "reason": "Delivered the current weather for Kakinada from the fetched Open-Meteo data." + } + ] +} \ No newline at end of file diff --git a/agent_test_mock.go b/agent_test_mock.go new file mode 100644 index 00000000..c195d89e --- /dev/null +++ b/agent_test_mock.go @@ -0,0 +1,485 @@ +package shuffle + +import ( + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "log" + "net/url" + "os" + "path/filepath" + "strings" +) + +// MockToolCall represents a single tool call with its request and response +type MockToolCall struct { + URL string `json:"url"` + Method string `json:"method"` + Fields map[string]string `json:"fields"` + Response map[string]interface{} `json:"response"` +} + +// MockUseCaseData represents the test data for a single use case +type MockUseCaseData struct { + UseCase string `json:"use_case"` + UserPrompt string `json:"user_prompt"` + ToolCalls []MockToolCall `json:"tool_calls"` + ExpectedDecisions []map[string]interface{} `json:"expected_decisions"` +} + +// RunAgentDecisionMockHandler handles agent decision execution in test mode +// This function is called instead of the real Singul endpoint when AGENT_TEST_MODE=true +// +// # It loads mock data based on use case and matches tool calls by URL and fields +// +// Parameters: +// - execution: The full workflow execution context +// - decision: The agent decision to execute containing Tool, Action, Fields, etc. +// +// Returns: +// - rawResponse: The mock tool result as bytes (in Singul format) +// - debugUrl: Debug URL (empty in test mode) +// - appname: The app name (same as decision.Tool) +// - error: Any error that occurred +func RunAgentDecisionMockHandler(execution WorkflowExecution, decision AgentDecision) ([]byte, string, string, error) { + log.Printf("[DEBUG][%s] Mock handler called for tool=%s, action=%s", execution.ExecutionId, decision.Tool, decision.Action) + + useCase := os.Getenv("AGENT_TEST_USE_CASE") + if useCase == "" { + log.Printf("[ERROR][%s] AGENT_TEST_USE_CASE not set - cannot determine which test data to load", execution.ExecutionId) + return nil, "", decision.Tool, errors.New("AGENT_TEST_USE_CASE environment variable not set") + } + + response, err := GetMockSingulResponse(useCase, decision.Fields) + if err != nil { + log.Printf("[ERROR][%s] Failed to get mock response: %s", execution.ExecutionId, err) + return nil, "", decision.Tool, err + } + + // Parse the response to extract raw_response (same as real Singul handler does) + var outputMapped SchemalessOutput + err = json.Unmarshal(response, &outputMapped) + if err != nil { + log.Printf("[ERROR][%s] Failed to unmarshal mock response: %s", execution.ExecutionId, err) + return response, "", decision.Tool, err + } + + // Extract the raw_response field + body := response + if val, ok := outputMapped.RawResponse.(string); ok { + body = []byte(val) + } else if val, ok := outputMapped.RawResponse.([]byte); ok { + body = val + } else if val, ok := outputMapped.RawResponse.(map[string]interface{}); ok { + marshalledRawResp, err := json.MarshalIndent(val, "", " ") + if err != nil { + log.Printf("[ERROR][%s] Failed to marshal raw response: %s", execution.ExecutionId, err) + } else { + body = marshalledRawResp + } + } + + log.Printf("[DEBUG][%s] Returning mock response for %s (success=%v, response_size=%d bytes)", + execution.ExecutionId, decision.Tool, outputMapped.Success, len(body)) + + // Return in same format as real Singul handler: (body, debugUrl, appname, error) + return body, "", decision.Tool, nil +} + +// GetMockSingulResponse is the function that returns mock Singul responses +// It loads the use case data and matches based on URL and fields +// +// Parameters: +// - useCase: The use case name +// - fields: The request fields containing url, method, headers, body +// +// Returns: +// - response: The mock Singul response as bytes (in Singul format) +// - error: Any error that occurred +func GetMockSingulResponse(useCase string, fields []Valuereplace) ([]byte, error) { + useCaseData, err := loadUseCaseData(useCase) + if err != nil { + return nil, err + } + + requestURL := extractFieldValue(fields, "url") + if requestURL == "" { + return nil, errors.New("no URL found in request fields") + } + + log.Printf("[DEBUG] Looking for mock data with URL: %s", requestURL) + + var candidates []MockToolCall + reqURLParsed, err := url.Parse(requestURL) + if err != nil { + log.Printf("[ERROR] Invalid request URL %s: %v", requestURL, err) + return nil, fmt.Errorf("invalid request URL: %w", err) + } + for _, tc := range useCaseData.ToolCalls { + if urlsEqual(reqURLParsed, tc.URL) { + candidates = append(candidates, tc) + } + } + + // If no exact matches, try fuzzy matching + if len(candidates) == 0 { + log.Printf("[DEBUG] No exact match, trying fuzzy matching...") + bestMatch, score := findBestFuzzyMatch(reqURLParsed, useCaseData.ToolCalls) + if score >= 0.80 { + log.Printf("[INFO] Found fuzzy match with %.1f%% similarity: %s", score*100, bestMatch.URL) + candidates = append(candidates, bestMatch) + } else { + return nil, fmt.Errorf("no mock data found for URL: %s in use case: %s (best match: %.1f%%)", requestURL, useCase, score*100) + } + } + + // If only one match, return it + if len(candidates) == 1 { + log.Printf("[DEBUG] Found exact match for URL: %s", requestURL) + return marshalResponse(candidates[0].Response) + } + + // Multiple matches - compare fields to find exact match + log.Printf("[DEBUG] Found %d candidates for URL, comparing fields...", len(candidates)) + for _, candidate := range candidates { + if fieldsMatch(fields, candidate.Fields) { + log.Printf("[DEBUG] Found exact match based on fields") + return marshalResponse(candidate.Response) + } + } + + // No exact match - return first candidate with a warning + log.Printf("[WARNING] No exact field match found, returning first candidate") + return marshalResponse(candidates[0].Response) +} + +// urlsEqual compares two URLs ignoring query‑parameter order and allowing fuzzy matching when the sets are equal. +func urlsEqual(req *url.URL, stored string) bool { + storedURL, err := url.Parse(stored) + if err != nil { + log.Printf("[WARN] Invalid stored URL %s: %v", stored, err) + return false + } + if req.Scheme != storedURL.Scheme || req.Host != storedURL.Host || req.Path != storedURL.Path { + return false + } + reqQuery := req.Query() + storedQuery := storedURL.Query() + // If the number of parameters differs, not a match + if len(reqQuery) != len(storedQuery) { + return false + } + + for key, reqVals := range reqQuery { + storedVals, ok := storedQuery[key] + if !ok { + return false + } + if len(reqVals) != len(storedVals) { + return false + } + for i, v := range reqVals { + if v != storedVals[i] { + return false + } + } + } + return true +} + +// loadUseCaseData loads the test data for a given use case from JSON file +func loadUseCaseData(useCase string) (*MockUseCaseData, error) { + possiblePaths := []string{} + + if envPath := os.Getenv("AGENT_TEST_DATA_PATH"); envPath != "" { + possiblePaths = append(possiblePaths, envPath) + } + + possiblePaths = append(possiblePaths, "agent_test_data") + possiblePaths = append(possiblePaths, "../shuffle-shared/agent_test_data") + possiblePaths = append(possiblePaths, "../../shuffle-shared/agent_test_data") + + if homeDir, err := os.UserHomeDir(); err == nil { + possiblePaths = append(possiblePaths, filepath.Join(homeDir, "Documents", "shuffle-shared", "agent_test_data")) + } + + var filePath string + var foundPath string + + for _, basePath := range possiblePaths { + testPath := filepath.Join(basePath, fmt.Sprintf("%s.json", useCase)) + if _, err := os.Stat(testPath); err == nil { + filePath = testPath + foundPath = basePath + break + } + } + + if filePath == "" { + return nil, fmt.Errorf("could not find test data file %s.json in any of these paths: %v", useCase, possiblePaths) + } + + log.Printf("[DEBUG] Loading use case data from: %s", filePath) + + data, err := ioutil.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("failed to read use case file %s: %s", filePath, err) + } + + var useCaseData MockUseCaseData + err = json.Unmarshal(data, &useCaseData) + if err != nil { + return nil, fmt.Errorf("failed to parse use case data: %s", err) + } + + log.Printf("[DEBUG] Loaded use case '%s' with %d tool calls from %s", useCaseData.UseCase, len(useCaseData.ToolCalls), foundPath) + + return &useCaseData, nil +} + +// extractFieldValue extracts a field value by key from the fields array +func extractFieldValue(fields []Valuereplace, key string) string { + for _, field := range fields { + if field.Key == key { + return field.Value + } + } + return "" +} + +func fieldsMatch(requestFields []Valuereplace, storedFields map[string]string) bool { + // Convert request fields to map for easier comparison + requestMap := make(map[string]string) + for _, field := range requestFields { + requestMap[field.Key] = field.Value + } + + for key, storedValue := range storedFields { + requestValue, exists := requestMap[key] + if !exists || requestValue != storedValue { + return false + } + } + + return true +} + +func marshalResponse(response map[string]interface{}) ([]byte, error) { + data, err := json.Marshal(response) + if err != nil { + return nil, fmt.Errorf("failed to marshal response: %s", err) + } + return data, nil +} + +// analyzeTestFailureWithLLM uses LLM to provide detailed analysis of why a test failed +func analyzeTestFailureWithLLM(actualDecisions []interface{}, expectedDecisions []map[string]interface{}, isTimeout bool) string { + cleanActual := stripRawResponses(actualDecisions) + cleanExpected := stripRawResponsesFromMaps(expectedDecisions) + + actualJSON, err := json.MarshalIndent(cleanActual, "", " ") + if err != nil { + return "Failed to analyze: could not marshal actual decisions" + } + + expectedJSON, err := json.MarshalIndent(cleanExpected, "", " ") + if err != nil { + return "Failed to analyze: could not marshal expected decisions" + } + + systemMessage := `You are analyzing agent test failures. +Focus on what the agent ACTUALLY did and where it got stuck. + +Output rules: +- Start with what the agent successfully completed +- Identify the SPECIFIC action and tool where it failed or got stuck +- Compare only that failure point with what was expected +- Ignore answer and finish actions - focus only on API calls and tool usage +- Be concise (max 2-3 sentences) +- Use plain language without special characters like quotes, backticks, or brackets +- Name the specific API or tool that failed + +Example output format: +Agent completed geocoding API call successfully. Got stuck on weather API call - agent used URL with different parameters than expected (missing daily forecast params and using timezone=auto instead of Asia/Kolkata).` + + var userMessage string + if isTimeout { + userMessage = fmt.Sprintf(`The agent test timed out. + +What the agent ACTUALLY did: +%s + +What was EXPECTED (full test plan): +%s + +Analyze from the agent's perspective: +1. Which API calls or tools did the agent successfully complete? +2. Where exactly did it get stuck or fail? +3. What was different about that specific action compared to what was expected? +4. Ignore any answer or finish actions - focus only on the actual work (API calls, tools).`, string(actualJSON), string(expectedJSON)) + } else { + userMessage = fmt.Sprintf(`The agent test failed. + +What the agent ACTUALLY did: +%s + +What was EXPECTED: +%s + +Analyze from the agent's perspective: +1. Which actions did the agent complete successfully? +2. Which specific action/tool failed and why? +3. What was the difference between what the agent did vs what was expected? +4. Ignore any answer or finish actions.`, string(actualJSON), string(expectedJSON)) + } + + responseBody, err := RunAiQuery(systemMessage, userMessage) + if err != nil { + log.Printf("[ERROR] Failed to get LLM analysis: %s", err) + return "Failed to analyze with LLM" + } + + failureReason := strings.TrimSpace(responseBody) + if after, ok := strings.CutPrefix(failureReason, "```"); ok { + failureReason = after + } + if after, ok := strings.CutSuffix(failureReason, "```"); ok { + failureReason = after + } + failureReason = strings.TrimSpace(failureReason) + + log.Printf("[INFO] LLM Analysis: %s", failureReason) + return failureReason +} + +// Hmmm, let's see if this helps with token usage, stripRawResponses removes raw_response fields from decisions to save LLM tokens +func stripRawResponses(decisions []interface{}) []interface{} { + cleaned := make([]interface{}, len(decisions)) + for i, d := range decisions { + if decisionMap, ok := d.(map[string]interface{}); ok { + cleanedDecision := make(map[string]interface{}) + for k, v := range decisionMap { + // Skip raw_response and other verbose fields + if k != "raw_response" && k != "RawResponse" && k != "debug_url" && k != "DebugUrl" { + cleanedDecision[k] = v + } + } + cleaned[i] = cleanedDecision + } else { + cleaned[i] = d + } + } + return cleaned +} + +// stripRawResponsesFromMaps removes raw_response fields from expected decisions +func stripRawResponsesFromMaps(decisions []map[string]interface{}) []map[string]interface{} { + cleaned := make([]map[string]interface{}, len(decisions)) + for i, decisionMap := range decisions { + cleanedDecision := make(map[string]interface{}) + for k, v := range decisionMap { + if k != "raw_response" && k != "RawResponse" && k != "debug_url" && k != "DebugUrl" { + cleanedDecision[k] = v + } + } + cleaned[i] = cleanedDecision + } + return cleaned +} + +// findBestFuzzyMatch finds the most similar URL from stored tool calls +// Returns the best match and its similarity score (0.0 to 1.0) +func findBestFuzzyMatch(reqURL *url.URL, toolCalls []MockToolCall) (MockToolCall, float64) { + var bestMatch MockToolCall + bestScore := 0.0 + + for _, tc := range toolCalls { + storedURL, err := url.Parse(tc.URL) + if err != nil { + continue + } + + score := calculateURLSimilarity(reqURL, storedURL) + if score > bestScore { + bestScore = score + bestMatch = tc + } + } + + return bestMatch, bestScore +} + +// calculateURLSimilarity returns a score from 0.0 to 1.0 indicating how similar two URLs are +func calculateURLSimilarity(url1, url2 *url.URL) float64 { + score := 0.0 + totalWeight := 0.0 + + // Scheme (10% weight) + if url1.Scheme == url2.Scheme { + score += 0.10 + } + totalWeight += 0.10 + + // Host (20% weight) + if url1.Host == url2.Host { + score += 0.20 + } + totalWeight += 0.20 + + // Path (20% weight) + if url1.Path == url2.Path { + score += 0.20 + } + totalWeight += 0.20 + + // Query parameters (50% weight) + query1 := url1.Query() + query2 := url2.Query() + + if len(query1) == 0 && len(query2) == 0 { + score += 0.50 + } else if len(query1) > 0 || len(query2) > 0 { + matchingParams := 0 + totalParams := 0 + + allKeys := make(map[string]bool) + for k := range query1 { + allKeys[k] = true + } + for k := range query2 { + allKeys[k] = true + } + totalParams = len(allKeys) + + // Count how many match + for key := range allKeys { + val1, ok1 := query1[key] + val2, ok2 := query2[key] + + if ok1 && ok2 { + // Both have this key - check if values match + if len(val1) == len(val2) { + allMatch := true + for i := range val1 { + if val1[i] != val2[i] { + allMatch = false + break + } + } + if allMatch { + matchingParams++ + } + } + } + } + + if totalParams > 0 { + paramScore := float64(matchingParams) / float64(totalParams) + score += paramScore * 0.50 + } + } + totalWeight += 0.50 + + return score / totalWeight +} diff --git a/ai.go b/ai.go index 0ad9500e..5f50fc1b 100644 --- a/ai.go +++ b/ai.go @@ -14,7 +14,9 @@ import ( "log" "math/rand" "net/http" + "net/url" "os" + "path/filepath" "reflect" "regexp" "sort" @@ -11974,3 +11976,530 @@ func buildManualInputList(history []ConversationMessage, newPrompt string) []map return items } + +// RunAgentTests runs automated tests for AI agents using test data from agent_test_data/ +func RunAgentTests(resp http.ResponseWriter, request *http.Request) { + ctx := GetContext(request) + + log.Printf("[INFO] Starting automated agent tests") + + // Try multiple possible paths (same as mock handler) + possiblePaths := []string{} + + if envPath := os.Getenv("AGENT_TEST_DATA_PATH"); envPath != "" { + possiblePaths = append(possiblePaths, envPath) + } + possiblePaths = append(possiblePaths, "agent_test_data") + possiblePaths = append(possiblePaths, "../shuffle-shared/agent_test_data") + possiblePaths = append(possiblePaths, "../../shuffle-shared/agent_test_data") + + // Add cross-platform paths using home directory + if homeDir, err := os.UserHomeDir(); err == nil { + possiblePaths = append(possiblePaths, filepath.Join(homeDir, "Documents", "shuffle-shared", "agent_test_data")) + } + + // Find the first valid path + testDataPath := "" + for _, path := range possiblePaths { + if _, err := os.Stat(path); err == nil { + testDataPath = path + log.Printf("[INFO] Found test data directory: %s", path) + break + } + } + + if testDataPath == "" { + log.Printf("[ERROR] Could not find test data directory in any of: %v", possiblePaths) + resp.WriteHeader(500) + resp.Write([]byte(fmt.Sprintf(`{"success": false, "reason": "Test data directory not found. Tried: %v"}`, possiblePaths))) + return + } + + // Read all JSON files from test data directory + files, err := ioutil.ReadDir(testDataPath) + if err != nil { + log.Printf("[ERROR] Failed to read test data directory: %s", err) + resp.WriteHeader(500) + resp.Write([]byte(fmt.Sprintf(`{"success": false, "reason": "Failed to read test data: %s"}`, err))) + return + } + + totalTests := 0 + passedTests := 0 + failedTests := 0 + + type TestResult struct { + TestCase string `json:"test_case"` + Status string `json:"status"` + Error string `json:"error,omitempty"` + } + results := []TestResult{} + + // Run tests for each JSON file + for _, file := range files { + if file.IsDir() || !strings.HasSuffix(file.Name(), ".json") { + continue + } + + totalTests++ + useCaseName := strings.TrimSuffix(file.Name(), ".json") + + log.Printf("[INFO] ========== Test %d: %s ==========", totalTests, useCaseName) + + // Load test case + testFilePath := filepath.Join(testDataPath, file.Name()) + testData, err := ioutil.ReadFile(testFilePath) + if err != nil { + log.Printf("[ERROR] Failed to read test file %s: %s", file.Name(), err) + failedTests++ + results = append(results, TestResult{ + TestCase: file.Name(), + Status: "FAIL", + Error: fmt.Sprintf("Failed to read file: %s", err), + }) + continue + } + + var testCase MockUseCaseData + err = json.Unmarshal(testData, &testCase) + if err != nil { + log.Printf("[ERROR] Failed to parse test file %s: %s", file.Name(), err) + failedTests++ + results = append(results, TestResult{ + TestCase: file.Name(), + Status: "FAIL", + Error: fmt.Sprintf("Failed to parse JSON: %s", err), + }) + continue + } + + // Set environment for this test case + os.Setenv("AGENT_TEST_MODE", "true") + os.Setenv("AGENT_TEST_USE_CASE", useCaseName) + + // Run the test + passed, testErr := runSingleAgentTest(ctx, request, testCase, useCaseName) + if passed { + passedTests++ + log.Printf("[INFO] ✅ Test %d PASSED: %s", totalTests, useCaseName) + results = append(results, TestResult{ + TestCase: file.Name(), + Status: "PASS", + }) + } else { + failedTests++ + log.Printf("[ERROR] ❌ Test %d FAILED: %s", totalTests, useCaseName) + results = append(results, TestResult{ + TestCase: file.Name(), + Status: "FAIL", + Error: testErr, + }) + } + } + + log.Printf("[INFO] ========== Test Summary ==========") + log.Printf("[INFO] Total: %d, Passed: %d, Failed: %d", totalTests, passedTests, failedTests) + + // Clean up: Disable test mode after tests complete + os.Setenv("AGENT_TEST_MODE", "false") + os.Setenv("AGENT_TEST_USE_CASE", "") + log.Printf("[INFO] Test mode disabled") + + // Build response + type TestResponse struct { + Success bool `json:"success"` + Total int `json:"total"` + Passed int `json:"passed"` + Failed int `json:"failed"` + Results []TestResult `json:"results"` + } + + response := TestResponse{ + Success: failedTests == 0, + Total: totalTests, + Passed: passedTests, + Failed: failedTests, + Results: results, + } + + responseBytes, _ := json.Marshal(response) + resp.WriteHeader(200) + resp.Write(responseBytes) +} + +// runSingleAgentTest runs a single test case +func runSingleAgentTest(ctx context.Context, request *http.Request, testCase MockUseCaseData, useCaseName string) (bool, string) { + // Get user prompt from test case + userPrompt := "" + if data, ok := testCase.ToolCalls[0].Response["user_prompt"].(string); ok { + userPrompt = data + } + + // Try to get from top level + type TestCaseWithPrompt struct { + UserPrompt string `json:"user_prompt"` + } + var tempData TestCaseWithPrompt + testBytes, _ := json.Marshal(testCase) + json.Unmarshal(testBytes, &tempData) + if tempData.UserPrompt != "" { + userPrompt = tempData.UserPrompt + } + + if userPrompt == "" { + log.Printf("[ERROR] No user_prompt found in test case") + return false, "No user_prompt found in test case" + } + + // Build request body for agent_starter + requestBody := map[string]interface{}{ + "id": uuid.NewV4().String(), + "name": "agent", + "app_name": "AI Agent", + "app_id": "shuffle_agent", + "app_version": "1.0.0", + "environment": "cloud", + "parameters": []map[string]string{ + {"name": "app_name", "value": "openai"}, + {"name": "input", "value": userPrompt}, + {"name": "action", "value": "API"}, + }, + } + + requestBodyBytes, err := json.Marshal(requestBody) + if err != nil { + log.Printf("[ERROR] Failed to marshal request body: %s", err) + return false, fmt.Sprintf("Failed to marshal request body: %s", err) + } + + // Call agent_starter endpoint + baseUrl := "http://localhost:5002" + if os.Getenv("BASE_URL") != "" { + baseUrl = os.Getenv("BASE_URL") + } + + startUrl := fmt.Sprintf("%s/api/v1/apps/agent_starter/run", baseUrl) + req, err := http.NewRequest("POST", startUrl, bytes.NewBuffer(requestBodyBytes)) + if err != nil { + log.Printf("[ERROR] Failed to create start request: %s", err) + return false, fmt.Sprintf("Failed to create start request: %s", err) + } + + // Copy headers from original request + for key, values := range request.Header { + for _, value := range values { + req.Header.Add(key, value) + } + } + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + log.Printf("[ERROR] Failed to start agent: %s", err) + return false, fmt.Sprintf("Failed to start agent: %s", err) + } + defer resp.Body.Close() + + startBody, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Printf("[ERROR] Failed to read start response: %s", err) + return false, fmt.Sprintf("Failed to read start response: %s", err) + } + + var startResponse struct { + Success bool `json:"success"` + ExecutionId string `json:"execution_id"` + Authorization string `json:"authorization"` + } + err = json.Unmarshal(startBody, &startResponse) + if err != nil { + log.Printf("[ERROR] Failed to parse start response: %s", err) + return false, fmt.Sprintf("Failed to parse start response: %s", err) + } + + if !startResponse.Success { + log.Printf("[ERROR] Failed to start agent: %s", string(startBody)) + return false, fmt.Sprintf("Failed to start agent: %s", string(startBody)) + } + + log.Printf("[INFO] ✅ Started agent (execution_id: %s)", startResponse.ExecutionId) + + // Poll for results + maxRetries := 20 + retryDelay := 5 * time.Second + + var finalResult map[string]interface{} + var lastPolledResult map[string]interface{} // Keep track of last result even if not finished + agentFinished := false + + for i := 0; i < maxRetries; i++ { + time.Sleep(retryDelay) + + // Call streams/results endpoint + resultsUrl := fmt.Sprintf("%s/api/v1/streams/results", baseUrl) + resultsBody := map[string]string{ + "execution_id": startResponse.ExecutionId, + "authorization": startResponse.Authorization, + } + resultsBodyBytes, _ := json.Marshal(resultsBody) + + resultsReq, err := http.NewRequest("POST", resultsUrl, bytes.NewBuffer(resultsBodyBytes)) + if err != nil { + continue + } + + // Copy headers + for key, values := range request.Header { + for _, value := range values { + resultsReq.Header.Add(key, value) + } + } + resultsReq.Header.Set("Content-Type", "application/json") + + resultsResp, err := client.Do(resultsReq) + if err != nil { + continue + } + + resultsRespBody, err := ioutil.ReadAll(resultsResp.Body) + resultsResp.Body.Close() + if err != nil { + continue + } + + var resultsData map[string]interface{} + err = json.Unmarshal(resultsRespBody, &resultsData) + if err != nil { + continue + } + + // Extract result field (which is a JSON string) + resultStr, ok := resultsData["result"].(string) + if !ok { + continue + } + + // Parse the result string as JSON + var parsedResult map[string]interface{} + err = json.Unmarshal([]byte(resultStr), &parsedResult) + if err != nil { + continue + } + + // Store this as last polled result (even if not finished) + lastPolledResult = parsedResult + + // Check if finished + status, ok := parsedResult["status"].(string) + if ok && status == "FINISHED" { + finalResult = parsedResult + agentFinished = true + break + } + } + + if !agentFinished { + log.Printf("[ERROR] Agent did not finish within timeout (%d retries)", maxRetries) + + // Get ALL decisions from last polled result (not just finished ones!) + var allDecisions []interface{} + if lastPolledResult != nil { + if decisions, ok := lastPolledResult["decisions"].([]interface{}); ok { + allDecisions = decisions + } + } + + // Use LLM to analyze timeout failure with COMPLETE picture + llmReason := analyzeTestFailureWithLLM(allDecisions, testCase.ExpectedDecisions, true) + return false, fmt.Sprintf("Timeout after %d retries. %s", maxRetries, llmReason) + } + + log.Printf("[INFO] ✅ Agent finished") + + // Compare decisions + actualDecisions, ok := finalResult["decisions"].([]interface{}) + if !ok { + log.Printf("[ERROR] No decisions found in result") + return false, "No decisions found in result" + } + + // Get expected decisions from test case + expectedDecisionsRaw, ok := testCase.ToolCalls[0].Response["expected_decisions"] + if !ok { + // Try to get from parsed test case + type TestCaseWithDecisions struct { + ExpectedDecisions []map[string]interface{} `json:"expected_decisions"` + } + var tempData TestCaseWithDecisions + testBytes, _ := json.Marshal(testCase) + json.Unmarshal(testBytes, &tempData) + + if len(tempData.ExpectedDecisions) == 0 { + log.Printf("[ERROR] No expected_decisions found in test case") + return false, "No expected_decisions found in test case" + } + + // Compare decisions + passed, errMsg := compareDecisions(actualDecisions, tempData.ExpectedDecisions) + if !passed { + // Use LLM to provide detailed analysis + llmReason := analyzeTestFailureWithLLM(actualDecisions, tempData.ExpectedDecisions, false) + return false, llmReason + } + return passed, errMsg + } + + expectedDecisions, ok := expectedDecisionsRaw.([]interface{}) + if !ok { + log.Printf("[ERROR] expected_decisions is not an array") + return false, "expected_decisions is not an array" + } + + // Convert to comparable format + expectedMaps := make([]map[string]interface{}, len(expectedDecisions)) + for i, ed := range expectedDecisions { + if edMap, ok := ed.(map[string]interface{}); ok { + expectedMaps[i] = edMap + } + } + + return compareDecisions(actualDecisions, expectedMaps) +} + +// compareDecisions compares actual vs expected decisions +func compareDecisions(actual []interface{}, expected []map[string]interface{}) (bool, string) { + if len(actual) != len(expected) { + errMsg := fmt.Sprintf("Decision count mismatch: expected %d, got %d", len(expected), len(actual)) + log.Printf("[ERROR] %s", errMsg) + return false, errMsg + } + + for i := 0; i < len(actual); i++ { + actualDecision, ok := actual[i].(map[string]interface{}) + if !ok { + errMsg := fmt.Sprintf("Decision %d: invalid format", i) + log.Printf("[ERROR] %s", errMsg) + return false, errMsg + } + + expectedDecision := expected[i] + + // Compare action + actualAction, _ := actualDecision["action"].(string) + expectedAction, _ := expectedDecision["action"].(string) + + // Skip comparison for "answer" actions - they're just progress updates + if actualAction == "answer" || expectedAction == "answer" { + log.Printf("[DEBUG] Decision %d: Skipping comparison for 'answer' action (progress update)", i) + continue + } + + if actualAction != expectedAction { + errMsg := fmt.Sprintf("Decision %d: action mismatch (expected: %s, got: %s)", i, expectedAction, actualAction) + log.Printf("[ERROR] %s", errMsg) + return false, errMsg + } + + // Compare tool + actualTool, _ := actualDecision["tool"].(string) + expectedTool, _ := expectedDecision["tool"].(string) + if actualTool != expectedTool { + errMsg := fmt.Sprintf("Decision %d: tool mismatch (expected: %s, got: %s)", i, expectedTool, actualTool) + log.Printf("[ERROR] %s", errMsg) + return false, errMsg + } + + // Compare fields + actualFields, _ := actualDecision["fields"].([]interface{}) + expectedFields, _ := expectedDecision["fields"].([]interface{}) + + if !compareFields(actualFields, expectedFields, i) { + errMsg := fmt.Sprintf("Decision %d: field mismatch", i) + return false, errMsg + } + + log.Printf("[INFO] ✅ Decision %d: action=%s, tool=%s, fields match", i, actualAction, actualTool) + } + + return true, "" +} + +// compareFields compares field arrays +func compareFields(actual []interface{}, expected []interface{}, decisionIndex int) bool { + // Get the action type from the parent decision context + // We need to know if this is "answer" or "finish" to skip field comparison + + // Convert to maps for easier comparison + actualMap := make(map[string]string) + for _, f := range actual { + if fieldMap, ok := f.(map[string]interface{}); ok { + key, _ := fieldMap["key"].(string) + value, _ := fieldMap["value"].(string) + actualMap[key] = value + } + } + + expectedMap := make(map[string]string) + for _, f := range expected { + if fieldMap, ok := f.(map[string]interface{}); ok { + key, _ := fieldMap["key"].(string) + value, _ := fieldMap["value"].(string) + expectedMap[key] = value + } + } + + // Compare all expected fields + for key, expectedValue := range expectedMap { + actualValue, exists := actualMap[key] + if !exists { + log.Printf("[ERROR] Decision %d: missing field '%s'", decisionIndex, key) + return false + } + + // Normalize whitespace for comparison + expectedValue = strings.TrimSpace(expectedValue) + actualValue = strings.TrimSpace(actualValue) + + // Empty values are OK + if expectedValue == "" && actualValue == "" { + continue + } + + // Skip comparison for LLM-generated content fields + if key == "output" || key == "body" { + // These contain LLM responses which will vary + // Just check they're not empty + if actualValue != "" { + log.Printf("[DEBUG] Decision %d: Skipping exact comparison for field '%s' (LLM-generated content)", decisionIndex, key) + continue + } + } + + // For URL fields, use fuzzy matching (same as mock) + if key == "url" { + expectedURL, err1 := url.Parse(expectedValue) + actualURL, err2 := url.Parse(actualValue) + + if err1 == nil && err2 == nil { + // Use the same fuzzy matching logic as the mock + score := calculateURLSimilarity(actualURL, expectedURL) + if score >= 0.80 { + log.Printf("[DEBUG] Decision %d: URL fuzzy match (%.1f%% similarity)", decisionIndex, score*100) + continue + } else { + log.Printf("[ERROR] Decision %d: URL mismatch (%.1f%% similarity). Expected: %s, Got: %s", decisionIndex, score*100, expectedValue, actualValue) + return false + } + } + } + + // Exact match for other fields + if expectedValue != actualValue { + log.Printf("[ERROR] Decision %d: field '%s' mismatch (expected: %s, got: %s)", decisionIndex, key, expectedValue, actualValue) + return false + } + } + + return true +} diff --git a/cloudSync.go b/cloudSync.go index 23e07bd6..40807e0b 100755 --- a/cloudSync.go +++ b/cloudSync.go @@ -2109,6 +2109,32 @@ func RunAgentDecisionSingulActionHandler(execution WorkflowExecution, decision A debugUrl := "" log.Printf("[INFO][%s] Running agent decision action '%s' with app '%s'. This is ran with Singul.", execution.ExecutionId, decision.Action, decision.Tool) + // Check if running in test mode + if os.Getenv("AGENT_TEST_MODE") == "true" { + log.Printf("[DEBUG][%s] AGENT_TEST_MODE enabled - using mock tool execution", execution.ExecutionId) + + // Call mock function instead of real Singul + // Mock function signature: + // func RunAgentDecisionMockHandler(execution WorkflowExecution, decision AgentDecision) ([]byte, string, string, error) + // + // Inputs needed: + // - execution: Full execution context (ExecutionId, Authorization, Workflow, etc) + // - decision: The decision to execute (Tool, Action, Fields, etc) + // + // Returns: (rawResponse []byte, debugUrl string, appname string, error) + // - rawResponse: The mock tool result (what Singul would return) + // - debugUrl: Debug URL (can be empty in tests) + // - appname: The app name (decision.Tool) + // - error: Any error that occurred + // + // The mock function should: + // 1. Load stored result based on decision.Tool + decision.Action + // 2. Return it in the same format as real Singul + // 3. The caller (RunAgentDecisionAction) will handle posting to /streams + + return RunAgentDecisionMockHandler(execution, decision) + } + baseUrl := "https://shuffler.io" if os.Getenv("BASE_URL") != "" { baseUrl = os.Getenv("BASE_URL") @@ -2148,8 +2174,9 @@ func RunAgentDecisionSingulActionHandler(execution WorkflowExecution, decision A } parsedAction := CategoryAction{ - AppName: decision.Tool, - Label: decision.Action, + AppName: decision.Tool, + Label: decision.Action, + Query: decision.Reason, // Add the reason field for LLM context Fields: oldFields, diff --git a/shared.go b/shared.go index 28845bf5..1dfa0fcd 100755 --- a/shared.go +++ b/shared.go @@ -16278,8 +16278,28 @@ func handleAgentDecisionStreamResult(workflowExecution WorkflowExecution, action } if foundActionResultIndex < 0 { - log.Printf("[ERROR][%s] Action '%s' was NOT found with any result in the execution (yet)", workflowExecution.ExecutionId, actionResult.Action.ID) - return &workflowExecution, false, errors.New(fmt.Sprintf("ActionResultIndex: Agent node ID for decision ID %s not found", decisionId)) + // In test mode, Singul doesn't create sub-executions, so we need to handle this gracefully + if os.Getenv("AGENT_TEST_MODE") == "true" { + log.Printf("[DEBUG][%s] AGENT_TEST_MODE: Action '%s' not found in results, creating placeholder", workflowExecution.ExecutionId, actionResult.Action.ID) + + // Create a placeholder result for the agent action + placeholderResult := ActionResult{ + Action: actionResult.Action, + ExecutionId: workflowExecution.ExecutionId, + Result: `{"status":"RUNNING","decisions":[]}`, + StartedAt: time.Now().Unix(), + CompletedAt: 0, + Status: "EXECUTING", + } + + workflowExecution.Results = append(workflowExecution.Results, placeholderResult) + foundActionResultIndex = len(workflowExecution.Results) - 1 + + log.Printf("[DEBUG][%s] Created placeholder result at index %d", workflowExecution.ExecutionId, foundActionResultIndex) + } else { + log.Printf("[ERROR][%s] Action '%s' was NOT found with any result in the execution (yet)", workflowExecution.ExecutionId, actionResult.Action.ID) + return &workflowExecution, false, errors.New(fmt.Sprintf("ActionResultIndex: Agent node ID for decision ID %s not found", decisionId)) + } } mappedResult := AgentOutput{} @@ -16291,6 +16311,28 @@ func handleAgentDecisionStreamResult(workflowExecution WorkflowExecution, action return &workflowExecution, false, err } + // In test mode, if the placeholder has no decisions, we need to add the incoming decision + if os.Getenv("AGENT_TEST_MODE") == "true" && len(mappedResult.Decisions) == 0 { + log.Printf("[DEBUG][%s] AGENT_TEST_MODE: Placeholder has no decisions, parsing incoming decision", workflowExecution.ExecutionId) + + // Parse the incoming decision from actionResult + incomingDecision := AgentDecision{} + err = json.Unmarshal([]byte(actionResult.Result), &incomingDecision) + if err != nil { + log.Printf("[ERROR][%s] Failed unmarshalling incoming decision: %s", workflowExecution.ExecutionId, err) + } else { + // Add the decision to the mapped result + mappedResult.Decisions = append(mappedResult.Decisions, incomingDecision) + mappedResult.Status = "RUNNING" + + // Update the workflow execution result with the new decision + updatedResult, _ := json.Marshal(mappedResult) + workflowExecution.Results[foundActionResultIndex].Result = string(updatedResult) + + log.Printf("[DEBUG][%s] Added decision %s to placeholder (total decisions: %d)", workflowExecution.ExecutionId, incomingDecision.RunDetails.Id, len(mappedResult.Decisions)) + } + } + // FIXME: Need to check the current value from the workflowexecution here, instead of using the currently sent in decision // 1. Get the current result for the action