diff --git a/api/pyproject.toml b/api/pyproject.toml index 83f2016dcb..74b74fb5d7 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "api" -version = "0.76.0" +version = "0.77.0" description = "Agenta API" authors = [ { name = "Mahmoud Mabrouk", email = "mahmoud@agenta.ai" }, diff --git a/docs/blog/entries/json-multi-field-match.mdx b/docs/blog/entries/json-multi-field-match.mdx new file mode 100644 index 0000000000..2c532de763 --- /dev/null +++ b/docs/blog/entries/json-multi-field-match.mdx @@ -0,0 +1,124 @@ +--- +title: "JSON Multi-Field Match Evaluator" +slug: json-multi-field-match +date: 2025-12-31 +tags: [v0.73.0] +description: "Compare multiple fields between JSON objects with the new JSON Multi-Field Match evaluator. Ideal for entity extraction validation with per-field scoring and support for nested paths." +--- + +```mdx-code-block +import Image from "@theme/IdealImage"; +``` + +The JSON Multi-Field Match evaluator lets you validate multiple fields in JSON outputs simultaneously. This makes it ideal for entity extraction tasks where you need to check if your model correctly extracted name, email, address, and other structured fields. + +## What is JSON Multi-Field Match? + +This evaluator compares specific fields between your model's JSON output and the expected JSON values from your test set. Unlike the old JSON Field Match evaluator (which only checked one field), this evaluator handles any number of fields at once. + +For each field you configure, the evaluator produces a separate score (either 1 for a match or 0 for no match). It also calculates an aggregate score showing the percentage of fields that matched correctly. + +## Key Features + +### Multiple Field Comparison + +Configure as many fields as you need to validate. The evaluator checks each field independently and reports results for all of them. + +If you're extracting user information, you might configure fields like `name`, `email`, `phone`, and `address.city`. Each field gets its own score, so you can see exactly which extractions succeeded and which failed. + +### Three Path Format Options + +The evaluator supports three different ways to specify field paths: + +**Dot notation** (recommended for most cases): +- Simple fields: `name`, `email` +- Nested fields: `user.address.city` +- Array indices: `items.0.name` + +**JSON Path** (standard JSON Path syntax): +- Simple fields: `$.name`, `$.email` +- Nested fields: `$.user.address.city` +- Array indices: `$.items[0].name` + +**JSON Pointer** (RFC 6901): +- Simple fields: `/name`, `/email` +- Nested fields: `/user/address/city` +- Array indices: `/items/0/name` + +All three formats work the same way. Use whichever matches your existing tooling or personal preference. + +### Nested Field and Array Support + +Access deeply nested fields and array elements without restrictions. The evaluator handles any level of nesting. + +### Per-Field Scoring + +See individual scores for each configured field in the evaluation results. This granular view helps you identify which specific extractions are working well and which need improvement. + +### Aggregate Score + +The aggregate score shows the percentage of matching fields. If you configure five fields and three match, the aggregate score is 0.6 (or 60%). + +## Example + +Suppose you're building an entity extraction model that pulls contact information from text. Your ground truth looks like this: + +```json +{ + "name": "John Doe", + "email": "john@example.com", + "phone": "555-1234", + "address": { + "city": "New York", + "zip": "10001" + } +} +``` + +Your model produces this output: + +```json +{ + "name": "John Doe", + "email": "jane@example.com", + "phone": "555-1234", + "address": { + "city": "New York", + "zip": "10002" + } +} +``` + +You configure these fields: `["name", "email", "phone", "address.city", "address.zip"]` + +The evaluator returns: + +| Field | Score | +| ----- | ----- | +| `name` | 1.0 | +| `email` | 0.0 | +| `phone` | 1.0 | +| `address.city` | 1.0 | +| `address.zip` | 0.0 | +| `aggregate_score` | 0.6 | + +You can see immediately that the model got the email and zip code wrong but correctly extracted the name, phone, and city. + +## Auto-Detection in the UI + +When you configure the evaluator in the web interface, Agenta automatically detects available fields from your test set data. Click to add or remove fields using a tag-based interface. This makes setup fast and reduces configuration errors. + +## Migration from JSON Field Match + +The old JSON Field Match evaluator only supported checking a single field. If you're using it, consider migrating to JSON Multi-Field Match to gain: + +- Support for multiple fields in one evaluator +- Per-field scoring for detailed analysis +- Aggregate scoring for overall performance tracking +- Nested field and array support + +Existing JSON Field Match configurations continue to work. We recommend migrating to JSON Multi-Field Match for new evaluations. + +## Next Steps + +Learn more about configuring and using the JSON Multi-Field Match evaluator in the [Classification and Entity Extraction Evaluators](/evaluation/configure-evaluators/classification-entity-extraction#json-multi-field-match) documentation. diff --git a/docs/blog/main.mdx b/docs/blog/main.mdx index 59e48437e9..e3d60f566e 100644 --- a/docs/blog/main.mdx +++ b/docs/blog/main.mdx @@ -11,6 +11,7 @@ import Image from "@theme/IdealImage";
+ ### [Chat Sessions in Observability](/changelog/chat-sessions-observability) _9 January 2026_ @@ -28,6 +29,16 @@ The new session browser shows key metrics like total cost, latency, and token us --- +### [JSON Multi-Field Match Evaluator](/changelog/json-multi-field-match) + +_31 December 2025_ + +**v0.73.0** + +The new JSON Multi-Field Match evaluator validates multiple fields between JSON objects. Configure any number of field paths using dot notation, JSON Path, or JSON Pointer formats. Each field gets its own score (0 or 1), and an aggregate score shows the percentage of matching fields. This evaluator is ideal for entity extraction tasks like validating extracted names, emails, and addresses. The UI automatically detects fields from your test data for quick setup. This replaces the old JSON Field Match evaluator, which only supported single fields. + +--- + ### [PDF Support in the Playground](/changelog/pdf-support-in-playground) _17 December 2025_ diff --git a/docs/docs/evaluation/configure-evaluators/01-overview.mdx b/docs/docs/evaluation/configure-evaluators/01-overview.mdx index 7cb1292849..8c5bae0d49 100644 --- a/docs/docs/evaluation/configure-evaluators/01-overview.mdx +++ b/docs/docs/evaluation/configure-evaluators/01-overview.mdx @@ -25,7 +25,7 @@ Agenta offers a growing list of pre-built evaluators suitable for most use cases | [Exact Match](/evaluation/configure-evaluators/classification-entity-extraction#exact-match) | Classification/Entity Extraction | Pattern Matching | Checks if the output exactly matches the expected result. | | [Contains JSON](/evaluation/configure-evaluators/classification-entity-extraction#contains-json) | Classification/Entity Extraction | Pattern Matching | Ensures the output contains valid JSON. | | [Regex Test](/evaluation/configure-evaluators/regex-evaluator) | Classification/Entity Extraction | Pattern Matching | Checks if the output matches a given regex pattern. | -| [JSON Field Match](/evaluation/configure-evaluators/classification-entity-extraction#json-field-match) | Classification/Entity Extraction | Pattern Matching | Compares specific fields within JSON data. | +| [JSON Multi-Field Match](/evaluation/configure-evaluators/classification-entity-extraction#json-multi-field-match) | Classification/Entity Extraction | Pattern Matching | Compares multiple fields within JSON objects and reports per-field scores. | | [JSON Diff Match](/evaluation/configure-evaluators/classification-entity-extraction#json-diff-match) | Classification/Entity Extraction | Similarity Metrics | Compares generated JSON with a ground truth JSON based on schema or values. | | [Similarity Match](/evaluation/configure-evaluators/semantic-similarity#similarity-match) | Text Generation / Chatbot | Similarity Metrics | Compares generated output with expected using Jaccard similarity. | | [Semantic Similarity Match](/evaluation/configure-evaluators/semantic-similarity#semantic-similarity-match) | Text Generation / Chatbot | Semantic Analysis | Compares the meaning of the generated output with the expected result. | diff --git a/docs/docs/evaluation/configure-evaluators/02-classification-entity-extraction.mdx b/docs/docs/evaluation/configure-evaluators/02-classification-entity-extraction.mdx index 5c222dd90f..029ce6e9ef 100644 --- a/docs/docs/evaluation/configure-evaluators/02-classification-entity-extraction.mdx +++ b/docs/docs/evaluation/configure-evaluators/02-classification-entity-extraction.mdx @@ -26,20 +26,84 @@ The Contains JSON evaluator checks if the model's output contains a valid JSON s This evaluator attempts to parse the output as JSON. It returns `true` if a valid JSON structure is found within the output, and `false` otherwise. -## JSON Field Match +## JSON Field Match (Deprecated) {#json-field-match} -The JSON Field Match evaluator compares specific fields within JSON data. +:::warning Deprecated +The JSON Field Match evaluator has been replaced by [JSON Multi-Field Match](#json-multi-field-match). The new evaluator supports multiple fields, nested paths, and provides per-field scoring. Existing configurations will continue to work, but we recommend migrating to the new evaluator. +::: + +## JSON Multi-Field Match + +The JSON Multi-Field Match evaluator compares multiple fields between two JSON objects and reports a score for each field. This evaluator is ideal for entity extraction tasks where you need to validate that specific fields (like name, email, or address) match the expected values. ### How It Works -This evaluator attempts to parse the output as JSON and extract a specified field. It then compares this field value to the correct answer. The evaluator returns `true` if the field value matches the correct answer, and `false` otherwise. Note that the value in the `correct_answer` column should be a string, not JSON. +The evaluator parses both the model output and the ground truth as JSON. It then compares each configured field path and produces: + +1. A score for each field (1 if matched, 0 if not matched) +2. An aggregate score showing the percentage of fields that matched + +For example, if you configure fields `["name", "email", "phone"]` and the model gets name and email correct but phone wrong, you will see: +- `name`: 1.0 +- `email`: 1.0 +- `phone`: 0.0 +- `aggregate_score`: 0.67 + +### Path Formats + +You can specify field paths in three formats: + +| Format | Example | Description | +| ------ | ------- | ----------- | +| Dot notation | `user.address.city` | Simple nested access. Use numeric indices for arrays: `items.0.name` | +| JSON Path | `$.user.address.city` | Standard JSON Path syntax. Supports array indexing: `$.items[0].name` | +| JSON Pointer | `/user/address/city` | RFC 6901 standard. Use numeric segments for arrays: `/items/0/name` | + +Dot notation is recommended for most cases. JSON Path and JSON Pointer are useful when you need compatibility with other tools. ### Configuration -| Parameter | Type | Description | -| -------------------- | ------ | ------------------------------------------------------------- | -| `json_field` | String | The name of the field in the JSON output to evaluate | -| `correct_answer_key` | String | The column name in the test set containing the correct answer | +| Parameter | Type | Description | +| -------------------- | -------- | ------------------------------------------------------------------ | +| `fields` | String[] | List of field paths to compare (e.g., `["name", "user.email"]`) | +| `correct_answer_key` | String | The column name in the test set containing the expected JSON | + +### Example + +**Ground truth** (in the `correct_answer` column): +```json +{ + "name": "John Doe", + "email": "john@example.com", + "address": { + "city": "New York", + "zip": "10001" + } +} +``` + +**Model output**: +```json +{ + "name": "John Doe", + "email": "jane@example.com", + "address": { + "city": "New York", + "zip": "10002" + } +} +``` + +**Configured fields**: `["name", "email", "address.city", "address.zip"]` + +**Results**: +| Field | Score | +| ----- | ----- | +| `name` | 1.0 | +| `email` | 0.0 | +| `address.city` | 1.0 | +| `address.zip` | 0.0 | +| `aggregate_score` | 0.5 | ## JSON Diff Match diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index d6e1c588e3..4b447f402b 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta" -version = "0.76.0" +version = "0.77.0" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" authors = [ diff --git a/web/ee/package.json b/web/ee/package.json index d5b52fc6c6..1f3c1b000e 100644 --- a/web/ee/package.json +++ b/web/ee/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/ee", - "version": "0.76.0", + "version": "0.77.0", "private": true, "engines": { "node": ">=18" diff --git a/web/oss/package.json b/web/oss/package.json index 17de9745cf..b675f4536a 100644 --- a/web/oss/package.json +++ b/web/oss/package.json @@ -1,6 +1,6 @@ { "name": "@agenta/oss", - "version": "0.76.0", + "version": "0.77.0", "private": true, "engines": { "node": ">=18" diff --git a/web/oss/src/components/DrillInView/DrillInBreadcrumb.tsx b/web/oss/src/components/DrillInView/DrillInBreadcrumb.tsx index 3daf9a427b..2c288876d7 100644 --- a/web/oss/src/components/DrillInView/DrillInBreadcrumb.tsx +++ b/web/oss/src/components/DrillInView/DrillInBreadcrumb.tsx @@ -104,7 +104,7 @@ const DrillInBreadcrumb = memo( ) return ( -
+
{/* Fixed prefix (span navigation) - doesn't scroll */} {prefix &&
{prefix}
} diff --git a/web/oss/src/components/DrillInView/DrillInContent.tsx b/web/oss/src/components/DrillInView/DrillInContent.tsx index 3fb57b5cd0..4df7c8e47b 100644 --- a/web/oss/src/components/DrillInView/DrillInContent.tsx +++ b/web/oss/src/components/DrillInView/DrillInContent.tsx @@ -704,7 +704,7 @@ export function DrillInContent({ {/* Field content - collapsible */} {!isCollapsed && ( -
+
{renderFieldContent({ item, stringValue, diff --git a/web/oss/src/components/Editor/DiffView.tsx b/web/oss/src/components/Editor/DiffView.tsx index 60bd32c50d..e039612815 100644 --- a/web/oss/src/components/Editor/DiffView.tsx +++ b/web/oss/src/components/Editor/DiffView.tsx @@ -322,7 +322,6 @@ const DiffView: React.FC = ({ key={diffKey} initialValue="" language={processedContent.language} - validationSchema={{}} additionalCodePlugins={[ { +export const tryParseJson = (value: unknown): unknown => { if (typeof value !== "string") return value try { return JSON.parse(value) @@ -42,7 +42,7 @@ const isChatEntry = (entry: any): boolean => { return false } -const extractMessageArray = (value: any): any[] | null => { +export const extractMessageArray = (value: any): any[] | null => { if (!value) return null if (Array.isArray(value)) return value if (typeof value !== "object") return null @@ -67,7 +67,9 @@ const extractMessageArray = (value: any): any[] | null => { return null } -const normalizeMessages = (messages: any[]): {role: string; content: any; tool_calls?: any[]}[] => { +export const normalizeMessages = ( + messages: any[], +): {role: string; content: any; tool_calls?: any[]}[] => { return messages .map((entry) => { if (!entry) return null diff --git a/web/oss/src/components/GenericDrawer/index.tsx b/web/oss/src/components/GenericDrawer/index.tsx index 7c9e206ca0..f024228f82 100644 --- a/web/oss/src/components/GenericDrawer/index.tsx +++ b/web/oss/src/components/GenericDrawer/index.tsx @@ -27,6 +27,7 @@ const GenericDrawer = ({ onClick={() => props.onClose?.({} as any)} type="text" icon={} + {...props.closeButtonProps} /> {props.expandable && ( @@ -46,6 +47,7 @@ const GenericDrawer = ({ ) } + {...props.expandButtonProps} /> )} diff --git a/web/oss/src/components/GenericDrawer/types.d.ts b/web/oss/src/components/GenericDrawer/types.d.ts index a2e0fe6add..a0b1d9843e 100644 --- a/web/oss/src/components/GenericDrawer/types.d.ts +++ b/web/oss/src/components/GenericDrawer/types.d.ts @@ -1,9 +1,10 @@ import {ReactNode} from "react" -import {DrawerProps} from "antd" +import {ButtonProps, DrawerProps} from "antd" export interface GenericDrawerProps extends DrawerProps { expandable?: boolean + expandButtonProps?: ButtonProps headerExtra?: ReactNode mainContent: ReactNode extraContent?: ReactNode @@ -14,4 +15,5 @@ export interface GenericDrawerProps extends DrawerProps { mainContentDefaultSize?: number extraContentDefaultSize?: number closeOnLayoutClick?: boolean + closeButtonProps?: ButtonProps } diff --git a/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/components/FocusDrawerContent.tsx b/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/components/FocusDrawerContent.tsx new file mode 100644 index 0000000000..1e477cc735 --- /dev/null +++ b/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/components/FocusDrawerContent.tsx @@ -0,0 +1,352 @@ +import {useMemo} from "react" + +import {Collapse} from "antd" +import clsx from "clsx" +import {useAtomValue} from "jotai" + +import SimpleSharedEditor from "@/oss/components/EditorViews/SimpleSharedEditor" +import { + extractMessageArray, + normalizeMessages, + tryParseJson, +} from "@/oss/components/EvalRunDetails/utils/chatMessages" +import {currentAppContextAtom} from "@/oss/state/app/selectors/app" +import {inputRowsByIdFamilyAtom} from "@/oss/state/generation/entities" +import { + PlaygroundTestResult, + responseByRowRevisionAtomFamily, +} from "@/oss/state/newPlayground/generation/runtime" +import {playgroundFocusDrawerAtom} from "@/oss/state/playgroundFocusDrawerAtom" + +import GenerationResultUtils from "../../../PlaygroundGenerations/assets/GenerationResultUtils" + +const getOutputContent = ( + rep: PlaygroundTestResult, + index: number, +): { + type: "error" | "chat" | "text" + content: string | React.ReactNode | {role: string; content: any; tool_calls?: any[]}[] +} => { + if (!rep) return {type: "text", content: ""} + const error = rep.error || rep.response?.error + + if (error) { + const errorContent = typeof error === "string" ? error : JSON.stringify(error) + return {type: "error", content: errorContent} + } + + try { + const potentialChatValue = + rep.response?.choices || rep.response?.output || rep.response?.data || rep.response + + const chatValueString = + typeof potentialChatValue === "string" + ? potentialChatValue + : JSON.stringify(potentialChatValue) + + const parsed = tryParseJson(chatValueString) + const messageArray = extractMessageArray(parsed) + + if (messageArray) { + const normalized = normalizeMessages(messageArray) + if (normalized.length > 0) { + return {type: "chat", content: normalized} + } + } + } catch (e) { + console.error("Error rendering output:", e) + return {type: "error", content: "Error rendering output"} + } + + const simpleContent = + rep.response?.choices?.[0]?.message?.content || + rep.response?.output || + rep.response?.data || + (typeof rep.response === "string" ? rep.response : "") || + "" + + return {type: "text", content: String(simpleContent)} +} + +const getLastUserMessage = (repetitions: PlaygroundTestResult[]) => { + if (!repetitions?.length) return null + + try { + const firstRep = repetitions[0] + const nodes = firstRep.response?.tree?.nodes + const node = Array.isArray(nodes) ? nodes[0] : nodes ? Object.values(nodes)[0] : null + + if (!node) return null + + // Support both direct data access and attributes.ag.data structure + const data = node.data || node.attributes?.ag?.data + const messages = data?.inputs?.messages + + if (Array.isArray(messages)) { + return ( + messages + .slice() + .reverse() + .find((m: any) => m.role === "user") || null + ) + } + } catch (error) { + console.error("Error getting user message", error) + } + return null +} + +const getChatInputs = (repetitions: PlaygroundTestResult[]) => { + if (!repetitions?.length) return [] + try { + const firstRep = repetitions[0] + const nodes = firstRep.response?.tree?.nodes + const node = Array.isArray(nodes) ? nodes[0] : nodes ? Object.values(nodes)[0] : null + + if (!node) return [] + + const data = node.data || node.attributes?.ag?.data + let inputs = data?.inputs || {} + if (typeof inputs !== "object" || inputs === null) return [] + + if (inputs.inputs) { + inputs = inputs.inputs + } + + return Object.entries(inputs) + .filter(([key]) => key !== "messages") + .map(([key, value]) => ({key, value})) + } catch (error) { + console.error("Error getting chat inputs", error) + return [] + } +} + +const FocusDrawerContent = () => { + const {appType} = useAtomValue(currentAppContextAtom) + const {rowId, variantId} = useAtomValue(playgroundFocusDrawerAtom) + + const rowData = useAtomValue(inputRowsByIdFamilyAtom(rowId || "")) + const responseData = useAtomValue( + responseByRowRevisionAtomFamily({ + rowId: rowId || "", + revisionId: variantId || "", + }), + ) + + const repetitions = useMemo(() => { + if (!responseData) return [] + if (Array.isArray(responseData)) return responseData + return [responseData] + }, [responseData]) + + const lastUserMessage = useMemo(() => { + return getLastUserMessage(repetitions) + }, [repetitions]) + + const inputsToRender = useMemo(() => { + if (appType === "chat") { + return getChatInputs(repetitions) + } + return rowData?.variables || [] + }, [appType, repetitions, rowData?.variables]) + + return ( +
+ {/* Header Actions - Custom rendered inside Drawer title or here if we prefer custom header. + EnhancedDrawer uses Antd Drawer, so we can pass 'extra' prop for buttons. + But let's assume we render content here. + */} + + + {inputsToRender?.map((v: any, index: number) => { + if (!v) return null + const key = v.key || `Repeats ${index + 1}` + const value = v.value ?? v.content?.value ?? "" + + return ( + + ) + })} + {(!inputsToRender || inputsToRender.length === 0) && ( +
No inputs available
+ )} +
+ ), + }, + ]} + /> + + {/* Messages Section */} + {lastUserMessage && ( + + +
+ ), + }, + ]} + /> + )} + + {/* Output Section */} + +
+ {repetitions.map((rep: PlaygroundTestResult, index: number) => { + const {type, content} = getOutputContent(rep, index) + let contentToRender: React.ReactNode = null + const isError = type === "error" + const header = ( +
+ + {isError + ? `Repeat ${index + 1} (Error)` + : `Repeat ${index + 1}`} + + +
+ ) + + if (type === "chat" && Array.isArray(content)) { + contentToRender = ( +
+ {content.map((msg: any, msgIndex: number) => { + const role = msg.role + ? msg.role?.charAt(0).toUpperCase() + + msg.role?.slice(1) + : "Unknown" + const value = + typeof msg.content === "string" + ? msg.content + : JSON.stringify( + msg.content, + null, + 2, + ) + return ( + + ) + })} +
+ ) + } else { + contentToRender = ( + + ) + } + + return ( +
+ {header} + {contentToRender} +
+ ) + })} + {repetitions.length === 0 && ( +
+ No outputs available +
+ )} +
+
+ ), + }, + ]} + /> +
+ ) +} + +export default FocusDrawerContent diff --git a/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/index.tsx b/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/index.tsx new file mode 100644 index 0000000000..1aead1e8c0 --- /dev/null +++ b/web/oss/src/components/Playground/Components/Drawers/FocusDrawer/index.tsx @@ -0,0 +1,80 @@ +import {useMemo} from "react" + +import {CaretDoubleRight, CaretDown, CaretUp} from "@phosphor-icons/react" +import {Button, Typography} from "antd" +import {useAtom, useAtomValue, useSetAtom} from "jotai" +import dynamic from "next/dynamic" + +import GenericDrawer from "@/oss/components/GenericDrawer" +import {generationRowIdsAtom} from "@/oss/components/Playground/state/atoms/generationProperties" +import { + closePlaygroundFocusDrawerAtom, + playgroundFocusDrawerAtom, +} from "@/oss/state/playgroundFocusDrawerAtom" + +const FocusDrawerContent = dynamic(() => import("./components/FocusDrawerContent"), {ssr: false}) + +const {Text} = Typography + +const PlaygroundFocusDrawer = () => { + const [{isOpen, rowId}, setDrawerState] = useAtom(playgroundFocusDrawerAtom) + const closeDrawer = useSetAtom(closePlaygroundFocusDrawerAtom) + const rowIds = useAtomValue(generationRowIdsAtom) + + const currentRowIndex = useMemo(() => { + return rowIds.indexOf(rowId || "") + }, [rowIds, rowId]) + + const handleNext = () => { + if (currentRowIndex < rowIds.length - 1) { + setDrawerState((prev) => ({...prev, rowId: rowIds[currentRowIndex + 1]})) + } + } + + const handlePrev = () => { + if (currentRowIndex > 0) { + setDrawerState((prev) => ({...prev, rowId: rowIds[currentRowIndex - 1]})) + } + } + + return ( + , + size: "small", + }} + expandable + expandButtonProps={{ + size: "small", + }} + initialWidth={800} + headerExtra={ +
+
+
+ Test case {currentRowIndex + 1} +
+ } + mainContent={} + className="[&_.ant-drawer-body]:!p-0" + /> + ) +} + +export default PlaygroundFocusDrawer diff --git a/web/oss/src/components/Playground/Components/MainLayout/index.tsx b/web/oss/src/components/Playground/Components/MainLayout/index.tsx index 9538352dc4..3d1cad8baf 100644 --- a/web/oss/src/components/Playground/Components/MainLayout/index.tsx +++ b/web/oss/src/components/Playground/Components/MainLayout/index.tsx @@ -1,9 +1,9 @@ -import React from "react" import {memo, useCallback, useEffect, useRef} from "react" -import {Typography, Button, Splitter} from "antd" +import {Button, Splitter, Typography} from "antd" import clsx from "clsx" import {useAtomValue, useSetAtom} from "jotai" +import dynamic from "next/dynamic" import {generationInputRowIdsAtom} from "@/oss/components/Playground/state/atoms/generationProperties" import {chatTurnIdsAtom} from "@/oss/state/generation/entities" @@ -14,9 +14,9 @@ import {revisionListAtom} from "@/oss/state/variant/selectors/variant" import {usePlaygroundScrollSync} from "../../hooks/usePlaygroundScrollSync" import { + appChatModeAtom, displayedVariantsAtom, isComparisonViewAtom, - appChatModeAtom, selectedVariantsAtom, } from "../../state/atoms" import {GenerationComparisonOutput} from "../PlaygroundGenerationComparisonView" @@ -27,6 +27,9 @@ import PlaygroundGenerations from "../PlaygroundGenerations" import PromptComparisonVariantNavigation from "../PlaygroundPromptComparisonView/PromptComparisonVariantNavigation" import PlaygroundVariantConfig from "../PlaygroundVariantConfig" import type {BaseContainerProps} from "../types" +const PlaygroundFocusDrawer = dynamic(() => import("../Drawers/FocusDrawer"), { + ssr: false, +}) import ComparisonVariantConfigSkeleton from "./assets/ComparisonVariantConfigSkeleton" import ComparisonVariantNavigationSkeleton from "./assets/ComparisonVariantNavigationSkeleton" @@ -275,6 +278,7 @@ const PlaygroundMainView = ({className, isLoading = false, ...divProps}: MainLay
+ ) diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerationComparisonView/GenerationComparisonCompletionOutput/index.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerationComparisonView/GenerationComparisonCompletionOutput/index.tsx index ea08672106..2b958c40f4 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundGenerationComparisonView/GenerationComparisonCompletionOutput/index.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerationComparisonView/GenerationComparisonCompletionOutput/index.tsx @@ -3,6 +3,7 @@ import {useMemo} from "react" import clsx from "clsx" import {useAtomValue} from "jotai" +import {useRepetitionResult} from "@/oss/components/Playground/hooks/useRepetitionResult" import {generationResultAtomFamily} from "@/oss/components/Playground/state/atoms" import {getResponseLazy} from "@/oss/lib/hooks/useStatelessVariants/state" @@ -33,6 +34,11 @@ const GenerationComparisonCompletionOutput = ({ [inlineResult, resultHash], ) + const {currentResult, repetitionProps} = useRepetitionResult({ + rowId, + variantId, + result, + }) return ( <> {variantIndex === 0 ? ( @@ -62,11 +68,16 @@ const GenerationComparisonCompletionOutput = ({
{isRunning ? ( - ) : result ? ( - result.error ? ( - + ) : currentResult ? ( + currentResult.error ? ( + ) : ( - + ) ) : ( diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationChatTurnNormalized/index.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationChatTurnNormalized/index.tsx index 8217746705..68942d6be4 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationChatTurnNormalized/index.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationChatTurnNormalized/index.tsx @@ -1,6 +1,5 @@ import {useCallback, useMemo} from "react" -// antd imports not needed here import clsx from "clsx" import {useAtomValue, useSetAtom} from "jotai" import dynamic from "next/dynamic" @@ -12,20 +11,25 @@ import {ClickRunPlaceholder} from "@/oss/components/Playground/Components/Playgr import {useAssistantDisplayValue} from "@/oss/components/Playground/hooks/chat/useAssistant" import useEffectiveRevisionId from "@/oss/components/Playground/hooks/chat/useEffectiveRevisionId" import useHasAssistantContent from "@/oss/components/Playground/hooks/chat/useHasAssistantContent" +import {useRepetitionResult} from "@/oss/components/Playground/hooks/useRepetitionResult" import {displayedVariantsAtom} from "@/oss/components/Playground/state/atoms" import {resolvedGenerationResultAtomFamily} from "@/oss/components/Playground/state/atoms/generationProperties" +import {messageSchemaMetadataAtom} from "@/oss/state/generation/entities" import {assistantMessageAtomFamily, chatTurnAtomFamily} from "@/oss/state/generation/selectors" import { addChatTurnAtom, - runChatTurnAtom, cancelChatTurnAtom, + runChatTurnAtom, } from "@/oss/state/newPlayground/chat/actions" +import {buildAssistantMessage} from "@/oss/state/newPlayground/helpers/messageFactory" interface Props { turnId: string variantId?: string withControls?: boolean className?: string + hideUserMessage?: boolean + messageProps?: any } const GenerationResultUtils = dynamic(() => import("../GenerationResultUtils"), {ssr: false}) @@ -58,6 +62,21 @@ const GenerationChatTurnNormalized = ({ const {isRunning, result: inlineResult} = useAtomValue(genResultAtom) as any const result = inlineResult + const {currentResult, repetitionIndex, repetitionProps} = useRepetitionResult({ + rowId: resolvedTurnId || turnId, + variantId: variantId as string, + result, + }) + + const messageSchema = useAtomValue(messageSchemaMetadataAtom) + + const messageOverride = useMemo(() => { + if (Array.isArray(result) && result.length > 0) { + return buildAssistantMessage(messageSchema, currentResult) + } + return undefined + }, [result, currentResult, messageSchema]) + const onRun = useCallback(() => { runTurn({turnId, variantId: variantId as string | undefined}) }, [runTurn, turnId, variantId, effectiveRevisionId, resolvedTurnId]) @@ -88,7 +107,10 @@ const GenerationChatTurnNormalized = ({ ), ) as any - const displayAssistantValue = useAssistantDisplayValue(assistantMsg, result) + const displayAssistantValue = useAssistantDisplayValue( + messageOverride || assistantMsg, + currentResult, + ) const turnState = useAtomValue(useMemo(() => chatTurnAtomFamily(sessionRowId), [sessionRowId])) @@ -99,7 +121,7 @@ const GenerationChatTurnNormalized = ({ }, [turnState, variantId]) const hasAssistantContent = useHasAssistantContent( - assistantMsg as any, + (messageOverride || assistantMsg) as any, displayAssistantValue, toolMessages.length > 0, ) @@ -132,13 +154,24 @@ const GenerationChatTurnNormalized = ({ ) : hasAssistantContent ? ( <> : null} + footer={ +
+ {currentResult ? ( + + ) : ( +
+ )} +
+ } messageProps={messageProps} + messageOverride={messageOverride} + repetitionProps={repetitionProps} /> {variantId ? toolMessages.map((_, index) => ( diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/GenerationResponsePanel.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/GenerationResponsePanel.tsx index 3b54e0547f..edfec5fb31 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/GenerationResponsePanel.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/GenerationResponsePanel.tsx @@ -1,46 +1,76 @@ import {useMemo} from "react" +import {Typography} from "antd" +import {useAtomValue} from "jotai" import dynamic from "next/dynamic" import ToolCallView from "@/oss/components/Playground/Components/ToolCallView" +import {isComparisonViewAtom} from "@/oss/components/Playground/state/atoms" import {deriveToolViewModelFromResult} from "@/oss/state/newPlayground/chat/parsers" import SharedEditor from "../../../SharedEditor" +import RepetitionNavigation from "../RepetitionNavigation" const GenerationResultUtils = dynamic(() => import("../GenerationResultUtils"), {ssr: false}) interface Props { result: any + repetitionProps?: { + current: number + total: number + onNext: () => void + onPrev: () => void + } + rowId: string + variantId?: string } -export default function GenerationResponsePanel({result}: Props) { +export default function GenerationResponsePanel({ + result, + repetitionProps, + rowId, + variantId, +}: Props) { const {toolData, isJSON, displayValue} = useMemo( () => deriveToolViewModelFromResult(result), [result], ) + const footerNode = ( +
+ +
+ ) + + const isComparisonView = useAtomValue(isComparisonViewAtom) + if (toolData) { - return ( - } - /> - ) + return } return ( - } - handleChange={() => undefined} - /> +
+ {repetitionProps && !isComparisonView && ( +
+ + Total repeats + + +
+ )} + + undefined} + /> +
) } diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/SingleView.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/SingleView.tsx index fb2662718b..3ae93a2704 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/SingleView.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationCompletionRow/SingleView.tsx @@ -1,6 +1,7 @@ import {useCallback, useEffect, useMemo, useRef, useState} from "react" import { + ArrowsOutLineHorizontal, CaretDownIcon, CaretLineDownIcon, CaretLineUpIcon, @@ -18,11 +19,13 @@ import RunButton from "@/oss/components/Playground/assets/RunButton" import TypingIndicator from "@/oss/components/Playground/assets/TypingIndicator" import TestsetDrawerButton from "@/oss/components/Playground/Components/Drawers/TestsetDrawer" import {allGenerationsCollapsedAtom} from "@/oss/components/Playground/Components/PlaygroundGenerations/assets/GenerationHeader/store" +import {useRepetitionResult} from "@/oss/components/Playground/hooks/useRepetitionResult" import {generationInputRowIdsAtom} from "@/oss/components/Playground/state/atoms/generationProperties" import {deleteGenerationInputRowMutationAtom} from "@/oss/components/Playground/state/atoms/mutations/input/deleteInputRow" import {duplicateGenerationInputRowMutationAtom} from "@/oss/components/Playground/state/atoms/mutations/input/duplicateInputRow" import {inputRowIdsAtom} from "@/oss/state/generation/entities" import {variableIdsUnifiedAtomFamily} from "@/oss/state/newPlayground/generation/selectors" +import {openPlaygroundFocusDrawerAtom} from "@/oss/state/playgroundFocusDrawerAtom" import {ClickRunPlaceholder} from "../ResultPlaceholder" @@ -56,12 +59,19 @@ const SingleView = ({ cancelRow, containerClassName, }: Props) => { - const variableIds = useAtomValue( + const variableIds = useAtom( useMemo( () => variableIdsUnifiedAtomFamily({rowId, revisionId: variantId}), [rowId, variantId], ), - ) as string[] + )[0] as string[] + + const openFocusDrawer = useSetAtom(openPlaygroundFocusDrawerAtom) + const {currentResult, repetitionProps} = useRepetitionResult({ + rowId, + variantId, + result, + }) const inputRowIds = useAtomValue(generationInputRowIdsAtom) as string[] const allInputRowIds = useAtomValue(inputRowIdsAtom) as string[] @@ -121,6 +131,13 @@ const SingleView = ({ )}
+ } + size="small" + type="text" + onClick={() => openFocusDrawer({rowId, variantId})} + tooltipProps={{title: "Expand results"}} + /> } type="text" @@ -183,6 +200,13 @@ const SingleView = ({ )}
+ } + size="small" + type="text" + onClick={() => openFocusDrawer({rowId, variantId})} + tooltipProps={{title: "View all repeats"}} + /> } type="text" @@ -296,13 +320,23 @@ const SingleView = ({ > {isBusy ? ( - ) : !result ? ( + ) : !currentResult ? ( - ) : result.error ? ( - - ) : result.response ? ( - - ) : null} + ) : ( +
+ {currentResult.error ? ( + + ) : currentResult.response ? ( + + ) : null} +
+ )}
) : null} diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationHeader/index.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationHeader/index.tsx index 740a15f678..2831bb420f 100644 --- a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationHeader/index.tsx +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/GenerationHeader/index.tsx @@ -14,6 +14,7 @@ import {runAllChatAtom} from "@/oss/state/newPlayground/chat/actions" import RunButton from "../../../../assets/RunButton" import {usePlaygroundAtoms} from "../../../../hooks/usePlaygroundAtoms" import {generationHeaderDataAtomFamily, triggerWebWorkerTestAtom} from "../../../../state/atoms" +import RunOptionsPopover from "../RunOptionsPopover" import {allGenerationsCollapsedAtom} from "./store" import {useStyles} from "./styles" @@ -40,6 +41,7 @@ const GenerationHeader = ({variantId}: GenerationHeaderProps) => { const triggerTest = useSetAtom(triggerWebWorkerTestAtom) const runAllChat = useSetAtom(runAllChatAtom) const appType = useAtomValue(appTypeAtom) + const completionRowIds = useAtomValue(generationInputRowIdsAtom) as string[] const [isAllCollapsed, setIsAllCollapsed] = useAtom(allGenerationsCollapsedAtom) @@ -111,14 +113,18 @@ const GenerationHeader = ({variantId}: GenerationHeaderProps) => { /> {!isRunning ? ( - - runTests()} - disabled={isRunning} - /> - +
+ + runTests()} + disabled={isRunning} + style={{borderRadius: "6px 0 0 6px"}} + /> + + +
) : ( void + onPrev: () => void + disabled?: boolean +} + +const RepetitionNavigation = ({ + current, + total, + onNext, + onPrev, + disabled, +}: RepetitionNavigationProps) => { + if (total <= 1) return null + + return ( +
+
+ ) +} + +export default RepetitionNavigation diff --git a/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/RunOptionsPopover/index.tsx b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/RunOptionsPopover/index.tsx new file mode 100644 index 0000000000..4188ea05f5 --- /dev/null +++ b/web/oss/src/components/Playground/Components/PlaygroundGenerations/assets/RunOptionsPopover/index.tsx @@ -0,0 +1,84 @@ +import {useCallback, useRef} from "react" + +import {CaretDown} from "@phosphor-icons/react" +import {Button, InputNumber, Popover, Slider, Typography} from "antd" +import {useAtom} from "jotai" + +import {usePostHogAg} from "@/oss/lib/helpers/analytics/hooks/usePostHogAg" +import {repetitionCountAtom} from "@/oss/state/newPlayground/generation/options" + +const RunOptionsPopover = ({isRunning, variantId}: {isRunning: boolean; variantId: string}) => { + const [repetitionCount, setRepetitionCount] = useAtom(repetitionCountAtom) + const posthog = usePostHogAg() + const initialCountRef = useRef(repetitionCount) + + const handleOpenChange = useCallback( + (open: boolean) => { + if (open) { + initialCountRef.current = repetitionCount + } else { + if (repetitionCount !== initialCountRef.current) { + posthog?.capture("playground_repeats_count_changed", {count: repetitionCount}) + } else if (repetitionCount === 1) { + posthog?.capture("playground_repeats_opened_no_change_default", {count: 1}) + } + } + }, + [posthog, repetitionCount], + ) + + const content = ( +
+
+
+ Repeats + setRepetitionCount(val || 1)} + size="small" + className="w-[60px]" + disabled={isRunning} + /> +
+ + Run the same prompt multiple times to reduce variability in results.{" "} + + setRepetitionCount(val)} + disabled={isRunning} + /> +
+
+ ) + + return ( + +