Updated Specification and Docs to support Audio Modality.

evalstate · evalstate · commit 424f28038704 · 2024-12-01T10:14:23.000Z
diff --git a/docs/specification/client/sampling.md b/docs/specification/client/sampling.md
@@ -8,7 +8,7 @@ weight: 40
 **Protocol Revision**: {{< param protocolRevision >}}
 {{< /callout >}}
 
-The Model Context Protocol (MCP) provides a standardized way for servers to request LLM sampling ("completions" or "generations") from language models via clients. This flow allows clients to maintain control over model access, selection, and permissions while enabling servers to leverage AI capabilities&mdash;with no server API keys necessary. Servers can request text or image-based interactions and optionally include context from MCP servers in their prompts.
+The Model Context Protocol (MCP) provides a standardized way for servers to request LLM sampling ("completions" or "generations") from language models via clients. This flow allows clients to maintain control over model access, selection, and permissions while enabling servers to leverage AI capabilities&mdash;with no server API keys necessary. Servers can request text, audio or image-based interactions and optionally include context from MCP servers in their prompts.
 
 ## User Interaction Model
 
@@ -142,6 +142,16 @@ Sampling messages can contain:
 }
 ```
 
+#### Audio Content
+```json
+{
+  "type": "audio",
+  "data": "base64-encoded-audio-data",
+  "mimeType": "audio/wav"
+}
+```
+
+
 ### Model Preferences
 
 Model selection in MCP requires careful abstraction since servers and clients may use different AI providers with distinct model offerings. A server cannot simply request a specific model by name since the client may not have access to that exact model or may prefer to use a different provider's equivalent model.
diff --git a/docs/specification/server/prompts.md b/docs/specification/server/prompts.md
@@ -189,6 +189,17 @@ Image content allows including visual information in messages:
 ```
 The image data MUST be base64-encoded and include a valid MIME type. This enables multi-modal interactions where visual context is important.
 
+#### Audio Content
+Audio content allows including audio information in messages:
+```json
+{
+  "type": "audio",
+  "data": "base64-encoded-audio-data",
+  "mimeType": "audio/wav"
+}
+```
+The audio data MUST be base64-encoded and include a valid MIME type. This enables multi-modal interactions where audio context is important.
+
 #### Embedded Resources
 Embedded resources allow referencing server-side resources directly in messages:
 ```json
diff --git a/docs/specification/server/tools.md b/docs/specification/server/tools.md
@@ -188,6 +188,15 @@ Tool results can contain multiple content items of different types:
 }
 ```
 
+#### Audio Content
+```json
+{
+  "type": "audio",
+  "data": "base64-encoded-audio-data",
+  "mimeType": "audio/wav"
+}
+```
+
 #### Embedded Resources
 
 [Resources]({{< ref "/specification/server/resources" >}}) **MAY** be embedded, to provide additional context or data, behind a URI that can be subscribed to or fetched again by the client later:
diff --git a/schema/schema.json b/schema/schema.json
@@ -25,6 +25,48 @@
             },
             "type": "object"
         },
+        "AudioContent": {
+            "description": "Audio provided to or from an LLM.",
+            "properties": {
+                "annotations": {
+                    "properties": {
+                        "audience": {
+                            "description": "Describes who the intended customer of this object or data is.\n\nIt can include multiple entries to indicate content useful for multiple audiences (e.g., `[\"user\", \"assistant\"]`).",
+                            "items": {
+                                "$ref": "#/definitions/Role"
+                            },
+                            "type": "array"
+                        },
+                        "priority": {
+                            "description": "Describes how important this data is for operating the server.\n\nA value of 1 means \"most important,\" and indicates that the data is\neffectively required, while 0 means \"least important,\" and indicates that\nthe data is entirely optional.",
+                            "maximum": 1,
+                            "minimum": 0,
+                            "type": "number"
+                        }
+                    },
+                    "type": "object"
+                },
+                "data": {
+                    "description": "The base64-encoded audio data.",
+                    "format": "byte",
+                    "type": "string"
+                },
+                "mimeType": {
+                    "description": "The MIME type of the audio. Different providers may support different audio types.",
+                    "type": "string"
+                },
+                "type": {
+                    "const": "audio",
+                    "type": "string"
+                }
+            },
+            "required": [
+                "data",
+                "mimeType",
+                "type"
+            ],
+            "type": "object"
+        },
         "BlobResourceContents": {
             "properties": {
                 "blob": {
@@ -94,6 +136,9 @@
                             {
                                 "$ref": "#/definitions/ImageContent"
                             },
+                            {
+                                "$ref": "#/definitions/AudioContent"
+                            },
                             {
                                 "$ref": "#/definitions/EmbeddedResource"
                             }
@@ -409,6 +454,9 @@
                         },
                         {
                             "$ref": "#/definitions/ImageContent"
+                        },
+                        {
+                            "$ref": "#/definitions/AudioContent"
                         }
                     ]
                 },
@@ -1349,6 +1397,9 @@
                         {
                             "$ref": "#/definitions/ImageContent"
                         },
+                        {
+                            "$ref": "#/definitions/AudioContent"
+                        },
                         {
                             "$ref": "#/definitions/EmbeddedResource"
                         }
@@ -1718,6 +1769,9 @@
                         },
                         {
                             "$ref": "#/definitions/ImageContent"
+                        },
+                        {
+                            "$ref": "#/definitions/AudioContent"
                         }
                     ]
                 },
diff --git a/schema/schema.ts b/schema/schema.ts
@@ -600,7 +600,7 @@ export type Role = "user" | "assistant";
  */
 export interface PromptMessage {
   role: Role;
-  content: TextContent | ImageContent | EmbeddedResource;
+  content: TextContent | ImageContent | AudioContent | EmbeddedResource;
 }
 
 /**
@@ -649,7 +649,7 @@ export interface ListToolsResult extends PaginatedResult {
  * should be reported as an MCP error response.
  */
 export interface CallToolResult extends Result {
-  content: (TextContent | ImageContent | EmbeddedResource)[];
+  content: (TextContent | ImageContent | AudioContent | EmbeddedResource)[];
 
   /**
    * Whether the tool call ended in an error.
@@ -804,7 +804,7 @@ export interface CreateMessageResult extends Result, SamplingMessage {
  */
 export interface SamplingMessage {
   role: Role;
-  content: TextContent | ImageContent;
+  content: TextContent | ImageContent | AudioContent;
 }
 
 /**
@@ -862,6 +862,25 @@ export interface ImageContent extends Annotated {
   mimeType: string;
 }
 
+
+/**
+ * Audio provided to or from an LLM.
+ */
+export interface AudioContent extends Annotated {
+  type: "audio";
+  /**
+   * The base64-encoded audio data.
+   *
+   * @format byte
+   */
+  data: string;
+  /**
+   * The MIME type of the audio. Different providers may support different audio types.
+   */
+  mimeType: string;
+}
+
+
 /**
  * The server's preferences for model selection, requested of the client during sampling.
  *