modelpack · aftersnow · Aug 27, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025
diff --git a/docs/config.md b/docs/config.md
@@ -149,11 +149,13 @@ The following terms are used in this section:
 
   - **inputTypes** _array of string_, OPTIONAL
 
-    Input types that the model supports, such as "text", "image", "audio", "video", etc.
+    An array of strings specifying the data types that the model can accept as input.
+    The allowed values are: "text", "image", "audio", "video", or "embedding". For input types that are not explicitly defined, the value "other" value should be used.
 
   - **outputTypes** _array of string_, OPTIONAL
 
-    Output types that the model supports, such as "text", "image", "audio", "video", etc.
+    An array of strings specifying the data types that the model can produce as output.
+    The allowed values are: "text", "image", "audio", "video", or "embedding". For output types that are not explicitly defined, the value "other" value should be used.
 
   - **knowledgeCutoff** _string_, OPTIONAL
 
@@ -167,10 +169,6 @@ The following terms are used in this section:
 
     Whether the model can use external tools or APIs to perform tasks.
 
-  - **embedding** _boolean_, OPTIONAL
-
-    Whether the model can perform embedding tasks.
-
   - **reward** _boolean_, OPTIONAL
 
     Whether the model is a reward model.
@@ -220,7 +218,6 @@ Here is an example model artifact configuration JSON document:
       "knowledgeCutoff": "2024-05-21T00:00:00Z",
       "reasoning": true,
       "toolUsage": false,
-      "embedding": false,
       "reward": false,
       "languages": ["en", "zh"]
     }

diff --git a/specs-go/v1/config.go b/specs-go/v1/config.go
@@ -98,12 +98,34 @@ type ModelDescriptor struct {
 type Modality string
 
 const (
-	TextModality      Modality = "text"
-	ImageModality     Modality = "image"
-	AudioModality     Modality = "audio"
-	VideoModality     Modality = "video"
+	// TextModality indicates that the model can process or generate text.
+	// If present in InputTypes, the model accepts text as input.
+	// If present in OutputTypes, the model produces text as output.
+	TextModality Modality = "text"
+
+	// ImageModality indicates that the model can process or generate images.
+	// If present in InputTypes, the model accepts images as input.
+	// If present in OutputTypes, the model produces images as output.
+	ImageModality Modality = "image"
+
+	// AudioModality indicates that the model can process or generate audio.
+	// If present in InputTypes, the model accepts audio as input.
+	// If present in OutputTypes, the model produces audio as output.
+	AudioModality Modality = "audio"
+
+	// VideoModality indicates that the model can process or generate video.
+	// If present in InputTypes, the model accepts video as input.
+	// If present in OutputTypes, the model produces video as output.
+	VideoModality Modality = "video"
+
+	// EmbeddingModality indicates that the model can process or generate embeddings.
+	// If present in InputTypes, the model accepts embeddings as input.
+	// If present in OutputTypes, the model produces embeddings as output.
+	// For a dedicated "embedding model", EmbeddingModality should be present in its OutputTypes.
 	EmbeddingModality Modality = "embedding"
-	OtherModality     Modality = "other"
+
+	// OtherModality indicates that the model supports a modality not explicitly listed.
+	OtherModality Modality = "other"
 )
 
 // ModelCapabilities defines the special capabilities that the model supports
@@ -124,9 +146,6 @@ type ModelCapabilities struct {
 	// such as a calculator, a search engine, etc.
 	ToolUsage *bool `json:"toolUsage,omitempty"`
 
-	// Embedding indicates whether the model can perform embedding tasks
-	Embedding *bool `json:"embedding,omitempty"`
-
 	// Reward indicates whether the model is a reward model
 	Reward *bool `json:"reward,omitempty"`