huggingface
diff --git a/‎packages/tasks/scripts/inference-codegen.ts
Lines changed: 1 addition & 1 deletion b/‎packages/tasks/scripts/inference-codegen.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/tasks/src/tasks/audio-classification/inference.ts
Lines changed: 3 additions & 3 deletions b/‎packages/tasks/src/tasks/audio-classification/inference.ts
Lines changed: 3 additions & 3 deletions
diff --git a/‎packages/tasks/src/tasks/audio-classification/spec/input.json
Lines changed: 2 additions & 2 deletions b/‎packages/tasks/src/tasks/audio-classification/spec/input.json
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/tasks/src/tasks/audio-classification/spec/output.json
Lines changed: 1 addition & 12 deletions b/‎packages/tasks/src/tasks/audio-classification/spec/output.json
Lines changed: 1 addition & 12 deletions
diff --git a/‎packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
Lines changed: 34 additions & 29 deletions b/‎packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
Lines changed: 34 additions & 29 deletions
diff --git a/‎packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
Lines changed: 1 addition & 1 deletion b/‎packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
Lines changed: 30 additions & 28 deletions b/‎packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
Lines changed: 30 additions & 28 deletions
diff --git a/‎packages/tasks/src/tasks/common-definitions.json
Lines changed: 25 additions & 17 deletions b/‎packages/tasks/src/tasks/common-definitions.json
Lines changed: 25 additions & 17 deletions
@@ -57,7 +57,7 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 		indentation: "\t",
 		rendererOptions: {
 			"just-types": true,
-			"nice-property-names": true,
+			"nice-property-names": false,
 			"prefer-unions": true,
 			"prefer-const-values": true,
 			"prefer-unknown": true,
 
@@ -23,11 +23,11 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	functionToApply?: ClassificationOutputTransform;
+	function_to_apply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
-	topK?: number;
+	top_k?: number;
 	[property: string]: unknown;
 }
 /**
@@ -40,7 +40,7 @@ export type AudioClassificationOutput = AudioClassificationOutputElement[];
  */
 export interface AudioClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
 
@@ -19,11 +19,11 @@
 			"description": "Additional inference parameters for Audio Classification",
 			"type": "object",
 			"properties": {
-				"functionToApply": {
+				"function_to_apply": {
 					"title": "AudioClassificationOutputTransform",
 					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
-				"topK": {
+				"top_k": {
 					"type": "integer",
 					"description": "When specified, limits the output to the top K most probable classes."
 				}
 
@@ -5,17 +5,6 @@
 	"description": "Outputs for Audio Classification inference",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
@@ -3,6 +3,7 @@
  *
  * Using src/scripts/inference-codegen
  */
+
 /**
  * Inputs for Automatic Speech Recognition inference
  */
@@ -17,6 +18,7 @@ export interface AutomaticSpeechRecognitionInput {
 	parameters?: AutomaticSpeechRecognitionParameters;
 	[property: string]: unknown;
 }
+
 /**
  * Additional inference parameters
  *
@@ -30,9 +32,10 @@ export interface AutomaticSpeechRecognitionParameters {
 	/**
 	 * Whether to output corresponding timestamps with the generated text
 	 */
-	returnTimestamps?: boolean;
+	return_timestamps?: boolean;
 	[property: string]: unknown;
 }
+
 /**
  * Parametrization of the text generation process
  *
@@ -42,18 +45,18 @@ export interface GenerationParameters {
 	/**
 	 * Whether to use sampling instead of greedy decoding when generating new tokens.
 	 */
-	doSample?: boolean;
+	do_sample?: boolean;
 	/**
 	 * Controls the stopping condition for beam-based methods.
 	 */
-	earlyStopping?: EarlyStoppingUnion;
+	early_stopping?: EarlyStoppingUnion;
 	/**
 	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
 	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
 	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
 	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
 	 */
-	epsilonCutoff?: number;
+	epsilon_cutoff?: number;
 	/**
 	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
 	 * float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -63,84 +66,74 @@ export interface GenerationParameters {
 	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
 	 * for more details.
 	 */
-	etaCutoff?: number;
+	eta_cutoff?: number;
 	/**
 	 * The maximum length (in tokens) of the generated text, including the input.
 	 */
-	maxLength?: number;
+	max_length?: number;
 	/**
 	 * The maximum number of tokens to generate. Takes precedence over maxLength.
 	 */
-	maxNewTokens?: number;
+	max_new_tokens?: number;
 	/**
 	 * The minimum length (in tokens) of the generated text, including the input.
 	 */
-	minLength?: number;
+	min_length?: number;
 	/**
 	 * The minimum number of tokens to generate. Takes precedence over maxLength.
 	 */
-	minNewTokens?: number;
+	min_new_tokens?: number;
 	/**
 	 * Number of groups to divide num_beams into in order to ensure diversity among different
 	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
 	 */
-	numBeamGroups?: number;
+	num_beam_groups?: number;
 	/**
 	 * Number of beams to use for beam search.
 	 */
-	numBeams?: number;
+	num_beams?: number;
 	/**
 	 * The value balances the model confidence and the degeneration penalty in contrastive
 	 * search decoding.
 	 */
-	penaltyAlpha?: number;
+	penalty_alpha?: number;
 	/**
 	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
 	/**
 	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
 	 */
-	topK?: number;
+	top_k?: number;
 	/**
 	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
 	 * that add up to top_p or higher are kept for generation.
 	 */
-	topP?: number;
+	top_p?: number;
 	/**
 	 * Local typicality measures how similar the conditional probability of predicting a target
 	 * token next is to the expected conditional probability of predicting a random token next,
 	 * given the partial text already generated. If set to float < 1, the smallest set of the
 	 * most locally typical tokens with probabilities that add up to typical_p or higher are
 	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
 	 */
-	typicalP?: number;
+	typical_p?: number;
 	/**
 	 * Whether the model should use the past last key/values attentions to speed up decoding
 	 */
-	useCache?: boolean;
+	use_cache?: boolean;
 	[property: string]: unknown;
 }
+
 /**
  * Controls the stopping condition for beam-based methods.
  */
 export type EarlyStoppingUnion = boolean | "never";
-export interface AutomaticSpeechRecognitionOutputChunk {
-	/**
-	 * A chunk of text identified by the model
-	 */
-	text: string;
-	/**
-	 * The start and end timestamps corresponding with the text
-	 */
-	timestamps: number[];
-	[property: string]: unknown;
-}
-export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
+
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
-export interface AutomaticSpeechRecognitionOutputElement {
+export interface AutomaticSpeechRecognitionOutput {
 	/**
 	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
 	 * the model.
@@ -152,3 +145,15 @@ export interface AutomaticSpeechRecognitionOutputElement {
 	text: string;
 	[property: string]: unknown;
 }
+
+export interface AutomaticSpeechRecognitionOutputChunk {
+	/**
+	 * A chunk of text identified by the model
+	 */
+	text: string;
+	/**
+	 * The start and end timestamps corresponding with the text
+	 */
+	timestamps: number[];
+	[property: string]: unknown;
+}
@@ -19,7 +19,7 @@
 			"description": "Additional inference parameters for Automatic Speech Recognition",
 			"type": "object",
 			"properties": {
-				"returnTimestamps": {
+				"return_timestamps": {
 					"type": "boolean",
 					"description": "Whether to output corresponding timestamps with the generated text"
 				},
 
@@ -3,34 +3,36 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Automatic Speech Recognition task",
 	"title": "AutomaticSpeechRecognitionOutput",
-	"type": "array",
-	"items": {
-		"type": "object",
-		"properties": {
-			"text": {
-				"type": "string",
-				"description": "The recognized text."
-			},
-			"chunks": {
-				"type": "array",
-				"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
-				"items": {
-					"type": "object",
-					"title": "AutomaticSpeechRecognitionOutputChunk",
-					"properties": {
-						"text": { "type": "string", "description": "A chunk of text identified by the model" },
-						"timestamps": {
-							"type": "array",
-							"description": "The start and end timestamps corresponding with the text",
-							"items": { "type": "number" },
-							"minLength": 2,
-							"maxLength": 2
-						}
+	"type": "object",
+	"properties": {
+		"text": {
+			"type": "string",
+			"description": "The recognized text."
+		},
+		"chunks": {
+			"type": "array",
+			"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
+			"items": {
+				"type": "object",
+				"title": "AutomaticSpeechRecognitionOutputChunk",
+				"properties": {
+					"text": {
+						"type": "string",
+						"description": "A chunk of text identified by the model"
 					},
-					"required": ["text", "timestamps"]
-				}
+					"timestamps": {
+						"type": "array",
+						"description": "The start and end timestamps corresponding with the text",
+						"items": {
+							"type": "number"
+						},
+						"minLength": 2,
+						"maxLength": 2
+					}
+				},
+				"required": ["text", "timestamps"]
 			}
-		},
-		"required": ["text"]
-	}
+		}
+	},
+	"required": ["text"]
 }
@@ -43,63 +43,71 @@
 					"type": "number",
 					"description": "The value used to modulate the next token probabilities."
 				},
-				"topK": {
+				"top_k": {
 					"type": "integer",
 					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
 				},
-				"topP": {
+				"top_p": {
 					"type": "number",
 					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
 				},
-				"typicalP": {
+				"typical_p": {
 					"type": "number",
 					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
 				},
-				"epsilonCutoff": {
+				"epsilon_cutoff": {
 					"type": "number",
 					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
 				},
-				"etaCutoff": {
+				"eta_cutoff": {
 					"type": "number",
 					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
 				},
-				"maxLength": {
+				"max_length": {
 					"type": "integer",
 					"description": "The maximum length (in tokens) of the generated text, including the input."
 				},
-				"maxNewTokens": {
+				"max_new_tokens": {
 					"type": "integer",
 					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
 				},
-				"minLength": {
+				"min_length": {
 					"type": "integer",
 					"description": "The minimum length (in tokens) of the generated text, including the input."
 				},
-				"minNewTokens": {
+				"min_new_tokens": {
 					"type": "integer",
 					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
 				},
-				"doSample": {
+				"do_sample": {
 					"type": "boolean",
 					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
 				},
-				"earlyStopping": {
+				"early_stopping": {
 					"description": "Controls the stopping condition for beam-based methods.",
-					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
-				},
-				"numBeams": {
+					"oneOf": [
+						{
+							"type": "boolean"
+						},
+						{
+							"const": "never",
+							"type": "string"
+						}
+					]
+				},
+				"num_beams": {
 					"type": "integer",
 					"description": "Number of beams to use for beam search."
 				},
-				"numBeamGroups": {
+				"num_beam_groups": {
 					"type": "integer",
 					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
 				},
-				"penaltyAlpha": {
+				"penalty_alpha": {
 					"type": "number",
 					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
 				},
-				"useCache": {
+				"use_cache": {
 					"type": "boolean",
 					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
 				}