Skip to content

Commit 839c1ce

Browse files
authored
♻️ [Tasks] Use snake_case for parameters (+ misc format fixes) (#470)
# TL.DR - Use `snake_case` for parameters in the JSON schema - Some misc schema fixes: - Some output types were array where they should be objects - Fix a typo in 'description' preventing the JSDoc from being generated - Use the factorized classification output type for audio_classification task as well
1 parent 1bbf68a commit 839c1ce

File tree

55 files changed

+342
-327
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+342
-327
lines changed

packages/tasks/scripts/inference-codegen.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
5757
indentation: "\t",
5858
rendererOptions: {
5959
"just-types": true,
60-
"nice-property-names": true,
60+
"nice-property-names": false,
6161
"prefer-unions": true,
6262
"prefer-const-values": true,
6363
"prefer-unknown": true,

packages/tasks/src/tasks/audio-classification/inference.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ export interface AudioClassificationInput {
2323
* Additional inference parameters for Audio Classification
2424
*/
2525
export interface AudioClassificationParameters {
26-
functionToApply?: ClassificationOutputTransform;
26+
function_to_apply?: ClassificationOutputTransform;
2727
/**
2828
* When specified, limits the output to the top K most probable classes.
2929
*/
30-
topK?: number;
30+
top_k?: number;
3131
[property: string]: unknown;
3232
}
3333
/**
@@ -40,7 +40,7 @@ export type AudioClassificationOutput = AudioClassificationOutputElement[];
4040
*/
4141
export interface AudioClassificationOutputElement {
4242
/**
43-
* The predicted class label (model specific).
43+
* The predicted class label.
4444
*/
4545
label: string;
4646
/**

packages/tasks/src/tasks/audio-classification/spec/input.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919
"description": "Additional inference parameters for Audio Classification",
2020
"type": "object",
2121
"properties": {
22-
"functionToApply": {
22+
"function_to_apply": {
2323
"title": "AudioClassificationOutputTransform",
2424
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
2525
},
26-
"topK": {
26+
"top_k": {
2727
"type": "integer",
2828
"description": "When specified, limits the output to the top K most probable classes."
2929
}

packages/tasks/src/tasks/audio-classification/spec/output.json

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,6 @@
55
"description": "Outputs for Audio Classification inference",
66
"type": "array",
77
"items": {
8-
"type": "object",
9-
"properties": {
10-
"label": {
11-
"type": "string",
12-
"description": "The predicted class label (model specific)."
13-
},
14-
"score": {
15-
"type": "number",
16-
"description": "The corresponding probability."
17-
}
18-
},
19-
"required": ["label", "score"]
8+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
209
}
2110
}

packages/tasks/src/tasks/automatic-speech-recognition/inference.ts

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*
44
* Using src/scripts/inference-codegen
55
*/
6+
67
/**
78
* Inputs for Automatic Speech Recognition inference
89
*/
@@ -17,6 +18,7 @@ export interface AutomaticSpeechRecognitionInput {
1718
parameters?: AutomaticSpeechRecognitionParameters;
1819
[property: string]: unknown;
1920
}
21+
2022
/**
2123
* Additional inference parameters
2224
*
@@ -30,9 +32,10 @@ export interface AutomaticSpeechRecognitionParameters {
3032
/**
3133
* Whether to output corresponding timestamps with the generated text
3234
*/
33-
returnTimestamps?: boolean;
35+
return_timestamps?: boolean;
3436
[property: string]: unknown;
3537
}
38+
3639
/**
3740
* Parametrization of the text generation process
3841
*
@@ -42,18 +45,18 @@ export interface GenerationParameters {
4245
/**
4346
* Whether to use sampling instead of greedy decoding when generating new tokens.
4447
*/
45-
doSample?: boolean;
48+
do_sample?: boolean;
4649
/**
4750
* Controls the stopping condition for beam-based methods.
4851
*/
49-
earlyStopping?: EarlyStoppingUnion;
52+
early_stopping?: EarlyStoppingUnion;
5053
/**
5154
* If set to float strictly between 0 and 1, only tokens with a conditional probability
5255
* greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
5356
* 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
5457
* Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
5558
*/
56-
epsilonCutoff?: number;
59+
epsilon_cutoff?: number;
5760
/**
5861
* Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
5962
* float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -63,84 +66,74 @@ export interface GenerationParameters {
6366
* See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
6467
* for more details.
6568
*/
66-
etaCutoff?: number;
69+
eta_cutoff?: number;
6770
/**
6871
* The maximum length (in tokens) of the generated text, including the input.
6972
*/
70-
maxLength?: number;
73+
max_length?: number;
7174
/**
7275
* The maximum number of tokens to generate. Takes precedence over maxLength.
7376
*/
74-
maxNewTokens?: number;
77+
max_new_tokens?: number;
7578
/**
7679
* The minimum length (in tokens) of the generated text, including the input.
7780
*/
78-
minLength?: number;
81+
min_length?: number;
7982
/**
8083
* The minimum number of tokens to generate. Takes precedence over maxLength.
8184
*/
82-
minNewTokens?: number;
85+
min_new_tokens?: number;
8386
/**
8487
* Number of groups to divide num_beams into in order to ensure diversity among different
8588
* groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
8689
*/
87-
numBeamGroups?: number;
90+
num_beam_groups?: number;
8891
/**
8992
* Number of beams to use for beam search.
9093
*/
91-
numBeams?: number;
94+
num_beams?: number;
9295
/**
9396
* The value balances the model confidence and the degeneration penalty in contrastive
9497
* search decoding.
9598
*/
96-
penaltyAlpha?: number;
99+
penalty_alpha?: number;
97100
/**
98101
* The value used to modulate the next token probabilities.
99102
*/
100103
temperature?: number;
101104
/**
102105
* The number of highest probability vocabulary tokens to keep for top-k-filtering.
103106
*/
104-
topK?: number;
107+
top_k?: number;
105108
/**
106109
* If set to float < 1, only the smallest set of most probable tokens with probabilities
107110
* that add up to top_p or higher are kept for generation.
108111
*/
109-
topP?: number;
112+
top_p?: number;
110113
/**
111114
* Local typicality measures how similar the conditional probability of predicting a target
112115
* token next is to the expected conditional probability of predicting a random token next,
113116
* given the partial text already generated. If set to float < 1, the smallest set of the
114117
* most locally typical tokens with probabilities that add up to typical_p or higher are
115118
* kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
116119
*/
117-
typicalP?: number;
120+
typical_p?: number;
118121
/**
119122
* Whether the model should use the past last key/values attentions to speed up decoding
120123
*/
121-
useCache?: boolean;
124+
use_cache?: boolean;
122125
[property: string]: unknown;
123126
}
127+
124128
/**
125129
* Controls the stopping condition for beam-based methods.
126130
*/
127131
export type EarlyStoppingUnion = boolean | "never";
128-
export interface AutomaticSpeechRecognitionOutputChunk {
129-
/**
130-
* A chunk of text identified by the model
131-
*/
132-
text: string;
133-
/**
134-
* The start and end timestamps corresponding with the text
135-
*/
136-
timestamps: number[];
137-
[property: string]: unknown;
138-
}
139-
export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
132+
140133
/**
141134
* Outputs of inference for the Automatic Speech Recognition task
142135
*/
143-
export interface AutomaticSpeechRecognitionOutputElement {
136+
export interface AutomaticSpeechRecognitionOutput {
144137
/**
145138
* When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
146139
* the model.
@@ -152,3 +145,15 @@ export interface AutomaticSpeechRecognitionOutputElement {
152145
text: string;
153146
[property: string]: unknown;
154147
}
148+
149+
export interface AutomaticSpeechRecognitionOutputChunk {
150+
/**
151+
* A chunk of text identified by the model
152+
*/
153+
text: string;
154+
/**
155+
* The start and end timestamps corresponding with the text
156+
*/
157+
timestamps: number[];
158+
[property: string]: unknown;
159+
}

packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"description": "Additional inference parameters for Automatic Speech Recognition",
2020
"type": "object",
2121
"properties": {
22-
"returnTimestamps": {
22+
"return_timestamps": {
2323
"type": "boolean",
2424
"description": "Whether to output corresponding timestamps with the generated text"
2525
},

packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,36 @@
33
"$schema": "http://json-schema.org/draft-06/schema#",
44
"description": "Outputs of inference for the Automatic Speech Recognition task",
55
"title": "AutomaticSpeechRecognitionOutput",
6-
"type": "array",
7-
"items": {
8-
"type": "object",
9-
"properties": {
10-
"text": {
11-
"type": "string",
12-
"description": "The recognized text."
13-
},
14-
"chunks": {
15-
"type": "array",
16-
"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
17-
"items": {
18-
"type": "object",
19-
"title": "AutomaticSpeechRecognitionOutputChunk",
20-
"properties": {
21-
"text": { "type": "string", "description": "A chunk of text identified by the model" },
22-
"timestamps": {
23-
"type": "array",
24-
"description": "The start and end timestamps corresponding with the text",
25-
"items": { "type": "number" },
26-
"minLength": 2,
27-
"maxLength": 2
28-
}
6+
"type": "object",
7+
"properties": {
8+
"text": {
9+
"type": "string",
10+
"description": "The recognized text."
11+
},
12+
"chunks": {
13+
"type": "array",
14+
"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
15+
"items": {
16+
"type": "object",
17+
"title": "AutomaticSpeechRecognitionOutputChunk",
18+
"properties": {
19+
"text": {
20+
"type": "string",
21+
"description": "A chunk of text identified by the model"
2922
},
30-
"required": ["text", "timestamps"]
31-
}
23+
"timestamps": {
24+
"type": "array",
25+
"description": "The start and end timestamps corresponding with the text",
26+
"items": {
27+
"type": "number"
28+
},
29+
"minLength": 2,
30+
"maxLength": 2
31+
}
32+
},
33+
"required": ["text", "timestamps"]
3234
}
33-
},
34-
"required": ["text"]
35-
}
35+
}
36+
},
37+
"required": ["text"]
3638
}

packages/tasks/src/tasks/common-definitions.json

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,63 +43,71 @@
4343
"type": "number",
4444
"description": "The value used to modulate the next token probabilities."
4545
},
46-
"topK": {
46+
"top_k": {
4747
"type": "integer",
4848
"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
4949
},
50-
"topP": {
50+
"top_p": {
5151
"type": "number",
5252
"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
5353
},
54-
"typicalP": {
54+
"typical_p": {
5555
"type": "number",
5656
"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
5757
},
58-
"epsilonCutoff": {
58+
"epsilon_cutoff": {
5959
"type": "number",
6060
"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
6161
},
62-
"etaCutoff": {
62+
"eta_cutoff": {
6363
"type": "number",
6464
"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
6565
},
66-
"maxLength": {
66+
"max_length": {
6767
"type": "integer",
6868
"description": "The maximum length (in tokens) of the generated text, including the input."
6969
},
70-
"maxNewTokens": {
70+
"max_new_tokens": {
7171
"type": "integer",
7272
"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
7373
},
74-
"minLength": {
74+
"min_length": {
7575
"type": "integer",
7676
"description": "The minimum length (in tokens) of the generated text, including the input."
7777
},
78-
"minNewTokens": {
78+
"min_new_tokens": {
7979
"type": "integer",
8080
"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
8181
},
82-
"doSample": {
82+
"do_sample": {
8383
"type": "boolean",
8484
"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
8585
},
86-
"earlyStopping": {
86+
"early_stopping": {
8787
"description": "Controls the stopping condition for beam-based methods.",
88-
"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
89-
},
90-
"numBeams": {
88+
"oneOf": [
89+
{
90+
"type": "boolean"
91+
},
92+
{
93+
"const": "never",
94+
"type": "string"
95+
}
96+
]
97+
},
98+
"num_beams": {
9199
"type": "integer",
92100
"description": "Number of beams to use for beam search."
93101
},
94-
"numBeamGroups": {
102+
"num_beam_groups": {
95103
"type": "integer",
96104
"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
97105
},
98-
"penaltyAlpha": {
106+
"penalty_alpha": {
99107
"type": "number",
100108
"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
101109
},
102-
"useCache": {
110+
"use_cache": {
103111
"type": "boolean",
104112
"description": "Whether the model should use the past last key/values attentions to speed up decoding"
105113
}

0 commit comments

Comments
 (0)