Skip to content

Commit 5061334

Browse files
committed
Allow users to define task groups in LMEvalJob
Add new field: TaskGroups under the TaskList to support custom task group. User can define a custom task group and specify a list of aggregate metrics. In the result JSON, the task groups have a dedicated section of their results. Signed-off-by: Yihong Wang <yh.wang@ibm.com>
1 parent 10c0fc3 commit 5061334

File tree

8 files changed

+654
-21
lines changed

8 files changed

+654
-21
lines changed

api/lmes/v1alpha1/lmevaljob_types.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,9 @@ func (c *CustomArtifacts) GetTasks() []CustomArtifact {
181181
// Find details of the Unitxt Recipe here:
182182
// https://www.unitxt.ai/en/latest/unitxt.standard.html#unitxt.standard.StandardRecipe
183183
type TaskRecipe struct {
184+
// The name of the TaskRecipe
185+
// +optional
186+
Name *string `json:"name,omitempty"`
184187
// The Unitxt dataset card
185188
Card Card `json:"card"`
186189
// The Unitxt template
@@ -236,11 +239,39 @@ type CustomTasks struct {
236239
Source CustomTaskSource `json:"source,omitempty"`
237240
}
238241

242+
// Define an aggregate metric using 'mean' aggregation.
243+
type AggregateMetric struct {
244+
// The name of the metric to aggregate
245+
MetricName string `json:"metricName"`
246+
// Weight by size or not. Default value is True
247+
// +optional
248+
// +kubebuilder:default=true
249+
WeightBySize *bool `json:"weightBySize,omitempty"`
250+
}
251+
252+
// +kubebuilder:validation:XValidation:rule="has(self.taskNames) || has(self.taskRecipes)", message="One of taskNames or taskRecipes must be defined"
253+
type TaskGroup struct {
254+
// The name of the task group
255+
Name string `json:"name"`
256+
// TaskNames from lm-eval's task list and/or from custom tasks if CustomTasks is defined
257+
// +optional
258+
TaskNames []string `json:"taskNames,omitempty"`
259+
// Task Recipes specifically for the Unitxt tasks
260+
// +optional
261+
TaskRecipes []TaskRecipe `json:"taskRecipes,omitempty"`
262+
// A list of aggregate metrics to calculate for the task group
263+
// +optional
264+
AggregateMetrics []AggregateMetric `json:"aggregateMetrics,omitempty"`
265+
}
266+
267+
// +kubebuilder:validation:XValidation:rule="has(self.taskNames) || has(self.taskRecipes) || has(self.taskGroups)", message="One of taskNames, taskRecipes, or taskGroups must be defined"
268+
239269
type TaskList struct {
240270
// TaskNames from lm-eval's task list and/or from custom tasks if CustomTasks is defined
241271
TaskNames []string `json:"taskNames,omitempty"`
242272
// Task Recipes specifically for Unitxt
243273
TaskRecipes []TaskRecipe `json:"taskRecipes,omitempty"`
274+
TaskGroups []TaskGroup `json:"taskGroups,omitempty"`
244275
// Custom Unitxt artifacts that can be used in a TaskRecipe
245276
CustomArtifacts *CustomArtifacts `json:"custom,omitempty"`
246277
// CustomTasks is a list of external tasks
@@ -340,6 +371,9 @@ func (t *TaskRecipe) String() string {
340371
if t.DemosPoolSize != nil {
341372
b.WriteString(fmt.Sprintf(",demos_pool_size=%d", *t.DemosPoolSize))
342373
}
374+
if t.Name != nil && *t.Name != "" {
375+
b.WriteString(fmt.Sprintf("|%s", *t.Name))
376+
}
343377
return b.String()
344378
}
345379

api/lmes/v1alpha1/zz_generated.deepcopy.go

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/lmes_driver/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ func (t *strArrayArg) String() string {
5151

5252
var (
5353
taskRecipes strArrayArg
54+
taskGroups strArrayArg
5455
customArtifactArgs strArrayArg
5556
taskNames strArrayArg
5657
copy = flag.String("copy", "", "copy this binary to specified destination path")
@@ -70,6 +71,7 @@ var (
7071

7172
func init() {
7273
flag.Var(&taskRecipes, "task-recipe", "task recipe")
74+
flag.Var(&taskGroups, "task-group", "task group")
7375
flag.Var(&customArtifactArgs, "custom-artifact", "A string contains an artifact's type, name and value. Use | as separator")
7476
flag.Var(&taskNames, "task-name", "A task name for custom tasks")
7577
}
@@ -125,6 +127,7 @@ func main() {
125127
DetectDevice: *detectDevice,
126128
Logger: driverLog,
127129
TaskRecipes: taskRecipes,
130+
TaskGroups: taskGroups,
128131
CustomArtifacts: customArtifacts,
129132
Args: args,
130133
CommPort: *commPort,

config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4819,6 +4819,142 @@ spec:
48194819
type: object
48204820
type: object
48214821
type: object
4822+
taskGroups:
4823+
items:
4824+
properties:
4825+
aggregateMetrics:
4826+
description: A list of aggregate metrics to calculate for
4827+
the task group
4828+
items:
4829+
description: Define an aggregate metric using 'mean' aggregation.
4830+
properties:
4831+
metricName:
4832+
description: The name of the metric to aggregate
4833+
type: string
4834+
weightBySize:
4835+
default: true
4836+
description: Weight by size or not. Default value
4837+
is True
4838+
type: boolean
4839+
required:
4840+
- metricName
4841+
type: object
4842+
type: array
4843+
name:
4844+
description: The name of the task group
4845+
type: string
4846+
taskNames:
4847+
description: TaskNames from lm-eval's task list and/or from
4848+
custom tasks if CustomTasks is defined
4849+
items:
4850+
type: string
4851+
type: array
4852+
taskRecipes:
4853+
description: Task Recipes specifically for the Unitxt tasks
4854+
items:
4855+
description: |-
4856+
Use a task recipe to form a custom task. It maps to the Unitxt Recipe
4857+
Find details of the Unitxt Recipe here:
4858+
https://www.unitxt.ai/en/latest/unitxt.standard.html#unitxt.standard.StandardRecipe
4859+
properties:
4860+
card:
4861+
description: The Unitxt dataset card
4862+
properties:
4863+
custom:
4864+
description: |-
4865+
A JSON string for a custom unitxt card which contains the custom dataset.
4866+
Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_dataset.html#adding-to-the-catalog
4867+
to compose a custom card, store it as a JSON file, and use the JSON content as the value here.
4868+
type: string
4869+
name:
4870+
description: Unitxt card's ID
4871+
type: string
4872+
type: object
4873+
demosPoolSize:
4874+
description: The pool size for the fewshot
4875+
type: integer
4876+
format:
4877+
description: The Unitxt format
4878+
type: string
4879+
loaderLimit:
4880+
description: A limit number of records to load
4881+
type: integer
4882+
metrics:
4883+
description: Metrics
4884+
items:
4885+
properties:
4886+
name:
4887+
description: Unitxt metric id
4888+
type: string
4889+
ref:
4890+
description: |-
4891+
The name of the custom metric in the custom field. Its value is a JSON string
4892+
for a custom Unitxt metric. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_metric.html#adding-a-new-instance-metric
4893+
to compose a custom metric, store it as a JSON file by calling the
4894+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4895+
and use the JSON content as the value here.
4896+
type: string
4897+
type: object
4898+
type: array
4899+
name:
4900+
description: The name of the TaskRecipe
4901+
type: string
4902+
numDemos:
4903+
description: Number of fewshot
4904+
type: integer
4905+
systemPrompt:
4906+
description: The Unitxt System Prompt
4907+
properties:
4908+
name:
4909+
description: Unitxt System Prompt id
4910+
type: string
4911+
ref:
4912+
description: The name of the custom systemPrompt
4913+
in the custom field. Its value is a custom system
4914+
prompt string
4915+
type: string
4916+
type: object
4917+
task:
4918+
description: The Unitxt Task
4919+
properties:
4920+
name:
4921+
description: Unitxt task id
4922+
type: string
4923+
ref:
4924+
description: |-
4925+
The name of the custom task in the custom field. Its value is a JSON string
4926+
for a custom Unitxt task. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_task.html
4927+
to compose a custom task, store it as a JSON file by calling the
4928+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4929+
and use the JSON content as the value here.
4930+
type: string
4931+
type: object
4932+
template:
4933+
description: The Unitxt template
4934+
properties:
4935+
name:
4936+
description: Unitxt template ID
4937+
type: string
4938+
ref:
4939+
description: |-
4940+
The name of the custom template in the custom field. Its value is a JSON string
4941+
for a custom Unitxt template. Use the documentation here: https://www.unitxt.ai/en/latest/docs/adding_template.html
4942+
to compose a custom template, store it as a JSON file by calling the
4943+
add_to_catalog API: https://www.unitxt.ai/en/latest/docs/saving_and_loading_from_catalog.html#adding-assets-to-the-catalog,
4944+
and use the JSON content as the value here.
4945+
type: string
4946+
type: object
4947+
required:
4948+
- card
4949+
type: object
4950+
type: array
4951+
required:
4952+
- name
4953+
type: object
4954+
x-kubernetes-validations:
4955+
- message: One of taskNames or taskRecipes must be defined
4956+
rule: has(self.taskNames) || has(self.taskRecipes)
4957+
type: array
48224958
taskNames:
48234959
description: TaskNames from lm-eval's task list and/or from custom
48244960
tasks if CustomTasks is defined
@@ -4872,6 +5008,9 @@ spec:
48725008
type: string
48735009
type: object
48745010
type: array
5011+
name:
5012+
description: The name of the TaskRecipe
5013+
type: string
48755014
numDemos:
48765015
description: Number of fewshot
48775016
type: integer
@@ -4922,6 +5061,9 @@ spec:
49225061
type: object
49235062
type: array
49245063
type: object
5064+
x-kubernetes-validations:
5065+
- message: One of taskNames, taskRecipes, or taskGroups must be defined
5066+
rule: has(self.taskNames) || has(self.taskRecipes) || has(self.taskGroups)
49255067
required:
49265068
- model
49275069
- taskList

0 commit comments

Comments
 (0)