diff --git a/api/evalhub/v1alpha1/evalhub_types.go b/api/evalhub/v1alpha1/evalhub_types.go index ed5f7c96b..6974d4f47 100644 --- a/api/evalhub/v1alpha1/evalhub_types.go +++ b/api/evalhub/v1alpha1/evalhub_types.go @@ -45,6 +45,12 @@ type EvalHubSpec struct { // +optional Env []corev1.EnvVar `json:"env,omitempty"` + // Providers is the list of OOTB provider names to mount into the deployment. + // Each name must match a provider-name label on a ConfigMap in the operator namespace. + // +kubebuilder:default:={"garak","guidellm","lighteval","lm-evaluation-harness"} + // +optional + Providers []string `json:"providers,omitempty"` + // Database configuration for persistent storage. // When set, the operator configures PostgreSQL via the referenced secret. // When omitted, the service uses its default (in-memory SQLite). diff --git a/api/evalhub/v1alpha1/zz_generated.deepcopy.go b/api/evalhub/v1alpha1/zz_generated.deepcopy.go index 896e60b62..2697d522f 100644 --- a/api/evalhub/v1alpha1/zz_generated.deepcopy.go +++ b/api/evalhub/v1alpha1/zz_generated.deepcopy.go @@ -115,6 +115,11 @@ func (in *EvalHubSpec) DeepCopyInto(out *EvalHubSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.Providers != nil { + in, out := &in.Providers, &out.Providers + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.Database != nil { in, out := &in.Database, &out.Database *out = new(DatabaseSpec) diff --git a/config/base/kustomization.yaml b/config/base/kustomization.yaml index 549c25dce..db0f8179e 100644 --- a/config/base/kustomization.yaml +++ b/config/base/kustomization.yaml @@ -6,6 +6,7 @@ resources: - ../rbac - ../manager - ../prometheus + - ../configmaps commonLabels: app.kubernetes.io/part-of: trustyai @@ -56,3 +57,31 @@ vars: apiVersion: v1 fieldref: fieldpath: data.evalHubImage + - name: evalhub-provider-garak-image + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.evalhub-provider-garak-image + - name: evalhub-provider-guidellm-image + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.evalhub-provider-guidellm-image + - name: evalhub-provider-lighteval-image + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.evalhub-provider-lighteval-image + - name: evalhub-provider-lm-evaluation-harness-image + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.evalhub-provider-lm-evaluation-harness-image diff --git a/config/base/params.env b/config/base/params.env index eef60e4b6..74bc8fef8 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -16,4 +16,8 @@ guardrails-orchestrator-image=quay.io/trustyai/ta-guardrails-orchestrator:latest guardrails-built-in-detector-image=quay.io/trustyai/guardrails-detector-built-in:latest guardrails-sidecar-gateway-image=quay.io/trustyai/guardrails-sidecar-gateway:latest garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest -nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest \ No newline at end of file +nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest +evalhub-provider-garak-image=quay.io/evalhub/garak:latest +evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest +evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest +evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2 \ No newline at end of file diff --git a/config/base/params.yaml b/config/base/params.yaml index 190afea08..fb2998ffd 100644 --- a/config/base/params.yaml +++ b/config/base/params.yaml @@ -2,3 +2,5 @@ varReference: - kind: Deployment path: spec/template/spec/containers[]/image + - kind: ConfigMap + path: data diff --git a/config/configmaps/evalhub/kustomization.yaml b/config/configmaps/evalhub/kustomization.yaml new file mode 100644 index 000000000..807e0ac8c --- /dev/null +++ b/config/configmaps/evalhub/kustomization.yaml @@ -0,0 +1,7 @@ +resources: + - provider-garak.yaml + - provider-guidellm.yaml + - provider-lighteval.yaml + - provider-lm-evaluation-harness.yaml + +namespace: system diff --git a/config/configmaps/evalhub/provider-garak.yaml b/config/configmaps/evalhub/provider-garak.yaml new file mode 100644 index 000000000..cdd97aa3f --- /dev/null +++ b/config/configmaps/evalhub/provider-garak.yaml @@ -0,0 +1,80 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: evalhub-provider-garak + labels: + trustyai.opendatahub.io/evalhub-provider-type: system + trustyai.opendatahub.io/evalhub-provider-name: garak +data: + garak.yaml: | + id: garak + name: Garak + description: LLM vulnerability scanner and red-teaming framework + type: builtin + runtime: + k8s: + image: $(evalhub-provider-garak-image) + entrypoint: + - python + - /opt/app-root/src/main.py + cpu_request: 100m + memory_request: 128Mi + cpu_limit: 500m + memory_limit: 1Gi + env: + - name: VAR_NAME + value: VALUE + local: null + benchmarks: + - id: toxicity + name: Toxicity Detection + description: Tests model's tendency to generate toxic content + category: safety + metrics: + - toxicity_rate + - severity_score + num_few_shot: 0 + dataset_size: 500 + tags: + - safety + - toxicity + - red_team + - id: bias_detection + name: Bias Detection + description: Evaluates model for various forms of bias + category: fairness + metrics: + - bias_score + - demographic_parity + num_few_shot: 0 + dataset_size: 1000 + tags: + - fairness + - bias + - demographic + - id: pii_leakage + name: PII Leakage + description: Tests for personally identifiable information leakage + category: privacy + metrics: + - pii_leak_rate + - sensitivity_score + num_few_shot: 0 + dataset_size: 300 + tags: + - privacy + - pii + - security + - id: prompt_injection + name: Prompt Injection + description: Tests resilience against prompt injection attacks + category: security + metrics: + - injection_success_rate + - defense_effectiveness + num_few_shot: 0 + dataset_size: 200 + tags: + - security + - injection + - adversarial diff --git a/config/configmaps/evalhub/provider-guidellm.yaml b/config/configmaps/evalhub/provider-guidellm.yaml new file mode 100644 index 000000000..e86f5fa85 --- /dev/null +++ b/config/configmaps/evalhub/provider-guidellm.yaml @@ -0,0 +1,130 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: evalhub-provider-guidellm + labels: + trustyai.opendatahub.io/evalhub-provider-type: system + trustyai.opendatahub.io/evalhub-provider-name: guidellm +data: + guidellm.yaml: | + id: guidellm + name: GuideLLM + description: Performance benchmarking framework for LLM inference servers + type: builtin + runtime: + k8s: + image: $(evalhub-provider-guidellm-image) + entrypoint: + - python + - main.py + cpu_request: 100m + memory_request: 128Mi + cpu_limit: 1000m + memory_limit: 2Gi + local: null + benchmarks: + - id: sweep + name: Sweep Profile + description: Automatically finds optimal request rate by sweeping from low to high + concurrency + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - throughput + - latency + - guidellm + - auto + - id: throughput + name: Throughput Profile + description: Measures maximum throughput by gradually increasing request rate until + saturation + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - throughput + - guidellm + - saturation + - id: concurrent + name: Concurrent Profile + description: Tests performance with fixed number of concurrent requests + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - concurrency + - guidellm + - id: constant + name: Constant Rate Profile + description: Maintains constant request rate throughout benchmark duration + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - constant_rate + - guidellm + - id: poisson + name: Poisson Profile + description: Sends requests following Poisson distribution for realistic production + simulation + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - poisson + - realistic + - guidellm + - id: quick_perf_test + name: Quick Performance Test + description: Fast performance evaluation with sweep profile and limited samples + category: performance + metrics: + - requests_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - quick + - guidellm + - suite + - id: comprehensive_perf_test + name: Comprehensive Performance Test + description: Thorough performance evaluation across all profiles + category: performance + metrics: + - requests_per_second + - prompt_tokens_per_second + - output_tokens_per_second + - mean_ttft_ms + - mean_itl_ms + tags: + - performance + - comprehensive + - guidellm + - suite diff --git a/config/configmaps/evalhub/provider-lighteval.yaml b/config/configmaps/evalhub/provider-lighteval.yaml new file mode 100644 index 000000000..c4475a60c --- /dev/null +++ b/config/configmaps/evalhub/provider-lighteval.yaml @@ -0,0 +1,307 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: evalhub-provider-lighteval + labels: + trustyai.opendatahub.io/evalhub-provider-type: system + trustyai.opendatahub.io/evalhub-provider-name: lighteval +data: + lighteval.yaml: | + id: lighteval + name: Lighteval + description: Lightweight LLM evaluation framework from Hugging Face + type: builtin + runtime: + k8s: + image: $(evalhub-provider-lighteval-image) + entrypoint: + - python + - main.py + cpu_request: 100m + memory_request: 128Mi + cpu_limit: 500m + memory_limit: 1Gi + local: null + benchmarks: + - id: commonsense_reasoning + name: Commonsense Reasoning Suite + description: Suite of commonsense reasoning benchmarks (hellaswag, winogrande, openbookqa, + arc:easy) + category: reasoning + metrics: + - accuracy + - acc_norm + tags: + - reasoning + - commonsense + - lighteval + - suite + - id: scientific_reasoning + name: Scientific Reasoning Suite + description: Scientific reasoning benchmarks (arc:easy, arc:challenge) + category: reasoning + metrics: + - accuracy + - acc_norm + tags: + - reasoning + - science + - lighteval + - suite + - id: physical_commonsense + name: Physical Commonsense Suite + description: Physical commonsense reasoning (piqa) + category: reasoning + metrics: + - accuracy + tags: + - reasoning + - physical + - lighteval + - suite + - id: truthfulness + name: Truthfulness Suite + description: Truthfulness and hallucination benchmarks (truthfulqa:mc, truthfulqa:generation) + category: safety + metrics: + - mc1 + - mc2 + tags: + - safety + - truthfulness + - lighteval + - suite + - id: math + name: Math Suite + description: Mathematical reasoning benchmarks (gsm8k, math:algebra, math:counting_and_probability) + category: math + metrics: + - exact_match + - accuracy + tags: + - math + - reasoning + - lighteval + - suite + - id: knowledge + name: Knowledge Suite + description: Knowledge benchmarks (mmlu, triviaqa) + category: knowledge + metrics: + - accuracy + - acc_norm + tags: + - knowledge + - lighteval + - suite + - id: language_understanding + name: Language Understanding Suite + description: GLUE language understanding tasks (glue:cola, glue:sst2, glue:mrpc) + category: language_understanding + metrics: + - accuracy + - matthews_correlation + - f1 + tags: + - language_understanding + - glue + - lighteval + - suite + - id: hellaswag + name: HellaSwag + description: Commonsense reasoning around everyday activities + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - commonsense + - lighteval + - id: winogrande + name: Winogrande + description: Commonsense reasoning with pronoun resolution + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1267 + tags: + - reasoning + - commonsense + - lighteval + - id: openbookqa + name: OpenBookQA + description: Question answering with open book knowledge + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 500 + tags: + - knowledge + - qa + - lighteval + - id: arc:easy + name: ARC Easy + description: AI2 Reasoning Challenge - Easy subset + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 2376 + tags: + - reasoning + - science + - lighteval + - id: arc:challenge + name: ARC Challenge + description: AI2 Reasoning Challenge - Challenge subset + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 1172 + tags: + - reasoning + - science + - lighteval + - id: piqa + name: PIQA + description: Physical Interaction QA - physical commonsense reasoning + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1838 + tags: + - reasoning + - physical + - lighteval + - id: truthfulqa:mc + name: TruthfulQA MC + description: Measures truthfulness with multiple choice format + category: safety + metrics: + - mc1 + - mc2 + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - truthfulness + - lighteval + - id: truthfulqa:generation + name: TruthfulQA Generation + description: Measures truthfulness with generation format + category: safety + metrics: + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - truthfulness + - lighteval + - id: gsm8k + name: GSM8K + description: Grade School Math 8K - arithmetic reasoning + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 8 + dataset_size: 1319 + tags: + - math + - reasoning + - lighteval + - id: math:algebra + name: MATH Algebra + description: Mathematical reasoning - Algebra subset + category: math + metrics: + - accuracy + num_few_shot: 0 + tags: + - math + - algebra + - lighteval + - id: math:counting_and_probability + name: MATH Counting & Probability + description: Mathematical reasoning - Counting and Probability subset + category: math + metrics: + - accuracy + num_few_shot: 0 + tags: + - math + - probability + - lighteval + - id: mmlu + name: MMLU + description: Massive Multitask Language Understanding - 57 subjects + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 5 + dataset_size: 15908 + tags: + - knowledge + - multitask + - lighteval + - id: triviaqa + name: TriviaQA + description: Large-scale question answering dataset + category: knowledge + metrics: + - accuracy + - exact_match + num_few_shot: 0 + tags: + - knowledge + - qa + - lighteval + - id: glue:cola + name: GLUE CoLA + description: Corpus of Linguistic Acceptability + category: language_understanding + metrics: + - matthews_correlation + num_few_shot: 0 + tags: + - language_understanding + - glue + - lighteval + - id: glue:sst2 + name: GLUE SST-2 + description: Stanford Sentiment Treebank + category: language_understanding + metrics: + - accuracy + num_few_shot: 0 + tags: + - language_understanding + - glue + - sentiment + - lighteval + - id: glue:mrpc + name: GLUE MRPC + description: Microsoft Research Paraphrase Corpus + category: language_understanding + metrics: + - accuracy + - f1 + num_few_shot: 0 + tags: + - language_understanding + - glue + - paraphrase + - lighteval diff --git a/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml b/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml new file mode 100644 index 000000000..68d86d397 --- /dev/null +++ b/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml @@ -0,0 +1,2027 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: evalhub-provider-lm-evaluation-harness + labels: + trustyai.opendatahub.io/evalhub-provider-type: system + trustyai.opendatahub.io/evalhub-provider-name: lm-evaluation-harness +data: + lm_evaluation_harness.yaml: | + id: lm_evaluation_harness + name: LM Evaluation Harness + description: Comprehensive evaluation framework for language models with 167 benchmarks + type: builtin + runtime: + k8s: + image: $(evalhub-provider-lm-evaluation-harness-image) + entrypoint: + - /opt/app-root/bin/python + - /opt/app-root/src/main.py + cpu_request: 100m + memory_request: 128Mi + cpu_limit: 500m + memory_limit: 1Gi + env: + - name: VAR_NAME + value: VALUE + local: null + benchmarks: + - id: arc_easy + name: ARC Easy + description: ARC Easy evaluation benchmark - AI2 Reasoning Challenge (Easy) + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 2376 + tags: + - reasoning + - science + - lm_eval + - id: AraDiCE_boolq_lev + name: Aradice Boolq Lev + description: Aradice Boolq Lev evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 3270 + tags: + - general + - lm_eval + - id: blimp + name: Blimp + description: Blimp evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_anaphor_gender_agreement + name: Blimp Anaphor Gender Agreement + description: Blimp Anaphor Gender Agreement evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_animate_subject_trans + name: Blimp Animate Subject Trans + description: Blimp Animate Subject Trans evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_coordinate_structure_constraint_complex_left_branch + name: Blimp Coordinate Structure Constraint Complex Left Branch + description: Blimp Coordinate Structure Constraint Complex Left Branch evaluation + benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_determiner_noun_agreement_2 + name: Blimp Determiner Noun Agreement 2 + description: Blimp Determiner Noun Agreement 2 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_determiner_noun_agreement_with_adj_2 + name: Blimp Determiner Noun Agreement With Adj 2 + description: Blimp Determiner Noun Agreement With Adj 2 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_determiner_noun_agreement_with_adjective_1 + name: Blimp Determiner Noun Agreement With Adjective 1 + description: Blimp Determiner Noun Agreement With Adjective 1 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_existential_there_object_raising + name: Blimp Existential There Object Raising + description: Blimp Existential There Object Raising evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_existential_there_subject_raising + name: Blimp Existential There Subject Raising + description: Blimp Existential There Subject Raising evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_intransitive + name: Blimp Intransitive + description: Blimp Intransitive evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_irregular_plural_subject_verb_agreement_1 + name: Blimp Irregular Plural Subject Verb Agreement 1 + description: Blimp Irregular Plural Subject Verb Agreement 1 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_left_branch_island_simple_question + name: Blimp Left Branch Island Simple Question + description: Blimp Left Branch Island Simple Question evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_npi_present_2 + name: Blimp Npi Present 2 + description: Blimp Npi Present 2 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: blimp_passive_1 + name: Blimp Passive 1 + description: Blimp Passive 1 evaluation benchmark + category: general + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - general + - lm_eval + - id: AraDiCE_ArabicMMLU_egy + name: Aradice Arabicmmlu Egy + description: Aradice Arabicmmlu Egy evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_high_humanities_history_lev + name: Aradice Arabicmmlu High Humanities History Lev + description: Aradice Arabicmmlu High Humanities History Lev evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_high_humanities_philosophy_egy + name: Aradice Arabicmmlu High Humanities Philosophy Egy + description: Aradice Arabicmmlu High Humanities Philosophy Egy evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_high_language_arabic-language_lev + name: Aradice Arabicmmlu High Language Arabic-Language Lev + description: Aradice Arabicmmlu High Language Arabic-Language Lev evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_lev + name: Aradice Arabicmmlu Lev + description: Aradice Arabicmmlu Lev evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_egy + name: Aradice Arabicmmlu Middle Humanities Islamic-Studies Egy + description: Aradice Arabicmmlu Middle Humanities Islamic-Studies Egy evaluation + benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_language_arabic-language_lev + name: Aradice Arabicmmlu Middle Language Arabic-Language Lev + description: Aradice Arabicmmlu Middle Language Arabic-Language Lev evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_na_humanities_islamic-studies_egy + name: Aradice Arabicmmlu Na Humanities Islamic-Studies Egy + description: Aradice Arabicmmlu Na Humanities Islamic-Studies Egy evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_na_language_arabic-language-general_lev + name: Aradice Arabicmmlu Na Language Arabic-Language-General Lev + description: Aradice Arabicmmlu Na Language Arabic-Language-General Lev evaluation + benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_na_other_driving-test_egy + name: Aradice Arabicmmlu Na Other Driving-Test Egy + description: Aradice Arabicmmlu Na Other Driving-Test Egy evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_na_other_general-knowledge_lev + name: Aradice Arabicmmlu Na Other General-Knowledge Lev + description: Aradice Arabicmmlu Na Other General-Knowledge Lev evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_humanities_islamic-studies_egy + name: Aradice Arabicmmlu Primary Humanities Islamic-Studies Egy + description: Aradice Arabicmmlu Primary Humanities Islamic-Studies Egy evaluation + benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_language_arabic-language_lev + name: Aradice Arabicmmlu Primary Language Arabic-Language Lev + description: Aradice Arabicmmlu Primary Language Arabic-Language Lev evaluation + benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_ArabicMMLU_univ_other_management_egy + name: Aradice Arabicmmlu Univ Other Management Egy + description: Aradice Arabicmmlu Univ Other Management Egy evaluation benchmark + category: knowledge + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - knowledge + - lm_eval + - id: AraDiCE_openbookqa_eng + name: Aradice Openbookqa Eng + description: Aradice Openbookqa Eng evaluation benchmark + category: knowledge + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 500 + tags: + - knowledge + - lm_eval + - id: arabic_leaderboard_arabic_mt_boolq + name: Arabic Leaderboard Arabic Mt Boolq + description: Arabic Leaderboard Arabic Mt Boolq evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 3270 + tags: + - multilingual + - lm_eval + - id: arabic_leaderboard_arabic_mt_boolq_light + name: Arabic Leaderboard Arabic Mt Boolq Light + description: Arabic Leaderboard Arabic Mt Boolq Light evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 3270 + tags: + - multilingual + - lm_eval + - id: arabic_mt_boolq_light + name: Arabic Mt Boolq Light + description: Arabic Mt Boolq Light evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 3270 + tags: + - multilingual + - lm_eval + - id: leaderboard_bbh_salient_translation_error_detection + name: Leaderboard Bbh Salient Translation Error Detection + description: Leaderboard Bbh Salient Translation Error Detection evaluation benchmark + category: multilingual + metrics: + - bleu + - chrf + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: aclue_ancient_chinese_culture + name: Aclue Ancient Chinese Culture + description: Aclue Ancient Chinese Culture evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: african_flores + name: African Flores + description: African Flores evaluation benchmark + category: multilingual + metrics: + - bleu + - chrf + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli-irokobench + name: Afrixnli-Irokobench + description: Afrixnli-Irokobench evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_amh_prompt_2 + name: Afrixnli Amh Prompt 2 + description: Afrixnli Amh Prompt 2 evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_amh_prompt_5 + name: Afrixnli Amh Prompt 5 + description: Afrixnli Amh Prompt 5 evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_ewe + name: Afrixnli En Direct Ewe + description: Afrixnli En Direct Ewe evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_ibo + name: Afrixnli En Direct Ibo + description: Afrixnli En Direct Ibo evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_lug + name: Afrixnli En Direct Lug + description: Afrixnli En Direct Lug evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_sot + name: Afrixnli En Direct Sot + description: Afrixnli En Direct Sot evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_wol + name: Afrixnli En Direct Wol + description: Afrixnli En Direct Wol evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: afrixnli_en_direct_zul + name: Afrixnli En Direct Zul + description: Afrixnli En Direct Zul evaluation benchmark + category: multilingual + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 2000 + tags: + - multilingual + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_stem_math_egy + name: Aradice Arabicmmlu Primary Stem Math Egy + description: Aradice Arabicmmlu Primary Stem Math Egy evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_college_mathematics_light + name: Arabic Leaderboard Arabic Mmlu College Mathematics Light + description: Arabic Leaderboard Arabic Mmlu College Mathematics Light evaluation + benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_high_school_mathematics + name: Arabic Leaderboard Arabic Mmlu High School Mathematics + description: Arabic Leaderboard Arabic Mmlu High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: cmmlu_college_mathematics + name: Cmmlu College Mathematics + description: Cmmlu College Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: cmmlu_high_school_mathematics + name: Cmmlu High School Mathematics + description: Cmmlu High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_am_high_school_mathematics + name: Global Mmlu Full Am High School Mathematics + description: Global Mmlu Full Am High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_ar_high_school_mathematics + name: Global Mmlu Full Ar High School Mathematics + description: Global Mmlu Full Ar High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_bn_high_school_mathematics + name: Global Mmlu Full Bn High School Mathematics + description: Global Mmlu Full Bn High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_cs_high_school_mathematics + name: Global Mmlu Full Cs High School Mathematics + description: Global Mmlu Full Cs High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_de_high_school_mathematics + name: Global Mmlu Full De High School Mathematics + description: Global Mmlu Full De High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_el_high_school_mathematics + name: Global Mmlu Full El High School Mathematics + description: Global Mmlu Full El High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_en_high_school_mathematics + name: Global Mmlu Full En High School Mathematics + description: Global Mmlu Full En High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_es_high_school_mathematics + name: Global Mmlu Full Es High School Mathematics + description: Global Mmlu Full Es High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_fa_high_school_mathematics + name: Global Mmlu Full Fa High School Mathematics + description: Global Mmlu Full Fa High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: global_mmlu_full_fil_high_school_mathematics + name: Global Mmlu Full Fil High School Mathematics + description: Global Mmlu Full Fil High School Mathematics evaluation benchmark + category: math + metrics: + - exact_match + - accuracy + num_few_shot: 0 + dataset_size: 14042 + tags: + - math + - lm_eval + - id: AraDiCE_piqa_lev + name: Aradice Piqa Lev + description: Aradice Piqa Lev evaluation benchmark + category: reasoning + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 1838 + tags: + - reasoning + - lm_eval + - id: AraDiCE_winogrande_eng + name: Aradice Winogrande Eng + description: Aradice Winogrande Eng evaluation benchmark + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1267 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_copa + name: Arabic Leaderboard Arabic Mt Copa + description: Arabic Leaderboard Arabic Mt Copa evaluation benchmark + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 500 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_copa_light + name: Arabic Leaderboard Arabic Mt Copa Light + description: Arabic Leaderboard Arabic Mt Copa Light evaluation benchmark + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 500 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_hellaswag + name: Arabic Leaderboard Arabic Mt Hellaswag + description: Arabic Leaderboard Arabic Mt Hellaswag evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_hellaswag_light + name: Arabic Leaderboard Arabic Mt Hellaswag Light + description: Arabic Leaderboard Arabic Mt Hellaswag Light evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_piqa + name: Arabic Leaderboard Arabic Mt Piqa + description: Arabic Leaderboard Arabic Mt Piqa evaluation benchmark + category: reasoning + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 1838 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_piqa_light + name: Arabic Leaderboard Arabic Mt Piqa Light + description: Arabic Leaderboard Arabic Mt Piqa Light evaluation benchmark + category: reasoning + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 1838 + tags: + - reasoning + - lm_eval + - id: arabic_mt_hellaswag + name: Arabic Mt Hellaswag + description: Arabic Mt Hellaswag evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: arabic_mt_piqa + name: Arabic Mt Piqa + description: Arabic Mt Piqa evaluation benchmark + category: reasoning + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 1838 + tags: + - reasoning + - lm_eval + - id: copa_ar + name: Copa Ar + description: Copa Ar evaluation benchmark + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 500 + tags: + - reasoning + - lm_eval + - id: copal_id_colloquial + name: Copal Id Colloquial + description: Copal Id Colloquial evaluation benchmark + category: reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 500 + tags: + - reasoning + - lm_eval + - id: darijahellaswag + name: Darijahellaswag + description: Darijahellaswag evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: egyhellaswag + name: Egyhellaswag + description: Egyhellaswag evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: hellaswag_ar + name: Hellaswag Ar + description: Hellaswag Ar evaluation benchmark + category: reasoning + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 10042 + tags: + - reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mt_race + name: Arabic Leaderboard Arabic Mt Race + description: Arabic Leaderboard Arabic Mt Race evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 674 + tags: + - reading_comprehension + - lm_eval + - id: arabic_leaderboard_arabic_mt_race_light + name: Arabic Leaderboard Arabic Mt Race Light + description: Arabic Leaderboard Arabic Mt Race Light evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 674 + tags: + - reading_comprehension + - lm_eval + - id: arabic_mt_race_light + name: Arabic Mt Race Light + description: Arabic Mt Race Light evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 674 + tags: + - reading_comprehension + - lm_eval + - id: blimp_drop_argument + name: Blimp Drop Argument + description: Blimp Drop Argument evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 9536 + tags: + - reading_comprehension + - lm_eval + - id: bigbench_gre_reading_comprehension_multiple_choice + name: Bigbench Gre Reading Comprehension Multiple Choice + description: Bigbench Gre Reading Comprehension Multiple Choice evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: eus_reading + name: Eus Reading + description: Eus Reading evaluation benchmark + category: reading_comprehension + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: longbench_qasper + name: Longbench Qasper + description: Longbench Qasper evaluation benchmark + category: reading_comprehension + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: qasper_freeform + name: Qasper Freeform + description: Qasper Freeform evaluation benchmark + category: reading_comprehension + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: ruler_qa_squad + name: Ruler Qa Squad + description: Ruler Qa Squad evaluation benchmark + category: reading_comprehension + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: scrolls_qasper + name: Scrolls Qasper + description: Scrolls Qasper evaluation benchmark + category: reading_comprehension + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 3000 + tags: + - reading_comprehension + - lm_eval + - id: AraDiCE_ArabicMMLU_high_social-science_economics_egy + name: Aradice Arabicmmlu High Social-Science Economics Egy + description: Aradice Arabicmmlu High Social-Science Economics Egy evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_high_social-science_geography_lev + name: Aradice Arabicmmlu High Social-Science Geography Lev + description: Aradice Arabicmmlu High Social-Science Geography Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_high_stem_computer-science_egy + name: Aradice Arabicmmlu High Stem Computer-Science Egy + description: Aradice Arabicmmlu High Stem Computer-Science Egy evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_high_stem_physics_lev + name: Aradice Arabicmmlu High Stem Physics Lev + description: Aradice Arabicmmlu High Stem Physics Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_social-science_civics_egy + name: Aradice Arabicmmlu Middle Social-Science Civics Egy + description: Aradice Arabicmmlu Middle Social-Science Civics Egy evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_social-science_economics_lev + name: Aradice Arabicmmlu Middle Social-Science Economics Lev + description: Aradice Arabicmmlu Middle Social-Science Economics Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_social-science_social-science_egy + name: Aradice Arabicmmlu Middle Social-Science Social-Science Egy + description: Aradice Arabicmmlu Middle Social-Science Social-Science Egy evaluation + benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_middle_stem_computer-science_lev + name: Aradice Arabicmmlu Middle Stem Computer-Science Lev + description: Aradice Arabicmmlu Middle Stem Computer-Science Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_social-science_geography_egy + name: Aradice Arabicmmlu Primary Social-Science Geography Egy + description: Aradice Arabicmmlu Primary Social-Science Geography Egy evaluation + benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_social-science_social-science_lev + name: Aradice Arabicmmlu Primary Social-Science Social-Science Lev + description: Aradice Arabicmmlu Primary Social-Science Social-Science Lev evaluation + benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_primary_stem_natural-science_lev + name: Aradice Arabicmmlu Primary Stem Natural-Science Lev + description: Aradice Arabicmmlu Primary Stem Natural-Science Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_univ_social-science_accounting_lev + name: Aradice Arabicmmlu Univ Social-Science Accounting Lev + description: Aradice Arabicmmlu Univ Social-Science Accounting Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_univ_social-science_political-science_egy + name: Aradice Arabicmmlu Univ Social-Science Political-Science Egy + description: Aradice Arabicmmlu Univ Social-Science Political-Science Egy evaluation + benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: AraDiCE_ArabicMMLU_univ_stem_computer-science_lev + name: Aradice Arabicmmlu Univ Stem Computer-Science Lev + description: Aradice Arabicmmlu Univ Stem Computer-Science Lev evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_college_biology_light + name: Arabic Leaderboard Arabic Mmlu College Biology Light + description: Arabic Leaderboard Arabic Mmlu College Biology Light evaluation benchmark + category: science + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - science + - lm_eval + - id: agieval_logiqa_zh + name: Agieval Logiqa Zh + description: Agieval Logiqa Zh evaluation benchmark + category: logic_reasoning + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 651 + tags: + - logic_reasoning + - lm_eval + - id: bbh + name: Bbh + description: Bbh evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot + name: Bbh Cot Fewshot + description: Bbh Cot Fewshot evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_causal_judgement + name: Bbh Cot Fewshot Causal Judgement + description: Bbh Cot Fewshot Causal Judgement evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_dyck_languages + name: Bbh Cot Fewshot Dyck Languages + description: Bbh Cot Fewshot Dyck Languages evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_hyperbaton + name: Bbh Cot Fewshot Hyperbaton + description: Bbh Cot Fewshot Hyperbaton evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_logical_deduction_three_objects + name: Bbh Cot Fewshot Logical Deduction Three Objects + description: Bbh Cot Fewshot Logical Deduction Three Objects evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_navigate + name: Bbh Cot Fewshot Navigate + description: Bbh Cot Fewshot Navigate evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_reasoning_about_colored_objects + name: Bbh Cot Fewshot Reasoning About Colored Objects + description: Bbh Cot Fewshot Reasoning About Colored Objects evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_snarks + name: Bbh Cot Fewshot Snarks + description: Bbh Cot Fewshot Snarks evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_tracking_shuffled_objects_five_objects + name: Bbh Cot Fewshot Tracking Shuffled Objects Five Objects + description: Bbh Cot Fewshot Tracking Shuffled Objects Five Objects evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_fewshot_web_of_lies + name: Bbh Cot Fewshot Web Of Lies + description: Bbh Cot Fewshot Web Of Lies evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 5 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_zeroshot + name: Bbh Cot Zeroshot + description: Bbh Cot Zeroshot evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_zeroshot_causal_judgement + name: Bbh Cot Zeroshot Causal Judgement + description: Bbh Cot Zeroshot Causal Judgement evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: bbh_cot_zeroshot_dyck_languages + name: Bbh Cot Zeroshot Dyck Languages + description: Bbh Cot Zeroshot Dyck Languages evaluation benchmark + category: logic_reasoning + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - logic_reasoning + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_anatomy + name: Arabic Leaderboard Arabic Mmlu Anatomy + description: Arabic Leaderboard Arabic Mmlu Anatomy evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_clinical_knowledge + name: Arabic Leaderboard Arabic Mmlu Clinical Knowledge + description: Arabic Leaderboard Arabic Mmlu Clinical Knowledge evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_medical_genetics + name: Arabic Leaderboard Arabic Mmlu Medical Genetics + description: Arabic Leaderboard Arabic Mmlu Medical Genetics evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: arabic_leaderboard_arabic_mmlu_professional_medicine + name: Arabic Leaderboard Arabic Mmlu Professional Medicine + description: Arabic Leaderboard Arabic Mmlu Professional Medicine evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: cmmlu_professional_medicine + name: Cmmlu Professional Medicine + description: Cmmlu Professional Medicine evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: cmmlu_traditional_chinese_medicine + name: Cmmlu Traditional Chinese Medicine + description: Cmmlu Traditional Chinese Medicine evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_am_anatomy + name: Global Mmlu Full Am Anatomy + description: Global Mmlu Full Am Anatomy evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_am_clinical_knowledge + name: Global Mmlu Full Am Clinical Knowledge + description: Global Mmlu Full Am Clinical Knowledge evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_am_medical_genetics + name: Global Mmlu Full Am Medical Genetics + description: Global Mmlu Full Am Medical Genetics evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_am_professional_medicine + name: Global Mmlu Full Am Professional Medicine + description: Global Mmlu Full Am Professional Medicine evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_ar_anatomy + name: Global Mmlu Full Ar Anatomy + description: Global Mmlu Full Ar Anatomy evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_ar_clinical_knowledge + name: Global Mmlu Full Ar Clinical Knowledge + description: Global Mmlu Full Ar Clinical Knowledge evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_ar_medical_genetics + name: Global Mmlu Full Ar Medical Genetics + description: Global Mmlu Full Ar Medical Genetics evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_ar_professional_medicine + name: Global Mmlu Full Ar Professional Medicine + description: Global Mmlu Full Ar Professional Medicine evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: global_mmlu_full_bn_anatomy + name: Global Mmlu Full Bn Anatomy + description: Global Mmlu Full Bn Anatomy evaluation benchmark + category: medical + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 14042 + tags: + - medical + - lm_eval + - id: lambada_openai + name: Lambada Openai + description: Lambada Openai evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: lambada_openai_mt_en + name: Lambada Openai Mt En + description: Lambada Openai Mt En evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: lambada_openai_mt_it + name: Lambada Openai Mt It + description: Lambada Openai Mt It evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: lambada_openai_mt_stablelm_es + name: Lambada Openai Mt Stablelm Es + description: Lambada Openai Mt Stablelm Es evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: lambada_openai_mt_stablelm_nl + name: Lambada Openai Mt Stablelm Nl + description: Lambada Openai Mt Stablelm Nl evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: lambada_standard_cloze_yaml + name: Lambada Standard Cloze Yaml + description: Lambada Standard Cloze Yaml evaluation benchmark + category: language_modeling + metrics: + - perplexity + - accuracy + num_few_shot: 0 + dataset_size: 5153 + tags: + - language_modeling + - lm_eval + - id: paloma_wikitext_103 + name: Paloma Wikitext 103 + description: Paloma Wikitext 103 evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 4358 + tags: + - language_modeling + - lm_eval + - id: pile_arxiv + name: Pile Arxiv + description: Pile Arxiv evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: pile_freelaw + name: Pile Freelaw + description: Pile Freelaw evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: pile_hackernews + name: Pile Hackernews + description: Pile Hackernews evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: pile_openwebtext2 + name: Pile Openwebtext2 + description: Pile Openwebtext2 evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: pile_ubuntu-irc + name: Pile Ubuntu-Irc + description: Pile Ubuntu-Irc evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: pile_youtubesubtitles + name: Pile Youtubesubtitles + description: Pile Youtubesubtitles evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 210000000 + tags: + - language_modeling + - lm_eval + - id: wikitext + name: Wikitext + description: Wikitext evaluation benchmark + category: language_modeling + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 4358 + tags: + - language_modeling + - lm_eval + - id: careqa_open_perplexity + name: Careqa Open Perplexity + description: Careqa Open Perplexity evaluation benchmark + category: language_modeling + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 10000 + tags: + - language_modeling + - lm_eval + - id: AraDiCE_truthfulqa_mc1_lev + name: Aradice Truthfulqa Mc1 Lev + description: Aradice Truthfulqa Mc1 Lev evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: metabench_truthfulqa_permute + name: Metabench Truthfulqa Permute + description: Metabench Truthfulqa Permute evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_gen_nno_p0 + name: Nortruthfulqa Gen Nno P0 + description: Nortruthfulqa Gen Nno P0 evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_gen_nno_p3 + name: Nortruthfulqa Gen Nno P3 + description: Nortruthfulqa Gen Nno P3 evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_gen_nob_p1 + name: Nortruthfulqa Gen Nob P1 + description: Nortruthfulqa Gen Nob P1 evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_gen_nob_p4 + name: Nortruthfulqa Gen Nob P4 + description: Nortruthfulqa Gen Nob P4 evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_mc_nno_p2 + name: Nortruthfulqa Mc Nno P2 + description: Nortruthfulqa Mc Nno P2 evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_mc_nob_p0 + name: Nortruthfulqa Mc Nob P0 + description: Nortruthfulqa Mc Nob P0 evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: nortruthfulqa_mc_nob_p3 + name: Nortruthfulqa Mc Nob P3 + description: Nortruthfulqa Mc Nob P3 evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: tinyTruthfulQA + name: Tinytruthfulqa + description: Tinytruthfulqa evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: truthfulqa-multi_gen_ca + name: Truthfulqa-Multi Gen Ca + description: Truthfulqa-Multi Gen Ca evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: truthfulqa-multi_gen_eu + name: Truthfulqa-Multi Gen Eu + description: Truthfulqa-Multi Gen Eu evaluation benchmark + category: safety + metrics: + - mc1 + - mc2 + - bleu + - rouge + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: truthfulqa-multi_mc1_en + name: Truthfulqa-Multi Mc1 En + description: Truthfulqa-Multi Mc1 En evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: truthfulqa-multi_mc1_gl + name: Truthfulqa-Multi Mc1 Gl + description: Truthfulqa-Multi Mc1 Gl evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: truthfulqa-multi_mc2_es + name: Truthfulqa-Multi Mc2 Es + description: Truthfulqa-Multi Mc2 Es evaluation benchmark + category: safety + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 817 + tags: + - safety + - lm_eval + - id: bigbench_code_line_description_multiple_choice + name: Bigbench Code Line Description Multiple Choice + description: Bigbench Code Line Description Multiple Choice evaluation benchmark + category: code + metrics: + - accuracy + - acc_norm + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: ceval-valid_college_programming + name: Ceval-Valid College Programming + description: Ceval-Valid College Programming evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: code2text_javascript + name: Code2Text Javascript + description: Code2Text Javascript evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: code2text_ruby + name: Code2Text Ruby + description: Code2Text Ruby evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: humaneval + name: Humaneval + description: Humaneval evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: humaneval_instruct + name: Humaneval Instruct + description: Humaneval Instruct evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval + - id: mbpp + name: MBPP + description: MBPP (Most Basic Python Programming) evaluation benchmark + category: code + metrics: + - accuracy + num_few_shot: 0 + dataset_size: 1000 + tags: + - code + - lm_eval diff --git a/config/configmaps/kustomization.yaml b/config/configmaps/kustomization.yaml index 424a7733f..8beaf6c86 100644 --- a/config/configmaps/kustomization.yaml +++ b/config/configmaps/kustomization.yaml @@ -1,4 +1,5 @@ resources: - lmeval.yaml + - evalhub/ namespace: system diff --git a/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml b/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml index 0bd2d4b4f..d3838cf43 100644 --- a/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml +++ b/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml @@ -180,6 +180,18 @@ spec: - name type: object type: array + providers: + default: + - garak + - guidellm + - lighteval + - lm-evaluation-harness + description: |- + Providers is the list of OOTB provider names to mount into the deployment. + Each name must match a provider-name label on a ConfigMap in the operator namespace. + items: + type: string + type: array replicas: default: 1 description: Number of replicas for the eval-hub deployment diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index a25946b6c..5787df1cc 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -18,3 +18,7 @@ guardrails-sidecar-gateway-image=quay.io/opendatahub/vllm-orchestrator-gateway:l ragas-provider-image=quay.io/opendatahub/llama-stack-provider-ragas:latest garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest +evalhub-provider-garak-image=quay.io/evalhub/garak:latest +evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest +evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest +evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2 \ No newline at end of file diff --git a/config/overlays/rhoai/params.env b/config/overlays/rhoai/params.env index dee252fe0..2f7360c95 100644 --- a/config/overlays/rhoai/params.env +++ b/config/overlays/rhoai/params.env @@ -18,3 +18,7 @@ guardrails-sidecar-gateway-image=quay.io/trustyai/guardrails-sidecar-gateway:lat ragas-provider-image=quay.io/trustyai/llama-stack-provider-ragas:latest garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest +evalhub-provider-garak-image=quay.io/evalhub/garak:latest +evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest +evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest +evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2 \ No newline at end of file diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 10e4c9d34..3dc23ef20 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -9,14 +9,14 @@ resources: - auth_proxy_role_binding.yaml - auth_proxy_client_clusterrole.yaml - evalhub/evalhub_auth_reviewer_role.yaml - - evalhub/evalhub_jobs_writer_role.yaml - - evalhub/evalhub_jobs_writer_binding.yaml - - evalhub/evalhub_job_config_role.yaml - evalhub/evalhub_job_config_binding.yaml - - evalhub/evalhub_mlflow_access_role.yaml + - evalhub/evalhub_job_config_role.yaml + - evalhub/evalhub_jobs_writer_binding.yaml + - evalhub/evalhub_jobs_writer_role.yaml - evalhub/evalhub_mlflow_access_binding.yaml - - evalhub/evalhub_mlflow_jobs_role.yaml + - evalhub/evalhub_mlflow_access_role.yaml - evalhub/evalhub_mlflow_jobs_binding.yaml + - evalhub/evalhub_mlflow_jobs_role.yaml - nemoguardrail_editor_role.yaml - nemoguardrail_viewer_role.yaml - trustyaiservice_editor_role.yaml diff --git a/controllers/evalhub/build_test.go b/controllers/evalhub/build_test.go index 711523c77..26e740535 100644 --- a/controllers/evalhub/build_test.go +++ b/controllers/evalhub/build_test.go @@ -68,7 +68,7 @@ func TestBuildDeploymentSpec(t *testing.T) { } t.Run("should build correct deployment spec", func(t *testing.T) { - deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHub) + deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHub, nil) require.NoError(t, err) // Check replicas @@ -183,7 +183,7 @@ func TestBuildDeploymentSpec(t *testing.T) { EventRecorder: record.NewFakeRecorder(10), } - deploymentSpec, err := reconcilerNoConfig.buildDeploymentSpec(ctx, evalHub) + deploymentSpec, err := reconcilerNoConfig.buildDeploymentSpec(ctx, evalHub, nil) require.Error(t, err) // Should return empty deployment spec (zero value) on error assert.Equal(t, appsv1.DeploymentSpec{}, deploymentSpec) @@ -201,7 +201,7 @@ func TestBuildDeploymentSpec(t *testing.T) { }, } - deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHubNoReplicas) + deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHubNoReplicas, nil) require.NoError(t, err) // Should use default replicas (1) diff --git a/controllers/evalhub/configmap.go b/controllers/evalhub/configmap.go index d121e582d..ce26c2e9f 100644 --- a/controllers/evalhub/configmap.go +++ b/controllers/evalhub/configmap.go @@ -16,34 +16,37 @@ import ( "sigs.k8s.io/yaml" ) -// ProviderConfig represents the provider configuration structure -type ProviderConfig struct { - Name string `yaml:"name"` - Type string `yaml:"type"` - Enabled bool `yaml:"enabled"` - Benchmarks []string `yaml:"benchmarks,omitempty"` - Config map[string]string `yaml:"config,omitempty"` +// ServiceConfig represents the service section in config.yaml +type ServiceConfig struct { + Port int `json:"port"` + ReadyFile string `json:"ready_file"` + TerminationFile string `json:"termination_file"` } // DatabaseConfig represents the database configuration in config.yaml type DatabaseConfig struct { - Driver string `yaml:"driver"` - MaxOpenConns int `yaml:"max_open_conns,omitempty"` - MaxIdleConns int `yaml:"max_idle_conns,omitempty"` + Driver string `json:"driver"` + URL string `json:"url,omitempty"` + MaxOpenConns int `json:"max_open_conns,omitempty"` + MaxIdleConns int `json:"max_idle_conns,omitempty"` } // SecretsMapping represents the secrets mapping configuration in config.yaml type SecretsMapping struct { - Dir string `yaml:"dir"` - Mappings map[string]string `yaml:"mappings"` + Dir string `json:"dir"` + Mappings map[string]string `json:"mappings"` } +// EnvMappings maps environment variable names to config field paths +type EnvMappings map[string]string + // EvalHubConfig represents the eval-hub configuration structure type EvalHubConfig struct { - Providers []ProviderConfig `yaml:"providers"` - Collections []string `yaml:"collections,omitempty"` - Database *DatabaseConfig `yaml:"database,omitempty"` - Secrets *SecretsMapping `yaml:"secrets,omitempty"` + Service ServiceConfig `json:"service"` + Secrets *SecretsMapping `json:"secrets,omitempty"` + EnvMappings EnvMappings `json:"env_mappings"` + Database *DatabaseConfig `json:"database"` + Prometheus map[string]any `json:"prometheus,omitempty"` } // reconcileConfigMap creates or updates the ConfigMap for EvalHub configuration @@ -91,69 +94,30 @@ func (r *EvalHubReconciler) reconcileConfigMap(ctx context.Context, instance *ev // generateConfigData generates the configuration data for the ConfigMap func (r *EvalHubReconciler) generateConfigData(instance *evalhubv1alpha1.EvalHub) (map[string]string, error) { config := EvalHubConfig{ - Providers: make([]ProviderConfig, 0), - Collections: []string{}, - } - - // Default providers configuration set by the controller - config.Providers = []ProviderConfig{ - { - Name: "lm-eval-harness", - Type: "lm_evaluation_harness", - Enabled: true, - Benchmarks: []string{ - "arc_challenge", "hellaswag", "mmlu", "truthfulqa", - }, - Config: map[string]string{ - "batch_size": "8", - "max_length": "2048", - }, + Service: ServiceConfig{ + Port: containerPort, + ReadyFile: "/tmp/repo-ready", + TerminationFile: "/tmp/termination-log", }, - { - Name: "ragas-provider", - Type: "ragas", - Enabled: true, - Benchmarks: []string{ - "faithfulness", "answer_relevancy", "context_precision", "context_recall", - }, - Config: map[string]string{ - "llm_model": "gpt-3.5-turbo", - "embeddings_model": "text-embedding-ada-002", - }, + EnvMappings: EnvMappings{ + "PORT": "service.port", + "DB_URL": "database.url", + "MLFLOW_TRACKING_URI": "mlflow.tracking_uri", + "MLFLOW_CA_CERT_PATH": "mlflow.ca_cert_path", + "MLFLOW_INSECURE_SKIP_VERIFY": "mlflow.insecure_skip_verify", + "MLFLOW_TOKEN_PATH": "mlflow.token_path", + "MLFLOW_WORKSPACE": "mlflow.workspace", }, - { - Name: "garak-security", - Type: "garak", - Enabled: false, - Benchmarks: []string{ - "encoding", "injection", "malware", "prompt_injection", - }, - Config: map[string]string{ - "probe_set": "basic", - }, + Database: &DatabaseConfig{ + Driver: "sqlite", + URL: "file::eval_hub:?mode=memory&cache=shared", }, - { - Name: "trustyai-custom", - Type: "trustyai_custom", - Enabled: true, - Benchmarks: []string{ - "bias_detection", "fairness_metrics", - }, - Config: map[string]string{ - "bias_threshold": "0.1", - }, + Prometheus: map[string]any{ + "enabled": true, }, } - // Default collections - config.Collections = []string{ - "healthcare_safety_v1", - "automotive_safety_v1", - "finance_compliance_v1", - "general_llm_eval_v1", - } - - // Conditionally add database configuration + // Override database configuration when explicitly configured if instance.Spec.IsDatabaseConfigured() { maxOpen, maxIdle := dbDefaultMaxOpen, dbDefaultMaxIdle if instance.Spec.Database.MaxOpenConns > 0 { @@ -179,31 +143,11 @@ func (r *EvalHubReconciler) generateConfigData(instance *evalhubv1alpha1.EvalHub return nil, err } - // Generate providers.yaml content - providersYAML, err := r.generateProvidersYAML(config.Providers) - if err != nil { - return nil, err - } - return map[string]string{ - "config.yaml": string(configYAML), - "providers.yaml": providersYAML, + "config.yaml": string(configYAML), }, nil } -// generateProvidersYAML generates the providers.yaml configuration -func (r *EvalHubReconciler) generateProvidersYAML(providers []ProviderConfig) (string, error) { - providersData := make(map[string]interface{}) - providersData["providers"] = providers - - yamlData, err := yaml.Marshal(providersData) - if err != nil { - return "", err - } - - return string(yamlData), nil -} - // getImageFromConfigMap gets a required image value from the operator's ConfigMap // Returns error if ConfigMap is not found, key is missing, or value is empty // This ensures explicit configuration and prevents deployment with unconfigured images @@ -373,6 +317,95 @@ func (r *EvalHubReconciler) generateProxyConfigData(instance *evalhubv1alpha1.Ev } } +// reconcileProviderConfigMaps copies provider ConfigMaps from the operator namespace to the +// EvalHub CR's namespace. Only providers listed in instance.Spec.Providers are copied. +// Each source ConfigMap is discovered by the labels: +// - trustyai.opendatahub.io/evalhub-provider-type=system +// - trustyai.opendatahub.io/evalhub-provider-name= +// +// Returns the list of created ConfigMap names (for building projected volumes). +func (r *EvalHubReconciler) reconcileProviderConfigMaps(ctx context.Context, instance *evalhubv1alpha1.EvalHub) ([]string, error) { + if len(instance.Spec.Providers) == 0 { + return nil, nil + } + + log := log.FromContext(ctx) + log.Info("Reconciling Provider ConfigMaps", "instance", instance.Name, "providers", instance.Spec.Providers) + + var cmNames []string + for _, providerName := range instance.Spec.Providers { + // Look up the source ConfigMap by both labels + var sourceList corev1.ConfigMapList + if err := r.List(ctx, &sourceList, + client.InNamespace(r.Namespace), + client.MatchingLabels{ + providerLabel: "system", + providerNameLabel: providerName, + }); err != nil { + return nil, fmt.Errorf("failed to list provider ConfigMaps for %q in namespace %s: %w", providerName, r.Namespace, err) + } + if len(sourceList.Items) == 0 { + return nil, fmt.Errorf("provider %q not found: no ConfigMap with label %s=%s in namespace %s", + providerName, providerNameLabel, providerName, r.Namespace) + } + + src := &sourceList.Items[0] + targetName := instance.Name + "-provider-" + providerName + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: targetName, + Namespace: instance.Namespace, + }, + } + + // Check if ConfigMap already exists + getErr := r.Get(ctx, client.ObjectKeyFromObject(configMap), configMap) + if getErr != nil && !errors.IsNotFound(getErr) { + return nil, getErr + } + + if errors.IsNotFound(getErr) { + configMap.Data = src.Data + if instance.UID != "" { + if err := controllerutil.SetControllerReference(instance, configMap, r.Scheme); err != nil { + return nil, err + } + } + log.Info("Creating Provider ConfigMap", "name", targetName, "provider", providerName) + if err := r.Create(ctx, configMap); err != nil { + return nil, err + } + } else { + configMap.Data = src.Data + log.Info("Updating Provider ConfigMap", "name", targetName, "provider", providerName) + if err := r.Update(ctx, configMap); err != nil { + return nil, err + } + } + + cmNames = append(cmNames, targetName) + } + + return cmNames, nil +} + +// providerVolumeProjections builds VolumeProjection entries for mounting provider ConfigMaps +// into a single projected volume. +func providerVolumeProjections(cmNames []string) []corev1.VolumeProjection { + var projections []corev1.VolumeProjection + for _, name := range cmNames { + projections = append(projections, corev1.VolumeProjection{ + ConfigMap: &corev1.ConfigMapProjection{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: name, + }, + }, + }) + } + return projections +} + // reconcileServiceCAConfigMap creates or updates the ConfigMap for service CA certificate injection // This ConfigMap is used by jobs to mount the service CA certificate for TLS verification func (r *EvalHubReconciler) reconcileServiceCAConfigMap(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { diff --git a/controllers/evalhub/configmap_test.go b/controllers/evalhub/configmap_test.go index 43d4faac5..fc8d146cc 100644 --- a/controllers/evalhub/configmap_test.go +++ b/controllers/evalhub/configmap_test.go @@ -88,7 +88,6 @@ var _ = Describe("EvalHub ConfigMap", func() { By("Checking required keys exist") Expect(configMap.Data).To(HaveKey("config.yaml")) - Expect(configMap.Data).To(HaveKey("providers.yaml")) }) It("should have valid YAML configuration", func() { @@ -103,173 +102,6 @@ var _ = Describe("EvalHub ConfigMap", func() { var config EvalHubConfig err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) Expect(err).NotTo(HaveOccurred()) - - By("Checking default providers are present") - Expect(config.Providers).To(HaveLen(4)) - - var providerNames []string - for _, provider := range config.Providers { - providerNames = append(providerNames, provider.Name) - } - Expect(providerNames).To(ContainElements( - "lm-eval-harness", "ragas-provider", "garak-security", "trustyai-custom", - )) - - By("Checking default collections are present") - Expect(config.Collections).To(ContainElements( - "healthcare_safety_v1", "automotive_safety_v1", - "finance_compliance_v1", "general_llm_eval_v1", - )) - }) - - It("should have valid providers.yaml", func() { - By("Reconciling configmap") - err := reconciler.reconcileConfigMap(ctx, evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Getting configmap") - configMap := waitForConfigMap(evalHubName+"-config", testNamespace) - - By("Parsing providers.yaml") - var providersData map[string]interface{} - err = yaml.Unmarshal([]byte(configMap.Data["providers.yaml"]), &providersData) - Expect(err).NotTo(HaveOccurred()) - - By("Checking providers structure") - Expect(providersData).To(HaveKey("providers")) - providers, ok := providersData["providers"].([]interface{}) - Expect(ok).To(BeTrue()) - Expect(providers).To(HaveLen(4)) - }) - - It("should configure lm-eval-harness provider correctly", func() { - By("Reconciling configmap") - err := reconciler.reconcileConfigMap(ctx, evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Getting configmap") - configMap := waitForConfigMap(evalHubName+"-config", testNamespace) - - By("Parsing config.yaml") - var config EvalHubConfig - err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) - Expect(err).NotTo(HaveOccurred()) - - By("Finding lm-eval-harness provider") - var lmEvalProvider *ProviderConfig - for _, provider := range config.Providers { - if provider.Name == "lm-eval-harness" { - lmEvalProvider = &provider - break - } - } - Expect(lmEvalProvider).NotTo(BeNil()) - - By("Checking lm-eval-harness configuration") - Expect(lmEvalProvider.Type).To(Equal("lm_evaluation_harness")) - Expect(lmEvalProvider.Enabled).To(BeTrue()) - Expect(lmEvalProvider.Benchmarks).To(ContainElements( - "arc_challenge", "hellaswag", "mmlu", "truthfulqa", - )) - Expect(lmEvalProvider.Config["batch_size"]).To(Equal("8")) - Expect(lmEvalProvider.Config["max_length"]).To(Equal("2048")) - }) - - It("should configure ragas provider correctly", func() { - By("Reconciling configmap") - err := reconciler.reconcileConfigMap(ctx, evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Getting configmap") - configMap := waitForConfigMap(evalHubName+"-config", testNamespace) - - By("Parsing config.yaml") - var config EvalHubConfig - err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) - Expect(err).NotTo(HaveOccurred()) - - By("Finding ragas provider") - var ragasProvider *ProviderConfig - for _, provider := range config.Providers { - if provider.Name == "ragas-provider" { - ragasProvider = &provider - break - } - } - Expect(ragasProvider).NotTo(BeNil()) - - By("Checking ragas configuration") - Expect(ragasProvider.Type).To(Equal("ragas")) - Expect(ragasProvider.Enabled).To(BeTrue()) - Expect(ragasProvider.Benchmarks).To(ContainElements( - "faithfulness", "answer_relevancy", "context_precision", "context_recall", - )) - Expect(ragasProvider.Config["llm_model"]).To(Equal("gpt-3.5-turbo")) - Expect(ragasProvider.Config["embeddings_model"]).To(Equal("text-embedding-ada-002")) - }) - - It("should configure garak security provider correctly", func() { - By("Reconciling configmap") - err := reconciler.reconcileConfigMap(ctx, evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Getting configmap") - configMap := waitForConfigMap(evalHubName+"-config", testNamespace) - - By("Parsing config.yaml") - var config EvalHubConfig - err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) - Expect(err).NotTo(HaveOccurred()) - - By("Finding garak provider") - var garakProvider *ProviderConfig - for _, provider := range config.Providers { - if provider.Name == "garak-security" { - garakProvider = &provider - break - } - } - Expect(garakProvider).NotTo(BeNil()) - - By("Checking garak configuration") - Expect(garakProvider.Type).To(Equal("garak")) - Expect(garakProvider.Enabled).To(BeFalse()) // Disabled by default - Expect(garakProvider.Benchmarks).To(ContainElements( - "encoding", "injection", "malware", "prompt_injection", - )) - Expect(garakProvider.Config["probe_set"]).To(Equal("basic")) - }) - - It("should configure trustyai custom provider correctly", func() { - By("Reconciling configmap") - err := reconciler.reconcileConfigMap(ctx, evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Getting configmap") - configMap := waitForConfigMap(evalHubName+"-config", testNamespace) - - By("Parsing config.yaml") - var config EvalHubConfig - err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) - Expect(err).NotTo(HaveOccurred()) - - By("Finding trustyai provider") - var trustyaiProvider *ProviderConfig - for _, provider := range config.Providers { - if provider.Name == "trustyai-custom" { - trustyaiProvider = &provider - break - } - } - Expect(trustyaiProvider).NotTo(BeNil()) - - By("Checking trustyai configuration") - Expect(trustyaiProvider.Type).To(Equal("trustyai_custom")) - Expect(trustyaiProvider.Enabled).To(BeTrue()) - Expect(trustyaiProvider.Benchmarks).To(ContainElements( - "bias_detection", "fairness_metrics", - )) - Expect(trustyaiProvider.Config["bias_threshold"]).To(Equal("0.1")) }) It("should update existing configmap", func() { @@ -381,7 +213,7 @@ var _ = Describe("EvalHub ConfigMap", func() { Expect(config.Secrets.Mappings).To(HaveKeyWithValue("db-url", "database.url")) }) - It("should omit database and secrets sections when database is not configured", func() { + It("should default to sqlite when database is not explicitly configured", func() { By("Reconciling configmap for standard EvalHub (no DB)") err := reconciler.reconcileConfigMap(ctx, evalHub) Expect(err).NotTo(HaveOccurred()) @@ -399,8 +231,9 @@ var _ = Describe("EvalHub ConfigMap", func() { err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) Expect(err).NotTo(HaveOccurred()) - By("Checking database and secrets are absent") - Expect(config.Database).To(BeNil()) + By("Checking default sqlite database is set and secrets are absent") + Expect(config.Database).NotTo(BeNil()) + Expect(config.Database.Driver).To(Equal("sqlite")) Expect(config.Secrets).To(BeNil()) }) }) @@ -413,44 +246,11 @@ var _ = Describe("EvalHub ConfigMap", func() { By("Checking required keys are present") Expect(configData).To(HaveKey("config.yaml")) - Expect(configData).To(HaveKey("providers.yaml")) By("Validating config.yaml content") var config EvalHubConfig err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config) Expect(err).NotTo(HaveOccurred()) - Expect(config.Providers).To(HaveLen(4)) - Expect(config.Collections).To(HaveLen(4)) - - By("Validating providers.yaml content") - var providersData map[string]interface{} - err = yaml.Unmarshal([]byte(configData["providers.yaml"]), &providersData) - Expect(err).NotTo(HaveOccurred()) - Expect(providersData).To(HaveKey("providers")) - }) - - It("should generate providers YAML correctly", func() { - By("Generating configuration data") - configData, err := reconciler.generateConfigData(evalHub) - Expect(err).NotTo(HaveOccurred()) - - By("Parsing config.yaml to get providers") - var config EvalHubConfig - err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config) - Expect(err).NotTo(HaveOccurred()) - - By("Generating providers YAML") - providersYAML, err := reconciler.generateProvidersYAML(config.Providers) - Expect(err).NotTo(HaveOccurred()) - - By("Verifying providers YAML matches configmap data") - Expect(providersYAML).To(Equal(configData["providers.yaml"])) - - By("Verifying providers YAML is valid") - var providersData map[string]interface{} - err = yaml.Unmarshal([]byte(providersYAML), &providersData) - Expect(err).NotTo(HaveOccurred()) - Expect(providersData).To(HaveKey("providers")) }) }) }) diff --git a/controllers/evalhub/constants.go b/controllers/evalhub/constants.go index 42109092f..e61809760 100644 --- a/controllers/evalhub/constants.go +++ b/controllers/evalhub/constants.go @@ -49,6 +49,15 @@ const ( mlflowTokenMountPath = "/var/run/secrets/mlflow" mlflowTokenFile = "token" mlflowTokenExpiration = 3600 // seconds + + // EvalHub config directory (contains config.yaml and providers/ subdir) + configDirPath = "/etc/evalhub/config" + + // Provider ConfigMap configuration + providerLabel = "trustyai.opendatahub.io/evalhub-provider-type" + providerNameLabel = "trustyai.opendatahub.io/evalhub-provider-name" + providersVolumeName = "evalhub-providers" + providersMountPath = configDirPath + "/providers" ) var ( diff --git a/controllers/evalhub/deployment.go b/controllers/evalhub/deployment.go index 9d63877eb..517b7fb8a 100644 --- a/controllers/evalhub/deployment.go +++ b/controllers/evalhub/deployment.go @@ -18,7 +18,7 @@ import ( ) // reconcileDeployment creates or updates the Deployment for EvalHub -func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { +func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *evalhubv1alpha1.EvalHub, providerCMNames []string) error { log := log.FromContext(ctx) log.Info("Reconciling Deployment", "name", instance.Name) @@ -36,7 +36,7 @@ func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *e } // Define the desired deployment spec - desiredSpec, err := r.buildDeploymentSpec(ctx, instance) + desiredSpec, err := r.buildDeploymentSpec(ctx, instance, providerCMNames) if err != nil { return err } @@ -61,7 +61,7 @@ func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *e } // buildDeploymentSpec builds the deployment specification for EvalHub -func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *evalhubv1alpha1.EvalHub) (appsv1.DeploymentSpec, error) { +func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *evalhubv1alpha1.EvalHub, providerCMNames []string) (appsv1.DeploymentSpec, error) { labels := map[string]string{ "app": "eval-hub", "instance": instance.Name, @@ -104,12 +104,8 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e Value: "3", }, { - Name: "CONFIG_PATH", - Value: "/etc/evalhub/config.yaml", - }, - { - Name: "PROVIDERS_CONFIG_PATH", - Value: "/etc/evalhub/providers.yaml", + Name: "EVAL_HUB_CONFIG_DIR", + Value: configDirPath, }, { Name: "SERVICE_URL", @@ -140,7 +136,7 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e volumeMounts := []corev1.VolumeMount{ { Name: "evalhub-config", - MountPath: "/etc/evalhub", + MountPath: configDirPath, ReadOnly: true, }, { @@ -154,6 +150,13 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e ReadOnly: true, }, } + if len(providerCMNames) > 0 { + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: providersVolumeName, + MountPath: providersMountPath, + ReadOnly: true, + }) + } if instance.Spec.IsDatabaseConfigured() { volumeMounts = append(volumeMounts, corev1.VolumeMount{ Name: dbSecretVolumeName, @@ -343,6 +346,16 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e }, }, } + if len(providerCMNames) > 0 { + volumes = append(volumes, corev1.Volume{ + Name: providersVolumeName, + VolumeSource: corev1.VolumeSource{ + Projected: &corev1.ProjectedVolumeSource{ + Sources: providerVolumeProjections(providerCMNames), + }, + }, + }) + } if instance.Spec.IsDatabaseConfigured() { volumes = append(volumes, corev1.Volume{ Name: dbSecretVolumeName, diff --git a/controllers/evalhub/deployment_test.go b/controllers/evalhub/deployment_test.go index 2c90e67c9..e905cfa7b 100644 --- a/controllers/evalhub/deployment_test.go +++ b/controllers/evalhub/deployment_test.go @@ -75,7 +75,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should create deployment with correct specifications", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Verifying deployment exists") @@ -105,7 +105,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure evalhub container correctly", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -135,7 +135,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should set default environment variables", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -171,7 +171,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should include custom environment variables", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -200,7 +200,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure resource requirements", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -234,7 +234,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure health probes", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -273,7 +273,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure security contexts", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -302,7 +302,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should update existing deployment", func() { By("Creating initial deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Modifying EvalHub replicas") @@ -312,7 +312,7 @@ var _ = Describe("EvalHub Deployment", func() { Expect(err).NotTo(HaveOccurred()) By("Reconciling deployment again") - err = reconciler.reconcileDeployment(ctx, evalHub) + err = reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Verifying deployment is updated") @@ -331,14 +331,14 @@ var _ = Describe("EvalHub Deployment", func() { Expect(err).NotTo(HaveOccurred()) By("Reconciling deployment without config map") - err = reconciler.reconcileDeployment(ctx, evalHub) + err = reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("kube-rbac-proxy configuration error")) }) It("should configure rolling update strategy", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -360,7 +360,7 @@ var _ = Describe("EvalHub Deployment", func() { defer k8sClient.Delete(ctx, dbEvalHub) By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, dbEvalHub) + err := reconciler.reconcileDeployment(ctx, dbEvalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -420,7 +420,7 @@ var _ = Describe("EvalHub Deployment", func() { Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed()) By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -456,7 +456,7 @@ var _ = Describe("EvalHub Deployment", func() { } By("Attempting to reconcile deployment") - err := reconciler.reconcileDeployment(ctx, nonExistentEvalHub) + err := reconciler.reconcileDeployment(ctx, nonExistentEvalHub, nil) Expect(err).To(HaveOccurred()) }) @@ -482,7 +482,7 @@ var _ = Describe("EvalHub Deployment", func() { Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed()) By("Attempting to reconcile deployment") - err = reconciler.reconcileDeployment(ctx, evalHub) + err = reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("kube-rbac-proxy configuration error")) @@ -500,7 +500,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should include kube-rbac-proxy sidecar container", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -540,7 +540,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure kube-rbac-proxy resource requirements", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -573,7 +573,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure kube-rbac-proxy security context", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -599,7 +599,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure kube-rbac-proxy volume mounts", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -638,7 +638,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure deployment volumes for proxy", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") @@ -676,7 +676,7 @@ var _ = Describe("EvalHub Deployment", func() { It("should configure service account for API", func() { By("Reconciling deployment") - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) Expect(err).NotTo(HaveOccurred()) By("Getting deployment") diff --git a/controllers/evalhub/evalhub_controller.go b/controllers/evalhub/evalhub_controller.go index 3f098d900..9643b9b2b 100644 --- a/controllers/evalhub/evalhub_controller.go +++ b/controllers/evalhub/evalhub_controller.go @@ -145,8 +145,18 @@ func (r *EvalHubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return RequeueWithError(err) } + // Reconcile Provider ConfigMaps (copy from operator namespace to instance namespace) + providerCMNames, err := r.reconcileProviderConfigMaps(ctx, instance) + if err != nil { + log.Error(err, "Failed to reconcile Provider ConfigMaps") + instance.SetStatus("Ready", "Error", fmt.Sprintf("Failed to reconcile Provider ConfigMaps: %v", err), corev1.ConditionFalse) + r.Status().Update(ctx, instance) + return RequeueWithError(err) + } + instance.Status.ActiveProviders = instance.Spec.Providers + // Reconcile Deployment - if err := r.reconcileDeployment(ctx, instance); err != nil { + if err := r.reconcileDeployment(ctx, instance, providerCMNames); err != nil { log.Error(err, "Failed to reconcile Deployment") instance.SetStatus("Ready", "Error", fmt.Sprintf("Failed to reconcile Deployment: %v", err), corev1.ConditionFalse) r.Status().Update(ctx, instance) diff --git a/controllers/evalhub/evalhub_controller_test.go b/controllers/evalhub/evalhub_controller_test.go index a27b0f74c..97a9d46b6 100644 --- a/controllers/evalhub/evalhub_controller_test.go +++ b/controllers/evalhub/evalhub_controller_test.go @@ -42,6 +42,11 @@ var _ = Describe("EvalHub Controller", func() { configMap = createConfigMap(configMapName, testNamespace) Expect(k8sClient.Create(ctx, configMap)).Should(Succeed()) + // Create source provider ConfigMaps (needed because the CRD default populates providers) + for _, cm := range createDefaultProviderConfigMaps(testNamespace) { + Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) + } + // Create EvalHub instance evalHub = createEvalHubInstance(evalHubName, testNamespace) Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed()) @@ -180,6 +185,11 @@ var _ = Describe("EvalHub Lifecycle Integration", func() { configMap = createConfigMap(configMapName, testNamespace) Expect(k8sClient.Create(ctx, configMap)).Should(Succeed()) + // Create source provider ConfigMaps (needed because the CRD default populates providers) + for _, cm := range createDefaultProviderConfigMaps(testNamespace) { + Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) + } + // Create EvalHub instance evalHub = createEvalHubInstance(evalHubName, testNamespace) Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed()) @@ -219,7 +229,6 @@ var _ = Describe("EvalHub Lifecycle Integration", func() { By("Checking that ConfigMap is created") configMapCreated := waitForConfigMap(evalHubName+"-config", testNamespace) Expect(configMapCreated.Data).To(HaveKey("config.yaml")) - Expect(configMapCreated.Data).To(HaveKey("providers.yaml")) By("Checking that Deployment is created") deployment := waitForDeployment(evalHubName, testNamespace) diff --git a/controllers/evalhub/suite_test.go b/controllers/evalhub/suite_test.go index 95de06484..459c9e860 100644 --- a/controllers/evalhub/suite_test.go +++ b/controllers/evalhub/suite_test.go @@ -94,7 +94,8 @@ func createEvalHubInstance(name, namespace string) *evalhubv1alpha1.EvalHub { Namespace: namespace, }, Spec: evalhubv1alpha1.EvalHubSpec{ - Replicas: &replicas, + Replicas: &replicas, + Providers: []string{}, Env: []corev1.EnvVar{ { Name: "TEST_ENV", @@ -128,6 +129,30 @@ func createConfigMap(name, namespace string) *corev1.ConfigMap { } } +// createDefaultProviderConfigMaps creates source provider ConfigMaps in the given namespace +// to satisfy the CRD default providers list during integration tests. +func createDefaultProviderConfigMaps(namespace string) []*corev1.ConfigMap { + defaultProviders := []string{"garak", "guidellm", "lighteval", "lm-evaluation-harness"} + var cms []*corev1.ConfigMap + for _, id := range defaultProviders { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "evalhub-provider-" + id, + Namespace: namespace, + Labels: map[string]string{ + providerLabel: "system", + providerNameLabel: id, + }, + }, + Data: map[string]string{ + id + ".yaml": "id: " + id + "\nname: " + id + "\n", + }, + } + cms = append(cms, cm) + } + return cms +} + // setupReconciler creates and returns a EvalHubReconciler for testing func setupReconciler(namespace string) (*EvalHubReconciler, context.Context) { eventRecorder := record.NewFakeRecorder(100) diff --git a/controllers/evalhub/unit_test.go b/controllers/evalhub/unit_test.go index 6d1fbeb3b..c8ed7832c 100644 --- a/controllers/evalhub/unit_test.go +++ b/controllers/evalhub/unit_test.go @@ -73,7 +73,7 @@ func TestEvalHubReconciler_reconcileDeployment(t *testing.T) { } t.Run("should create deployment with correct spec", func(t *testing.T) { - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) require.NoError(t, err) // Verify deployment was created @@ -154,7 +154,7 @@ func TestEvalHubReconciler_reconcileDeployment(t *testing.T) { EventRecorder: record.NewFakeRecorder(10), } - err := reconcilerNoConfig.reconcileDeployment(ctx, evalHub) + err := reconcilerNoConfig.reconcileDeployment(ctx, evalHub, nil) require.Error(t, err) assert.Contains(t, err.Error(), "kube-rbac-proxy configuration error") }) @@ -266,33 +266,11 @@ func TestEvalHubReconciler_reconcileConfigMap(t *testing.T) { // Check data keys exist assert.Contains(t, configMap.Data, "config.yaml") - assert.Contains(t, configMap.Data, "providers.yaml") // Parse and validate config.yaml var config EvalHubConfig err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config) require.NoError(t, err) - - // Check providers - assert.Len(t, config.Providers, 4) - providerNames := make([]string, len(config.Providers)) - for i, provider := range config.Providers { - providerNames[i] = provider.Name - } - assert.Contains(t, providerNames, "lm-eval-harness") - assert.Contains(t, providerNames, "ragas-provider") - assert.Contains(t, providerNames, "garak-security") - assert.Contains(t, providerNames, "trustyai-custom") - - // Check collections - assert.Contains(t, config.Collections, "healthcare_safety_v1") - assert.Contains(t, config.Collections, "automotive_safety_v1") - - // Parse and validate providers.yaml - var providersData map[string]interface{} - err = yaml.Unmarshal([]byte(configMap.Data["providers.yaml"]), &providersData) - require.NoError(t, err) - assert.Contains(t, providersData, "providers") }) } @@ -428,35 +406,11 @@ func TestGenerateConfigData(t *testing.T) { // Check keys exist assert.Contains(t, configData, "config.yaml") - assert.Contains(t, configData, "providers.yaml") // Parse config.yaml var config EvalHubConfig err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config) require.NoError(t, err) - - // Verify default providers - assert.Len(t, config.Providers, 4) - - // Find lm-eval-harness provider - var lmEvalProvider *ProviderConfig - for _, provider := range config.Providers { - if provider.Name == "lm-eval-harness" { - lmEvalProvider = &provider - break - } - } - require.NotNil(t, lmEvalProvider) - assert.Equal(t, "lm_evaluation_harness", lmEvalProvider.Type) - assert.True(t, lmEvalProvider.Enabled) - assert.Contains(t, lmEvalProvider.Benchmarks, "arc_challenge") - assert.Equal(t, "8", lmEvalProvider.Config["batch_size"]) - - // Verify collections - assert.Contains(t, config.Collections, "healthcare_safety_v1") - assert.Contains(t, config.Collections, "automotive_safety_v1") - assert.Contains(t, config.Collections, "finance_compliance_v1") - assert.Contains(t, config.Collections, "general_llm_eval_v1") }) } @@ -1188,7 +1142,7 @@ func TestGenerateConfigData_WithDatabase(t *testing.T) { assert.Equal(t, 10, config.Database.MaxIdleConns) }) - t.Run("should omit database and secrets sections when DB not configured", func(t *testing.T) { + t.Run("should default to sqlite when DB not explicitly configured", func(t *testing.T) { evalHub := &evalhubv1alpha1.EvalHub{ ObjectMeta: metav1.ObjectMeta{ Name: "test-evalhub", @@ -1204,7 +1158,8 @@ func TestGenerateConfigData_WithDatabase(t *testing.T) { err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config) require.NoError(t, err) - assert.Nil(t, config.Database) + assert.NotNil(t, config.Database) + assert.Equal(t, "sqlite", config.Database.Driver) assert.Nil(t, config.Secrets) }) } @@ -1256,7 +1211,7 @@ func TestEvalHubReconciler_reconcileDeployment_WithDB(t *testing.T) { } t.Run("should add DB secret volume and mount when database configured", func(t *testing.T) { - err := reconciler.reconcileDeployment(ctx, evalHub) + err := reconciler.reconcileDeployment(ctx, evalHub, nil) require.NoError(t, err) deployment := &appsv1.Deployment{} @@ -1330,3 +1285,212 @@ func TestEvalHubHelperMethods_IsDatabaseConfigured(t *testing.T) { assert.True(t, spec.IsDatabaseConfigured()) }) } + +func TestEvalHubReconciler_reconcileProviderConfigMaps(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, appsv1.AddToScheme(scheme)) + require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) + + ctx := context.Background() + operatorNamespace := "operator-ns" + instanceNamespace := "instance-ns" + evalHubName := "test-evalhub" + + // Source provider ConfigMap in the operator namespace (simulates what kustomize deploys) + sourceProvider := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "trustyai-service-operator-evalhub-provider-testprovider", + Namespace: operatorNamespace, + Labels: map[string]string{ + providerLabel: "system", + providerNameLabel: "testprovider", + }, + }, + Data: map[string]string{ + "testprovider.yaml": "id: testprovider\nname: Test Provider\nruntime:\n k8s:\n image: quay.io/test/provider:latest\n", + }, + } + + t.Run("should copy provider ConfigMap to instance namespace", func(t *testing.T) { + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: instanceNamespace, + }, + Spec: evalhubv1alpha1.EvalHubSpec{ + Providers: []string{"testprovider"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub, sourceProvider). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + Namespace: operatorNamespace, + EventRecorder: record.NewFakeRecorder(10), + } + + cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub) + require.NoError(t, err) + + // Should return the target ConfigMap name + require.Len(t, cmNames, 1) + assert.Equal(t, evalHubName+"-provider-testprovider", cmNames[0]) + + // Verify the ConfigMap was created in the instance namespace with correct data + copiedCM := &corev1.ConfigMap{} + err = fakeClient.Get(ctx, types.NamespacedName{ + Name: evalHubName + "-provider-testprovider", + Namespace: instanceNamespace, + }, copiedCM) + require.NoError(t, err) + assert.Equal(t, sourceProvider.Data["testprovider.yaml"], copiedCM.Data["testprovider.yaml"]) + }) + + t.Run("should return nil when no providers specified", func(t *testing.T) { + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: instanceNamespace, + }, + Spec: evalhubv1alpha1.EvalHubSpec{}, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + Namespace: operatorNamespace, + EventRecorder: record.NewFakeRecorder(10), + } + + cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub) + require.NoError(t, err) + assert.Nil(t, cmNames) + }) + + t.Run("should error when provider not found", func(t *testing.T) { + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: instanceNamespace, + }, + Spec: evalhubv1alpha1.EvalHubSpec{ + Providers: []string{"nonexistent"}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + Namespace: operatorNamespace, + EventRecorder: record.NewFakeRecorder(10), + } + + cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub) + require.Error(t, err) + assert.Nil(t, cmNames) + assert.Contains(t, err.Error(), "not found") + }) + + t.Run("should mount providers as projected volume in deployment", func(t *testing.T) { + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: instanceNamespace, + }, + Spec: evalhubv1alpha1.EvalHubSpec{ + Providers: []string{"testprovider"}, + }, + } + + operatorConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: operatorNamespace, + }, + Data: map[string]string{ + configMapEvalHubImageKey: "quay.io/test/eval-hub:latest", + configMapKubeRBACProxyImageKey: "gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub, sourceProvider, operatorConfigMap). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + Namespace: operatorNamespace, + EventRecorder: record.NewFakeRecorder(10), + } + + // First reconcile provider ConfigMaps + cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub) + require.NoError(t, err) + require.Len(t, cmNames, 1) + + // Then reconcile deployment with the provider ConfigMap names + err = reconciler.reconcileDeployment(ctx, evalHub, cmNames) + require.NoError(t, err) + + // Verify the deployment has the projected volume + deployment := &appsv1.Deployment{} + err = fakeClient.Get(ctx, types.NamespacedName{ + Name: evalHubName, + Namespace: instanceNamespace, + }, deployment) + require.NoError(t, err) + + // Find the evalhub-providers projected volume + var providersVolume *corev1.Volume + for i, v := range deployment.Spec.Template.Spec.Volumes { + if v.Name == providersVolumeName { + providersVolume = &deployment.Spec.Template.Spec.Volumes[i] + break + } + } + require.NotNil(t, providersVolume, "evalhub-providers volume should be present") + require.NotNil(t, providersVolume.VolumeSource.Projected) + require.Len(t, providersVolume.VolumeSource.Projected.Sources, 1) + assert.Equal(t, evalHubName+"-provider-testprovider", + providersVolume.VolumeSource.Projected.Sources[0].ConfigMap.Name) + + // Find the providers volume mount on the evalhub container + var evalHubContainer *corev1.Container + for i, c := range deployment.Spec.Template.Spec.Containers { + if c.Name == containerName { + evalHubContainer = &deployment.Spec.Template.Spec.Containers[i] + break + } + } + require.NotNil(t, evalHubContainer) + + var providersMount *corev1.VolumeMount + for i, m := range evalHubContainer.VolumeMounts { + if m.Name == providersVolumeName { + providersMount = &evalHubContainer.VolumeMounts[i] + break + } + } + require.NotNil(t, providersMount, "providers volume mount should be present") + assert.Equal(t, providersMountPath, providersMount.MountPath) + assert.True(t, providersMount.ReadOnly) + }) +} diff --git a/hack/sync-evalhub-providers.py b/hack/sync-evalhub-providers.py new file mode 100755 index 000000000..b45cc7cde --- /dev/null +++ b/hack/sync-evalhub-providers.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Fetches provider YAML files from the eval-hub upstream repository and generates +Kubernetes ConfigMap manifests for the operator to deploy. + +Usage: + hack/sync-evalhub-providers.py [branch] + +Arguments: + branch Git branch to fetch from (default: main) +""" + +import json +import sys +import textwrap +import urllib.request +from pathlib import Path + +import yaml + +REPO = "eval-hub/eval-hub" +UPSTREAM_DIR = "config/providers" +OUTPUT_DIR = Path("config/configmaps/evalhub") + +PROVIDER_TYPE_LABEL = "trustyai.opendatahub.io/evalhub-provider-type" +PROVIDER_NAME_LABEL = "trustyai.opendatahub.io/evalhub-provider-name" + +# Files to exclude from the upstream repository (by filename) +EXCLUDE_FILES = { + "ragas.yaml", +} + + +def fetch_json(url: str): + with urllib.request.urlopen(url) as resp: + return json.load(resp) + + +def fetch_text(url: str) -> str: + with urllib.request.urlopen(url) as resp: + return resp.read().decode() + + +def list_yaml_files(branch: str) -> list[str]: + api_url = f"https://api.github.com/repos/{REPO}/contents/{UPSTREAM_DIR}?ref={branch}" + print(f"Fetching provider list from {api_url}") + entries = fetch_json(api_url) + return [e["name"] for e in entries if e["name"].endswith((".yaml", ".yml"))] + + +def process_provider(filename: str, branch: str) -> tuple[str, str] | None: + """Download a provider YAML, replace the image with a kustomize placeholder, + and generate a ConfigMap manifest. Returns (cm_filename, provider_id) or None.""" + raw_url = f"https://raw.githubusercontent.com/{REPO}/{branch}/{UPSTREAM_DIR}/{filename}" + content = fetch_text(raw_url) + data = yaml.safe_load(content) + + provider_id = data.get("id") + if not provider_id: + print(f" SKIP: no 'id' field found in {filename}", file=sys.stderr) + return None + + # Sanitize for K8s resource names (RFC 1123: only lowercase alphanumeric and hyphens) + safe_id = provider_id.replace("_", "-") + + cm_file = f"provider-{safe_id}.yaml" + cm_name = f"evalhub-provider-{safe_id}" + var_name = f"evalhub-provider-{safe_id}-image" + + print(f" id={provider_id} -> {cm_file}") + + # Replace runtime.k8s.image with kustomize placeholder + if "runtime" in data and "k8s" in data["runtime"]: + data["runtime"]["k8s"]["image"] = f"$({var_name})" + + provider_yaml = yaml.dump(data, default_flow_style=False, sort_keys=False) + + cm = textwrap.dedent(f"""\ + apiVersion: v1 + kind: ConfigMap + metadata: + name: {cm_name} + labels: + {PROVIDER_TYPE_LABEL}: system + {PROVIDER_NAME_LABEL}: {safe_id} + data: + {filename}: | + """) + indented = textwrap.indent(provider_yaml, " ") + + (OUTPUT_DIR / cm_file).write_text(cm + indented) + return cm_file, safe_id + + +def write_kustomization(cm_files: list[str]): + lines = ["resources:"] + for f in cm_files: + lines.append(f" - {f}") + lines.append("") + lines.append("namespace: system") + lines.append("") + (OUTPUT_DIR / "kustomization.yaml").write_text("\n".join(lines)) + + +def main(): + branch = sys.argv[1] if len(sys.argv) > 1 else "main" + + filenames = list_yaml_files(branch) + if not filenames: + print(f"ERROR: No YAML files found in {UPSTREAM_DIR}", file=sys.stderr) + sys.exit(1) + + # Clean existing provider ConfigMap files + for old in OUTPUT_DIR.glob("provider-*.yaml"): + old.unlink() + + cm_files = [] + provider_ids = [] + + for filename in filenames: + if filename in EXCLUDE_FILES: + print(f"Skipping {filename} (excluded)") + continue + print(f"Processing {filename}...") + result = process_provider(filename, branch) + if result: + cm_file, provider_id = result + cm_files.append(cm_file) + provider_ids.append(provider_id) + + write_kustomization(cm_files) + + print(f"\nGenerated {len(cm_files)} provider ConfigMaps in {OUTPUT_DIR}/") + print(f"Provider IDs: {', '.join(provider_ids)}") + + +if __name__ == "__main__": + main()