diff --git a/api/evalhub/v1alpha1/evalhub_types.go b/api/evalhub/v1alpha1/evalhub_types.go
index ed5f7c96b..6974d4f47 100644
--- a/api/evalhub/v1alpha1/evalhub_types.go
+++ b/api/evalhub/v1alpha1/evalhub_types.go
@@ -45,6 +45,12 @@ type EvalHubSpec struct {
 	// +optional
 	Env []corev1.EnvVar `json:"env,omitempty"`
 
+	// Providers is the list of OOTB provider names to mount into the deployment.
+	// Each name must match a provider-name label on a ConfigMap in the operator namespace.
+	// +kubebuilder:default:={"garak","guidellm","lighteval","lm-evaluation-harness"}
+	// +optional
+	Providers []string `json:"providers,omitempty"`
+
 	// Database configuration for persistent storage.
 	// When set, the operator configures PostgreSQL via the referenced secret.
 	// When omitted, the service uses its default (in-memory SQLite).
diff --git a/api/evalhub/v1alpha1/zz_generated.deepcopy.go b/api/evalhub/v1alpha1/zz_generated.deepcopy.go
index 896e60b62..2697d522f 100644
--- a/api/evalhub/v1alpha1/zz_generated.deepcopy.go
+++ b/api/evalhub/v1alpha1/zz_generated.deepcopy.go
@@ -115,6 +115,11 @@ func (in *EvalHubSpec) DeepCopyInto(out *EvalHubSpec) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.Providers != nil {
+		in, out := &in.Providers, &out.Providers
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 	if in.Database != nil {
 		in, out := &in.Database, &out.Database
 		*out = new(DatabaseSpec)
diff --git a/config/base/kustomization.yaml b/config/base/kustomization.yaml
index 549c25dce..db0f8179e 100644
--- a/config/base/kustomization.yaml
+++ b/config/base/kustomization.yaml
@@ -6,6 +6,7 @@ resources:
   - ../rbac
   - ../manager
   - ../prometheus
+  - ../configmaps
 
 commonLabels:
   app.kubernetes.io/part-of: trustyai
@@ -56,3 +57,31 @@ vars:
       apiVersion: v1
     fieldref:
       fieldpath: data.evalHubImage
+  - name: evalhub-provider-garak-image
+    objref:
+      kind: ConfigMap
+      name: config
+      apiVersion: v1
+    fieldref:
+      fieldpath: data.evalhub-provider-garak-image
+  - name: evalhub-provider-guidellm-image
+    objref:
+      kind: ConfigMap
+      name: config
+      apiVersion: v1
+    fieldref:
+      fieldpath: data.evalhub-provider-guidellm-image
+  - name: evalhub-provider-lighteval-image
+    objref:
+      kind: ConfigMap
+      name: config
+      apiVersion: v1
+    fieldref:
+      fieldpath: data.evalhub-provider-lighteval-image
+  - name: evalhub-provider-lm-evaluation-harness-image
+    objref:
+      kind: ConfigMap
+      name: config
+      apiVersion: v1
+    fieldref:
+      fieldpath: data.evalhub-provider-lm-evaluation-harness-image
diff --git a/config/base/params.env b/config/base/params.env
index eef60e4b6..74bc8fef8 100644
--- a/config/base/params.env
+++ b/config/base/params.env
@@ -16,4 +16,8 @@ guardrails-orchestrator-image=quay.io/trustyai/ta-guardrails-orchestrator:latest
 guardrails-built-in-detector-image=quay.io/trustyai/guardrails-detector-built-in:latest
 guardrails-sidecar-gateway-image=quay.io/trustyai/guardrails-sidecar-gateway:latest
 garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest
-nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest
\ No newline at end of file
+nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest
+evalhub-provider-garak-image=quay.io/evalhub/garak:latest
+evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest
+evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest
+evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2
\ No newline at end of file
diff --git a/config/base/params.yaml b/config/base/params.yaml
index 190afea08..fb2998ffd 100644
--- a/config/base/params.yaml
+++ b/config/base/params.yaml
@@ -2,3 +2,5 @@
 varReference:
   - kind: Deployment
     path: spec/template/spec/containers[]/image
+  - kind: ConfigMap
+    path: data
diff --git a/config/configmaps/evalhub/kustomization.yaml b/config/configmaps/evalhub/kustomization.yaml
new file mode 100644
index 000000000..807e0ac8c
--- /dev/null
+++ b/config/configmaps/evalhub/kustomization.yaml
@@ -0,0 +1,7 @@
+resources:
+  - provider-garak.yaml
+  - provider-guidellm.yaml
+  - provider-lighteval.yaml
+  - provider-lm-evaluation-harness.yaml
+
+namespace: system
diff --git a/config/configmaps/evalhub/provider-garak.yaml b/config/configmaps/evalhub/provider-garak.yaml
new file mode 100644
index 000000000..cdd97aa3f
--- /dev/null
+++ b/config/configmaps/evalhub/provider-garak.yaml
@@ -0,0 +1,80 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evalhub-provider-garak
+  labels:
+    trustyai.opendatahub.io/evalhub-provider-type: system
+    trustyai.opendatahub.io/evalhub-provider-name: garak
+data:
+  garak.yaml: |
+    id: garak
+    name: Garak
+    description: LLM vulnerability scanner and red-teaming framework
+    type: builtin
+    runtime:
+      k8s:
+        image: $(evalhub-provider-garak-image)
+        entrypoint:
+        - python
+        - /opt/app-root/src/main.py
+        cpu_request: 100m
+        memory_request: 128Mi
+        cpu_limit: 500m
+        memory_limit: 1Gi
+        env:
+        - name: VAR_NAME
+          value: VALUE
+      local: null
+    benchmarks:
+    - id: toxicity
+      name: Toxicity Detection
+      description: Tests model's tendency to generate toxic content
+      category: safety
+      metrics:
+      - toxicity_rate
+      - severity_score
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - safety
+      - toxicity
+      - red_team
+    - id: bias_detection
+      name: Bias Detection
+      description: Evaluates model for various forms of bias
+      category: fairness
+      metrics:
+      - bias_score
+      - demographic_parity
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - fairness
+      - bias
+      - demographic
+    - id: pii_leakage
+      name: PII Leakage
+      description: Tests for personally identifiable information leakage
+      category: privacy
+      metrics:
+      - pii_leak_rate
+      - sensitivity_score
+      num_few_shot: 0
+      dataset_size: 300
+      tags:
+      - privacy
+      - pii
+      - security
+    - id: prompt_injection
+      name: Prompt Injection
+      description: Tests resilience against prompt injection attacks
+      category: security
+      metrics:
+      - injection_success_rate
+      - defense_effectiveness
+      num_few_shot: 0
+      dataset_size: 200
+      tags:
+      - security
+      - injection
+      - adversarial
diff --git a/config/configmaps/evalhub/provider-guidellm.yaml b/config/configmaps/evalhub/provider-guidellm.yaml
new file mode 100644
index 000000000..e86f5fa85
--- /dev/null
+++ b/config/configmaps/evalhub/provider-guidellm.yaml
@@ -0,0 +1,130 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evalhub-provider-guidellm
+  labels:
+    trustyai.opendatahub.io/evalhub-provider-type: system
+    trustyai.opendatahub.io/evalhub-provider-name: guidellm
+data:
+  guidellm.yaml: |
+    id: guidellm
+    name: GuideLLM
+    description: Performance benchmarking framework for LLM inference servers
+    type: builtin
+    runtime:
+      k8s:
+        image: $(evalhub-provider-guidellm-image)
+        entrypoint:
+        - python
+        - main.py
+        cpu_request: 100m
+        memory_request: 128Mi
+        cpu_limit: 1000m
+        memory_limit: 2Gi
+      local: null
+    benchmarks:
+    - id: sweep
+      name: Sweep Profile
+      description: Automatically finds optimal request rate by sweeping from low to high
+        concurrency
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - throughput
+      - latency
+      - guidellm
+      - auto
+    - id: throughput
+      name: Throughput Profile
+      description: Measures maximum throughput by gradually increasing request rate until
+        saturation
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - throughput
+      - guidellm
+      - saturation
+    - id: concurrent
+      name: Concurrent Profile
+      description: Tests performance with fixed number of concurrent requests
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - concurrency
+      - guidellm
+    - id: constant
+      name: Constant Rate Profile
+      description: Maintains constant request rate throughout benchmark duration
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - constant_rate
+      - guidellm
+    - id: poisson
+      name: Poisson Profile
+      description: Sends requests following Poisson distribution for realistic production
+        simulation
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - poisson
+      - realistic
+      - guidellm
+    - id: quick_perf_test
+      name: Quick Performance Test
+      description: Fast performance evaluation with sweep profile and limited samples
+      category: performance
+      metrics:
+      - requests_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - quick
+      - guidellm
+      - suite
+    - id: comprehensive_perf_test
+      name: Comprehensive Performance Test
+      description: Thorough performance evaluation across all profiles
+      category: performance
+      metrics:
+      - requests_per_second
+      - prompt_tokens_per_second
+      - output_tokens_per_second
+      - mean_ttft_ms
+      - mean_itl_ms
+      tags:
+      - performance
+      - comprehensive
+      - guidellm
+      - suite
diff --git a/config/configmaps/evalhub/provider-lighteval.yaml b/config/configmaps/evalhub/provider-lighteval.yaml
new file mode 100644
index 000000000..c4475a60c
--- /dev/null
+++ b/config/configmaps/evalhub/provider-lighteval.yaml
@@ -0,0 +1,307 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evalhub-provider-lighteval
+  labels:
+    trustyai.opendatahub.io/evalhub-provider-type: system
+    trustyai.opendatahub.io/evalhub-provider-name: lighteval
+data:
+  lighteval.yaml: |
+    id: lighteval
+    name: Lighteval
+    description: Lightweight LLM evaluation framework from Hugging Face
+    type: builtin
+    runtime:
+      k8s:
+        image: $(evalhub-provider-lighteval-image)
+        entrypoint:
+        - python
+        - main.py
+        cpu_request: 100m
+        memory_request: 128Mi
+        cpu_limit: 500m
+        memory_limit: 1Gi
+      local: null
+    benchmarks:
+    - id: commonsense_reasoning
+      name: Commonsense Reasoning Suite
+      description: Suite of commonsense reasoning benchmarks (hellaswag, winogrande, openbookqa,
+        arc:easy)
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      tags:
+      - reasoning
+      - commonsense
+      - lighteval
+      - suite
+    - id: scientific_reasoning
+      name: Scientific Reasoning Suite
+      description: Scientific reasoning benchmarks (arc:easy, arc:challenge)
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      tags:
+      - reasoning
+      - science
+      - lighteval
+      - suite
+    - id: physical_commonsense
+      name: Physical Commonsense Suite
+      description: Physical commonsense reasoning (piqa)
+      category: reasoning
+      metrics:
+      - accuracy
+      tags:
+      - reasoning
+      - physical
+      - lighteval
+      - suite
+    - id: truthfulness
+      name: Truthfulness Suite
+      description: Truthfulness and hallucination benchmarks (truthfulqa:mc, truthfulqa:generation)
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      tags:
+      - safety
+      - truthfulness
+      - lighteval
+      - suite
+    - id: math
+      name: Math Suite
+      description: Mathematical reasoning benchmarks (gsm8k, math:algebra, math:counting_and_probability)
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      tags:
+      - math
+      - reasoning
+      - lighteval
+      - suite
+    - id: knowledge
+      name: Knowledge Suite
+      description: Knowledge benchmarks (mmlu, triviaqa)
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      tags:
+      - knowledge
+      - lighteval
+      - suite
+    - id: language_understanding
+      name: Language Understanding Suite
+      description: GLUE language understanding tasks (glue:cola, glue:sst2, glue:mrpc)
+      category: language_understanding
+      metrics:
+      - accuracy
+      - matthews_correlation
+      - f1
+      tags:
+      - language_understanding
+      - glue
+      - lighteval
+      - suite
+    - id: hellaswag
+      name: HellaSwag
+      description: Commonsense reasoning around everyday activities
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - commonsense
+      - lighteval
+    - id: winogrande
+      name: Winogrande
+      description: Commonsense reasoning with pronoun resolution
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1267
+      tags:
+      - reasoning
+      - commonsense
+      - lighteval
+    - id: openbookqa
+      name: OpenBookQA
+      description: Question answering with open book knowledge
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - knowledge
+      - qa
+      - lighteval
+    - id: arc:easy
+      name: ARC Easy
+      description: AI2 Reasoning Challenge - Easy subset
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 2376
+      tags:
+      - reasoning
+      - science
+      - lighteval
+    - id: arc:challenge
+      name: ARC Challenge
+      description: AI2 Reasoning Challenge - Challenge subset
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 1172
+      tags:
+      - reasoning
+      - science
+      - lighteval
+    - id: piqa
+      name: PIQA
+      description: Physical Interaction QA - physical commonsense reasoning
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1838
+      tags:
+      - reasoning
+      - physical
+      - lighteval
+    - id: truthfulqa:mc
+      name: TruthfulQA MC
+      description: Measures truthfulness with multiple choice format
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - truthfulness
+      - lighteval
+    - id: truthfulqa:generation
+      name: TruthfulQA Generation
+      description: Measures truthfulness with generation format
+      category: safety
+      metrics:
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - truthfulness
+      - lighteval
+    - id: gsm8k
+      name: GSM8K
+      description: Grade School Math 8K - arithmetic reasoning
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 8
+      dataset_size: 1319
+      tags:
+      - math
+      - reasoning
+      - lighteval
+    - id: math:algebra
+      name: MATH Algebra
+      description: Mathematical reasoning - Algebra subset
+      category: math
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      tags:
+      - math
+      - algebra
+      - lighteval
+    - id: math:counting_and_probability
+      name: MATH Counting & Probability
+      description: Mathematical reasoning - Counting and Probability subset
+      category: math
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      tags:
+      - math
+      - probability
+      - lighteval
+    - id: mmlu
+      name: MMLU
+      description: Massive Multitask Language Understanding - 57 subjects
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 5
+      dataset_size: 15908
+      tags:
+      - knowledge
+      - multitask
+      - lighteval
+    - id: triviaqa
+      name: TriviaQA
+      description: Large-scale question answering dataset
+      category: knowledge
+      metrics:
+      - accuracy
+      - exact_match
+      num_few_shot: 0
+      tags:
+      - knowledge
+      - qa
+      - lighteval
+    - id: glue:cola
+      name: GLUE CoLA
+      description: Corpus of Linguistic Acceptability
+      category: language_understanding
+      metrics:
+      - matthews_correlation
+      num_few_shot: 0
+      tags:
+      - language_understanding
+      - glue
+      - lighteval
+    - id: glue:sst2
+      name: GLUE SST-2
+      description: Stanford Sentiment Treebank
+      category: language_understanding
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      tags:
+      - language_understanding
+      - glue
+      - sentiment
+      - lighteval
+    - id: glue:mrpc
+      name: GLUE MRPC
+      description: Microsoft Research Paraphrase Corpus
+      category: language_understanding
+      metrics:
+      - accuracy
+      - f1
+      num_few_shot: 0
+      tags:
+      - language_understanding
+      - glue
+      - paraphrase
+      - lighteval
diff --git a/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml b/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml
new file mode 100644
index 000000000..68d86d397
--- /dev/null
+++ b/config/configmaps/evalhub/provider-lm-evaluation-harness.yaml
@@ -0,0 +1,2027 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evalhub-provider-lm-evaluation-harness
+  labels:
+    trustyai.opendatahub.io/evalhub-provider-type: system
+    trustyai.opendatahub.io/evalhub-provider-name: lm-evaluation-harness
+data:
+  lm_evaluation_harness.yaml: |
+    id: lm_evaluation_harness
+    name: LM Evaluation Harness
+    description: Comprehensive evaluation framework for language models with 167 benchmarks
+    type: builtin
+    runtime:
+      k8s:
+        image: $(evalhub-provider-lm-evaluation-harness-image)
+        entrypoint:
+        - /opt/app-root/bin/python
+        - /opt/app-root/src/main.py
+        cpu_request: 100m
+        memory_request: 128Mi
+        cpu_limit: 500m
+        memory_limit: 1Gi
+        env:
+        - name: VAR_NAME
+          value: VALUE
+      local: null
+    benchmarks:
+    - id: arc_easy
+      name: ARC Easy
+      description: ARC Easy evaluation benchmark - AI2 Reasoning Challenge (Easy)
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 2376
+      tags:
+      - reasoning
+      - science
+      - lm_eval
+    - id: AraDiCE_boolq_lev
+      name: Aradice Boolq Lev
+      description: Aradice Boolq Lev evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 3270
+      tags:
+      - general
+      - lm_eval
+    - id: blimp
+      name: Blimp
+      description: Blimp evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_anaphor_gender_agreement
+      name: Blimp Anaphor Gender Agreement
+      description: Blimp Anaphor Gender Agreement evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_animate_subject_trans
+      name: Blimp Animate Subject Trans
+      description: Blimp Animate Subject Trans evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_coordinate_structure_constraint_complex_left_branch
+      name: Blimp Coordinate Structure Constraint Complex Left Branch
+      description: Blimp Coordinate Structure Constraint Complex Left Branch evaluation
+        benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_determiner_noun_agreement_2
+      name: Blimp Determiner Noun Agreement 2
+      description: Blimp Determiner Noun Agreement 2 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_determiner_noun_agreement_with_adj_2
+      name: Blimp Determiner Noun Agreement With Adj 2
+      description: Blimp Determiner Noun Agreement With Adj 2 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_determiner_noun_agreement_with_adjective_1
+      name: Blimp Determiner Noun Agreement With Adjective 1
+      description: Blimp Determiner Noun Agreement With Adjective 1 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_existential_there_object_raising
+      name: Blimp Existential There Object Raising
+      description: Blimp Existential There Object Raising evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_existential_there_subject_raising
+      name: Blimp Existential There Subject Raising
+      description: Blimp Existential There Subject Raising evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_intransitive
+      name: Blimp Intransitive
+      description: Blimp Intransitive evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_irregular_plural_subject_verb_agreement_1
+      name: Blimp Irregular Plural Subject Verb Agreement 1
+      description: Blimp Irregular Plural Subject Verb Agreement 1 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_left_branch_island_simple_question
+      name: Blimp Left Branch Island Simple Question
+      description: Blimp Left Branch Island Simple Question evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_npi_present_2
+      name: Blimp Npi Present 2
+      description: Blimp Npi Present 2 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: blimp_passive_1
+      name: Blimp Passive 1
+      description: Blimp Passive 1 evaluation benchmark
+      category: general
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - general
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_egy
+      name: Aradice Arabicmmlu Egy
+      description: Aradice Arabicmmlu Egy evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_humanities_history_lev
+      name: Aradice Arabicmmlu High Humanities History Lev
+      description: Aradice Arabicmmlu High Humanities History Lev evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_humanities_philosophy_egy
+      name: Aradice Arabicmmlu High Humanities Philosophy Egy
+      description: Aradice Arabicmmlu High Humanities Philosophy Egy evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_language_arabic-language_lev
+      name: Aradice Arabicmmlu High Language Arabic-Language Lev
+      description: Aradice Arabicmmlu High Language Arabic-Language Lev evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_lev
+      name: Aradice Arabicmmlu Lev
+      description: Aradice Arabicmmlu Lev evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_egy
+      name: Aradice Arabicmmlu Middle Humanities Islamic-Studies Egy
+      description: Aradice Arabicmmlu Middle Humanities Islamic-Studies Egy evaluation
+        benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_language_arabic-language_lev
+      name: Aradice Arabicmmlu Middle Language Arabic-Language Lev
+      description: Aradice Arabicmmlu Middle Language Arabic-Language Lev evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_na_humanities_islamic-studies_egy
+      name: Aradice Arabicmmlu Na Humanities Islamic-Studies Egy
+      description: Aradice Arabicmmlu Na Humanities Islamic-Studies Egy evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_na_language_arabic-language-general_lev
+      name: Aradice Arabicmmlu Na Language Arabic-Language-General Lev
+      description: Aradice Arabicmmlu Na Language Arabic-Language-General Lev evaluation
+        benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_na_other_driving-test_egy
+      name: Aradice Arabicmmlu Na Other Driving-Test Egy
+      description: Aradice Arabicmmlu Na Other Driving-Test Egy evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_na_other_general-knowledge_lev
+      name: Aradice Arabicmmlu Na Other General-Knowledge Lev
+      description: Aradice Arabicmmlu Na Other General-Knowledge Lev evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_humanities_islamic-studies_egy
+      name: Aradice Arabicmmlu Primary Humanities Islamic-Studies Egy
+      description: Aradice Arabicmmlu Primary Humanities Islamic-Studies Egy evaluation
+        benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_language_arabic-language_lev
+      name: Aradice Arabicmmlu Primary Language Arabic-Language Lev
+      description: Aradice Arabicmmlu Primary Language Arabic-Language Lev evaluation
+        benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_univ_other_management_egy
+      name: Aradice Arabicmmlu Univ Other Management Egy
+      description: Aradice Arabicmmlu Univ Other Management Egy evaluation benchmark
+      category: knowledge
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - knowledge
+      - lm_eval
+    - id: AraDiCE_openbookqa_eng
+      name: Aradice Openbookqa Eng
+      description: Aradice Openbookqa Eng evaluation benchmark
+      category: knowledge
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - knowledge
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_boolq
+      name: Arabic Leaderboard Arabic Mt Boolq
+      description: Arabic Leaderboard Arabic Mt Boolq evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 3270
+      tags:
+      - multilingual
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_boolq_light
+      name: Arabic Leaderboard Arabic Mt Boolq Light
+      description: Arabic Leaderboard Arabic Mt Boolq Light evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 3270
+      tags:
+      - multilingual
+      - lm_eval
+    - id: arabic_mt_boolq_light
+      name: Arabic Mt Boolq Light
+      description: Arabic Mt Boolq Light evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 3270
+      tags:
+      - multilingual
+      - lm_eval
+    - id: leaderboard_bbh_salient_translation_error_detection
+      name: Leaderboard Bbh Salient Translation Error Detection
+      description: Leaderboard Bbh Salient Translation Error Detection evaluation benchmark
+      category: multilingual
+      metrics:
+      - bleu
+      - chrf
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: aclue_ancient_chinese_culture
+      name: Aclue Ancient Chinese Culture
+      description: Aclue Ancient Chinese Culture evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: african_flores
+      name: African Flores
+      description: African Flores evaluation benchmark
+      category: multilingual
+      metrics:
+      - bleu
+      - chrf
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli-irokobench
+      name: Afrixnli-Irokobench
+      description: Afrixnli-Irokobench evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_amh_prompt_2
+      name: Afrixnli Amh Prompt 2
+      description: Afrixnli Amh Prompt 2 evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_amh_prompt_5
+      name: Afrixnli Amh Prompt 5
+      description: Afrixnli Amh Prompt 5 evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_ewe
+      name: Afrixnli En Direct Ewe
+      description: Afrixnli En Direct Ewe evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_ibo
+      name: Afrixnli En Direct Ibo
+      description: Afrixnli En Direct Ibo evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_lug
+      name: Afrixnli En Direct Lug
+      description: Afrixnli En Direct Lug evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_sot
+      name: Afrixnli En Direct Sot
+      description: Afrixnli En Direct Sot evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_wol
+      name: Afrixnli En Direct Wol
+      description: Afrixnli En Direct Wol evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: afrixnli_en_direct_zul
+      name: Afrixnli En Direct Zul
+      description: Afrixnli En Direct Zul evaluation benchmark
+      category: multilingual
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 2000
+      tags:
+      - multilingual
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_stem_math_egy
+      name: Aradice Arabicmmlu Primary Stem Math Egy
+      description: Aradice Arabicmmlu Primary Stem Math Egy evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_college_mathematics_light
+      name: Arabic Leaderboard Arabic Mmlu College Mathematics Light
+      description: Arabic Leaderboard Arabic Mmlu College Mathematics Light evaluation
+        benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_high_school_mathematics
+      name: Arabic Leaderboard Arabic Mmlu High School Mathematics
+      description: Arabic Leaderboard Arabic Mmlu High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: cmmlu_college_mathematics
+      name: Cmmlu College Mathematics
+      description: Cmmlu College Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: cmmlu_high_school_mathematics
+      name: Cmmlu High School Mathematics
+      description: Cmmlu High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_am_high_school_mathematics
+      name: Global Mmlu Full Am High School Mathematics
+      description: Global Mmlu Full Am High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_ar_high_school_mathematics
+      name: Global Mmlu Full Ar High School Mathematics
+      description: Global Mmlu Full Ar High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_bn_high_school_mathematics
+      name: Global Mmlu Full Bn High School Mathematics
+      description: Global Mmlu Full Bn High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_cs_high_school_mathematics
+      name: Global Mmlu Full Cs High School Mathematics
+      description: Global Mmlu Full Cs High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_de_high_school_mathematics
+      name: Global Mmlu Full De High School Mathematics
+      description: Global Mmlu Full De High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_el_high_school_mathematics
+      name: Global Mmlu Full El High School Mathematics
+      description: Global Mmlu Full El High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_en_high_school_mathematics
+      name: Global Mmlu Full En High School Mathematics
+      description: Global Mmlu Full En High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_es_high_school_mathematics
+      name: Global Mmlu Full Es High School Mathematics
+      description: Global Mmlu Full Es High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_fa_high_school_mathematics
+      name: Global Mmlu Full Fa High School Mathematics
+      description: Global Mmlu Full Fa High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: global_mmlu_full_fil_high_school_mathematics
+      name: Global Mmlu Full Fil High School Mathematics
+      description: Global Mmlu Full Fil High School Mathematics evaluation benchmark
+      category: math
+      metrics:
+      - exact_match
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - math
+      - lm_eval
+    - id: AraDiCE_piqa_lev
+      name: Aradice Piqa Lev
+      description: Aradice Piqa Lev evaluation benchmark
+      category: reasoning
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 1838
+      tags:
+      - reasoning
+      - lm_eval
+    - id: AraDiCE_winogrande_eng
+      name: Aradice Winogrande Eng
+      description: Aradice Winogrande Eng evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1267
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_copa
+      name: Arabic Leaderboard Arabic Mt Copa
+      description: Arabic Leaderboard Arabic Mt Copa evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_copa_light
+      name: Arabic Leaderboard Arabic Mt Copa Light
+      description: Arabic Leaderboard Arabic Mt Copa Light evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_hellaswag
+      name: Arabic Leaderboard Arabic Mt Hellaswag
+      description: Arabic Leaderboard Arabic Mt Hellaswag evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_hellaswag_light
+      name: Arabic Leaderboard Arabic Mt Hellaswag Light
+      description: Arabic Leaderboard Arabic Mt Hellaswag Light evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_piqa
+      name: Arabic Leaderboard Arabic Mt Piqa
+      description: Arabic Leaderboard Arabic Mt Piqa evaluation benchmark
+      category: reasoning
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 1838
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_piqa_light
+      name: Arabic Leaderboard Arabic Mt Piqa Light
+      description: Arabic Leaderboard Arabic Mt Piqa Light evaluation benchmark
+      category: reasoning
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 1838
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_mt_hellaswag
+      name: Arabic Mt Hellaswag
+      description: Arabic Mt Hellaswag evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_mt_piqa
+      name: Arabic Mt Piqa
+      description: Arabic Mt Piqa evaluation benchmark
+      category: reasoning
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 1838
+      tags:
+      - reasoning
+      - lm_eval
+    - id: copa_ar
+      name: Copa Ar
+      description: Copa Ar evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - reasoning
+      - lm_eval
+    - id: copal_id_colloquial
+      name: Copal Id Colloquial
+      description: Copal Id Colloquial evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 500
+      tags:
+      - reasoning
+      - lm_eval
+    - id: darijahellaswag
+      name: Darijahellaswag
+      description: Darijahellaswag evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: egyhellaswag
+      name: Egyhellaswag
+      description: Egyhellaswag evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: hellaswag_ar
+      name: Hellaswag Ar
+      description: Hellaswag Ar evaluation benchmark
+      category: reasoning
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 10042
+      tags:
+      - reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_race
+      name: Arabic Leaderboard Arabic Mt Race
+      description: Arabic Leaderboard Arabic Mt Race evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 674
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mt_race_light
+      name: Arabic Leaderboard Arabic Mt Race Light
+      description: Arabic Leaderboard Arabic Mt Race Light evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 674
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: arabic_mt_race_light
+      name: Arabic Mt Race Light
+      description: Arabic Mt Race Light evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 674
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: blimp_drop_argument
+      name: Blimp Drop Argument
+      description: Blimp Drop Argument evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 9536
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: bigbench_gre_reading_comprehension_multiple_choice
+      name: Bigbench Gre Reading Comprehension Multiple Choice
+      description: Bigbench Gre Reading Comprehension Multiple Choice evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: eus_reading
+      name: Eus Reading
+      description: Eus Reading evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: longbench_qasper
+      name: Longbench Qasper
+      description: Longbench Qasper evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: qasper_freeform
+      name: Qasper Freeform
+      description: Qasper Freeform evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: ruler_qa_squad
+      name: Ruler Qa Squad
+      description: Ruler Qa Squad evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: scrolls_qasper
+      name: Scrolls Qasper
+      description: Scrolls Qasper evaluation benchmark
+      category: reading_comprehension
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 3000
+      tags:
+      - reading_comprehension
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_social-science_economics_egy
+      name: Aradice Arabicmmlu High Social-Science Economics Egy
+      description: Aradice Arabicmmlu High Social-Science Economics Egy evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_social-science_geography_lev
+      name: Aradice Arabicmmlu High Social-Science Geography Lev
+      description: Aradice Arabicmmlu High Social-Science Geography Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_stem_computer-science_egy
+      name: Aradice Arabicmmlu High Stem Computer-Science Egy
+      description: Aradice Arabicmmlu High Stem Computer-Science Egy evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_high_stem_physics_lev
+      name: Aradice Arabicmmlu High Stem Physics Lev
+      description: Aradice Arabicmmlu High Stem Physics Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_social-science_civics_egy
+      name: Aradice Arabicmmlu Middle Social-Science Civics Egy
+      description: Aradice Arabicmmlu Middle Social-Science Civics Egy evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_social-science_economics_lev
+      name: Aradice Arabicmmlu Middle Social-Science Economics Lev
+      description: Aradice Arabicmmlu Middle Social-Science Economics Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_social-science_social-science_egy
+      name: Aradice Arabicmmlu Middle Social-Science Social-Science Egy
+      description: Aradice Arabicmmlu Middle Social-Science Social-Science Egy evaluation
+        benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_middle_stem_computer-science_lev
+      name: Aradice Arabicmmlu Middle Stem Computer-Science Lev
+      description: Aradice Arabicmmlu Middle Stem Computer-Science Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_social-science_geography_egy
+      name: Aradice Arabicmmlu Primary Social-Science Geography Egy
+      description: Aradice Arabicmmlu Primary Social-Science Geography Egy evaluation
+        benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_social-science_social-science_lev
+      name: Aradice Arabicmmlu Primary Social-Science Social-Science Lev
+      description: Aradice Arabicmmlu Primary Social-Science Social-Science Lev evaluation
+        benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_primary_stem_natural-science_lev
+      name: Aradice Arabicmmlu Primary Stem Natural-Science Lev
+      description: Aradice Arabicmmlu Primary Stem Natural-Science Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_univ_social-science_accounting_lev
+      name: Aradice Arabicmmlu Univ Social-Science Accounting Lev
+      description: Aradice Arabicmmlu Univ Social-Science Accounting Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_univ_social-science_political-science_egy
+      name: Aradice Arabicmmlu Univ Social-Science Political-Science Egy
+      description: Aradice Arabicmmlu Univ Social-Science Political-Science Egy evaluation
+        benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: AraDiCE_ArabicMMLU_univ_stem_computer-science_lev
+      name: Aradice Arabicmmlu Univ Stem Computer-Science Lev
+      description: Aradice Arabicmmlu Univ Stem Computer-Science Lev evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_college_biology_light
+      name: Arabic Leaderboard Arabic Mmlu College Biology Light
+      description: Arabic Leaderboard Arabic Mmlu College Biology Light evaluation benchmark
+      category: science
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - science
+      - lm_eval
+    - id: agieval_logiqa_zh
+      name: Agieval Logiqa Zh
+      description: Agieval Logiqa Zh evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 651
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh
+      name: Bbh
+      description: Bbh evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot
+      name: Bbh Cot Fewshot
+      description: Bbh Cot Fewshot evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_causal_judgement
+      name: Bbh Cot Fewshot Causal Judgement
+      description: Bbh Cot Fewshot Causal Judgement evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_dyck_languages
+      name: Bbh Cot Fewshot Dyck Languages
+      description: Bbh Cot Fewshot Dyck Languages evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_hyperbaton
+      name: Bbh Cot Fewshot Hyperbaton
+      description: Bbh Cot Fewshot Hyperbaton evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_logical_deduction_three_objects
+      name: Bbh Cot Fewshot Logical Deduction Three Objects
+      description: Bbh Cot Fewshot Logical Deduction Three Objects evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_navigate
+      name: Bbh Cot Fewshot Navigate
+      description: Bbh Cot Fewshot Navigate evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_reasoning_about_colored_objects
+      name: Bbh Cot Fewshot Reasoning About Colored Objects
+      description: Bbh Cot Fewshot Reasoning About Colored Objects evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_snarks
+      name: Bbh Cot Fewshot Snarks
+      description: Bbh Cot Fewshot Snarks evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_tracking_shuffled_objects_five_objects
+      name: Bbh Cot Fewshot Tracking Shuffled Objects Five Objects
+      description: Bbh Cot Fewshot Tracking Shuffled Objects Five Objects evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_fewshot_web_of_lies
+      name: Bbh Cot Fewshot Web Of Lies
+      description: Bbh Cot Fewshot Web Of Lies evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 5
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_zeroshot
+      name: Bbh Cot Zeroshot
+      description: Bbh Cot Zeroshot evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_zeroshot_causal_judgement
+      name: Bbh Cot Zeroshot Causal Judgement
+      description: Bbh Cot Zeroshot Causal Judgement evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: bbh_cot_zeroshot_dyck_languages
+      name: Bbh Cot Zeroshot Dyck Languages
+      description: Bbh Cot Zeroshot Dyck Languages evaluation benchmark
+      category: logic_reasoning
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - logic_reasoning
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_anatomy
+      name: Arabic Leaderboard Arabic Mmlu Anatomy
+      description: Arabic Leaderboard Arabic Mmlu Anatomy evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_clinical_knowledge
+      name: Arabic Leaderboard Arabic Mmlu Clinical Knowledge
+      description: Arabic Leaderboard Arabic Mmlu Clinical Knowledge evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_medical_genetics
+      name: Arabic Leaderboard Arabic Mmlu Medical Genetics
+      description: Arabic Leaderboard Arabic Mmlu Medical Genetics evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: arabic_leaderboard_arabic_mmlu_professional_medicine
+      name: Arabic Leaderboard Arabic Mmlu Professional Medicine
+      description: Arabic Leaderboard Arabic Mmlu Professional Medicine evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: cmmlu_professional_medicine
+      name: Cmmlu Professional Medicine
+      description: Cmmlu Professional Medicine evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: cmmlu_traditional_chinese_medicine
+      name: Cmmlu Traditional Chinese Medicine
+      description: Cmmlu Traditional Chinese Medicine evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_am_anatomy
+      name: Global Mmlu Full Am Anatomy
+      description: Global Mmlu Full Am Anatomy evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_am_clinical_knowledge
+      name: Global Mmlu Full Am Clinical Knowledge
+      description: Global Mmlu Full Am Clinical Knowledge evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_am_medical_genetics
+      name: Global Mmlu Full Am Medical Genetics
+      description: Global Mmlu Full Am Medical Genetics evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_am_professional_medicine
+      name: Global Mmlu Full Am Professional Medicine
+      description: Global Mmlu Full Am Professional Medicine evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_ar_anatomy
+      name: Global Mmlu Full Ar Anatomy
+      description: Global Mmlu Full Ar Anatomy evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_ar_clinical_knowledge
+      name: Global Mmlu Full Ar Clinical Knowledge
+      description: Global Mmlu Full Ar Clinical Knowledge evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_ar_medical_genetics
+      name: Global Mmlu Full Ar Medical Genetics
+      description: Global Mmlu Full Ar Medical Genetics evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_ar_professional_medicine
+      name: Global Mmlu Full Ar Professional Medicine
+      description: Global Mmlu Full Ar Professional Medicine evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: global_mmlu_full_bn_anatomy
+      name: Global Mmlu Full Bn Anatomy
+      description: Global Mmlu Full Bn Anatomy evaluation benchmark
+      category: medical
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 14042
+      tags:
+      - medical
+      - lm_eval
+    - id: lambada_openai
+      name: Lambada Openai
+      description: Lambada Openai evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: lambada_openai_mt_en
+      name: Lambada Openai Mt En
+      description: Lambada Openai Mt En evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: lambada_openai_mt_it
+      name: Lambada Openai Mt It
+      description: Lambada Openai Mt It evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: lambada_openai_mt_stablelm_es
+      name: Lambada Openai Mt Stablelm Es
+      description: Lambada Openai Mt Stablelm Es evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: lambada_openai_mt_stablelm_nl
+      name: Lambada Openai Mt Stablelm Nl
+      description: Lambada Openai Mt Stablelm Nl evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: lambada_standard_cloze_yaml
+      name: Lambada Standard Cloze Yaml
+      description: Lambada Standard Cloze Yaml evaluation benchmark
+      category: language_modeling
+      metrics:
+      - perplexity
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 5153
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: paloma_wikitext_103
+      name: Paloma Wikitext 103
+      description: Paloma Wikitext 103 evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 4358
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_arxiv
+      name: Pile Arxiv
+      description: Pile Arxiv evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_freelaw
+      name: Pile Freelaw
+      description: Pile Freelaw evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_hackernews
+      name: Pile Hackernews
+      description: Pile Hackernews evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_openwebtext2
+      name: Pile Openwebtext2
+      description: Pile Openwebtext2 evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_ubuntu-irc
+      name: Pile Ubuntu-Irc
+      description: Pile Ubuntu-Irc evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: pile_youtubesubtitles
+      name: Pile Youtubesubtitles
+      description: Pile Youtubesubtitles evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 210000000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: wikitext
+      name: Wikitext
+      description: Wikitext evaluation benchmark
+      category: language_modeling
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 4358
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: careqa_open_perplexity
+      name: Careqa Open Perplexity
+      description: Careqa Open Perplexity evaluation benchmark
+      category: language_modeling
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 10000
+      tags:
+      - language_modeling
+      - lm_eval
+    - id: AraDiCE_truthfulqa_mc1_lev
+      name: Aradice Truthfulqa Mc1 Lev
+      description: Aradice Truthfulqa Mc1 Lev evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: metabench_truthfulqa_permute
+      name: Metabench Truthfulqa Permute
+      description: Metabench Truthfulqa Permute evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_gen_nno_p0
+      name: Nortruthfulqa Gen Nno P0
+      description: Nortruthfulqa Gen Nno P0 evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_gen_nno_p3
+      name: Nortruthfulqa Gen Nno P3
+      description: Nortruthfulqa Gen Nno P3 evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_gen_nob_p1
+      name: Nortruthfulqa Gen Nob P1
+      description: Nortruthfulqa Gen Nob P1 evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_gen_nob_p4
+      name: Nortruthfulqa Gen Nob P4
+      description: Nortruthfulqa Gen Nob P4 evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_mc_nno_p2
+      name: Nortruthfulqa Mc Nno P2
+      description: Nortruthfulqa Mc Nno P2 evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_mc_nob_p0
+      name: Nortruthfulqa Mc Nob P0
+      description: Nortruthfulqa Mc Nob P0 evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: nortruthfulqa_mc_nob_p3
+      name: Nortruthfulqa Mc Nob P3
+      description: Nortruthfulqa Mc Nob P3 evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: tinyTruthfulQA
+      name: Tinytruthfulqa
+      description: Tinytruthfulqa evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: truthfulqa-multi_gen_ca
+      name: Truthfulqa-Multi Gen Ca
+      description: Truthfulqa-Multi Gen Ca evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: truthfulqa-multi_gen_eu
+      name: Truthfulqa-Multi Gen Eu
+      description: Truthfulqa-Multi Gen Eu evaluation benchmark
+      category: safety
+      metrics:
+      - mc1
+      - mc2
+      - bleu
+      - rouge
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: truthfulqa-multi_mc1_en
+      name: Truthfulqa-Multi Mc1 En
+      description: Truthfulqa-Multi Mc1 En evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: truthfulqa-multi_mc1_gl
+      name: Truthfulqa-Multi Mc1 Gl
+      description: Truthfulqa-Multi Mc1 Gl evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: truthfulqa-multi_mc2_es
+      name: Truthfulqa-Multi Mc2 Es
+      description: Truthfulqa-Multi Mc2 Es evaluation benchmark
+      category: safety
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 817
+      tags:
+      - safety
+      - lm_eval
+    - id: bigbench_code_line_description_multiple_choice
+      name: Bigbench Code Line Description Multiple Choice
+      description: Bigbench Code Line Description Multiple Choice evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      - acc_norm
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: ceval-valid_college_programming
+      name: Ceval-Valid College Programming
+      description: Ceval-Valid College Programming evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: code2text_javascript
+      name: Code2Text Javascript
+      description: Code2Text Javascript evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: code2text_ruby
+      name: Code2Text Ruby
+      description: Code2Text Ruby evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: humaneval
+      name: Humaneval
+      description: Humaneval evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: humaneval_instruct
+      name: Humaneval Instruct
+      description: Humaneval Instruct evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
+    - id: mbpp
+      name: MBPP
+      description: MBPP (Most Basic Python Programming) evaluation benchmark
+      category: code
+      metrics:
+      - accuracy
+      num_few_shot: 0
+      dataset_size: 1000
+      tags:
+      - code
+      - lm_eval
diff --git a/config/configmaps/kustomization.yaml b/config/configmaps/kustomization.yaml
index 424a7733f..8beaf6c86 100644
--- a/config/configmaps/kustomization.yaml
+++ b/config/configmaps/kustomization.yaml
@@ -1,4 +1,5 @@
 resources:
   - lmeval.yaml
+  - evalhub/
 
 namespace: system
diff --git a/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml b/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml
index 0bd2d4b4f..d3838cf43 100644
--- a/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml
+++ b/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml
@@ -180,6 +180,18 @@ spec:
                   - name
                   type: object
                 type: array
+              providers:
+                default:
+                - garak
+                - guidellm
+                - lighteval
+                - lm-evaluation-harness
+                description: |-
+                  Providers is the list of OOTB provider names to mount into the deployment.
+                  Each name must match a provider-name label on a ConfigMap in the operator namespace.
+                items:
+                  type: string
+                type: array
               replicas:
                 default: 1
                 description: Number of replicas for the eval-hub deployment
diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env
index a25946b6c..5787df1cc 100644
--- a/config/overlays/odh/params.env
+++ b/config/overlays/odh/params.env
@@ -18,3 +18,7 @@ guardrails-sidecar-gateway-image=quay.io/opendatahub/vllm-orchestrator-gateway:l
 ragas-provider-image=quay.io/opendatahub/llama-stack-provider-ragas:latest
 garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest
 nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest
+evalhub-provider-garak-image=quay.io/evalhub/garak:latest
+evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest
+evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest
+evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2
\ No newline at end of file
diff --git a/config/overlays/rhoai/params.env b/config/overlays/rhoai/params.env
index dee252fe0..2f7360c95 100644
--- a/config/overlays/rhoai/params.env
+++ b/config/overlays/rhoai/params.env
@@ -18,3 +18,7 @@ guardrails-sidecar-gateway-image=quay.io/trustyai/guardrails-sidecar-gateway:lat
 ragas-provider-image=quay.io/trustyai/llama-stack-provider-ragas:latest
 garak-provider-image=quay.io/trustyai/llama-stack-provider-trustyai-garak:latest
 nemo-guardrails-image=quay.io/trustyai/nemo-guardrails-server:latest
+evalhub-provider-garak-image=quay.io/evalhub/garak:latest
+evalhub-provider-guidellm-image=quay.io/evalhub/community-guidellm:latest
+evalhub-provider-lighteval-image=quay.io/evalhub/community-lighteval:latest
+evalhub-provider-lm-evaluation-harness-image=quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2
\ No newline at end of file
diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml
index 10e4c9d34..3dc23ef20 100644
--- a/config/rbac/kustomization.yaml
+++ b/config/rbac/kustomization.yaml
@@ -9,14 +9,14 @@ resources:
   - auth_proxy_role_binding.yaml
   - auth_proxy_client_clusterrole.yaml
   - evalhub/evalhub_auth_reviewer_role.yaml
-  - evalhub/evalhub_jobs_writer_role.yaml
-  - evalhub/evalhub_jobs_writer_binding.yaml
-  - evalhub/evalhub_job_config_role.yaml
   - evalhub/evalhub_job_config_binding.yaml
-  - evalhub/evalhub_mlflow_access_role.yaml
+  - evalhub/evalhub_job_config_role.yaml
+  - evalhub/evalhub_jobs_writer_binding.yaml
+  - evalhub/evalhub_jobs_writer_role.yaml
   - evalhub/evalhub_mlflow_access_binding.yaml
-  - evalhub/evalhub_mlflow_jobs_role.yaml
+  - evalhub/evalhub_mlflow_access_role.yaml
   - evalhub/evalhub_mlflow_jobs_binding.yaml
+  - evalhub/evalhub_mlflow_jobs_role.yaml
   - nemoguardrail_editor_role.yaml
   - nemoguardrail_viewer_role.yaml
   - trustyaiservice_editor_role.yaml
diff --git a/controllers/evalhub/build_test.go b/controllers/evalhub/build_test.go
index 711523c77..26e740535 100644
--- a/controllers/evalhub/build_test.go
+++ b/controllers/evalhub/build_test.go
@@ -68,7 +68,7 @@ func TestBuildDeploymentSpec(t *testing.T) {
 	}
 
 	t.Run("should build correct deployment spec", func(t *testing.T) {
-		deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHub)
+		deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHub, nil)
 		require.NoError(t, err)
 
 		// Check replicas
@@ -183,7 +183,7 @@ func TestBuildDeploymentSpec(t *testing.T) {
 			EventRecorder: record.NewFakeRecorder(10),
 		}
 
-		deploymentSpec, err := reconcilerNoConfig.buildDeploymentSpec(ctx, evalHub)
+		deploymentSpec, err := reconcilerNoConfig.buildDeploymentSpec(ctx, evalHub, nil)
 		require.Error(t, err)
 		// Should return empty deployment spec (zero value) on error
 		assert.Equal(t, appsv1.DeploymentSpec{}, deploymentSpec)
@@ -201,7 +201,7 @@ func TestBuildDeploymentSpec(t *testing.T) {
 			},
 		}
 
-		deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHubNoReplicas)
+		deploymentSpec, err := reconciler.buildDeploymentSpec(ctx, evalHubNoReplicas, nil)
 		require.NoError(t, err)
 
 		// Should use default replicas (1)
diff --git a/controllers/evalhub/configmap.go b/controllers/evalhub/configmap.go
index d121e582d..ce26c2e9f 100644
--- a/controllers/evalhub/configmap.go
+++ b/controllers/evalhub/configmap.go
@@ -16,34 +16,37 @@ import (
 	"sigs.k8s.io/yaml"
 )
 
-// ProviderConfig represents the provider configuration structure
-type ProviderConfig struct {
-	Name       string            `yaml:"name"`
-	Type       string            `yaml:"type"`
-	Enabled    bool              `yaml:"enabled"`
-	Benchmarks []string          `yaml:"benchmarks,omitempty"`
-	Config     map[string]string `yaml:"config,omitempty"`
+// ServiceConfig represents the service section in config.yaml
+type ServiceConfig struct {
+	Port            int    `json:"port"`
+	ReadyFile       string `json:"ready_file"`
+	TerminationFile string `json:"termination_file"`
 }
 
 // DatabaseConfig represents the database configuration in config.yaml
 type DatabaseConfig struct {
-	Driver       string `yaml:"driver"`
-	MaxOpenConns int    `yaml:"max_open_conns,omitempty"`
-	MaxIdleConns int    `yaml:"max_idle_conns,omitempty"`
+	Driver       string `json:"driver"`
+	URL          string `json:"url,omitempty"`
+	MaxOpenConns int    `json:"max_open_conns,omitempty"`
+	MaxIdleConns int    `json:"max_idle_conns,omitempty"`
 }
 
 // SecretsMapping represents the secrets mapping configuration in config.yaml
 type SecretsMapping struct {
-	Dir      string            `yaml:"dir"`
-	Mappings map[string]string `yaml:"mappings"`
+	Dir      string            `json:"dir"`
+	Mappings map[string]string `json:"mappings"`
 }
 
+// EnvMappings maps environment variable names to config field paths
+type EnvMappings map[string]string
+
 // EvalHubConfig represents the eval-hub configuration structure
 type EvalHubConfig struct {
-	Providers   []ProviderConfig `yaml:"providers"`
-	Collections []string         `yaml:"collections,omitempty"`
-	Database    *DatabaseConfig  `yaml:"database,omitempty"`
-	Secrets     *SecretsMapping  `yaml:"secrets,omitempty"`
+	Service     ServiceConfig   `json:"service"`
+	Secrets     *SecretsMapping `json:"secrets,omitempty"`
+	EnvMappings EnvMappings     `json:"env_mappings"`
+	Database    *DatabaseConfig `json:"database"`
+	Prometheus  map[string]any  `json:"prometheus,omitempty"`
 }
 
 // reconcileConfigMap creates or updates the ConfigMap for EvalHub configuration
@@ -91,69 +94,30 @@ func (r *EvalHubReconciler) reconcileConfigMap(ctx context.Context, instance *ev
 // generateConfigData generates the configuration data for the ConfigMap
 func (r *EvalHubReconciler) generateConfigData(instance *evalhubv1alpha1.EvalHub) (map[string]string, error) {
 	config := EvalHubConfig{
-		Providers:   make([]ProviderConfig, 0),
-		Collections: []string{},
-	}
-
-	// Default providers configuration set by the controller
-	config.Providers = []ProviderConfig{
-		{
-			Name:    "lm-eval-harness",
-			Type:    "lm_evaluation_harness",
-			Enabled: true,
-			Benchmarks: []string{
-				"arc_challenge", "hellaswag", "mmlu", "truthfulqa",
-			},
-			Config: map[string]string{
-				"batch_size": "8",
-				"max_length": "2048",
-			},
+		Service: ServiceConfig{
+			Port:            containerPort,
+			ReadyFile:       "/tmp/repo-ready",
+			TerminationFile: "/tmp/termination-log",
 		},
-		{
-			Name:    "ragas-provider",
-			Type:    "ragas",
-			Enabled: true,
-			Benchmarks: []string{
-				"faithfulness", "answer_relevancy", "context_precision", "context_recall",
-			},
-			Config: map[string]string{
-				"llm_model":        "gpt-3.5-turbo",
-				"embeddings_model": "text-embedding-ada-002",
-			},
+		EnvMappings: EnvMappings{
+			"PORT":                        "service.port",
+			"DB_URL":                      "database.url",
+			"MLFLOW_TRACKING_URI":         "mlflow.tracking_uri",
+			"MLFLOW_CA_CERT_PATH":         "mlflow.ca_cert_path",
+			"MLFLOW_INSECURE_SKIP_VERIFY": "mlflow.insecure_skip_verify",
+			"MLFLOW_TOKEN_PATH":           "mlflow.token_path",
+			"MLFLOW_WORKSPACE":            "mlflow.workspace",
 		},
-		{
-			Name:    "garak-security",
-			Type:    "garak",
-			Enabled: false,
-			Benchmarks: []string{
-				"encoding", "injection", "malware", "prompt_injection",
-			},
-			Config: map[string]string{
-				"probe_set": "basic",
-			},
+		Database: &DatabaseConfig{
+			Driver: "sqlite",
+			URL:    "file::eval_hub:?mode=memory&cache=shared",
 		},
-		{
-			Name:    "trustyai-custom",
-			Type:    "trustyai_custom",
-			Enabled: true,
-			Benchmarks: []string{
-				"bias_detection", "fairness_metrics",
-			},
-			Config: map[string]string{
-				"bias_threshold": "0.1",
-			},
+		Prometheus: map[string]any{
+			"enabled": true,
 		},
 	}
 
-	// Default collections
-	config.Collections = []string{
-		"healthcare_safety_v1",
-		"automotive_safety_v1",
-		"finance_compliance_v1",
-		"general_llm_eval_v1",
-	}
-
-	// Conditionally add database configuration
+	// Override database configuration when explicitly configured
 	if instance.Spec.IsDatabaseConfigured() {
 		maxOpen, maxIdle := dbDefaultMaxOpen, dbDefaultMaxIdle
 		if instance.Spec.Database.MaxOpenConns > 0 {
@@ -179,31 +143,11 @@ func (r *EvalHubReconciler) generateConfigData(instance *evalhubv1alpha1.EvalHub
 		return nil, err
 	}
 
-	// Generate providers.yaml content
-	providersYAML, err := r.generateProvidersYAML(config.Providers)
-	if err != nil {
-		return nil, err
-	}
-
 	return map[string]string{
-		"config.yaml":    string(configYAML),
-		"providers.yaml": providersYAML,
+		"config.yaml": string(configYAML),
 	}, nil
 }
 
-// generateProvidersYAML generates the providers.yaml configuration
-func (r *EvalHubReconciler) generateProvidersYAML(providers []ProviderConfig) (string, error) {
-	providersData := make(map[string]interface{})
-	providersData["providers"] = providers
-
-	yamlData, err := yaml.Marshal(providersData)
-	if err != nil {
-		return "", err
-	}
-
-	return string(yamlData), nil
-}
-
 // getImageFromConfigMap gets a required image value from the operator's ConfigMap
 // Returns error if ConfigMap is not found, key is missing, or value is empty
 // This ensures explicit configuration and prevents deployment with unconfigured images
@@ -373,6 +317,95 @@ func (r *EvalHubReconciler) generateProxyConfigData(instance *evalhubv1alpha1.Ev
 	}
 }
 
+// reconcileProviderConfigMaps copies provider ConfigMaps from the operator namespace to the
+// EvalHub CR's namespace. Only providers listed in instance.Spec.Providers are copied.
+// Each source ConfigMap is discovered by the labels:
+//   - trustyai.opendatahub.io/evalhub-provider-type=system
+//   - trustyai.opendatahub.io/evalhub-provider-name=<name>
+//
+// Returns the list of created ConfigMap names (for building projected volumes).
+func (r *EvalHubReconciler) reconcileProviderConfigMaps(ctx context.Context, instance *evalhubv1alpha1.EvalHub) ([]string, error) {
+	if len(instance.Spec.Providers) == 0 {
+		return nil, nil
+	}
+
+	log := log.FromContext(ctx)
+	log.Info("Reconciling Provider ConfigMaps", "instance", instance.Name, "providers", instance.Spec.Providers)
+
+	var cmNames []string
+	for _, providerName := range instance.Spec.Providers {
+		// Look up the source ConfigMap by both labels
+		var sourceList corev1.ConfigMapList
+		if err := r.List(ctx, &sourceList,
+			client.InNamespace(r.Namespace),
+			client.MatchingLabels{
+				providerLabel:     "system",
+				providerNameLabel: providerName,
+			}); err != nil {
+			return nil, fmt.Errorf("failed to list provider ConfigMaps for %q in namespace %s: %w", providerName, r.Namespace, err)
+		}
+		if len(sourceList.Items) == 0 {
+			return nil, fmt.Errorf("provider %q not found: no ConfigMap with label %s=%s in namespace %s",
+				providerName, providerNameLabel, providerName, r.Namespace)
+		}
+
+		src := &sourceList.Items[0]
+		targetName := instance.Name + "-provider-" + providerName
+
+		configMap := &corev1.ConfigMap{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      targetName,
+				Namespace: instance.Namespace,
+			},
+		}
+
+		// Check if ConfigMap already exists
+		getErr := r.Get(ctx, client.ObjectKeyFromObject(configMap), configMap)
+		if getErr != nil && !errors.IsNotFound(getErr) {
+			return nil, getErr
+		}
+
+		if errors.IsNotFound(getErr) {
+			configMap.Data = src.Data
+			if instance.UID != "" {
+				if err := controllerutil.SetControllerReference(instance, configMap, r.Scheme); err != nil {
+					return nil, err
+				}
+			}
+			log.Info("Creating Provider ConfigMap", "name", targetName, "provider", providerName)
+			if err := r.Create(ctx, configMap); err != nil {
+				return nil, err
+			}
+		} else {
+			configMap.Data = src.Data
+			log.Info("Updating Provider ConfigMap", "name", targetName, "provider", providerName)
+			if err := r.Update(ctx, configMap); err != nil {
+				return nil, err
+			}
+		}
+
+		cmNames = append(cmNames, targetName)
+	}
+
+	return cmNames, nil
+}
+
+// providerVolumeProjections builds VolumeProjection entries for mounting provider ConfigMaps
+// into a single projected volume.
+func providerVolumeProjections(cmNames []string) []corev1.VolumeProjection {
+	var projections []corev1.VolumeProjection
+	for _, name := range cmNames {
+		projections = append(projections, corev1.VolumeProjection{
+			ConfigMap: &corev1.ConfigMapProjection{
+				LocalObjectReference: corev1.LocalObjectReference{
+					Name: name,
+				},
+			},
+		})
+	}
+	return projections
+}
+
 // reconcileServiceCAConfigMap creates or updates the ConfigMap for service CA certificate injection
 // This ConfigMap is used by jobs to mount the service CA certificate for TLS verification
 func (r *EvalHubReconciler) reconcileServiceCAConfigMap(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
diff --git a/controllers/evalhub/configmap_test.go b/controllers/evalhub/configmap_test.go
index 43d4faac5..fc8d146cc 100644
--- a/controllers/evalhub/configmap_test.go
+++ b/controllers/evalhub/configmap_test.go
@@ -88,7 +88,6 @@ var _ = Describe("EvalHub ConfigMap", func() {
 
 			By("Checking required keys exist")
 			Expect(configMap.Data).To(HaveKey("config.yaml"))
-			Expect(configMap.Data).To(HaveKey("providers.yaml"))
 		})
 
 		It("should have valid YAML configuration", func() {
@@ -103,173 +102,6 @@ var _ = Describe("EvalHub ConfigMap", func() {
 			var config EvalHubConfig
 			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
 			Expect(err).NotTo(HaveOccurred())
-
-			By("Checking default providers are present")
-			Expect(config.Providers).To(HaveLen(4))
-
-			var providerNames []string
-			for _, provider := range config.Providers {
-				providerNames = append(providerNames, provider.Name)
-			}
-			Expect(providerNames).To(ContainElements(
-				"lm-eval-harness", "ragas-provider", "garak-security", "trustyai-custom",
-			))
-
-			By("Checking default collections are present")
-			Expect(config.Collections).To(ContainElements(
-				"healthcare_safety_v1", "automotive_safety_v1",
-				"finance_compliance_v1", "general_llm_eval_v1",
-			))
-		})
-
-		It("should have valid providers.yaml", func() {
-			By("Reconciling configmap")
-			err := reconciler.reconcileConfigMap(ctx, evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Getting configmap")
-			configMap := waitForConfigMap(evalHubName+"-config", testNamespace)
-
-			By("Parsing providers.yaml")
-			var providersData map[string]interface{}
-			err = yaml.Unmarshal([]byte(configMap.Data["providers.yaml"]), &providersData)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Checking providers structure")
-			Expect(providersData).To(HaveKey("providers"))
-			providers, ok := providersData["providers"].([]interface{})
-			Expect(ok).To(BeTrue())
-			Expect(providers).To(HaveLen(4))
-		})
-
-		It("should configure lm-eval-harness provider correctly", func() {
-			By("Reconciling configmap")
-			err := reconciler.reconcileConfigMap(ctx, evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Getting configmap")
-			configMap := waitForConfigMap(evalHubName+"-config", testNamespace)
-
-			By("Parsing config.yaml")
-			var config EvalHubConfig
-			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Finding lm-eval-harness provider")
-			var lmEvalProvider *ProviderConfig
-			for _, provider := range config.Providers {
-				if provider.Name == "lm-eval-harness" {
-					lmEvalProvider = &provider
-					break
-				}
-			}
-			Expect(lmEvalProvider).NotTo(BeNil())
-
-			By("Checking lm-eval-harness configuration")
-			Expect(lmEvalProvider.Type).To(Equal("lm_evaluation_harness"))
-			Expect(lmEvalProvider.Enabled).To(BeTrue())
-			Expect(lmEvalProvider.Benchmarks).To(ContainElements(
-				"arc_challenge", "hellaswag", "mmlu", "truthfulqa",
-			))
-			Expect(lmEvalProvider.Config["batch_size"]).To(Equal("8"))
-			Expect(lmEvalProvider.Config["max_length"]).To(Equal("2048"))
-		})
-
-		It("should configure ragas provider correctly", func() {
-			By("Reconciling configmap")
-			err := reconciler.reconcileConfigMap(ctx, evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Getting configmap")
-			configMap := waitForConfigMap(evalHubName+"-config", testNamespace)
-
-			By("Parsing config.yaml")
-			var config EvalHubConfig
-			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Finding ragas provider")
-			var ragasProvider *ProviderConfig
-			for _, provider := range config.Providers {
-				if provider.Name == "ragas-provider" {
-					ragasProvider = &provider
-					break
-				}
-			}
-			Expect(ragasProvider).NotTo(BeNil())
-
-			By("Checking ragas configuration")
-			Expect(ragasProvider.Type).To(Equal("ragas"))
-			Expect(ragasProvider.Enabled).To(BeTrue())
-			Expect(ragasProvider.Benchmarks).To(ContainElements(
-				"faithfulness", "answer_relevancy", "context_precision", "context_recall",
-			))
-			Expect(ragasProvider.Config["llm_model"]).To(Equal("gpt-3.5-turbo"))
-			Expect(ragasProvider.Config["embeddings_model"]).To(Equal("text-embedding-ada-002"))
-		})
-
-		It("should configure garak security provider correctly", func() {
-			By("Reconciling configmap")
-			err := reconciler.reconcileConfigMap(ctx, evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Getting configmap")
-			configMap := waitForConfigMap(evalHubName+"-config", testNamespace)
-
-			By("Parsing config.yaml")
-			var config EvalHubConfig
-			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Finding garak provider")
-			var garakProvider *ProviderConfig
-			for _, provider := range config.Providers {
-				if provider.Name == "garak-security" {
-					garakProvider = &provider
-					break
-				}
-			}
-			Expect(garakProvider).NotTo(BeNil())
-
-			By("Checking garak configuration")
-			Expect(garakProvider.Type).To(Equal("garak"))
-			Expect(garakProvider.Enabled).To(BeFalse()) // Disabled by default
-			Expect(garakProvider.Benchmarks).To(ContainElements(
-				"encoding", "injection", "malware", "prompt_injection",
-			))
-			Expect(garakProvider.Config["probe_set"]).To(Equal("basic"))
-		})
-
-		It("should configure trustyai custom provider correctly", func() {
-			By("Reconciling configmap")
-			err := reconciler.reconcileConfigMap(ctx, evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Getting configmap")
-			configMap := waitForConfigMap(evalHubName+"-config", testNamespace)
-
-			By("Parsing config.yaml")
-			var config EvalHubConfig
-			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Finding trustyai provider")
-			var trustyaiProvider *ProviderConfig
-			for _, provider := range config.Providers {
-				if provider.Name == "trustyai-custom" {
-					trustyaiProvider = &provider
-					break
-				}
-			}
-			Expect(trustyaiProvider).NotTo(BeNil())
-
-			By("Checking trustyai configuration")
-			Expect(trustyaiProvider.Type).To(Equal("trustyai_custom"))
-			Expect(trustyaiProvider.Enabled).To(BeTrue())
-			Expect(trustyaiProvider.Benchmarks).To(ContainElements(
-				"bias_detection", "fairness_metrics",
-			))
-			Expect(trustyaiProvider.Config["bias_threshold"]).To(Equal("0.1"))
 		})
 
 		It("should update existing configmap", func() {
@@ -381,7 +213,7 @@ var _ = Describe("EvalHub ConfigMap", func() {
 			Expect(config.Secrets.Mappings).To(HaveKeyWithValue("db-url", "database.url"))
 		})
 
-		It("should omit database and secrets sections when database is not configured", func() {
+		It("should default to sqlite when database is not explicitly configured", func() {
 			By("Reconciling configmap for standard EvalHub (no DB)")
 			err := reconciler.reconcileConfigMap(ctx, evalHub)
 			Expect(err).NotTo(HaveOccurred())
@@ -399,8 +231,9 @@ var _ = Describe("EvalHub ConfigMap", func() {
 			err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
 			Expect(err).NotTo(HaveOccurred())
 
-			By("Checking database and secrets are absent")
-			Expect(config.Database).To(BeNil())
+			By("Checking default sqlite database is set and secrets are absent")
+			Expect(config.Database).NotTo(BeNil())
+			Expect(config.Database.Driver).To(Equal("sqlite"))
 			Expect(config.Secrets).To(BeNil())
 		})
 	})
@@ -413,44 +246,11 @@ var _ = Describe("EvalHub ConfigMap", func() {
 
 			By("Checking required keys are present")
 			Expect(configData).To(HaveKey("config.yaml"))
-			Expect(configData).To(HaveKey("providers.yaml"))
 
 			By("Validating config.yaml content")
 			var config EvalHubConfig
 			err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config)
 			Expect(err).NotTo(HaveOccurred())
-			Expect(config.Providers).To(HaveLen(4))
-			Expect(config.Collections).To(HaveLen(4))
-
-			By("Validating providers.yaml content")
-			var providersData map[string]interface{}
-			err = yaml.Unmarshal([]byte(configData["providers.yaml"]), &providersData)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(providersData).To(HaveKey("providers"))
-		})
-
-		It("should generate providers YAML correctly", func() {
-			By("Generating configuration data")
-			configData, err := reconciler.generateConfigData(evalHub)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Parsing config.yaml to get providers")
-			var config EvalHubConfig
-			err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Generating providers YAML")
-			providersYAML, err := reconciler.generateProvidersYAML(config.Providers)
-			Expect(err).NotTo(HaveOccurred())
-
-			By("Verifying providers YAML matches configmap data")
-			Expect(providersYAML).To(Equal(configData["providers.yaml"]))
-
-			By("Verifying providers YAML is valid")
-			var providersData map[string]interface{}
-			err = yaml.Unmarshal([]byte(providersYAML), &providersData)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(providersData).To(HaveKey("providers"))
 		})
 	})
 })
diff --git a/controllers/evalhub/constants.go b/controllers/evalhub/constants.go
index 42109092f..e61809760 100644
--- a/controllers/evalhub/constants.go
+++ b/controllers/evalhub/constants.go
@@ -49,6 +49,15 @@ const (
 	mlflowTokenMountPath  = "/var/run/secrets/mlflow"
 	mlflowTokenFile       = "token"
 	mlflowTokenExpiration = 3600 // seconds
+
+	// EvalHub config directory (contains config.yaml and providers/ subdir)
+	configDirPath = "/etc/evalhub/config"
+
+	// Provider ConfigMap configuration
+	providerLabel       = "trustyai.opendatahub.io/evalhub-provider-type"
+	providerNameLabel   = "trustyai.opendatahub.io/evalhub-provider-name"
+	providersVolumeName = "evalhub-providers"
+	providersMountPath  = configDirPath + "/providers"
 )
 
 var (
diff --git a/controllers/evalhub/deployment.go b/controllers/evalhub/deployment.go
index 9d63877eb..517b7fb8a 100644
--- a/controllers/evalhub/deployment.go
+++ b/controllers/evalhub/deployment.go
@@ -18,7 +18,7 @@ import (
 )
 
 // reconcileDeployment creates or updates the Deployment for EvalHub
-func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
+func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *evalhubv1alpha1.EvalHub, providerCMNames []string) error {
 	log := log.FromContext(ctx)
 	log.Info("Reconciling Deployment", "name", instance.Name)
 
@@ -36,7 +36,7 @@ func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *e
 	}
 
 	// Define the desired deployment spec
-	desiredSpec, err := r.buildDeploymentSpec(ctx, instance)
+	desiredSpec, err := r.buildDeploymentSpec(ctx, instance, providerCMNames)
 	if err != nil {
 		return err
 	}
@@ -61,7 +61,7 @@ func (r *EvalHubReconciler) reconcileDeployment(ctx context.Context, instance *e
 }
 
 // buildDeploymentSpec builds the deployment specification for EvalHub
-func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *evalhubv1alpha1.EvalHub) (appsv1.DeploymentSpec, error) {
+func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *evalhubv1alpha1.EvalHub, providerCMNames []string) (appsv1.DeploymentSpec, error) {
 	labels := map[string]string{
 		"app":       "eval-hub",
 		"instance":  instance.Name,
@@ -104,12 +104,8 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e
 			Value: "3",
 		},
 		{
-			Name:  "CONFIG_PATH",
-			Value: "/etc/evalhub/config.yaml",
-		},
-		{
-			Name:  "PROVIDERS_CONFIG_PATH",
-			Value: "/etc/evalhub/providers.yaml",
+			Name:  "EVAL_HUB_CONFIG_DIR",
+			Value: configDirPath,
 		},
 		{
 			Name:  "SERVICE_URL",
@@ -140,7 +136,7 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e
 	volumeMounts := []corev1.VolumeMount{
 		{
 			Name:      "evalhub-config",
-			MountPath: "/etc/evalhub",
+			MountPath: configDirPath,
 			ReadOnly:  true,
 		},
 		{
@@ -154,6 +150,13 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e
 			ReadOnly:  true,
 		},
 	}
+	if len(providerCMNames) > 0 {
+		volumeMounts = append(volumeMounts, corev1.VolumeMount{
+			Name:      providersVolumeName,
+			MountPath: providersMountPath,
+			ReadOnly:  true,
+		})
+	}
 	if instance.Spec.IsDatabaseConfigured() {
 		volumeMounts = append(volumeMounts, corev1.VolumeMount{
 			Name:      dbSecretVolumeName,
@@ -343,6 +346,16 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e
 			},
 		},
 	}
+	if len(providerCMNames) > 0 {
+		volumes = append(volumes, corev1.Volume{
+			Name: providersVolumeName,
+			VolumeSource: corev1.VolumeSource{
+				Projected: &corev1.ProjectedVolumeSource{
+					Sources: providerVolumeProjections(providerCMNames),
+				},
+			},
+		})
+	}
 	if instance.Spec.IsDatabaseConfigured() {
 		volumes = append(volumes, corev1.Volume{
 			Name: dbSecretVolumeName,
diff --git a/controllers/evalhub/deployment_test.go b/controllers/evalhub/deployment_test.go
index 2c90e67c9..e905cfa7b 100644
--- a/controllers/evalhub/deployment_test.go
+++ b/controllers/evalhub/deployment_test.go
@@ -75,7 +75,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should create deployment with correct specifications", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Verifying deployment exists")
@@ -105,7 +105,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure evalhub container correctly", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -135,7 +135,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should set default environment variables", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -171,7 +171,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should include custom environment variables", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -200,7 +200,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure resource requirements", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -234,7 +234,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure health probes", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -273,7 +273,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure security contexts", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -302,7 +302,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should update existing deployment", func() {
 			By("Creating initial deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Modifying EvalHub replicas")
@@ -312,7 +312,7 @@ var _ = Describe("EvalHub Deployment", func() {
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Reconciling deployment again")
-			err = reconciler.reconcileDeployment(ctx, evalHub)
+			err = reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Verifying deployment is updated")
@@ -331,14 +331,14 @@ var _ = Describe("EvalHub Deployment", func() {
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Reconciling deployment without config map")
-			err = reconciler.reconcileDeployment(ctx, evalHub)
+			err = reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring("kube-rbac-proxy configuration error"))
 		})
 
 		It("should configure rolling update strategy", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -360,7 +360,7 @@ var _ = Describe("EvalHub Deployment", func() {
 			defer k8sClient.Delete(ctx, dbEvalHub)
 
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, dbEvalHub)
+			err := reconciler.reconcileDeployment(ctx, dbEvalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -420,7 +420,7 @@ var _ = Describe("EvalHub Deployment", func() {
 			Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed())
 
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -456,7 +456,7 @@ var _ = Describe("EvalHub Deployment", func() {
 			}
 
 			By("Attempting to reconcile deployment")
-			err := reconciler.reconcileDeployment(ctx, nonExistentEvalHub)
+			err := reconciler.reconcileDeployment(ctx, nonExistentEvalHub, nil)
 			Expect(err).To(HaveOccurred())
 		})
 
@@ -482,7 +482,7 @@ var _ = Describe("EvalHub Deployment", func() {
 			Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed())
 
 			By("Attempting to reconcile deployment")
-			err = reconciler.reconcileDeployment(ctx, evalHub)
+			err = reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring("kube-rbac-proxy configuration error"))
 
@@ -500,7 +500,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should include kube-rbac-proxy sidecar container", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -540,7 +540,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure kube-rbac-proxy resource requirements", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -573,7 +573,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure kube-rbac-proxy security context", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -599,7 +599,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure kube-rbac-proxy volume mounts", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -638,7 +638,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure deployment volumes for proxy", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
@@ -676,7 +676,7 @@ var _ = Describe("EvalHub Deployment", func() {
 
 		It("should configure service account for API", func() {
 			By("Reconciling deployment")
-			err := reconciler.reconcileDeployment(ctx, evalHub)
+			err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 			Expect(err).NotTo(HaveOccurred())
 
 			By("Getting deployment")
diff --git a/controllers/evalhub/evalhub_controller.go b/controllers/evalhub/evalhub_controller.go
index 3f098d900..9643b9b2b 100644
--- a/controllers/evalhub/evalhub_controller.go
+++ b/controllers/evalhub/evalhub_controller.go
@@ -145,8 +145,18 @@ func (r *EvalHubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 		return RequeueWithError(err)
 	}
 
+	// Reconcile Provider ConfigMaps (copy from operator namespace to instance namespace)
+	providerCMNames, err := r.reconcileProviderConfigMaps(ctx, instance)
+	if err != nil {
+		log.Error(err, "Failed to reconcile Provider ConfigMaps")
+		instance.SetStatus("Ready", "Error", fmt.Sprintf("Failed to reconcile Provider ConfigMaps: %v", err), corev1.ConditionFalse)
+		r.Status().Update(ctx, instance)
+		return RequeueWithError(err)
+	}
+	instance.Status.ActiveProviders = instance.Spec.Providers
+
 	// Reconcile Deployment
-	if err := r.reconcileDeployment(ctx, instance); err != nil {
+	if err := r.reconcileDeployment(ctx, instance, providerCMNames); err != nil {
 		log.Error(err, "Failed to reconcile Deployment")
 		instance.SetStatus("Ready", "Error", fmt.Sprintf("Failed to reconcile Deployment: %v", err), corev1.ConditionFalse)
 		r.Status().Update(ctx, instance)
diff --git a/controllers/evalhub/evalhub_controller_test.go b/controllers/evalhub/evalhub_controller_test.go
index a27b0f74c..97a9d46b6 100644
--- a/controllers/evalhub/evalhub_controller_test.go
+++ b/controllers/evalhub/evalhub_controller_test.go
@@ -42,6 +42,11 @@ var _ = Describe("EvalHub Controller", func() {
 		configMap = createConfigMap(configMapName, testNamespace)
 		Expect(k8sClient.Create(ctx, configMap)).Should(Succeed())
 
+		// Create source provider ConfigMaps (needed because the CRD default populates providers)
+		for _, cm := range createDefaultProviderConfigMaps(testNamespace) {
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+		}
+
 		// Create EvalHub instance
 		evalHub = createEvalHubInstance(evalHubName, testNamespace)
 		Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed())
@@ -180,6 +185,11 @@ var _ = Describe("EvalHub Lifecycle Integration", func() {
 		configMap = createConfigMap(configMapName, testNamespace)
 		Expect(k8sClient.Create(ctx, configMap)).Should(Succeed())
 
+		// Create source provider ConfigMaps (needed because the CRD default populates providers)
+		for _, cm := range createDefaultProviderConfigMaps(testNamespace) {
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+		}
+
 		// Create EvalHub instance
 		evalHub = createEvalHubInstance(evalHubName, testNamespace)
 		Expect(k8sClient.Create(ctx, evalHub)).Should(Succeed())
@@ -219,7 +229,6 @@ var _ = Describe("EvalHub Lifecycle Integration", func() {
 		By("Checking that ConfigMap is created")
 		configMapCreated := waitForConfigMap(evalHubName+"-config", testNamespace)
 		Expect(configMapCreated.Data).To(HaveKey("config.yaml"))
-		Expect(configMapCreated.Data).To(HaveKey("providers.yaml"))
 
 		By("Checking that Deployment is created")
 		deployment := waitForDeployment(evalHubName, testNamespace)
diff --git a/controllers/evalhub/suite_test.go b/controllers/evalhub/suite_test.go
index 95de06484..459c9e860 100644
--- a/controllers/evalhub/suite_test.go
+++ b/controllers/evalhub/suite_test.go
@@ -94,7 +94,8 @@ func createEvalHubInstance(name, namespace string) *evalhubv1alpha1.EvalHub {
 			Namespace: namespace,
 		},
 		Spec: evalhubv1alpha1.EvalHubSpec{
-			Replicas: &replicas,
+			Replicas:  &replicas,
+			Providers: []string{},
 			Env: []corev1.EnvVar{
 				{
 					Name:  "TEST_ENV",
@@ -128,6 +129,30 @@ func createConfigMap(name, namespace string) *corev1.ConfigMap {
 	}
 }
 
+// createDefaultProviderConfigMaps creates source provider ConfigMaps in the given namespace
+// to satisfy the CRD default providers list during integration tests.
+func createDefaultProviderConfigMaps(namespace string) []*corev1.ConfigMap {
+	defaultProviders := []string{"garak", "guidellm", "lighteval", "lm-evaluation-harness"}
+	var cms []*corev1.ConfigMap
+	for _, id := range defaultProviders {
+		cm := &corev1.ConfigMap{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "evalhub-provider-" + id,
+				Namespace: namespace,
+				Labels: map[string]string{
+					providerLabel:     "system",
+					providerNameLabel: id,
+				},
+			},
+			Data: map[string]string{
+				id + ".yaml": "id: " + id + "\nname: " + id + "\n",
+			},
+		}
+		cms = append(cms, cm)
+	}
+	return cms
+}
+
 // setupReconciler creates and returns a EvalHubReconciler for testing
 func setupReconciler(namespace string) (*EvalHubReconciler, context.Context) {
 	eventRecorder := record.NewFakeRecorder(100)
diff --git a/controllers/evalhub/unit_test.go b/controllers/evalhub/unit_test.go
index 6d1fbeb3b..c8ed7832c 100644
--- a/controllers/evalhub/unit_test.go
+++ b/controllers/evalhub/unit_test.go
@@ -73,7 +73,7 @@ func TestEvalHubReconciler_reconcileDeployment(t *testing.T) {
 	}
 
 	t.Run("should create deployment with correct spec", func(t *testing.T) {
-		err := reconciler.reconcileDeployment(ctx, evalHub)
+		err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 		require.NoError(t, err)
 
 		// Verify deployment was created
@@ -154,7 +154,7 @@ func TestEvalHubReconciler_reconcileDeployment(t *testing.T) {
 			EventRecorder: record.NewFakeRecorder(10),
 		}
 
-		err := reconcilerNoConfig.reconcileDeployment(ctx, evalHub)
+		err := reconcilerNoConfig.reconcileDeployment(ctx, evalHub, nil)
 		require.Error(t, err)
 		assert.Contains(t, err.Error(), "kube-rbac-proxy configuration error")
 	})
@@ -266,33 +266,11 @@ func TestEvalHubReconciler_reconcileConfigMap(t *testing.T) {
 
 		// Check data keys exist
 		assert.Contains(t, configMap.Data, "config.yaml")
-		assert.Contains(t, configMap.Data, "providers.yaml")
 
 		// Parse and validate config.yaml
 		var config EvalHubConfig
 		err = yaml.Unmarshal([]byte(configMap.Data["config.yaml"]), &config)
 		require.NoError(t, err)
-
-		// Check providers
-		assert.Len(t, config.Providers, 4)
-		providerNames := make([]string, len(config.Providers))
-		for i, provider := range config.Providers {
-			providerNames[i] = provider.Name
-		}
-		assert.Contains(t, providerNames, "lm-eval-harness")
-		assert.Contains(t, providerNames, "ragas-provider")
-		assert.Contains(t, providerNames, "garak-security")
-		assert.Contains(t, providerNames, "trustyai-custom")
-
-		// Check collections
-		assert.Contains(t, config.Collections, "healthcare_safety_v1")
-		assert.Contains(t, config.Collections, "automotive_safety_v1")
-
-		// Parse and validate providers.yaml
-		var providersData map[string]interface{}
-		err = yaml.Unmarshal([]byte(configMap.Data["providers.yaml"]), &providersData)
-		require.NoError(t, err)
-		assert.Contains(t, providersData, "providers")
 	})
 }
 
@@ -428,35 +406,11 @@ func TestGenerateConfigData(t *testing.T) {
 
 		// Check keys exist
 		assert.Contains(t, configData, "config.yaml")
-		assert.Contains(t, configData, "providers.yaml")
 
 		// Parse config.yaml
 		var config EvalHubConfig
 		err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config)
 		require.NoError(t, err)
-
-		// Verify default providers
-		assert.Len(t, config.Providers, 4)
-
-		// Find lm-eval-harness provider
-		var lmEvalProvider *ProviderConfig
-		for _, provider := range config.Providers {
-			if provider.Name == "lm-eval-harness" {
-				lmEvalProvider = &provider
-				break
-			}
-		}
-		require.NotNil(t, lmEvalProvider)
-		assert.Equal(t, "lm_evaluation_harness", lmEvalProvider.Type)
-		assert.True(t, lmEvalProvider.Enabled)
-		assert.Contains(t, lmEvalProvider.Benchmarks, "arc_challenge")
-		assert.Equal(t, "8", lmEvalProvider.Config["batch_size"])
-
-		// Verify collections
-		assert.Contains(t, config.Collections, "healthcare_safety_v1")
-		assert.Contains(t, config.Collections, "automotive_safety_v1")
-		assert.Contains(t, config.Collections, "finance_compliance_v1")
-		assert.Contains(t, config.Collections, "general_llm_eval_v1")
 	})
 }
 
@@ -1188,7 +1142,7 @@ func TestGenerateConfigData_WithDatabase(t *testing.T) {
 		assert.Equal(t, 10, config.Database.MaxIdleConns)
 	})
 
-	t.Run("should omit database and secrets sections when DB not configured", func(t *testing.T) {
+	t.Run("should default to sqlite when DB not explicitly configured", func(t *testing.T) {
 		evalHub := &evalhubv1alpha1.EvalHub{
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      "test-evalhub",
@@ -1204,7 +1158,8 @@ func TestGenerateConfigData_WithDatabase(t *testing.T) {
 		err = yaml.Unmarshal([]byte(configData["config.yaml"]), &config)
 		require.NoError(t, err)
 
-		assert.Nil(t, config.Database)
+		assert.NotNil(t, config.Database)
+		assert.Equal(t, "sqlite", config.Database.Driver)
 		assert.Nil(t, config.Secrets)
 	})
 }
@@ -1256,7 +1211,7 @@ func TestEvalHubReconciler_reconcileDeployment_WithDB(t *testing.T) {
 	}
 
 	t.Run("should add DB secret volume and mount when database configured", func(t *testing.T) {
-		err := reconciler.reconcileDeployment(ctx, evalHub)
+		err := reconciler.reconcileDeployment(ctx, evalHub, nil)
 		require.NoError(t, err)
 
 		deployment := &appsv1.Deployment{}
@@ -1330,3 +1285,212 @@ func TestEvalHubHelperMethods_IsDatabaseConfigured(t *testing.T) {
 		assert.True(t, spec.IsDatabaseConfigured())
 	})
 }
+
+func TestEvalHubReconciler_reconcileProviderConfigMaps(t *testing.T) {
+	scheme := runtime.NewScheme()
+	require.NoError(t, corev1.AddToScheme(scheme))
+	require.NoError(t, appsv1.AddToScheme(scheme))
+	require.NoError(t, evalhubv1alpha1.AddToScheme(scheme))
+
+	ctx := context.Background()
+	operatorNamespace := "operator-ns"
+	instanceNamespace := "instance-ns"
+	evalHubName := "test-evalhub"
+
+	// Source provider ConfigMap in the operator namespace (simulates what kustomize deploys)
+	sourceProvider := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "trustyai-service-operator-evalhub-provider-testprovider",
+			Namespace: operatorNamespace,
+			Labels: map[string]string{
+				providerLabel:     "system",
+				providerNameLabel: "testprovider",
+			},
+		},
+		Data: map[string]string{
+			"testprovider.yaml": "id: testprovider\nname: Test Provider\nruntime:\n  k8s:\n    image: quay.io/test/provider:latest\n",
+		},
+	}
+
+	t.Run("should copy provider ConfigMap to instance namespace", func(t *testing.T) {
+		evalHub := &evalhubv1alpha1.EvalHub{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      evalHubName,
+				Namespace: instanceNamespace,
+			},
+			Spec: evalhubv1alpha1.EvalHubSpec{
+				Providers: []string{"testprovider"},
+			},
+		}
+
+		fakeClient := fake.NewClientBuilder().
+			WithScheme(scheme).
+			WithObjects(evalHub, sourceProvider).
+			Build()
+
+		reconciler := &EvalHubReconciler{
+			Client:        fakeClient,
+			Scheme:        scheme,
+			Namespace:     operatorNamespace,
+			EventRecorder: record.NewFakeRecorder(10),
+		}
+
+		cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub)
+		require.NoError(t, err)
+
+		// Should return the target ConfigMap name
+		require.Len(t, cmNames, 1)
+		assert.Equal(t, evalHubName+"-provider-testprovider", cmNames[0])
+
+		// Verify the ConfigMap was created in the instance namespace with correct data
+		copiedCM := &corev1.ConfigMap{}
+		err = fakeClient.Get(ctx, types.NamespacedName{
+			Name:      evalHubName + "-provider-testprovider",
+			Namespace: instanceNamespace,
+		}, copiedCM)
+		require.NoError(t, err)
+		assert.Equal(t, sourceProvider.Data["testprovider.yaml"], copiedCM.Data["testprovider.yaml"])
+	})
+
+	t.Run("should return nil when no providers specified", func(t *testing.T) {
+		evalHub := &evalhubv1alpha1.EvalHub{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      evalHubName,
+				Namespace: instanceNamespace,
+			},
+			Spec: evalhubv1alpha1.EvalHubSpec{},
+		}
+
+		fakeClient := fake.NewClientBuilder().
+			WithScheme(scheme).
+			WithObjects(evalHub).
+			Build()
+
+		reconciler := &EvalHubReconciler{
+			Client:        fakeClient,
+			Scheme:        scheme,
+			Namespace:     operatorNamespace,
+			EventRecorder: record.NewFakeRecorder(10),
+		}
+
+		cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub)
+		require.NoError(t, err)
+		assert.Nil(t, cmNames)
+	})
+
+	t.Run("should error when provider not found", func(t *testing.T) {
+		evalHub := &evalhubv1alpha1.EvalHub{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      evalHubName,
+				Namespace: instanceNamespace,
+			},
+			Spec: evalhubv1alpha1.EvalHubSpec{
+				Providers: []string{"nonexistent"},
+			},
+		}
+
+		fakeClient := fake.NewClientBuilder().
+			WithScheme(scheme).
+			WithObjects(evalHub).
+			Build()
+
+		reconciler := &EvalHubReconciler{
+			Client:        fakeClient,
+			Scheme:        scheme,
+			Namespace:     operatorNamespace,
+			EventRecorder: record.NewFakeRecorder(10),
+		}
+
+		cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub)
+		require.Error(t, err)
+		assert.Nil(t, cmNames)
+		assert.Contains(t, err.Error(), "not found")
+	})
+
+	t.Run("should mount providers as projected volume in deployment", func(t *testing.T) {
+		evalHub := &evalhubv1alpha1.EvalHub{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      evalHubName,
+				Namespace: instanceNamespace,
+			},
+			Spec: evalhubv1alpha1.EvalHubSpec{
+				Providers: []string{"testprovider"},
+			},
+		}
+
+		operatorConfigMap := &corev1.ConfigMap{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      configMapName,
+				Namespace: operatorNamespace,
+			},
+			Data: map[string]string{
+				configMapEvalHubImageKey:       "quay.io/test/eval-hub:latest",
+				configMapKubeRBACProxyImageKey: "gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1",
+			},
+		}
+
+		fakeClient := fake.NewClientBuilder().
+			WithScheme(scheme).
+			WithObjects(evalHub, sourceProvider, operatorConfigMap).
+			Build()
+
+		reconciler := &EvalHubReconciler{
+			Client:        fakeClient,
+			Scheme:        scheme,
+			Namespace:     operatorNamespace,
+			EventRecorder: record.NewFakeRecorder(10),
+		}
+
+		// First reconcile provider ConfigMaps
+		cmNames, err := reconciler.reconcileProviderConfigMaps(ctx, evalHub)
+		require.NoError(t, err)
+		require.Len(t, cmNames, 1)
+
+		// Then reconcile deployment with the provider ConfigMap names
+		err = reconciler.reconcileDeployment(ctx, evalHub, cmNames)
+		require.NoError(t, err)
+
+		// Verify the deployment has the projected volume
+		deployment := &appsv1.Deployment{}
+		err = fakeClient.Get(ctx, types.NamespacedName{
+			Name:      evalHubName,
+			Namespace: instanceNamespace,
+		}, deployment)
+		require.NoError(t, err)
+
+		// Find the evalhub-providers projected volume
+		var providersVolume *corev1.Volume
+		for i, v := range deployment.Spec.Template.Spec.Volumes {
+			if v.Name == providersVolumeName {
+				providersVolume = &deployment.Spec.Template.Spec.Volumes[i]
+				break
+			}
+		}
+		require.NotNil(t, providersVolume, "evalhub-providers volume should be present")
+		require.NotNil(t, providersVolume.VolumeSource.Projected)
+		require.Len(t, providersVolume.VolumeSource.Projected.Sources, 1)
+		assert.Equal(t, evalHubName+"-provider-testprovider",
+			providersVolume.VolumeSource.Projected.Sources[0].ConfigMap.Name)
+
+		// Find the providers volume mount on the evalhub container
+		var evalHubContainer *corev1.Container
+		for i, c := range deployment.Spec.Template.Spec.Containers {
+			if c.Name == containerName {
+				evalHubContainer = &deployment.Spec.Template.Spec.Containers[i]
+				break
+			}
+		}
+		require.NotNil(t, evalHubContainer)
+
+		var providersMount *corev1.VolumeMount
+		for i, m := range evalHubContainer.VolumeMounts {
+			if m.Name == providersVolumeName {
+				providersMount = &evalHubContainer.VolumeMounts[i]
+				break
+			}
+		}
+		require.NotNil(t, providersMount, "providers volume mount should be present")
+		assert.Equal(t, providersMountPath, providersMount.MountPath)
+		assert.True(t, providersMount.ReadOnly)
+	})
+}
diff --git a/hack/sync-evalhub-providers.py b/hack/sync-evalhub-providers.py
new file mode 100755
index 000000000..b45cc7cde
--- /dev/null
+++ b/hack/sync-evalhub-providers.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Fetches provider YAML files from the eval-hub upstream repository and generates
+Kubernetes ConfigMap manifests for the operator to deploy.
+
+Usage:
+    hack/sync-evalhub-providers.py [branch]
+
+Arguments:
+    branch  Git branch to fetch from (default: main)
+"""
+
+import json
+import sys
+import textwrap
+import urllib.request
+from pathlib import Path
+
+import yaml
+
+REPO = "eval-hub/eval-hub"
+UPSTREAM_DIR = "config/providers"
+OUTPUT_DIR = Path("config/configmaps/evalhub")
+
+PROVIDER_TYPE_LABEL = "trustyai.opendatahub.io/evalhub-provider-type"
+PROVIDER_NAME_LABEL = "trustyai.opendatahub.io/evalhub-provider-name"
+
+# Files to exclude from the upstream repository (by filename)
+EXCLUDE_FILES = {
+    "ragas.yaml",
+}
+
+
+def fetch_json(url: str):
+    with urllib.request.urlopen(url) as resp:
+        return json.load(resp)
+
+
+def fetch_text(url: str) -> str:
+    with urllib.request.urlopen(url) as resp:
+        return resp.read().decode()
+
+
+def list_yaml_files(branch: str) -> list[str]:
+    api_url = f"https://api.github.com/repos/{REPO}/contents/{UPSTREAM_DIR}?ref={branch}"
+    print(f"Fetching provider list from {api_url}")
+    entries = fetch_json(api_url)
+    return [e["name"] for e in entries if e["name"].endswith((".yaml", ".yml"))]
+
+
+def process_provider(filename: str, branch: str) -> tuple[str, str] | None:
+    """Download a provider YAML, replace the image with a kustomize placeholder,
+    and generate a ConfigMap manifest. Returns (cm_filename, provider_id) or None."""
+    raw_url = f"https://raw.githubusercontent.com/{REPO}/{branch}/{UPSTREAM_DIR}/{filename}"
+    content = fetch_text(raw_url)
+    data = yaml.safe_load(content)
+
+    provider_id = data.get("id")
+    if not provider_id:
+        print(f"  SKIP: no 'id' field found in {filename}", file=sys.stderr)
+        return None
+
+    # Sanitize for K8s resource names (RFC 1123: only lowercase alphanumeric and hyphens)
+    safe_id = provider_id.replace("_", "-")
+
+    cm_file = f"provider-{safe_id}.yaml"
+    cm_name = f"evalhub-provider-{safe_id}"
+    var_name = f"evalhub-provider-{safe_id}-image"
+
+    print(f"  id={provider_id} -> {cm_file}")
+
+    # Replace runtime.k8s.image with kustomize placeholder
+    if "runtime" in data and "k8s" in data["runtime"]:
+        data["runtime"]["k8s"]["image"] = f"$({var_name})"
+
+    provider_yaml = yaml.dump(data, default_flow_style=False, sort_keys=False)
+
+    cm = textwrap.dedent(f"""\
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+          name: {cm_name}
+          labels:
+            {PROVIDER_TYPE_LABEL}: system
+            {PROVIDER_NAME_LABEL}: {safe_id}
+        data:
+          {filename}: |
+        """)
+    indented = textwrap.indent(provider_yaml, "    ")
+
+    (OUTPUT_DIR / cm_file).write_text(cm + indented)
+    return cm_file, safe_id
+
+
+def write_kustomization(cm_files: list[str]):
+    lines = ["resources:"]
+    for f in cm_files:
+        lines.append(f"  - {f}")
+    lines.append("")
+    lines.append("namespace: system")
+    lines.append("")
+    (OUTPUT_DIR / "kustomization.yaml").write_text("\n".join(lines))
+
+
+def main():
+    branch = sys.argv[1] if len(sys.argv) > 1 else "main"
+
+    filenames = list_yaml_files(branch)
+    if not filenames:
+        print(f"ERROR: No YAML files found in {UPSTREAM_DIR}", file=sys.stderr)
+        sys.exit(1)
+
+    # Clean existing provider ConfigMap files
+    for old in OUTPUT_DIR.glob("provider-*.yaml"):
+        old.unlink()
+
+    cm_files = []
+    provider_ids = []
+
+    for filename in filenames:
+        if filename in EXCLUDE_FILES:
+            print(f"Skipping {filename} (excluded)")
+            continue
+        print(f"Processing {filename}...")
+        result = process_provider(filename, branch)
+        if result:
+            cm_file, provider_id = result
+            cm_files.append(cm_file)
+            provider_ids.append(provider_id)
+
+    write_kustomization(cm_files)
+
+    print(f"\nGenerated {len(cm_files)} provider ConfigMaps in {OUTPUT_DIR}/")
+    print(f"Provider IDs: {', '.join(provider_ids)}")
+
+
+if __name__ == "__main__":
+    main()