cleanlab
diff --git a/‎.stats.yml‎
Lines changed: 1 addition & 1 deletion b/‎.stats.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/codex/types/project_create_params.py‎
Lines changed: 282 additions & 3 deletions b/‎src/codex/types/project_create_params.py‎
Lines changed: 282 additions & 3 deletions
@@ -1,3 +1,3 @@
 configured_endpoints: 44
-openapi_spec_hash: 67d5aeebff72f48ee4730227ca0b47c2
+openapi_spec_hash: 0f1841fad65926e7ddfb22dd7a642b46
 config_hash: 659f65b6ccf5612986f920f7f9abbcb5
@@ -2,10 +2,22 @@
 
 from __future__ import annotations
 
-from typing import Optional
-from typing_extensions import Required, TypedDict
+from typing import Dict, Optional
+from typing_extensions import Literal, Required, TypedDict
 
-__all__ = ["ProjectCreateParams", "Config"]
+__all__ = [
+    "ProjectCreateParams",
+    "Config",
+    "ConfigEvalConfig",
+    "ConfigEvalConfigCustomEvals",
+    "ConfigEvalConfigCustomEvalsEvals",
+    "ConfigEvalConfigDefaultEvals",
+    "ConfigEvalConfigDefaultEvalsContextSufficiency",
+    "ConfigEvalConfigDefaultEvalsQueryEase",
+    "ConfigEvalConfigDefaultEvalsResponseGroundedness",
+    "ConfigEvalConfigDefaultEvalsResponseHelpfulness",
+    "ConfigEvalConfigDefaultEvalsTrustworthiness",
+]
 
 
 class ProjectCreateParams(TypedDict, total=False):
@@ -18,9 +30,276 @@ class ProjectCreateParams(TypedDict, total=False):
     description: Optional[str]
 
 
+class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
+    criteria: Required[str]
+    """
+    The evaluation criteria text that describes what aspect is being evaluated and
+    how
+    """
+
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    context_identifier: Optional[str]
+    """
+    The exact string used in your evaluation criteria to reference the retrieved
+    context.
+    """
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    query_identifier: Optional[str]
+    """
+    The exact string used in your evaluation criteria to reference the user's query.
+    """
+
+    response_identifier: Optional[str]
+    """
+    The exact string used in your evaluation criteria to reference the RAG/LLM
+    response.
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigCustomEvals(TypedDict, total=False):
+    evals: Dict[str, ConfigEvalConfigCustomEvalsEvals]
+
+
+class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    eval_key: Required[str]
+    """
+    Unique key for eval metric - currently maps to the TrustworthyRAG name property
+    and eval_scores dictionary key to check against threshold
+    """
+
+    name: Required[str]
+    """Display name/label for the evaluation metric"""
+
+    enabled: bool
+    """Allows the evaluation to be disabled without removing it"""
+
+    priority: Optional[int]
+    """
+    Priority order for evals (lower number = higher priority) to determine primary
+    eval issue to surface
+    """
+
+    should_escalate: bool
+    """
+    If true, failing this eval means the response is considered bad and can trigger
+    escalation to Codex/SME
+    """
+
+    threshold: float
+    """Threshold value that determines if the evaluation fails"""
+
+    threshold_direction: Literal["above", "below"]
+    """Whether the evaluation fails when score is above or below the threshold"""
+
+
+class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
+    context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed, while
+    other properties like thresholds and priorities can be configured.
+    """
+
+    query_ease: ConfigEvalConfigDefaultEvalsQueryEase
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed, while
+    other properties like thresholds and priorities can be configured.
+    """
+
+    response_groundedness: ConfigEvalConfigDefaultEvalsResponseGroundedness
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed, while
+    other properties like thresholds and priorities can be configured.
+    """
+
+    response_helpfulness: ConfigEvalConfigDefaultEvalsResponseHelpfulness
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed, while
+    other properties like thresholds and priorities can be configured.
+    """
+
+    trustworthiness: ConfigEvalConfigDefaultEvalsTrustworthiness
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed, while
+    other properties like thresholds and priorities can be configured.
+    """
+
+
+class ConfigEvalConfig(TypedDict, total=False):
+    custom_evals: ConfigEvalConfigCustomEvals
+    """Configuration for custom evaluation metrics."""
+
+    default_evals: ConfigEvalConfigDefaultEvals
+    """Configuration for default evaluation metrics."""
+
+
 class Config(TypedDict, total=False):
     clustering_use_llm_matching: bool
 
+    eval_config: ConfigEvalConfig
+    """Configuration for project-specific evaluation metrics"""
+
     llm_matching_model: str
 
     llm_matching_quality_preset: str