Skip to content

Commit 96fa4cc

Browse files
jwmuellerhuiwengoh
andauthored
Validate when explanations logging is supported or not (#124)
Co-authored-by: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
1 parent 487cd36 commit 96fa4cc

File tree

9 files changed

+224
-10
lines changed

9 files changed

+224
-10
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.1.34] - 2025-09-24
11+
12+
### Added
13+
14+
- Validate when explanations logging is supported or not
15+
1016
## [1.1.33] - 2025-09-23
1117

1218
### Fixed
@@ -355,7 +361,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
355361

356362
- Release of the Cleanlab TLM Python client.
357363

358-
[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.33...HEAD
364+
[Unreleased]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.34...HEAD
365+
[1.1.34]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.33...v1.1.34
359366
[1.1.33]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.32...v1.1.33
360367
[1.1.32]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.31...v1.1.32
361368
[1.1.31]: https://github.com/cleanlab/cleanlab-tlm/compare/v1.1.30...v1.1.31

src/cleanlab_tlm/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SPDX-License-Identifier: MIT
2-
__version__ = "1.1.33"
2+
__version__ = "1.1.34"

src/cleanlab_tlm/internal/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
_VALID_TLM_QUALITY_PRESETS: list[str] = ["best", "high", "medium", "low", "base"]
66
_VALID_TLM_QUALITY_PRESETS_CHAT_COMPLETIONS: list[str] = ["medium", "low", "base"]
77
_DEFAULT_TLM_QUALITY_PRESET: TLMQualityPreset = "medium"
8+
_QUALITY_PRESETS_W_CONSISTENCY_SAMPLES: set[str] = {"best", "high"} # Must also apply to TrustworthyRAG
89
_DEFAULT_TLM_MAX_TOKENS: int = 512
910
_VALID_TLM_MODELS: list[str] = [
1011
"gpt-3.5-turbo-16k",
@@ -38,6 +39,17 @@
3839
"nova-pro",
3940
]
4041
_TLM_DEFAULT_MODEL: str = "gpt-4.1-mini"
42+
_HIDDEN_REASONING_MODELS: set[str] = {
43+
"o1-preview",
44+
"o1",
45+
"o1-mini",
46+
"o3",
47+
"o3-mini",
48+
"o4-mini",
49+
"gpt-5",
50+
"gpt-5-mini",
51+
"gpt-5-nano",
52+
}
4153
_TLM_DEFAULT_CONTEXT_LIMIT: int = 70000
4254
_VALID_TLM_TASKS: set[str] = {task.value for task in Task}
4355
TLM_TASK_SUPPORTING_CONSTRAIN_OUTPUTS: set[Task] = {
@@ -95,3 +107,7 @@
95107
_TLM_EVAL_QUERY_IDENTIFIER_KEY: str = "query_identifier"
96108
_TLM_EVAL_CONTEXT_IDENTIFIER_KEY: str = "context_identifier"
97109
_TLM_EVAL_RESPONSE_IDENTIFIER_KEY: str = "response_identifier"
110+
111+
# Values that wont support logging explanation by default
112+
_REASONING_EFFORT_UNSUPPORTED_EXPLANATION_LOGGING: set[str] = {"none", "minimal"}
113+
_QUALITY_PRESETS_UNSUPPORTED_EXPLANATION_LOGGING: set[str] = {"low", "base"} # For regular TLM not TrustworthyRAG

src/cleanlab_tlm/internal/validation.py

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77

88
from cleanlab_tlm.errors import ValidationError
99
from cleanlab_tlm.internal.constants import (
10+
_HIDDEN_REASONING_MODELS,
11+
_QUALITY_PRESETS_UNSUPPORTED_EXPLANATION_LOGGING,
12+
_QUALITY_PRESETS_W_CONSISTENCY_SAMPLES,
13+
_REASONING_EFFORT_UNSUPPORTED_EXPLANATION_LOGGING,
1014
_TLM_CONSTRAIN_OUTPUTS_KEY,
1115
_TLM_DEFAULT_MODEL,
1216
_TLM_MAX_TOKEN_RANGE,
@@ -143,6 +147,12 @@ def validate_tlm_options(
143147
)
144148

145149
elif option == "use_self_reflection":
150+
if "num_self_reflections" in options:
151+
raise ValidationError(
152+
"`use_self_reflection` and `num_self_reflections` cannot be specified together. "
153+
"`use_self_reflection` is deprecated. Use `num_self_reflections` instead."
154+
)
155+
146156
if not isinstance(val, bool):
147157
raise ValidationError(f"Invalid type {type(val)}, use_self_reflection must be a boolean")
148158

@@ -169,9 +179,6 @@ def validate_tlm_options(
169179
raise ValidationError(f"Invalid type {type(val)}, log must be a list of strings.")
170180

171181
invalid_log_options = set(val) - TLM_VALID_LOG_OPTIONS
172-
173-
model = options.get("model", _TLM_DEFAULT_MODEL)
174-
175182
if invalid_log_options:
176183
raise ValidationError(
177184
f"Invalid options for log: {invalid_log_options}. Valid options include: {TLM_VALID_LOG_OPTIONS}"
@@ -248,6 +255,75 @@ def _validate_trustworthy_rag_options(options: Optional[TLMOptions], initialized
248255
)
249256

250257

258+
def validate_logging(options: Optional[TLMOptions], quality_preset: str, subclass: str) -> None:
259+
"""If user asks to log explanation, then either:
260+
ensure the specified TLM configuration supports this (return early), or otherwise raise informative error.
261+
262+
subclass: str
263+
Either "TLM" or "TrustworthyRAG".
264+
Indicates which type of TLM subclass object we are validating, different types have different quality_preset -> base options mappings.
265+
"""
266+
if not options:
267+
return
268+
if "log" not in options:
269+
return
270+
if "explanation" not in options["log"]:
271+
return
272+
273+
# Otherwise ensure we're using TLM configuration that supports logging explanations:
274+
unsupported_error = ValueError(
275+
"Your TLM configuration does not support logged explanations. "
276+
"Please remove 'explanation' from your specified `log`, and instead use the `get_explanation()` method after computing trust scores."
277+
)
278+
279+
disable_trustworthiness = options.get("disable_trustworthiness", False)
280+
if disable_trustworthiness:
281+
raise unsupported_error
282+
283+
model = options.get("model")
284+
num_consistency_samples = options.get("num_consistency_samples")
285+
reasoning_effort = options.get("reasoning_effort")
286+
287+
num_self_reflections = options.get("num_self_reflections")
288+
use_self_reflection = options.get("use_self_reflection")
289+
if use_self_reflection is False:
290+
# use_self_reflection is deprecated, consolidating to one parameter
291+
num_self_reflections = 0
292+
293+
if num_consistency_samples == 0 and num_self_reflections == 0:
294+
raise unsupported_error
295+
296+
if (num_consistency_samples is not None) and (num_consistency_samples > 0):
297+
return
298+
if (
299+
(reasoning_effort is not None)
300+
and (reasoning_effort not in _REASONING_EFFORT_UNSUPPORTED_EXPLANATION_LOGGING)
301+
and (num_self_reflections is None or num_self_reflections > 0)
302+
):
303+
return
304+
if (num_consistency_samples == 0) and (reasoning_effort in _REASONING_EFFORT_UNSUPPORTED_EXPLANATION_LOGGING):
305+
raise unsupported_error
306+
307+
if model in _HIDDEN_REASONING_MODELS:
308+
raise unsupported_error
309+
310+
# Otherwise we can assume relevant TLMOptions were left unspecified by user
311+
if subclass == "TLM":
312+
if quality_preset in _QUALITY_PRESETS_UNSUPPORTED_EXPLANATION_LOGGING:
313+
raise unsupported_error
314+
if quality_preset not in _QUALITY_PRESETS_W_CONSISTENCY_SAMPLES:
315+
if reasoning_effort in _REASONING_EFFORT_UNSUPPORTED_EXPLANATION_LOGGING:
316+
raise unsupported_error
317+
if num_self_reflections == 0 and num_consistency_samples is None:
318+
raise unsupported_error
319+
320+
if subclass == "TrustworthyRAG":
321+
if quality_preset not in _QUALITY_PRESETS_W_CONSISTENCY_SAMPLES:
322+
raise unsupported_error
323+
if num_consistency_samples == 0:
324+
raise unsupported_error
325+
326+
251327
def process_and_validate_kwargs_constrain_outputs(
252328
prompt: Union[str, Sequence[str]],
253329
task: Optional[Task],

src/cleanlab_tlm/tlm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
tlm_explanation_format_tlm_result,
4242
tlm_prompt_process_and_validate_kwargs,
4343
tlm_score_process_response_and_kwargs,
44+
validate_logging,
4445
validate_tlm_prompt,
4546
validate_tlm_prompt_response,
4647
)
@@ -117,6 +118,7 @@ def __init__(
117118
)
118119

119120
# TLM-specific initialization
121+
validate_logging(options=options, quality_preset=quality_preset, subclass="TLM")
120122
if task not in _VALID_TLM_TASKS:
121123
raise ValidationError(f"Invalid task {task} -- must be one of {_VALID_TLM_TASKS}")
122124

src/cleanlab_tlm/utils/rag.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
_validate_trustworthy_rag_options,
4545
tlm_explanation_format_trustworthy_rag_result,
4646
tlm_score_process_response_and_kwargs,
47+
validate_logging,
4748
validate_rag_inputs,
4849
)
4950

@@ -134,6 +135,7 @@ def __init__(
134135
self._evals = evals
135136

136137
_validate_trustworthy_rag_options(options=options, initialized_evals=self._evals)
138+
validate_logging(options=options, quality_preset=quality_preset, subclass="TrustworthyRAG")
137139

138140
# Optional per-eval tool call overrides
139141
# These are name-based include/exclude sets used only in the _handle_tool_call_filtering decorator

tests/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
TLM_SIMILARITY_MEASURES,
2020
)
2121
from cleanlab_tlm.internal.types import TLMQualityPreset
22+
from cleanlab_tlm.internal.validation import validate_logging
2223
from cleanlab_tlm.tlm import TLM, TLMOptions
2324
from cleanlab_tlm.utils.chat_completions import TLMChatCompletion
2425
from cleanlab_tlm.utils.rag import TrustworthyRAG
@@ -83,6 +84,16 @@ def tlm_dict(tlm_api_key: str) -> dict[str, Any]:
8384
tlm_dict[quality_preset][model] = {}
8485
task = random.choice(list(_VALID_TLM_TASKS))
8586
options = _get_options_dictionary(model)
87+
try: # ensure valid options/preset/model configuration for logging
88+
validate_logging(options=options, quality_preset=quality_preset, subclass="TLM")
89+
except ValueError as e:
90+
if "does not support logged explanations" in str(e):
91+
options["log"].remove("explanation")
92+
if len(options["log"]) == 0:
93+
del options["log"] # log cannot be empty list
94+
else:
95+
raise ValueError(e)
96+
8697
tlm_dict[quality_preset][model]["tlm"] = TLM(
8798
quality_preset=quality_preset,
8899
task=task,

tests/test_chat_completions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def test_tlm_chat_completion_structured_output_per_field_scoring() -> None:
283283
# test per_field_score
284284
assert len(score["log"]["per_field_score"]) == 2 # noqa: PLR2004
285285
assert {"steps", "final_answer"} == set(score["log"]["per_field_score"].keys())
286-
assert tlm_chat.get_untrustworthy_fields(response=response, tlm_result=score) == ["final_answer"]
286+
assert "final_answer" in tlm_chat.get_untrustworthy_fields(response=response, tlm_result=score)
287287

288288

289289
def test_tlm_chat_completion_score_invalid_response() -> None:

tests/test_validation.py

Lines changed: 104 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -875,14 +875,114 @@ def test_disable_trustworthiness_with_custom_criteria_works(tlm_api_key: str) ->
875875

876876
def test_disable_trustworthiness_without_custom_criteria_raises_error_rag(tlm_api_key: str) -> None:
877877
"""Test that disable_trustworthiness=True without custom_eval_criteria raises ValueError for TrustworthyRAG."""
878-
from cleanlab_tlm.utils.rag import TrustworthyRAG
879-
880878
with pytest.raises(ValidationError, match="^When disable_trustworthiness=True in TrustworthyRAG"):
881879
TrustworthyRAG(evals=[], api_key=tlm_api_key, options={"disable_trustworthiness": True})
882880

883881

884882
def test_disable_trustworthiness_with_custom_criteria_works_rag(tlm_api_key: str) -> None:
885883
"""Test that disable_trustworthiness=True with custom_eval_criteria works normally for TrustworthyRAG."""
886-
from cleanlab_tlm.utils.rag import TrustworthyRAG
887-
888884
TrustworthyRAG(api_key=tlm_api_key, options={"disable_trustworthiness": True})
885+
886+
887+
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
888+
def test_validate_logging(tlm_api_key: str) -> None:
889+
"""Test validate_logging() method errors at the right times."""
890+
# Settings that should not raise error:
891+
TLM(api_key=tlm_api_key)
892+
TLM(api_key=tlm_api_key, options={"log": ["explanation"]})
893+
TLM(api_key=tlm_api_key, quality_preset="best", options={"log": ["explanation"], "reasoning_effort": "none"})
894+
TLM(api_key=tlm_api_key, quality_preset="high", options={"log": ["explanation"], "reasoning_effort": "none"})
895+
TLM(api_key=tlm_api_key, quality_preset="base", options={"log": ["explanation"], "num_consistency_samples": 8})
896+
TLM(
897+
api_key=tlm_api_key,
898+
quality_preset="best",
899+
options={"log": ["explanation"], "num_self_reflections": 0},
900+
)
901+
TLM(
902+
api_key=tlm_api_key,
903+
quality_preset="low",
904+
options={
905+
"log": ["explanation"],
906+
"num_self_reflections": 0,
907+
"num_consistency_samples": 4,
908+
},
909+
)
910+
TLM(api_key=tlm_api_key, options={"model": "gpt-5-mini"})
911+
912+
# Settings that should error:
913+
with pytest.raises(ValueError, match="does not support logged explanations"):
914+
TLM(api_key=tlm_api_key, quality_preset="low", options={"log": ["explanation"]})
915+
with pytest.raises(ValueError, match="does not support logged explanations"):
916+
TLM(api_key=tlm_api_key, quality_preset="base", options={"log": ["explanation"]})
917+
with pytest.raises(ValueError, match="does not support logged explanations"):
918+
TLM(
919+
api_key=tlm_api_key,
920+
quality_preset="best",
921+
options={"log": ["explanation"], "reasoning_effort": "none", "num_consistency_samples": 0},
922+
)
923+
with pytest.raises(ValueError, match="does not support logged explanations"):
924+
TLM(
925+
api_key=tlm_api_key,
926+
options={"log": ["explanation"], "num_self_reflections": 0},
927+
)
928+
929+
with pytest.raises(ValueError, match="does not support logged explanations"):
930+
TLM(
931+
api_key=tlm_api_key,
932+
options={"log": ["explanation"], "use_self_reflection": False},
933+
)
934+
with pytest.raises(ValueError, match="does not support logged explanations"):
935+
TLM(
936+
api_key=tlm_api_key,
937+
quality_preset="best",
938+
options={
939+
"log": ["explanation"],
940+
"num_self_reflections": 0,
941+
"num_consistency_samples": 0,
942+
},
943+
)
944+
with pytest.raises(ValueError, match="does not support logged explanations"):
945+
TLM(
946+
api_key=tlm_api_key,
947+
options={
948+
"log": ["explanation"],
949+
"reasoning_effort": "high",
950+
"num_self_reflections": 0,
951+
},
952+
)
953+
with pytest.raises(ValueError, match="does not support logged explanations"):
954+
TLM(api_key=tlm_api_key, options={"log": ["explanation"], "model": "gpt-5-mini"})
955+
956+
# Settings that should not raise error:
957+
TrustworthyRAG(api_key=tlm_api_key)
958+
TrustworthyRAG(api_key=tlm_api_key, options={"log": ["explanation"], "num_consistency_samples": 5})
959+
TrustworthyRAG(api_key=tlm_api_key, options={"log": ["explanation"], "reasoning_effort": "high"})
960+
TrustworthyRAG(api_key=tlm_api_key, quality_preset="best", options={"log": ["explanation"]})
961+
962+
# Settings that should error:
963+
with pytest.raises(ValueError, match="does not support logged explanations"):
964+
TrustworthyRAG(api_key=tlm_api_key, options={"log": ["explanation"]})
965+
with pytest.raises(ValueError, match="does not support logged explanations"):
966+
TrustworthyRAG(
967+
api_key=tlm_api_key, quality_preset="best", options={"log": ["explanation"], "num_consistency_samples": 0}
968+
)
969+
with pytest.raises(ValueError, match="does not support logged explanations"):
970+
TrustworthyRAG(
971+
api_key=tlm_api_key,
972+
options={
973+
"log": ["explanation"],
974+
"reasoning_effort": "high",
975+
"num_self_reflections": 0,
976+
},
977+
)
978+
with pytest.raises(ValueError, match="does not support logged explanations"):
979+
TrustworthyRAG(
980+
api_key=tlm_api_key,
981+
quality_preset="best",
982+
options={
983+
"log": ["explanation"],
984+
"reasoning_effort": "high",
985+
"num_self_reflections": 0,
986+
"num_consistency_samples": 0,
987+
},
988+
)

0 commit comments

Comments
 (0)