cleanlab · axl1313 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,47 @@
+name: CI
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  workflow_call:
+
+jobs:
+  typecheck:
+    name: Type check
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - uses: pypa/hatch@install
+      - run: hatch run types.py${{ matrix.python }}:check
+  fmt:
+    name: Format and lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - uses: pypa/hatch@install
+      - run: hatch fmt --check
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - uses: pypa/hatch@install
+      - run: hatch test -v --cover --include python=${{ matrix.python }}
diff --git a/.github/workflows/static-analysis.yml b/.github/workflows/static-analysis.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,27 +1,18 @@
 repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.6.2
+  - repo: local
     hooks:
-      # Run the linter.
-      - id: ruff
-        name: ruff
-        files: ^.*.py$
-        args: [--fix]
-      # Run the formatter.
-      - id: ruff-format
-        name: ruff-format
-        files: ^.*.py$
-
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    # Mypy version.
-    rev: v1.10.1 # Can likely upgrade this now
-    hooks:
-      - id: mypy
-        name: mypy
-        language: system
-        files: ^.*.py$
-        entry: /bin/bash -c "uv sync --frozen --extra dev && uv run mypy tlm/ tests/"
+      - id: format
+        name: format
+        entry: hatch fmt
+        language: python
+        types: [python]
+        pass_filenames: false
+      - id: type-check
+        name: type-check
+        entry: hatch run types:check
+        language: python
+        types: [python]
+        pass_filenames: false
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0

diff --git a/.python-version b/.python-version
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -0,0 +1,85 @@
+# Development
+
+This project uses the [Hatch] project manager ([installation instructions][hatch-install]).
+
+Hatch automatically manages dependencies and runs testing, type checking, and other operations in isolated [environments][hatch-environments].
+
+[Hatch]: https://hatch.pypa.io/
+[hatch-install]: https://hatch.pypa.io/latest/install/
+[hatch-environments]: https://hatch.pypa.io/latest/environment/
+
+## Testing
+
+You can run the tests on your local machine with:
+
+```bash
+hatch test
+```
+
+The [`test` command][hatch-test] supports options such as `-c` for measuring test coverage, `-a` for testing with a matrix of Python versions, and appending an argument like `tests/test_validator.py::test_validate_expert_answer` for running a single test.
+
+[hatch-test]: https://hatch.pypa.io/latest/tutorials/testing/overview/
+
+## Type checking
+
+You can run the [mypy static type checker][mypy] with:
+
+```bash
+hatch run types:check
+```
+
+[mypy]: https://mypy-lang.org/
+
+## Formatting and linting
+
+You can run the [Ruff][ruff] formatter and linter with:
+
+```bash
+hatch fmt
+```
+
+This will automatically make [safe fixes][fix-safety] to your code. If you want to only check your files without making modifications, run `hatch fmt --check`.
+
+[ruff]: https://github.com/astral-sh/ruff
+[fix-safety]: https://docs.astral.sh/ruff/linter/#fix-safety
+
+## Pre-commit
+
+You can install the pre-commit hooks to automatically run type checking, formatting, and linting on every commit.
+
+First, install [pre-commit][pre-commit], for example, with [pipx]:
+
+```bash
+pipx install pre-commit
+```
+
+Then, install the hooks:
+
+```bash
+pre-commit install
+```
+
+[pre-commit]: https://pre-commit.com/
+[pipx]: https://pipx.pypa.io/
+
+## Packaging
+
+You can use [`hatch build`][hatch-build] to create build artifacts, a [source distribution ("sdist")][sdist] and a [built distribution ("wheel")][bdist].
+
+[hatch-build]: https://hatch.pypa.io/latest/build/
+[sdist]: https://packaging.python.org/en/latest/glossary/#term-Source-Distribution-or-sdist
+[bdist]: https://packaging.python.org/en/latest/glossary/#term-Built-Distribution
+
+### How to build and install the package locally
+
+You may want to build and install the package locally - for example, to see how your changes affect other Python code in a script or Jupyter notebook.
+
+To do this, you can build the package with `hatch build` and then install it in your local environment with `pip install dist/tlm-<version>-py3-none-any.whl`.
+
+Alternatively, you can use `pip install -e /path/to/tlm` to install the package from your local code. Note that if you make further local changes after that, you may need to reload the module, i.e. `reload(tlm)`, or restart the kernel.
+
+## Continuous integration
+
+Tests, type checking, and formatting/linting are [checked in the CI workflow][ci-workflow].
+
+[ci-workflow]: .github/workflows/ci.yml
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ The [Trustworthy Language Model](https://cleanlab.ai/blog/trustworthy-language-m
 
 Automatically detect hallucinated/incorrect responses in: Q&A (RAG), Chatbots, Agents, Structured Outputs, Data Extraction, Tool Calling, Classification/Tagging, Data Labeling, and other LLM applications.
 
-Use TLM to: 
+Use TLM to:
 - Guardrail AI mistakes before they are served to user
 - Escalate cases where AI is untrustworthy to humans
 - Discover incorrect LLM (or human) generated outputs in datasets/logs

diff --git a/notebooks/score_prompt_examples.ipynb b/notebooks/score_prompt_examples.ipynb
@@ -28,7 +28,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "from dotenv import load_dotenv\n",
     "\n",
     "ENV_FILE_PATH = \"/Users/Kelsey/code/tlm/.env\"\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,10 +4,12 @@ build-backend = "hatchling.build"
 
 [project]
 name = "tlm"
-version = "0.0.0"
+dynamic = ["version"]
 description = "Core functionality for TLM"
 readme = "README.md"
 requires-python = ">=3.10"
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 dependencies = [
     "jsonpath-ng>=1.7.0",
     "litellm==1.71.3",
@@ -19,6 +21,9 @@ dependencies = [
     "pydantic-settings>=2.0.0",
 ]
 
+[tool.hatch.version]
+path = "tlm/__about__.py"
+
 [project.optional-dependencies]
 dev = [
     "coverage>=7.6.4",
@@ -31,6 +36,22 @@ dev = [
     "pytest-recording>=0.13.2",
 ]
 
+[tool.hatch.envs.default]
+installer = "uv"
+
+[tool.hatch.envs.types]
+extra-dependencies = [
+    "mypy>=1.0.0",
+    "pytest>=8.3.3",
+    "pytest-recording>=0.13.2",
+]
+
+[[tool.hatch.envs.types.matrix]]
+python = ["3.10", "3.11", "3.12", "3.13"]
+
+[tool.hatch.envs.types.scripts]
+check = "mypy --install-types --non-interactive {args:tlm tests}"  # TODO: add --strict flag
+
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
@@ -40,6 +61,24 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(name)s: %(message)s"
 log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 log_level = "INFO"
 
+[tool.hatch.envs.hatch-test]
+extra-dependencies = [
+    "vcrpy>=8.0.0",
+    "pytest-asyncio>=1.3.0",
+]
+
+[tool.hatch.envs.hatch-test.env-vars]
+COVERAGE_FAIL_UNDER = ""
+
+[tool.hatch.envs.hatch-test.overrides]
+matrix.python.env-vars = [ { key = "COVERAGE_FAIL_UNDER", value = "--fail-under=90", if = [ "3.13" ] } ]
+
+[tool.hatch.envs.hatch-test.scripts]
+run = "pytest{env:HATCH_TEST_ARGS:} {args}"
+run-cov = "coverage run -m pytest{env:HATCH_TEST_ARGS:} {args}"
+cov-combine = "coverage combine"
+cov-report = "coverage report {env:COVERAGE_FAIL_UNDER}"
+
 [tool.coverage.run]
 omit = [
     "tests/*",
@@ -49,11 +88,18 @@ omit = [
 show_missing = true
 fail_under = 90
 
+[tool.hatch.envs.hatch-static-analysis]
+config-path = "none"
+
 [tool.ruff]
 # The maximum line length that the formatter will allow.
 line-length = 120
+extend-exclude = ["notebooks"]
+
+[tool.ruff.lint]
+select = ["E4", "E7", "E9", "F", "INP"]
 # Ignore F401 (unused imports) for conftest.py files
-per-file-ignores = {"**/conftest.py" = ["F401"]}
+per-file-ignores = {"**/conftest.py" = ["F401"], "**/tests/**/*.py" = ["INP"]}
 
 [tool.isort]
 line_length = 120

diff --git a/tlm/__about__.py b/tlm/__about__.py
@@ -0,0 +1 @@
+__version__ = "0.0.0"
diff --git a/tlm/components/completions/__init__.py b/tlm/components/completions/__init__.py
diff --git a/tlm/components/scores/__init__.py b/tlm/components/scores/__init__.py
diff --git a/tlm/config/__init__.py b/tlm/config/__init__.py
diff --git a/tlm/utils/__init__.py b/tlm/utils/__init__.py
diff --git a/tlm/utils/completion_utils.py b/tlm/utils/completion_utils.py
@@ -127,7 +127,7 @@ async def _generate_completion(
             failure_type = CompletionFailureType.RUNTIME_ERROR
 
         logger.error(
-            f"[{template.__class__.__name__}] error generating completion with LiteLLM: {e}\nusing litellm params: \n{litellm_params}\n{'='*100}"
+            f"[{template.__class__.__name__}] error generating completion with LiteLLM: {e}\nusing litellm params: \n{litellm_params}\n{'=' * 100}"
         )
         return CompletionFailure(type=failure_type, error=str(e))
 

diff --git a/tlm/utils/scoring/__init__.py b/tlm/utils/scoring/__init__.py