diff --git a/.github/workflows/constraints-update.yml b/.github/workflows/constraints-update.yml
new file mode 100644
index 00000000..ab909bb7
--- /dev/null
+++ b/.github/workflows/constraints-update.yml
@@ -0,0 +1,33 @@
+name: Update constraints-dev.txt
+
+on:
+  schedule:
+    - cron: '0 3 * * 1'  # Every Monday at 03:00 UTC
+  workflow_dispatch:
+
+jobs:
+  update-constraints:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Checkout "update-constraints" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          path: ci-actions
+          # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet
+          ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main
+          sparse-checkout: |
+            actions/update-constraints
+
+      - name: Update constraints
+        id: update-constraints
+        uses: ./ci-actions/actions/update-constraints
+        with:
+          gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index e37e869c..6b8f2b85 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -18,6 +18,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
+      - 'constraints-dev.txt'
       - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
 
 concurrency:
@@ -111,45 +112,29 @@ jobs:
       - name: Install ilab
         working-directory: ./instructlab
         run: |
-          export CUDA_HOME="/usr/local/cuda"
-          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64"
-          export PATH="$PATH:$CUDA_HOME/bin"
-          python3.11 -m venv --upgrade-deps venv
-          . venv/bin/activate
-          nvidia-smi
-          python3.11 -m pip cache remove llama_cpp_python
-
-          pip_install="python3.11 -m pip install -v -c constraints-dev.txt"
-
-          pip_install="python3.11 -m pip install -v -c constraints-dev.txt"
-
-          # pre-install some build dependencies
-          $pip_install packaging wheel setuptools-scm
-
-          # flash-attn has a bug in the setup.py that causes pip to attempt installing it
-          # before torch is installed. This is a bug because their setup.py depends on
-          # importing the module, so it should have been listed in build_requires. Alas!
-          #
-          # See: https://github.com/Dao-AILab/flash-attention/pull/958
-          # Also: https://github.com/instructlab/instructlab/issues/1821
-          #
-          # first, pre-install flash-attn build dependencies
-          $pip_install torch packaging setuptools wheel psutil ninja
-
-          # now build flash-attn using the pre-installed build dependencies; this will
-          # guarantee that the build version of torch will match the runtime version of
-          # torch; otherwise, all kinds of problems may occur, like missing symbols when
-          # accessing C extensions and such
-          $pip_install flash-attn --no-build-isolation
-
-          CMAKE_ARGS="-DGGML_CUDA=on" $pip_install .
-          $pip_install .[cuda] -r requirements-vllm-cuda.txt
+          PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh
         
       - name: Update instructlab-eval library
         working-directory: ./eval
         run: |
           . ../instructlab/venv/bin/activate
-          pip install -v .
+          # Patch out our own pin from the ilab repo constraints file
+          ilab_constraints=../instructlab/constraints-dev.txt
+          sed -i '/instructlab-eval==/d' $ilab_constraints
+
+          # Since we reuse the virtual environment prepared using ilab
+          # constraints, we should stick to the same constraints when
+          # installing latest eval.
+          #
+          # FIX: this is not ideal; a proper fix would require decoupling the
+          # two repos in CI: either by removing the job completely and relying
+          # on "sdk" (no ilab) test runs; or by preparing a separate
+          # constraints file that would consider both the requirements files
+          # for the eval library AND for the ilab - so that they are
+          # consistent.
+          pip_install="pip install -c $ilab_constraints"
+          $pip_install .
+          $pip_install .[cuda]
 
       - name: Run e2e test
         working-directory: ./instructlab
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index d970a3d2..bc146985 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -142,44 +142,29 @@ jobs:
       - name: Install ilab
         working-directory: ./instructlab
         run: |
-          export CUDA_HOME="/usr/local/cuda"
-          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64"
-          export PATH="$PATH:$CUDA_HOME/bin"
-          python3.11 -m venv --upgrade-deps venv
-          . venv/bin/activate
-          nvidia-smi
-          python3.11 -m pip cache remove llama_cpp_python
-
-          pip_install="python3.11 -m pip install -v -c constraints-dev.txt"
-
-          # pre-install some build dependencies
-          $pip_install packaging wheel setuptools-scm
-
-          # flash-attn has a bug in the setup.py that causes pip to attempt installing it
-          # before torch is installed. This is a bug because their setup.py depends on
-          # importing the module, so it should have been listed in build_requires. Alas!
-          #
-          # See: https://github.com/Dao-AILab/flash-attention/pull/958
-          # Also: https://github.com/instructlab/instructlab/issues/1821
-          #
-          # first, pre-install flash-attn build dependencies
-          $pip_install torch packaging setuptools wheel psutil ninja
-
-          # now build flash-attn using the pre-installed build dependencies; this will
-          # guarantee that the build version of torch will match the runtime version of
-          # torch; otherwise, all kinds of problems may occur, like missing symbols when
-          # accessing C extensions and such
-          $pip_install flash-attn --no-build-isolation
-
-          CMAKE_ARGS="-DGGML_CUDA=on" $pip_install .
-          $pip_install .[cuda] -r requirements-vllm-cuda.txt
+          PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh
 
       - name: Update instructlab-eval library
         working-directory: ./eval
         run: |
           . ../instructlab/venv/bin/activate
-          pip install .
-          pip install .[cuda]
+          # Patch out our own pin from the ilab repo constraints file
+          ilab_constraints=../instructlab/constraints-dev.txt
+          sed -i '/instructlab-eval==/d' $ilab_constraints
+
+          # Since we reuse the virtual environment prepared using ilab
+          # constraints, we should stick to the same constraints when
+          # installing latest eval.
+          #
+          # FIX: this is not ideal; a proper fix would require decoupling the
+          # two repos in CI: either by removing the job completely and relying
+          # on "sdk" (no ilab) test runs; or by preparing a separate
+          # constraints file that would consider both the requirements files
+          # for the eval library AND for the ilab - so that they are
+          # consistent.
+          pip_install="pip install -c $ilab_constraints"
+          $pip_install .
+          $pip_install .[cuda]
 
       - name: Check disk before tests
         run: |
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 45efcbf8..f768d6ee 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,6 +11,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - '.pylintrc'
       - 'scripts/*.sh' # Used by this workflow
@@ -23,6 +24,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - '.pylintrc'
       - 'scripts/*.sh' # Used by this workflow
@@ -82,8 +84,9 @@ jobs:
 
       - name: Install tox
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install tox tox-gh
+          pip_install="python -m pip install -c constraints-dev.txt"
+          $pip_install --upgrade pip
+          $pip_install tox tox-gh
 
       - name: "${{ matrix.lint.name }}"
         run: |
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bc765089..5e82f982 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,6 +12,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - 'scripts/*.sh' # Used by this workflow
       - '.github/workflows/test.yml' # This workflow
@@ -23,6 +24,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - 'scripts/*.sh' # Used by this workflow
       - '.github/workflows/test.yml' # This workflow
@@ -99,8 +101,9 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install tox tox-gh>=1.2
+          pip_install="python -m pip install -c constraints-dev.txt"
+          $pip_install --upgrade pip
+          $pip_install tox tox-gh>=1.2
 
       - name: Run unit and functional tests with tox
         run: |
diff --git a/constraints-dev.txt b/constraints-dev.txt
new file mode 100644
index 00000000..66a888aa
--- /dev/null
+++ b/constraints-dev.txt
@@ -0,0 +1,237 @@
+absl-py==2.3.0            # via rouge-score
+accelerate==1.7.0         # via lm-eval, peft, -r requirements.txt
+aiohappyeyeballs==2.6.1   # via aiohttp
+aiohttp==3.12.4           # via fsspec, langchain-community, vllm
+aiosignal==1.3.2          # via aiohttp, ray
+airportsdata==20250523    # via outlines
+annotated-types==0.7.0    # via pydantic
+antlr4-python3-runtime==4.11.0  # via latex2sympy2-extended, lm-eval
+anyio==4.9.0              # via httpx, openai, starlette, watchfiles
+appdirs==1.4.4            # via ragas
+astor==0.8.1              # via depyf
+astroid==3.3.10           # via pylint
+attrs==25.3.0             # via aiohttp, jsonlines, jsonschema, referencing
+beautifulsoup4==4.13.4    # via pyspelling
+blake3==1.0.5             # via vllm
+bracex==2.5.post1         # via wcmatch
+cachetools==6.0.0         # via tox
+certifi==2025.4.26        # via httpcore, httpx, requests
+cfgv==3.4.0               # via pre-commit
+chardet==5.2.0            # via mbstrdecoder, tox
+charset-normalizer==3.4.2  # via requests
+click==8.2.1              # via nltk, ray, rich-toolkit, typer, uvicorn
+cloudpickle==3.1.1        # via outlines, vllm
+colorama==0.4.6           # via sacrebleu, tox, tqdm-multiprocess
+compressed-tensors==0.9.1  # via vllm
+coverage==7.8.2           # via pytest-cov
+cupy-cuda12x==13.4.1      # via ray
+dataclasses-json==0.6.7   # via langchain-community
+dataproperty==1.1.0       # via pytablewriter, tabledata
+datasets==3.6.0           # via evaluate, lm-eval, ragas
+depyf==0.18.0             # via vllm
+dill==0.3.8               # via datasets, depyf, evaluate, lm-eval, multiprocess, pylint
+diskcache==5.6.3          # via outlines, ragas
+distlib==0.3.9            # via virtualenv
+distro==1.9.0             # via openai
+dnspython==2.7.0          # via email-validator
+einops==0.8.1             # via vllm
+email-validator==2.2.0    # via fastapi
+evaluate==0.4.3           # via lm-eval
+fastapi==0.115.12         # via vllm
+fastapi-cli==0.0.7        # via fastapi
+fastrlock==0.8.3          # via cupy-cuda12x
+filelock==3.18.0          # via datasets, huggingface-hub, ray, torch, tox, transformers, triton, virtualenv, vllm
+frozenlist==1.6.0         # via aiohttp, aiosignal, ray
+fsspec==2025.3.0          # via datasets, evaluate, huggingface-hub, torch
+gguf==0.10.0              # via vllm
+gitdb==4.0.12             # via gitpython
+gitpython==3.1.44         # via -r requirements.txt
+greenlet==3.2.2           # via sqlalchemy
+h11==0.16.0               # via httpcore, uvicorn
+hf-xet==1.1.2             # via huggingface-hub
+html5lib==1.1             # via pyspelling
+httpcore==1.0.9           # via httpx
+httptools==0.6.4          # via uvicorn
+httpx==0.28.1             # via fastapi, langsmith, openai, -r requirements.txt
+httpx-sse==0.4.0          # via langchain-community
+huggingface-hub==0.32.3   # via accelerate, datasets, evaluate, peft, tokenizers, transformers
+identify==2.6.12          # via pre-commit
+idna==3.10                # via anyio, email-validator, httpx, requests, yarl
+immutabledict==4.2.1      # via lm-eval
+importlib-metadata==8.7.0  # via vllm
+iniconfig==2.1.0          # via pytest
+interegular==0.3.3        # via lm-format-enforcer, outlines, outlines-core
+isort==6.0.1              # via pylint, -r requirements-dev.txt
+jinja2==3.1.6             # via fastapi, outlines, pytest-html, torch
+jiter==0.10.0             # via openai
+joblib==1.5.1             # via nltk, scikit-learn
+jsonlines==4.0.0          # via lm-eval
+jsonpatch==1.33           # via langchain-core
+jsonpointer==3.0.0        # via jsonpatch
+jsonschema==4.24.0        # via mistral-common, outlines, outlines-core, ray
+jsonschema-specifications==2025.4.1  # via jsonschema
+langchain==0.3.25         # via langchain-community, ragas
+langchain-community==0.3.24  # via ragas
+langchain-core==0.3.63    # via langchain, langchain-community, langchain-openai, langchain-text-splitters, ragas
+langchain-openai==0.3.18  # via ragas
+langchain-text-splitters==0.3.8  # via langchain
+langdetect==1.0.9         # via lm-eval
+langsmith==0.3.43         # via langchain, langchain-community, langchain-core
+lark==1.2.2               # via outlines, vllm
+latex2sympy2-extended==1.10.1  # via math-verify
+llvmlite==0.43.0          # via numba
+lm-eval==0.4.8            # via -r requirements-leaderboard.txt, -r requirements.txt
+lm-format-enforcer==0.10.11  # via vllm
+lxml==5.4.0               # via pyspelling, sacrebleu
+markdown==3.8             # via pyspelling
+markdown-it-py==3.0.0     # via rich
+markupsafe==3.0.2         # via jinja2
+marshmallow==3.26.1       # via dataclasses-json
+math-verify==0.7.0        # via lm-eval
+mbstrdecoder==1.1.4       # via dataproperty, pytablewriter, typepy
+mccabe==0.7.0             # via pylint
+mdurl==0.1.2              # via markdown-it-py
+mistral-common==1.5.6     # via vllm
+more-itertools==10.7.0    # via lm-eval
+mpmath==1.3.0             # via sympy
+msgpack==1.1.0            # via ray
+msgspec==0.19.0           # via vllm
+multidict==6.4.4          # via aiohttp, yarl
+multiprocess==0.70.16     # via datasets, evaluate
+mypy==1.16.0              # via -r requirements-dev.txt
+mypy-extensions==1.1.0    # via mypy, typing-inspect
+nest-asyncio==1.6.0       # via outlines, ragas
+networkx==3.5             # via torch
+nltk==3.9.1               # via lm-eval, rouge-score
+nodeenv==1.9.1            # via pre-commit
+numba==0.60.0             # via vllm
+numexpr==2.10.2           # via lm-eval
+numpy==1.26.4             # via accelerate, cupy-cuda12x, datasets, evaluate, gguf, langchain-community, mistral-common, numba, numexpr, opencv-python-headless, outlines, pandas, pandas-stubs, peft, ragas, rouge-score, sacrebleu, scikit-learn, scipy, torchvision, transformers, vllm, xformers
+nvidia-cublas-cu12==12.4.5.8  # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
+nvidia-cuda-cupti-cu12==12.4.127  # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127  # via torch
+nvidia-cuda-runtime-cu12==12.4.127  # via torch
+nvidia-cudnn-cu12==9.1.0.70  # via torch
+nvidia-cufft-cu12==11.2.1.3  # via torch
+nvidia-curand-cu12==10.3.5.147  # via torch
+nvidia-cusolver-cu12==11.6.1.9  # via torch
+nvidia-cusparse-cu12==12.3.1.170  # via nvidia-cusolver-cu12, torch
+nvidia-nccl-cu12==2.21.5  # via torch
+nvidia-nvjitlink-cu12==12.4.127  # via nvidia-cusolver-cu12, nvidia-cusparse-cu12, torch
+nvidia-nvtx-cu12==12.4.127  # via torch
+openai==1.82.1            # via langchain-openai, ragas, vllm, -r requirements.txt
+opencv-python-headless==4.11.0.86  # via mistral-common
+orjson==3.10.18           # via langsmith
+outlines==0.1.11          # via vllm
+outlines-core==0.1.26     # via outlines
+packaging==24.2           # via accelerate, datasets, evaluate, huggingface-hub, langchain-core, langsmith, lm-format-enforcer, marshmallow, peft, pyproject-api, pytest, ray, tox, transformers, typepy
+pandas==2.2.3             # via datasets, evaluate, -r requirements.txt
+pandas-stubs==2.2.3.250527  # via -r requirements.txt
+partial-json-parser==0.2.1.1.post5  # via vllm
+pathspec==0.12.1          # via mypy
+pathvalidate==3.2.3       # via pytablewriter
+peft==0.15.2              # via lm-eval
+pillow==11.2.1            # via mistral-common, torchvision, vllm
+platformdirs==4.3.8       # via pylint, tox, virtualenv
+pluggy==1.6.0             # via pytest, tox
+portalocker==3.1.1        # via sacrebleu
+pre-commit==4.2.0         # via -r requirements-dev.txt
+prometheus-client==0.22.0  # via prometheus-fastapi-instrumentator, vllm
+prometheus-fastapi-instrumentator==7.1.0  # via vllm
+propcache==0.3.1          # via aiohttp, yarl
+protobuf==6.31.1          # via ray, vllm
+psutil==7.0.0             # via accelerate, peft, vllm, -r requirements.txt
+py-cpuinfo==9.0.0         # via vllm
+pyarrow==20.0.0           # via datasets
+pybind11==2.13.6          # via lm-eval, xgrammar
+pycountry==24.6.1         # via outlines
+pydantic==2.11.5          # via compressed-tensors, fastapi, langchain, langchain-core, langsmith, lm-format-enforcer, mistral-common, openai, outlines, pydantic-settings, pylint-pydantic, ragas, vllm, xgrammar
+pydantic-core==2.33.2     # via pydantic
+pydantic-settings==2.9.1  # via langchain-community
+pygments==2.19.1          # via rich
+pylint==3.3.7             # via pylint-plugin-utils, pylint-pydantic, -r requirements-dev.txt
+pylint-plugin-utils==0.8.2  # via pylint-pydantic
+pylint-pydantic==0.3.5    # via -r requirements-dev.txt
+pyproject-api==1.9.0      # via tox
+pyspelling==2.10          # via -r requirements-dev.txt
+pytablewriter==1.2.1      # via lm-eval
+pytest==8.3.5             # via pytest-asyncio, pytest-cov, pytest-html, pytest-metadata, xgrammar, -r requirements-dev.txt
+pytest-asyncio==1.0.0     # via -r requirements-dev.txt
+pytest-cov==6.1.1         # via -r requirements-dev.txt
+pytest-html==4.1.1        # via -r requirements-dev.txt
+pytest-metadata==3.1.1    # via pytest-html
+python-dateutil==2.9.0.post0  # via pandas, typepy
+python-dotenv==1.1.0      # via pydantic-settings, uvicorn
+python-multipart==0.0.20  # via fastapi
+pytz==2025.2              # via pandas, typepy
+pyyaml==6.0.2             # via accelerate, datasets, gguf, huggingface-hub, langchain, langchain-community, langchain-core, lm-format-enforcer, peft, pre-commit, pyspelling, ray, transformers, uvicorn, vllm
+pyzmq==26.4.0             # via vllm
+ragas==0.2.15             # via -r requirements.txt
+ray==2.40.0               # via vllm
+referencing==0.36.2       # via jsonschema, jsonschema-specifications, outlines
+regex==2024.11.6          # via nltk, sacrebleu, tiktoken, transformers
+requests==2.32.3          # via datasets, evaluate, huggingface-hub, langchain, langchain-community, langsmith, mistral-common, outlines, ray, requests-toolbelt, tiktoken, transformers, vllm
+requests-toolbelt==1.0.0  # via langsmith
+rich==14.0.0              # via rich-toolkit, typer
+rich-toolkit==0.14.7      # via fastapi-cli
+rouge-score==0.1.2        # via lm-eval
+rpds-py==0.25.1           # via jsonschema, referencing
+ruff==0.11.12             # via -r requirements-dev.txt
+sacrebleu==2.5.1          # via lm-eval
+safetensors==0.5.3        # via accelerate, peft, transformers
+scikit-learn==1.6.1       # via lm-eval
+scipy==1.15.3             # via scikit-learn
+sentencepiece==0.2.0      # via lm-eval, mistral-common, vllm, xgrammar
+setuptools==80.9.0        # via pytablewriter
+shellingham==1.5.4        # via typer
+shortuuid==1.0.13         # via -r requirements.txt
+six==1.17.0               # via html5lib, langdetect, python-dateutil, rouge-score
+smmap==5.0.2              # via gitdb
+sniffio==1.3.1            # via anyio, openai
+soupsieve==2.7            # via beautifulsoup4, pyspelling
+sqlalchemy==2.0.41        # via langchain, langchain-community
+sqlitedict==2.1.0         # via lm-eval
+starlette==0.46.2         # via fastapi, prometheus-fastapi-instrumentator
+sympy==1.13.1             # via latex2sympy2-extended, lm-eval, torch
+tabledata==1.3.4          # via pytablewriter
+tabulate==0.9.0           # via sacrebleu
+tcolorpy==0.1.7           # via pytablewriter
+tenacity==9.1.2           # via langchain-community, langchain-core
+threadpoolctl==3.6.0      # via scikit-learn
+tiktoken==0.9.0           # via langchain-openai, mistral-common, ragas, vllm, xgrammar
+tokenizers==0.21.1        # via transformers, vllm
+tomlkit==0.13.2           # via pylint
+torch==2.5.1              # via accelerate, compressed-tensors, lm-eval, outlines, peft, torchaudio, torchvision, vllm, xformers, xgrammar, -r requirements-leaderboard.txt, -r requirements.txt
+torchaudio==2.5.1         # via vllm
+torchvision==0.20.1       # via vllm
+tox==4.26.0               # via -r requirements-dev.txt
+tqdm==4.67.1              # via datasets, evaluate, gguf, huggingface-hub, nltk, openai, outlines, peft, tqdm-multiprocess, transformers, vllm
+tqdm-multiprocess==0.0.11  # via lm-eval
+transformers==4.52.4      # via compressed-tensors, lm-eval, peft, vllm, xgrammar, -r requirements.txt
+triton==3.1.0             # via torch
+typepy==1.3.4             # via dataproperty, pytablewriter, tabledata
+typer==0.16.0             # via fastapi-cli
+types-pytz==2025.2.0.20250516  # via pandas-stubs
+types-pyyaml==6.0.12.20250516  # via -r requirements-dev.txt
+types-requests==2.32.0.20250515  # via types-tqdm
+types-tqdm==4.67.0.20250516  # via -r requirements-dev.txt
+typing-extensions==4.13.2  # via anyio, beautifulsoup4, fastapi, huggingface-hub, langchain-core, mistral-common, mypy, openai, outlines, pydantic, pydantic-core, referencing, rich-toolkit, sqlalchemy, torch, typer, typing-inspect, typing-inspection, vllm
+typing-inspect==0.9.0     # via dataclasses-json
+typing-inspection==0.4.1  # via pydantic, pydantic-settings
+tzdata==2025.2            # via pandas
+urllib3==2.4.0            # via requests, types-requests
+uvicorn==0.34.2           # via fastapi, fastapi-cli
+uvloop==0.21.0            # via uvicorn
+virtualenv==20.31.2       # via pre-commit, tox
+vllm==0.7.3               # via lm-eval, -r requirements-leaderboard.txt
+watchfiles==1.0.5         # via uvicorn
+wcmatch==10.0             # via pyspelling
+webencodings==0.5.1       # via html5lib
+websockets==15.0.1        # via uvicorn
+word2number==1.1          # via lm-eval
+xformers==0.0.28.post3    # via vllm
+xgrammar==0.1.11          # via vllm
+xxhash==3.5.0             # via datasets, evaluate
+yarl==1.20.0              # via aiohttp
+zipp==3.22.0              # via importlib-metadata
+zstandard==0.23.0         # via langsmith, lm-eval
diff --git a/constraints-dev.txt.in b/constraints-dev.txt.in
new file mode 100644
index 00000000..e69de29b
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 2c99749f..e1acce29 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,7 +2,20 @@
 
 -r requirements.txt
 
-pre-commit>=3.0.4,<5.0
-pylint>=2.16.2,<4.0
+pre-commit>=3.0.4
+pylint>=2.16.2
 pylint-pydantic
-tox>=4.4.2,<5
+tox>=4.4.2
+
+pytest
+pytest-asyncio
+pytest-cov
+pytest-html
+
+ruff
+isort
+pyspelling
+
+mypy>=1.10.0
+types-tqdm
+types-PyYAML
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
index ff2145ae..9913157b 100644
--- a/src/instructlab/eval/leaderboard.py
+++ b/src/instructlab/eval/leaderboard.py
@@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
         p.join()
 
     # extract the result which is not None
-    assert (
-        len([res for res in results.values() if res is not None]) == 1
-    ), "we expect exactly 1 process to return a results dict properly"
+    assert len([res for res in results.values() if res is not None]) == 1, (
+        "we expect exactly 1 process to return a results dict properly"
+    )
     results_dict = [res for res in results.values() if res is not None][0]
     return results_dict
 
@@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 24
-    ), "there should be 24 subtasks of bbh run"
+    assert len(parsed_scores["subtasks"]) == 24, (
+        "there should be 24 subtasks of bbh run"
+    )
     return parsed_scores
 
 
@@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
             scores.append(value)
             target_metrics.remove(metric)
 
-    assert (
-        len(scores) == 2
-    ), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
+    assert len(scores) == 2, (
+        f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
+    )
     return {
         "score": sum(scores) / 2,
     }
@@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 3
-    ), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
+    assert len(parsed_scores["subtasks"]) == 3, (
+        f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
+    )
     return parsed_scores
 
 
@@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
     )
-    assert (
-        len(parsed_scores["subtasks"]) == 7
-    ), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
+    assert len(parsed_scores["subtasks"]) == 7, (
+        f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
+    )
     return parsed_scores
 
 
@@ -463,9 +463,9 @@ def get_scores_from_result_dicts(
         # this is just a sanity check step
         benchmarks_already_covered = set(parsed_scores.keys())
         overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
-        assert (
-            len(benchmarks_already_covered & benchmarks_to_parse) == 0
-        ), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
+        assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
+            f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
+        )
 
         # now actually add them
         for benchmark in benchmarks_to_parse:
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index f3f80683..560bb8da 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -346,9 +346,9 @@ def check_data(questions, model_answers, ref_answers, models, judges):
         assert m in model_answers, f"Missing model answer for {m}"
         m_answer = model_answers[m]
         for q in questions:
-            assert (
-                q["question_id"] in m_answer
-            ), f"Missing model {m}'s answer to Question {q['question_id']}"
+            assert q["question_id"] in m_answer, (
+                f"Missing model {m}'s answer to Question {q['question_id']}"
+            )
     # check ref answers
     for jg in judges.values():
         if not jg.ref_based:
@@ -356,9 +356,9 @@ def check_data(questions, model_answers, ref_answers, models, judges):
         for q in questions:
             if q["category"] not in NEED_REF_CATS:
                 continue
-            assert (
-                q["question_id"] in ref_answers[jg.model_name]
-            ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+            assert q["question_id"] in ref_answers[jg.model_name], (
+                f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+            )
 
 
 def get_model_list(answer_file):
diff --git a/src/instructlab/eval/mt_bench_conversation.py b/src/instructlab/eval/mt_bench_conversation.py
index 05585b90..caaadfa5 100644
--- a/src/instructlab/eval/mt_bench_conversation.py
+++ b/src/instructlab/eval/mt_bench_conversation.py
@@ -116,9 +116,9 @@ def dict(self):
 def register_conv_template(template: Conversation, override: bool = False):
     """Register a new conversation template."""
     if not override:
-        assert (
-            template.name not in conv_templates
-        ), f"{template.name} has been registered."
+        assert template.name not in conv_templates, (
+            f"{template.name} has been registered."
+        )
 
     conv_templates[template.name] = template
 
diff --git a/tests/test_mt_bench.py b/tests/test_mt_bench.py
index ec767c94..57a9d861 100644
--- a/tests/test_mt_bench.py
+++ b/tests/test_mt_bench.py
@@ -43,7 +43,7 @@ def gen_qa_pairs(odd):
             {
                 "question_id": i + 1,
                 "score": 0.6,
-                "qna_file": f"category{i+1}/qna.yaml",
+                "qna_file": f"category{i + 1}/qna.yaml",
             }
         )
     return qa_pairs
diff --git a/tox.ini b/tox.ini
index c1337d4b..a50c2ade 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,6 +18,10 @@ setenv =
     ILAB_MAX_STABLE_VRAM_WAIT=0
 package = wheel
 wheel_build_env = pkg
+install_command = pip install \
+                  --use-feature fast-deps \
+                  -c constraints-dev.txt \
+                  {opts} {packages}
 # equivalent to `pip install instructlab[cpu]`
 extras = 
     cpu
@@ -60,8 +64,8 @@ skip_install = True
 skipsdist = true
 # keep in sync with .pre-commit-config.yaml
 deps =
-    ruff==0.3.4
-    isort==5.11.5
+    ruff
+    isort
 # supports 'fix', 'check', or abitrary args to 'ruff' command
 commands =
     ./scripts/ruff.sh {posargs:fix}
@@ -81,7 +85,7 @@ allowlist_externals = sh
 [testenv:mypy]
 description = Python type checking with mypy
 deps =
-  mypy>=1.10.0,<2.0
+  mypy
   types-tqdm
   types-PyYAML
   pytest
@@ -103,3 +107,13 @@ passenv =
 [gh]
 python =
     3.11 = py311-{unitcov, functional}
+
+[testenv:constraints]
+description = Generate new constraints file(s)
+basepython = {[testenv:py3]basepython}
+skip_install = True
+skipsdist = true
+deps =
+    uv==0.7.8
+commands = {posargs}
+allowlist_externals = *