diff --git a/.github/workflows/constraints-update.yml b/.github/workflows/constraints-update.yml new file mode 100644 index 00000000..ab909bb7 --- /dev/null +++ b/.github/workflows/constraints-update.yml @@ -0,0 +1,33 @@ +name: Update constraints-dev.txt + +on: + schedule: + - cron: '0 3 * * 1' # Every Monday at 03:00 UTC + workflow_dispatch: + +jobs: + update-constraints: + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Checkout "update-constraints" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: instructlab/ci-actions + path: ci-actions + # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet + ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main + sparse-checkout: | + actions/update-constraints + + - name: Update constraints + id: update-constraints + uses: ./ci-actions/actions/update-constraints + with: + gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index e37e869c..6b8f2b85 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -18,6 +18,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' + - 'constraints-dev.txt' - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow concurrency: @@ -111,45 +112,29 @@ jobs: - name: Install ilab working-directory: ./instructlab run: | - export CUDA_HOME="/usr/local/cuda" - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64" - export PATH="$PATH:$CUDA_HOME/bin" - python3.11 -m venv --upgrade-deps venv - . venv/bin/activate - nvidia-smi - python3.11 -m pip cache remove llama_cpp_python - - pip_install="python3.11 -m pip install -v -c constraints-dev.txt" - - pip_install="python3.11 -m pip install -v -c constraints-dev.txt" - - # pre-install some build dependencies - $pip_install packaging wheel setuptools-scm - - # flash-attn has a bug in the setup.py that causes pip to attempt installing it - # before torch is installed. This is a bug because their setup.py depends on - # importing the module, so it should have been listed in build_requires. Alas! - # - # See: https://github.com/Dao-AILab/flash-attention/pull/958 - # Also: https://github.com/instructlab/instructlab/issues/1821 - # - # first, pre-install flash-attn build dependencies - $pip_install torch packaging setuptools wheel psutil ninja - - # now build flash-attn using the pre-installed build dependencies; this will - # guarantee that the build version of torch will match the runtime version of - # torch; otherwise, all kinds of problems may occur, like missing symbols when - # accessing C extensions and such - $pip_install flash-attn --no-build-isolation - - CMAKE_ARGS="-DGGML_CUDA=on" $pip_install . - $pip_install .[cuda] -r requirements-vllm-cuda.txt + PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh - name: Update instructlab-eval library working-directory: ./eval run: | . ../instructlab/venv/bin/activate - pip install -v . + # Patch out our own pin from the ilab repo constraints file + ilab_constraints=../instructlab/constraints-dev.txt + sed -i '/instructlab-eval==/d' $ilab_constraints + + # Since we reuse the virtual environment prepared using ilab + # constraints, we should stick to the same constraints when + # installing latest eval. + # + # FIX: this is not ideal; a proper fix would require decoupling the + # two repos in CI: either by removing the job completely and relying + # on "sdk" (no ilab) test runs; or by preparing a separate + # constraints file that would consider both the requirements files + # for the eval library AND for the ilab - so that they are + # consistent. + pip_install="pip install -c $ilab_constraints" + $pip_install . + $pip_install .[cuda] - name: Run e2e test working-directory: ./instructlab diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index d970a3d2..bc146985 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -142,44 +142,29 @@ jobs: - name: Install ilab working-directory: ./instructlab run: | - export CUDA_HOME="/usr/local/cuda" - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64" - export PATH="$PATH:$CUDA_HOME/bin" - python3.11 -m venv --upgrade-deps venv - . venv/bin/activate - nvidia-smi - python3.11 -m pip cache remove llama_cpp_python - - pip_install="python3.11 -m pip install -v -c constraints-dev.txt" - - # pre-install some build dependencies - $pip_install packaging wheel setuptools-scm - - # flash-attn has a bug in the setup.py that causes pip to attempt installing it - # before torch is installed. This is a bug because their setup.py depends on - # importing the module, so it should have been listed in build_requires. Alas! - # - # See: https://github.com/Dao-AILab/flash-attention/pull/958 - # Also: https://github.com/instructlab/instructlab/issues/1821 - # - # first, pre-install flash-attn build dependencies - $pip_install torch packaging setuptools wheel psutil ninja - - # now build flash-attn using the pre-installed build dependencies; this will - # guarantee that the build version of torch will match the runtime version of - # torch; otherwise, all kinds of problems may occur, like missing symbols when - # accessing C extensions and such - $pip_install flash-attn --no-build-isolation - - CMAKE_ARGS="-DGGML_CUDA=on" $pip_install . - $pip_install .[cuda] -r requirements-vllm-cuda.txt + PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh - name: Update instructlab-eval library working-directory: ./eval run: | . ../instructlab/venv/bin/activate - pip install . - pip install .[cuda] + # Patch out our own pin from the ilab repo constraints file + ilab_constraints=../instructlab/constraints-dev.txt + sed -i '/instructlab-eval==/d' $ilab_constraints + + # Since we reuse the virtual environment prepared using ilab + # constraints, we should stick to the same constraints when + # installing latest eval. + # + # FIX: this is not ideal; a proper fix would require decoupling the + # two repos in CI: either by removing the job completely and relying + # on "sdk" (no ilab) test runs; or by preparing a separate + # constraints file that would consider both the requirements files + # for the eval library AND for the ilab - so that they are + # consistent. + pip_install="pip install -c $ilab_constraints" + $pip_install . + $pip_install .[cuda] - name: Check disk before tests run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 45efcbf8..f768d6ee 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,6 +11,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements*.txt' + - 'constraints-dev.txt' - 'tox.ini' - '.pylintrc' - 'scripts/*.sh' # Used by this workflow @@ -23,6 +24,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements*.txt' + - 'constraints-dev.txt' - 'tox.ini' - '.pylintrc' - 'scripts/*.sh' # Used by this workflow @@ -82,8 +84,9 @@ jobs: - name: Install tox run: | - python -m pip install --upgrade pip - python -m pip install tox tox-gh + pip_install="python -m pip install -c constraints-dev.txt" + $pip_install --upgrade pip + $pip_install tox tox-gh - name: "${{ matrix.lint.name }}" run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bc765089..5e82f982 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,6 +12,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' + - 'constraints-dev.txt' - 'tox.ini' - 'scripts/*.sh' # Used by this workflow - '.github/workflows/test.yml' # This workflow @@ -23,6 +24,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' + - 'constraints-dev.txt' - 'tox.ini' - 'scripts/*.sh' # Used by this workflow - '.github/workflows/test.yml' # This workflow @@ -99,8 +101,9 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install tox tox-gh>=1.2 + pip_install="python -m pip install -c constraints-dev.txt" + $pip_install --upgrade pip + $pip_install tox tox-gh>=1.2 - name: Run unit and functional tests with tox run: | diff --git a/constraints-dev.txt b/constraints-dev.txt new file mode 100644 index 00000000..66a888aa --- /dev/null +++ b/constraints-dev.txt @@ -0,0 +1,237 @@ +absl-py==2.3.0 # via rouge-score +accelerate==1.7.0 # via lm-eval, peft, -r requirements.txt +aiohappyeyeballs==2.6.1 # via aiohttp +aiohttp==3.12.4 # via fsspec, langchain-community, vllm +aiosignal==1.3.2 # via aiohttp, ray +airportsdata==20250523 # via outlines +annotated-types==0.7.0 # via pydantic +antlr4-python3-runtime==4.11.0 # via latex2sympy2-extended, lm-eval +anyio==4.9.0 # via httpx, openai, starlette, watchfiles +appdirs==1.4.4 # via ragas +astor==0.8.1 # via depyf +astroid==3.3.10 # via pylint +attrs==25.3.0 # via aiohttp, jsonlines, jsonschema, referencing +beautifulsoup4==4.13.4 # via pyspelling +blake3==1.0.5 # via vllm +bracex==2.5.post1 # via wcmatch +cachetools==6.0.0 # via tox +certifi==2025.4.26 # via httpcore, httpx, requests +cfgv==3.4.0 # via pre-commit +chardet==5.2.0 # via mbstrdecoder, tox +charset-normalizer==3.4.2 # via requests +click==8.2.1 # via nltk, ray, rich-toolkit, typer, uvicorn +cloudpickle==3.1.1 # via outlines, vllm +colorama==0.4.6 # via sacrebleu, tox, tqdm-multiprocess +compressed-tensors==0.9.1 # via vllm +coverage==7.8.2 # via pytest-cov +cupy-cuda12x==13.4.1 # via ray +dataclasses-json==0.6.7 # via langchain-community +dataproperty==1.1.0 # via pytablewriter, tabledata +datasets==3.6.0 # via evaluate, lm-eval, ragas +depyf==0.18.0 # via vllm +dill==0.3.8 # via datasets, depyf, evaluate, lm-eval, multiprocess, pylint +diskcache==5.6.3 # via outlines, ragas +distlib==0.3.9 # via virtualenv +distro==1.9.0 # via openai +dnspython==2.7.0 # via email-validator +einops==0.8.1 # via vllm +email-validator==2.2.0 # via fastapi +evaluate==0.4.3 # via lm-eval +fastapi==0.115.12 # via vllm +fastapi-cli==0.0.7 # via fastapi +fastrlock==0.8.3 # via cupy-cuda12x +filelock==3.18.0 # via datasets, huggingface-hub, ray, torch, tox, transformers, triton, virtualenv, vllm +frozenlist==1.6.0 # via aiohttp, aiosignal, ray +fsspec==2025.3.0 # via datasets, evaluate, huggingface-hub, torch +gguf==0.10.0 # via vllm +gitdb==4.0.12 # via gitpython +gitpython==3.1.44 # via -r requirements.txt +greenlet==3.2.2 # via sqlalchemy +h11==0.16.0 # via httpcore, uvicorn +hf-xet==1.1.2 # via huggingface-hub +html5lib==1.1 # via pyspelling +httpcore==1.0.9 # via httpx +httptools==0.6.4 # via uvicorn +httpx==0.28.1 # via fastapi, langsmith, openai, -r requirements.txt +httpx-sse==0.4.0 # via langchain-community +huggingface-hub==0.32.3 # via accelerate, datasets, evaluate, peft, tokenizers, transformers +identify==2.6.12 # via pre-commit +idna==3.10 # via anyio, email-validator, httpx, requests, yarl +immutabledict==4.2.1 # via lm-eval +importlib-metadata==8.7.0 # via vllm +iniconfig==2.1.0 # via pytest +interegular==0.3.3 # via lm-format-enforcer, outlines, outlines-core +isort==6.0.1 # via pylint, -r requirements-dev.txt +jinja2==3.1.6 # via fastapi, outlines, pytest-html, torch +jiter==0.10.0 # via openai +joblib==1.5.1 # via nltk, scikit-learn +jsonlines==4.0.0 # via lm-eval +jsonpatch==1.33 # via langchain-core +jsonpointer==3.0.0 # via jsonpatch +jsonschema==4.24.0 # via mistral-common, outlines, outlines-core, ray +jsonschema-specifications==2025.4.1 # via jsonschema +langchain==0.3.25 # via langchain-community, ragas +langchain-community==0.3.24 # via ragas +langchain-core==0.3.63 # via langchain, langchain-community, langchain-openai, langchain-text-splitters, ragas +langchain-openai==0.3.18 # via ragas +langchain-text-splitters==0.3.8 # via langchain +langdetect==1.0.9 # via lm-eval +langsmith==0.3.43 # via langchain, langchain-community, langchain-core +lark==1.2.2 # via outlines, vllm +latex2sympy2-extended==1.10.1 # via math-verify +llvmlite==0.43.0 # via numba +lm-eval==0.4.8 # via -r requirements-leaderboard.txt, -r requirements.txt +lm-format-enforcer==0.10.11 # via vllm +lxml==5.4.0 # via pyspelling, sacrebleu +markdown==3.8 # via pyspelling +markdown-it-py==3.0.0 # via rich +markupsafe==3.0.2 # via jinja2 +marshmallow==3.26.1 # via dataclasses-json +math-verify==0.7.0 # via lm-eval +mbstrdecoder==1.1.4 # via dataproperty, pytablewriter, typepy +mccabe==0.7.0 # via pylint +mdurl==0.1.2 # via markdown-it-py +mistral-common==1.5.6 # via vllm +more-itertools==10.7.0 # via lm-eval +mpmath==1.3.0 # via sympy +msgpack==1.1.0 # via ray +msgspec==0.19.0 # via vllm +multidict==6.4.4 # via aiohttp, yarl +multiprocess==0.70.16 # via datasets, evaluate +mypy==1.16.0 # via -r requirements-dev.txt +mypy-extensions==1.1.0 # via mypy, typing-inspect +nest-asyncio==1.6.0 # via outlines, ragas +networkx==3.5 # via torch +nltk==3.9.1 # via lm-eval, rouge-score +nodeenv==1.9.1 # via pre-commit +numba==0.60.0 # via vllm +numexpr==2.10.2 # via lm-eval +numpy==1.26.4 # via accelerate, cupy-cuda12x, datasets, evaluate, gguf, langchain-community, mistral-common, numba, numexpr, opencv-python-headless, outlines, pandas, pandas-stubs, peft, ragas, rouge-score, sacrebleu, scikit-learn, scipy, torchvision, transformers, vllm, xformers +nvidia-cublas-cu12==12.4.5.8 # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch +nvidia-cuda-cupti-cu12==12.4.127 # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 # via torch +nvidia-cuda-runtime-cu12==12.4.127 # via torch +nvidia-cudnn-cu12==9.1.0.70 # via torch +nvidia-cufft-cu12==11.2.1.3 # via torch +nvidia-curand-cu12==10.3.5.147 # via torch +nvidia-cusolver-cu12==11.6.1.9 # via torch +nvidia-cusparse-cu12==12.3.1.170 # via nvidia-cusolver-cu12, torch +nvidia-nccl-cu12==2.21.5 # via torch +nvidia-nvjitlink-cu12==12.4.127 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12, torch +nvidia-nvtx-cu12==12.4.127 # via torch +openai==1.82.1 # via langchain-openai, ragas, vllm, -r requirements.txt +opencv-python-headless==4.11.0.86 # via mistral-common +orjson==3.10.18 # via langsmith +outlines==0.1.11 # via vllm +outlines-core==0.1.26 # via outlines +packaging==24.2 # via accelerate, datasets, evaluate, huggingface-hub, langchain-core, langsmith, lm-format-enforcer, marshmallow, peft, pyproject-api, pytest, ray, tox, transformers, typepy +pandas==2.2.3 # via datasets, evaluate, -r requirements.txt +pandas-stubs==2.2.3.250527 # via -r requirements.txt +partial-json-parser==0.2.1.1.post5 # via vllm +pathspec==0.12.1 # via mypy +pathvalidate==3.2.3 # via pytablewriter +peft==0.15.2 # via lm-eval +pillow==11.2.1 # via mistral-common, torchvision, vllm +platformdirs==4.3.8 # via pylint, tox, virtualenv +pluggy==1.6.0 # via pytest, tox +portalocker==3.1.1 # via sacrebleu +pre-commit==4.2.0 # via -r requirements-dev.txt +prometheus-client==0.22.0 # via prometheus-fastapi-instrumentator, vllm +prometheus-fastapi-instrumentator==7.1.0 # via vllm +propcache==0.3.1 # via aiohttp, yarl +protobuf==6.31.1 # via ray, vllm +psutil==7.0.0 # via accelerate, peft, vllm, -r requirements.txt +py-cpuinfo==9.0.0 # via vllm +pyarrow==20.0.0 # via datasets +pybind11==2.13.6 # via lm-eval, xgrammar +pycountry==24.6.1 # via outlines +pydantic==2.11.5 # via compressed-tensors, fastapi, langchain, langchain-core, langsmith, lm-format-enforcer, mistral-common, openai, outlines, pydantic-settings, pylint-pydantic, ragas, vllm, xgrammar +pydantic-core==2.33.2 # via pydantic +pydantic-settings==2.9.1 # via langchain-community +pygments==2.19.1 # via rich +pylint==3.3.7 # via pylint-plugin-utils, pylint-pydantic, -r requirements-dev.txt +pylint-plugin-utils==0.8.2 # via pylint-pydantic +pylint-pydantic==0.3.5 # via -r requirements-dev.txt +pyproject-api==1.9.0 # via tox +pyspelling==2.10 # via -r requirements-dev.txt +pytablewriter==1.2.1 # via lm-eval +pytest==8.3.5 # via pytest-asyncio, pytest-cov, pytest-html, pytest-metadata, xgrammar, -r requirements-dev.txt +pytest-asyncio==1.0.0 # via -r requirements-dev.txt +pytest-cov==6.1.1 # via -r requirements-dev.txt +pytest-html==4.1.1 # via -r requirements-dev.txt +pytest-metadata==3.1.1 # via pytest-html +python-dateutil==2.9.0.post0 # via pandas, typepy +python-dotenv==1.1.0 # via pydantic-settings, uvicorn +python-multipart==0.0.20 # via fastapi +pytz==2025.2 # via pandas, typepy +pyyaml==6.0.2 # via accelerate, datasets, gguf, huggingface-hub, langchain, langchain-community, langchain-core, lm-format-enforcer, peft, pre-commit, pyspelling, ray, transformers, uvicorn, vllm +pyzmq==26.4.0 # via vllm +ragas==0.2.15 # via -r requirements.txt +ray==2.40.0 # via vllm +referencing==0.36.2 # via jsonschema, jsonschema-specifications, outlines +regex==2024.11.6 # via nltk, sacrebleu, tiktoken, transformers +requests==2.32.3 # via datasets, evaluate, huggingface-hub, langchain, langchain-community, langsmith, mistral-common, outlines, ray, requests-toolbelt, tiktoken, transformers, vllm +requests-toolbelt==1.0.0 # via langsmith +rich==14.0.0 # via rich-toolkit, typer +rich-toolkit==0.14.7 # via fastapi-cli +rouge-score==0.1.2 # via lm-eval +rpds-py==0.25.1 # via jsonschema, referencing +ruff==0.11.12 # via -r requirements-dev.txt +sacrebleu==2.5.1 # via lm-eval +safetensors==0.5.3 # via accelerate, peft, transformers +scikit-learn==1.6.1 # via lm-eval +scipy==1.15.3 # via scikit-learn +sentencepiece==0.2.0 # via lm-eval, mistral-common, vllm, xgrammar +setuptools==80.9.0 # via pytablewriter +shellingham==1.5.4 # via typer +shortuuid==1.0.13 # via -r requirements.txt +six==1.17.0 # via html5lib, langdetect, python-dateutil, rouge-score +smmap==5.0.2 # via gitdb +sniffio==1.3.1 # via anyio, openai +soupsieve==2.7 # via beautifulsoup4, pyspelling +sqlalchemy==2.0.41 # via langchain, langchain-community +sqlitedict==2.1.0 # via lm-eval +starlette==0.46.2 # via fastapi, prometheus-fastapi-instrumentator +sympy==1.13.1 # via latex2sympy2-extended, lm-eval, torch +tabledata==1.3.4 # via pytablewriter +tabulate==0.9.0 # via sacrebleu +tcolorpy==0.1.7 # via pytablewriter +tenacity==9.1.2 # via langchain-community, langchain-core +threadpoolctl==3.6.0 # via scikit-learn +tiktoken==0.9.0 # via langchain-openai, mistral-common, ragas, vllm, xgrammar +tokenizers==0.21.1 # via transformers, vllm +tomlkit==0.13.2 # via pylint +torch==2.5.1 # via accelerate, compressed-tensors, lm-eval, outlines, peft, torchaudio, torchvision, vllm, xformers, xgrammar, -r requirements-leaderboard.txt, -r requirements.txt +torchaudio==2.5.1 # via vllm +torchvision==0.20.1 # via vllm +tox==4.26.0 # via -r requirements-dev.txt +tqdm==4.67.1 # via datasets, evaluate, gguf, huggingface-hub, nltk, openai, outlines, peft, tqdm-multiprocess, transformers, vllm +tqdm-multiprocess==0.0.11 # via lm-eval +transformers==4.52.4 # via compressed-tensors, lm-eval, peft, vllm, xgrammar, -r requirements.txt +triton==3.1.0 # via torch +typepy==1.3.4 # via dataproperty, pytablewriter, tabledata +typer==0.16.0 # via fastapi-cli +types-pytz==2025.2.0.20250516 # via pandas-stubs +types-pyyaml==6.0.12.20250516 # via -r requirements-dev.txt +types-requests==2.32.0.20250515 # via types-tqdm +types-tqdm==4.67.0.20250516 # via -r requirements-dev.txt +typing-extensions==4.13.2 # via anyio, beautifulsoup4, fastapi, huggingface-hub, langchain-core, mistral-common, mypy, openai, outlines, pydantic, pydantic-core, referencing, rich-toolkit, sqlalchemy, torch, typer, typing-inspect, typing-inspection, vllm +typing-inspect==0.9.0 # via dataclasses-json +typing-inspection==0.4.1 # via pydantic, pydantic-settings +tzdata==2025.2 # via pandas +urllib3==2.4.0 # via requests, types-requests +uvicorn==0.34.2 # via fastapi, fastapi-cli +uvloop==0.21.0 # via uvicorn +virtualenv==20.31.2 # via pre-commit, tox +vllm==0.7.3 # via lm-eval, -r requirements-leaderboard.txt +watchfiles==1.0.5 # via uvicorn +wcmatch==10.0 # via pyspelling +webencodings==0.5.1 # via html5lib +websockets==15.0.1 # via uvicorn +word2number==1.1 # via lm-eval +xformers==0.0.28.post3 # via vllm +xgrammar==0.1.11 # via vllm +xxhash==3.5.0 # via datasets, evaluate +yarl==1.20.0 # via aiohttp +zipp==3.22.0 # via importlib-metadata +zstandard==0.23.0 # via langsmith, lm-eval diff --git a/constraints-dev.txt.in b/constraints-dev.txt.in new file mode 100644 index 00000000..e69de29b diff --git a/requirements-dev.txt b/requirements-dev.txt index 2c99749f..e1acce29 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,20 @@ -r requirements.txt -pre-commit>=3.0.4,<5.0 -pylint>=2.16.2,<4.0 +pre-commit>=3.0.4 +pylint>=2.16.2 pylint-pydantic -tox>=4.4.2,<5 +tox>=4.4.2 + +pytest +pytest-asyncio +pytest-cov +pytest-html + +ruff +isort +pyspelling + +mypy>=1.10.0 +types-tqdm +types-PyYAML diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py index ff2145ae..9913157b 100644 --- a/src/instructlab/eval/leaderboard.py +++ b/src/instructlab/eval/leaderboard.py @@ -234,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]: p.join() # extract the result which is not None - assert ( - len([res for res in results.values() if res is not None]) == 1 - ), "we expect exactly 1 process to return a results dict properly" + assert len([res for res in results.values() if res is not None]) == 1, ( + "we expect exactly 1 process to return a results dict properly" + ) results_dict = [res for res in results.values() if res is not None][0] return results_dict @@ -302,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores: parsed_scores = parse_multitask_results( result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm" ) - assert ( - len(parsed_scores["subtasks"]) == 24 - ), "there should be 24 subtasks of bbh run" + assert len(parsed_scores["subtasks"]) == 24, ( + "there should be 24 subtasks of bbh run" + ) return parsed_scores @@ -355,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores: scores.append(value) target_metrics.remove(metric) - assert ( - len(scores) == 2 - ), f"there should only be 2 values extracted in ifeval, got: {len(scores)}" + assert len(scores) == 2, ( + f"there should only be 2 values extracted in ifeval, got: {len(scores)}" + ) return { "score": sum(scores) / 2, } @@ -381,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores: parsed_scores = parse_multitask_results( result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm" ) - assert ( - len(parsed_scores["subtasks"]) == 3 - ), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}" + assert len(parsed_scores["subtasks"]) == 3, ( + f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}" + ) return parsed_scores @@ -394,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores: parsed_scores = parse_multitask_results( result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match" ) - assert ( - len(parsed_scores["subtasks"]) == 7 - ), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}" + assert len(parsed_scores["subtasks"]) == 7, ( + f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}" + ) return parsed_scores @@ -463,9 +463,9 @@ def get_scores_from_result_dicts( # this is just a sanity check step benchmarks_already_covered = set(parsed_scores.keys()) overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse - assert ( - len(benchmarks_already_covered & benchmarks_to_parse) == 0 - ), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}" + assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, ( + f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}" + ) # now actually add them for benchmark in benchmarks_to_parse: diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py index f3f80683..560bb8da 100644 --- a/src/instructlab/eval/mt_bench_common.py +++ b/src/instructlab/eval/mt_bench_common.py @@ -346,9 +346,9 @@ def check_data(questions, model_answers, ref_answers, models, judges): assert m in model_answers, f"Missing model answer for {m}" m_answer = model_answers[m] for q in questions: - assert ( - q["question_id"] in m_answer - ), f"Missing model {m}'s answer to Question {q['question_id']}" + assert q["question_id"] in m_answer, ( + f"Missing model {m}'s answer to Question {q['question_id']}" + ) # check ref answers for jg in judges.values(): if not jg.ref_based: @@ -356,9 +356,9 @@ def check_data(questions, model_answers, ref_answers, models, judges): for q in questions: if q["category"] not in NEED_REF_CATS: continue - assert ( - q["question_id"] in ref_answers[jg.model_name] - ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}" + assert q["question_id"] in ref_answers[jg.model_name], ( + f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}" + ) def get_model_list(answer_file): diff --git a/src/instructlab/eval/mt_bench_conversation.py b/src/instructlab/eval/mt_bench_conversation.py index 05585b90..caaadfa5 100644 --- a/src/instructlab/eval/mt_bench_conversation.py +++ b/src/instructlab/eval/mt_bench_conversation.py @@ -116,9 +116,9 @@ def dict(self): def register_conv_template(template: Conversation, override: bool = False): """Register a new conversation template.""" if not override: - assert ( - template.name not in conv_templates - ), f"{template.name} has been registered." + assert template.name not in conv_templates, ( + f"{template.name} has been registered." + ) conv_templates[template.name] = template diff --git a/tests/test_mt_bench.py b/tests/test_mt_bench.py index ec767c94..57a9d861 100644 --- a/tests/test_mt_bench.py +++ b/tests/test_mt_bench.py @@ -43,7 +43,7 @@ def gen_qa_pairs(odd): { "question_id": i + 1, "score": 0.6, - "qna_file": f"category{i+1}/qna.yaml", + "qna_file": f"category{i + 1}/qna.yaml", } ) return qa_pairs diff --git a/tox.ini b/tox.ini index c1337d4b..a50c2ade 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,10 @@ setenv = ILAB_MAX_STABLE_VRAM_WAIT=0 package = wheel wheel_build_env = pkg +install_command = pip install \ + --use-feature fast-deps \ + -c constraints-dev.txt \ + {opts} {packages} # equivalent to `pip install instructlab[cpu]` extras = cpu @@ -60,8 +64,8 @@ skip_install = True skipsdist = true # keep in sync with .pre-commit-config.yaml deps = - ruff==0.3.4 - isort==5.11.5 + ruff + isort # supports 'fix', 'check', or abitrary args to 'ruff' command commands = ./scripts/ruff.sh {posargs:fix} @@ -81,7 +85,7 @@ allowlist_externals = sh [testenv:mypy] description = Python type checking with mypy deps = - mypy>=1.10.0,<2.0 + mypy types-tqdm types-PyYAML pytest @@ -103,3 +107,13 @@ passenv = [gh] python = 3.11 = py311-{unitcov, functional} + +[testenv:constraints] +description = Generate new constraints file(s) +basepython = {[testenv:py3]basepython} +skip_install = True +skipsdist = true +deps = + uv==0.7.8 +commands = {posargs} +allowlist_externals = *