Skip to content

Commit ddc8ec5

Browse files
authored
Merge pull request #466 from instructlab/port_old_tests
Add simple and full knowledge pipeline functional tests
2 parents e04c039 + 2a62495 commit ddc8ec5

File tree

10 files changed

+346
-62
lines changed

10 files changed

+346
-62
lines changed

.github/mergify.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ pull_request_rules:
5959
- -files~=^requirements.*\.txt$
6060
- -files=.github/workflows/e2e-nvidia-t4-x1.yml
6161

62+
# functional gpu small workflow
63+
- or:
64+
- and:
65+
# note this should match the triggering criteria in 'functional-gpu-nvidia-t4-x1.yml'
66+
- check-success~=functional-gpu-small-workflow-complete
67+
- or:
68+
- files~=\.py$
69+
- files=pyproject.toml
70+
- files=^requirements.*\.txt$
71+
- files=.github/workflows/functional-gpu-nvidia-t4-x1.yml
72+
- and:
73+
- -files~=\.py$
74+
- -files=pyproject.toml
75+
- -files~=^requirements.*\.txt$
76+
- -files=.github/workflows/functional-gpu-nvidia-t4-x1.yml
77+
6278
# lint must pass if files change that would trigger this job
6379
- or:
6480
- and:
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: Functional GPU (NVIDIA Tesla T4 x1)
4+
5+
on:
6+
# run against every merge commit to 'main' and release branches
7+
push:
8+
branches:
9+
- main
10+
- release-*
11+
# only run on PRs that touch certain regex paths
12+
pull_request_target:
13+
branches:
14+
- main
15+
- release-*
16+
paths:
17+
# note this should match the merging criteria in 'mergify.yml'
18+
- "**.py"
19+
- "pyproject.toml"
20+
- "requirements**.txt"
21+
- 'tox.ini'
22+
- ".github/workflows/functional-gpu-nvidia-t4-x1.yml" # This workflow
23+
24+
concurrency:
25+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
26+
cancel-in-progress: true
27+
28+
env:
29+
LC_ALL: en_US.UTF-8
30+
31+
defaults:
32+
run:
33+
shell: bash
34+
35+
permissions:
36+
contents: read
37+
38+
jobs:
39+
start-small-ec2-runner:
40+
runs-on: ubuntu-latest
41+
outputs:
42+
label: ${{ steps.start-ec2-runner.outputs.label }}
43+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
44+
steps:
45+
- name: Configure AWS credentials
46+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
47+
with:
48+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
aws-region: ${{ vars.AWS_REGION }}
51+
52+
- name: Start EC2 runner
53+
id: start-ec2-runner
54+
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
55+
with:
56+
mode: start
57+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
58+
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
59+
ec2-instance-type: g4dn.2xlarge
60+
subnet-id: subnet-02d230cffd9385bd4
61+
security-group-id: sg-06300447c4a5fbef3
62+
iam-role-name: instructlab-ci-runner
63+
aws-resource-tags: >
64+
[
65+
{"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
66+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
67+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
68+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
69+
]
70+
71+
functional-gpu-small-test:
72+
needs:
73+
- start-small-ec2-runner
74+
runs-on: ${{ needs.start-small-ec2-runner.outputs.label }}
75+
76+
# It is important that this job has no write permissions and has
77+
# no access to any secrets. This part is where we are running
78+
# untrusted code from PRs.
79+
permissions: {}
80+
81+
steps:
82+
- name: Install Packages
83+
run: |
84+
cat /etc/os-release
85+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
86+
87+
- name: Checkout instructlab/sdg
88+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
89+
with:
90+
# https://github.com/actions/checkout/issues/249
91+
fetch-depth: 0
92+
93+
- name: Fetch and checkout PR
94+
if: github.event_name == 'pull_request_target'
95+
run: |
96+
git fetch origin pull/${{ github.event.pull_request.number }}/merge:pr-merge-${{ github.event.pull_request.number }}
97+
git checkout pr-merge-${{ github.event.pull_request.number }}
98+
git log -1 --format="%H %s"
99+
100+
- name: Install instructlab/sdg
101+
run: |
102+
export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
103+
python3.11 -m venv --upgrade-deps venv
104+
. venv/bin/activate
105+
nvidia-smi
106+
python3.11 -m pip install tox tox-gh>=1.2
107+
python3.11 -m pip cache remove llama_cpp_python
108+
109+
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -r requirements-dev.txt
110+
111+
- name: Check disk before tests
112+
run: |
113+
df -h
114+
115+
- name: Run functional gpu tests with tox
116+
run: |
117+
. venv/bin/activate
118+
tox -e py3-functional-gpu
119+
120+
- name: Check disk after tests
121+
run: |
122+
df -h
123+
124+
stop-small-ec2-runner:
125+
needs:
126+
- start-small-ec2-runner
127+
- functional-gpu-small-test
128+
runs-on: ubuntu-latest
129+
if: ${{ always() }}
130+
steps:
131+
- name: Configure AWS credentials
132+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
133+
with:
134+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
135+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
136+
aws-region: ${{ vars.AWS_REGION }}
137+
138+
- name: Stop EC2 runner
139+
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
140+
with:
141+
mode: stop
142+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
143+
label: ${{ needs.start-small-ec2-runner.outputs.label }}
144+
ec2-instance-id: ${{ needs.start-small-ec2-runner.outputs.ec2-instance-id }}
145+
146+
functional-gpu-small-workflow-complete:
147+
# we don't want to block PRs on failed EC2 cleanup
148+
# so not requiring "stop-small-ec2-runner" as well
149+
needs: ["start-small-ec2-runner", "functional-gpu-small-test"]
150+
runs-on: ubuntu-latest
151+
steps:
152+
- name: Functional GPU Workflow Complete
153+
run: echo "Functional GPU Workflow Complete"

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,8 @@ exclude = [
102102
]
103103
# honor excludes by not following there through imports
104104
follow_imports = "silent"
105+
106+
[tool.pytest.ini_options]
107+
markers = [
108+
"gpu: marks tests that should run with gpus (deselect with '-m \"not gpu\"')",
109+
]

requirements-dev.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
-r requirements.txt
44

5+
jsonschema
6+
llama-cpp-python[server]>=0.3.0,<1.0.0
57
pre-commit>=3.0.4,<5.0
68
pylint>=2.16.2,<4.0
79
pylint-pydantic
810
pytest
911
pytest-asyncio
1012
pytest-cov
1113
pytest-html
14+
starlette>=0.30.0
1215
tox>=4.4.2,<5
13-
jsonschema

scripts/test_knowledge.py

Lines changed: 0 additions & 52 deletions
This file was deleted.

tests/functional/conftest.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import typing
44

55
# Third Party
6+
from datasets import Dataset
67
import pytest
78

89
TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute()
@@ -19,3 +20,24 @@ def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
1920
def examples_path() -> typing.Generator[pathlib.Path, None, None]:
2021
"""Path to examples directory"""
2122
yield EXAMPLES_PATH
23+
24+
25+
@pytest.fixture
26+
def tonsils_knowledge_dataset():
27+
return Dataset.from_list(
28+
[
29+
{
30+
"icl_query_1": "what is the location of the tubal tonsils?",
31+
"icl_response_1": "The location of the tubal tonsils is the roof of the pharynx.",
32+
"icl_query_2": "How long does the adenoid grow?",
33+
"task_description": "Teaching about human anatomy, specifically tonsils",
34+
"icl_response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.",
35+
"icl_query_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?",
36+
"icl_response_3": "The tonsils are the immune systems first line of defense.",
37+
"document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat",
38+
"icl_document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils.",
39+
"domain": "textbook",
40+
"document_outline": "Medical description of tonsils",
41+
}
42+
]
43+
)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Standard
2+
from importlib import resources
3+
import pathlib
4+
import typing
5+
6+
# Third Party
7+
from llama_cpp.server.app import create_app
8+
from llama_cpp.server.settings import ModelSettings, ServerSettings
9+
from openai import OpenAI
10+
from starlette.testclient import TestClient
11+
12+
13+
def llama_cpp_openai_client(model, model_repo_id):
14+
server_settings = ServerSettings()
15+
model_settings = [
16+
ModelSettings(
17+
model=model,
18+
hf_model_repo_id=model_repo_id,
19+
verbose=True,
20+
)
21+
]
22+
app = create_app(
23+
server_settings=server_settings,
24+
model_settings=model_settings,
25+
)
26+
27+
@app.get("/")
28+
def read_root():
29+
return {"message": "Hello from InstructLab! Visit us at https://instructlab.ai"}
30+
31+
test_client = TestClient(app)
32+
return OpenAI(
33+
api_key="EMPTY",
34+
base_url="http://localhost:8000/v1",
35+
http_client=test_client,
36+
)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Standard
2+
from importlib import resources
3+
import unittest
4+
5+
# Third Party
6+
import pytest
7+
8+
# First Party
9+
from src.instructlab.sdg.datamixing import _get_question_hack, _get_response_hack
10+
from src.instructlab.sdg.pipeline import (
11+
FULL_PIPELINES_PACKAGE,
12+
Pipeline,
13+
PipelineContext,
14+
)
15+
16+
# Local
17+
from .llama_cpp_helpers import llama_cpp_openai_client
18+
19+
20+
@pytest.mark.gpu
21+
class TestFullPipeline(unittest.TestCase):
22+
@pytest.fixture(autouse=True)
23+
def _setup_fixtures(self, tonsils_knowledge_dataset):
24+
model = "mistral-7b-instruct-v0.2.Q5_K_M.gguf"
25+
model_repo_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
26+
model_family = "mixtral"
27+
client = llama_cpp_openai_client(model, model_repo_id)
28+
teacher_model = client.models.list().data[0].id
29+
num_instructions_to_generate = 2
30+
max_num_tokens = 1024
31+
context = PipelineContext(
32+
client=client,
33+
model_family=model_family,
34+
model_id=teacher_model,
35+
num_instructions_to_generate=num_instructions_to_generate,
36+
max_num_tokens=max_num_tokens,
37+
)
38+
yaml_path = resources.files(FULL_PIPELINES_PACKAGE).joinpath("knowledge.yaml")
39+
self.knowledge_dataset = tonsils_knowledge_dataset
40+
self.knowledge_pipeline = Pipeline.from_file(context, yaml_path)
41+
42+
def test_knowledge(self):
43+
samples = self.knowledge_pipeline.generate(self.knowledge_dataset)
44+
assert len(samples) > 0
45+
assert "question" in samples.column_names
46+
assert "response" in samples.column_names
47+
for sample in samples:
48+
question = _get_question_hack(sample)
49+
response = _get_response_hack(sample)
50+
assert len(question) > 0
51+
assert len(response) > 0

0 commit comments

Comments
 (0)