Skip to content

Commit 456712b

Browse files
authored
Modelplane upgrades (#93)
* Add text files to gitignore. * Add tests to confirm mlflow versions match for all dependencies. * Update where needed. * Upgrade mlflow and modelbench * Align with latest modelbench and fix tests. * Fix flightpaths. * Try not locking poetry version. * Doc fixes and cleanup no longer relevant flightpath.
1 parent 6b3ba95 commit 456712b

18 files changed

+3547
-3177
lines changed

.github/workflows/tests.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ on:
88
workflow_dispatch:
99
inputs:
1010
branch:
11-
description: 'Branch'
11+
description: "Branch"
1212
required: true
1313
default: main
1414

@@ -32,7 +32,13 @@ jobs:
3232
./start_services.sh --no-jupyter -d
3333
3434
- name: Install poetry
35-
run: pipx install "poetry == 1.8.5"
35+
run: pipx install poetry
36+
37+
- name: Verify MLflow versions match
38+
run: ./scripts/check_mlflow_versions.sh
39+
40+
- name: Check poetry lock file
41+
run: poetry check --lock
3642

3743
- name: Remove existing virtual environment
3844
run: |
@@ -75,6 +81,5 @@ jobs:
7581
run: |
7682
docker exec modelplane-jupyter-1 poetry run python /app/test_notebooks.py
7783
78-
7984
- name: Stop MLflow server
8085
run: docker compose down

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,7 @@ secrets.toml
77
.vscode/
88
.coverage*
99
.cache
10+
*.csv
11+
*.txt
12+
*.json
13+
*.jsonl

Dockerfile.mlflow

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
FROM ghcr.io/mlflow/mlflow:v3.1.1
1+
FROM ghcr.io/mlflow/mlflow:v3.7.0
22

33
# The base image does not include various dependencies that are needed for
44
# the MLflow server. We assume a postgres backend, so we need psycopg2.
55
# We also need boto3 for S3 support, and google-cloud-storage for GCS support.
6-
# TODO: better way to install these (maybe using poetry.lock to grab consistent versions?)
7-
RUN pip install mlflow[auth]==3.1.1 psycopg2-binary==2.9.10 boto3==1.38.31 \
8-
google-cloud-storage==3.1.0
6+
RUN pip install mlflow[auth]==3.7.0 psycopg2-binary==2.9.11 boto3==1.42.5 \
7+
google-cloud-storage==3.4.1

README.md

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ for access.
4343
MLFlow server (`MLFLOW_TRACKING_USERNAME` /
4444
`MLFLOW_TRACKING_PASSWORD`).
4545
* Alternatively, put the credentials in `~/.mlflow/credentials` as described [here](https://mlflow.org/docs/latest/ml/auth/#credentials-file).
46-
1. To access `modelbench-private` code (assuming you have
47-
access), you must also set `USE_MODELBENCH_PRIVATE=true` in `.env.jupyteronly`. This will forward your ssh agent to the container
48-
allowing it to load the private repository to build the image.
46+
1. To access the private annotators, you need to set up credentials to access cheval (see `modelgauge.annotators.cheval.registration`)
47+
and reach out to [[email protected]](mailto:[email protected]) for the credentials.
4948
1. Start jupyter with `./start_jupyter.sh`. (You can add the
5049
`-d` flag to start in the background.)
5150

@@ -97,17 +96,14 @@ or you can get the `run_id` via the MLFlow UI.
9796
MLFLOW_TRACKING_URI=http://localhost:8080 poetry run modelplane annotate --annotator_id {annotator_id} --experiment expname --response_run_id {run_id}
9897
```
9998

100-
### Custom Ensembles
99+
#### Private Ensemble
100+
If you have access to the private annotator, you can run directly with:
101101
```
102-
MLFLOW_TRACKING_URI=http://localhost:8080 poetry run modelplane annotate --annotator_id {annotator_id1} --annotator_id {annotator_id2} --ensemble_strategy {ensemble_strategy} --experiment expname --response_file path/to/response.csv
102+
MLFLOW_TRACKING_URI=http://localhost:8080 poetry run modelplane annotate --annotator_id safety-v1.1 --experiment expname --response_run_id {run_id}
103103
```
104104

105-
### Private Ensemble
106-
If you have access to the private ensemble, you can install with the needed extras
107-
```
108-
poetry install --extras modelbench-private
109-
```
110-
And then run annotations with:
105+
106+
### Custom Ensembles
111107
```
112-
MLFLOW_TRACKING_URI=http://localhost:8080 poetry run modelplane annotate --ensemble_id official --experiment expname --response_run_id {run_id}
108+
MLFLOW_TRACKING_URI=http://localhost:8080 poetry run modelplane annotate --annotator_id {annotator_id1} --annotator_id {annotator_id2} --ensemble_strategy {ensemble_strategy} --experiment expname --response_file path/to/response.csv
113109
```

docker-compose.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ services:
2727
GOOGLE_APPLICATION_CREDENTIALS: /creds/gcp-key.json
2828
# if not provided via volume below, AWS S3 will not work as artifact store
2929
AWS_SHARED_CREDENTIALS_FILE: /creds/aws-credentials
30+
# https://mlflow.org/docs/latest/community/usage-tracking/
31+
MLFLOW_DISABLE_TELEMETRY: "true"
32+
DO_NOT_TRACK: "true"
3033
depends_on:
3134
- postgres
3235
# grab backend from .env, pass artifact root, if provided, otherwise local storage of artifacts
@@ -37,6 +40,7 @@ services:
3740
--serve-artifacts
3841
--host 0.0.0.0
3942
--port 8080
43+
--allowed-hosts 'mlflow:*,localhost:*,127.0.0.1:*'
4044
ports:
4145
- "8080:8080"
4246
healthcheck:
@@ -51,7 +55,7 @@ services:
5155
- ${GOOGLE_CREDENTIALS_PATH:-/dev/null}:/creds/gcp-key.json:ro
5256
# Volume only needed for AWS S3 storage artifacts
5357
- ${AWS_CREDENTIALS_PATH:-/dev/null}:/creds/aws-credentials:ro
54-
58+
5559
jupyter:
5660
build:
5761
context: .

flightpaths/Annotator Development Template.ipynb

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@
7676
"\n",
7777
"An implementation of the annotator should define:\n",
7878
"\n",
79-
"* `translate_request`: takes a `TestItem` and `SUTResponse` (which have the needed information on the prompt responses for the given LLM) and translates the request into an arbitrary form useful for the `annotate` method.\n",
79+
"* `translate_prompt`: takes a `TextPrompt` and `SUTResponse` (which have the needed information on the prompt responses for the given LLM) and translates the request into an arbitrary form useful for the `annotate` method.\n",
8080
"* `annotate`: take the translated request and produce an annotation in any form.\n",
81-
"* `translate_response`: take the annotation and translate it to the needed from (`SafetyModelResponse`).\n",
81+
"* `translate_response`: take the annotation and translate it to the needed from (`SafetyAnnotation`).\n",
8282
"\n",
8383
"Below, we give a sample `RandomAnnotator`, which randomly decides whether a given response is safe by hazard category."
8484
]
@@ -94,10 +94,10 @@
9494
"\n",
9595
"from pydantic import BaseModel\n",
9696
"\n",
97-
"from modelgauge.annotator import CompletionAnnotator\n",
97+
"from modelgauge.annotation import SafetyAnnotation\n",
98+
"from modelgauge.annotator import Annotator\n",
9899
"from modelgauge.annotator_registry import ANNOTATORS\n",
99100
"from modelgauge.prompt import ChatPrompt, TextPrompt\n",
100-
"from modelgauge.safety_model_response import SafetyModelResponse\n",
101101
"from modelgauge.single_turn_prompt_response import TestItem\n",
102102
"from modelgauge.sut import SUTResponse\n",
103103
"from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
@@ -107,9 +107,8 @@
107107
" response: SUTResponse\n",
108108
"\n",
109109
"\n",
110-
"class RandomAnnotator(CompletionAnnotator[SafetyModelResponse]):\n",
111-
" \"\"\"Simple random implementation for demonstration. Uses SafetyModelResponse\n",
112-
" for consistency with AILuminate benchmarks.\n",
110+
"class RandomAnnotator(Annotator):\n",
111+
" \"\"\"Simple random implementation for demonstration.\n",
113112
"\n",
114113
" The methods to implement are based on the idea that annotate will \n",
115114
" potentially make a call to another system and the translators are helpful \n",
@@ -126,18 +125,16 @@
126125
" response=response,\n",
127126
" )\n",
128127
"\n",
129-
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyModelResponse:\n",
128+
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyAnnotation:\n",
130129
" unsafe_categories = []\n",
131130
" for category in BaseSafeTestVersion1.hazards:\n",
132131
" if random.random() < self.prob_unsafe:\n",
133132
" unsafe_categories.append(category)\n",
134-
" return SafetyModelResponse(\n",
133+
" return SafetyAnnotation(\n",
135134
" is_safe=not unsafe_categories,\n",
136-
" logprobs=random.random(),\n",
137-
" safety_categories=unsafe_categories,\n",
138135
" )\n",
139136
"\n",
140-
" def translate_response(self, request: AnnotationRequest, response: SafetyModelResponse) -> SafetyModelResponse:\n",
137+
" def translate_response(self, request: AnnotationRequest, response: SafetyAnnotation) -> SafetyAnnotation:\n",
141138
" return response\n",
142139
"\n",
143140
"\n",
@@ -222,14 +219,6 @@
222219
" ground_truth=ground_truth,\n",
223220
")"
224221
]
225-
},
226-
{
227-
"cell_type": "code",
228-
"execution_count": null,
229-
"id": "af9debec-28be-4a50-82da-5d7025de7d76",
230-
"metadata": {},
231-
"outputs": [],
232-
"source": []
233222
}
234223
],
235224
"metadata": {
@@ -248,7 +237,7 @@
248237
"name": "python",
249238
"nbconvert_exporter": "python",
250239
"pygments_lexer": "ipython3",
251-
"version": "3.12.11"
240+
"version": "3.12.12"
252241
}
253242
},
254243
"nbformat": 4,

flightpaths/Ensemble Development Template.ipynb

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,11 @@
7070
"\n",
7171
"* An implementation of an annotator should define:\n",
7272
"\n",
73-
" * `translate_request`: takes a `TestItem` and `SUTResponse` (which have the needed information on the prompt responses for the given LLM) and translates the request into an arbitrary form useful for the `annotate` method.\n",
73+
" * `translate_prompt`: takes a `TextPrompt` and `SUTResponse` (which have the needed information on the prompt responses for the given LLM) and translates the request into an arbitrary form useful for the `annotate` method.\n",
7474
" * `annotate`: take the translated request and produce an annotation in any form.\n",
7575
" * `translate_response`: take the annotation and translate it to the needed from (`SafetyModelResponse`).\n",
7676
"\n",
77-
"* An implementation of the combining logic should define `compute_response` which takes a map of `SafetyModelResponse`s from the underlying annotators and combines them to produce a final annotation (`EnsembleSafetyModelResponse`, which is simply a `SafetyModelResponse` along with the map of underlying responses).\n",
77+
"* An implementation of the combining logic should define `compute_response` which takes a map of `SafetyAnnotation`s from the underlying annotators and combines them to produce a final annotation (`EnsembleSafetyAnnotation`, which is simply a `SafetyAnnotation` along with the map of underlying responses).\n",
7878
"\n",
7979
"Below, we give a sample ensemble, which takes two independent `RandomAnnotator` responses and reports unsafe if any underlying annotation was unsafe. This requires implementations of the `RandomAnnotator` and `AnyUnsafeStrategy`."
8080
]
@@ -100,21 +100,21 @@
100100
"source": [
101101
"import random\n",
102102
"\n",
103-
"from pydantic import BaseModel\n",
104-
"\n",
105-
"from modelgauge.annotator import CompletionAnnotator\n",
103+
"from modelgauge.annotation import SafetyAnnotation\n",
104+
"from modelgauge.annotator import Annotator\n",
106105
"from modelgauge.annotator_registry import ANNOTATORS\n",
107106
"from modelgauge.prompt import ChatPrompt, TextPrompt\n",
108-
"from modelgauge.safety_model_response import SafetyModelResponse\n",
109107
"from modelgauge.sut import SUTResponse\n",
110108
"from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
109+
"from pydantic import BaseModel\n",
110+
"\n",
111111
"\n",
112112
"class AnnotationRequest(BaseModel):\n",
113113
" prompt: ChatPrompt | TextPrompt\n",
114114
" response: SUTResponse\n",
115115
"\n",
116116
"\n",
117-
"class RandomAnnotator(CompletionAnnotator[SafetyModelResponse]):\n",
117+
"class RandomAnnotator(Annotator):\n",
118118
" \"\"\"Simple random implementation for demonstration. Uses SafetyModelResponse\n",
119119
" for consistency with AILuminate benchmarks.\n",
120120
"\n",
@@ -132,19 +132,17 @@
132132
" response=response,\n",
133133
" )\n",
134134
"\n",
135-
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyModelResponse:\n",
135+
" def annotate(self, annotation_request: AnnotationRequest) -> SafetyAnnotation:\n",
136136
" unsafe_categories = []\n",
137137
" for category in BaseSafeTestVersion1.hazards:\n",
138138
" if random.random() < self.prob_unsafe:\n",
139139
" unsafe_categories.append(category)\n",
140-
" return SafetyModelResponse(\n",
140+
" return SafetyAnnotation(\n",
141141
" is_safe=not unsafe_categories,\n",
142142
" is_valid=True,\n",
143-
" logprobs=random.random(),\n",
144-
" safety_categories=unsafe_categories,\n",
145143
" )\n",
146144
"\n",
147-
" def translate_response(self, request: AnnotationRequest, response: SafetyModelResponse) -> SafetyModelResponse:\n",
145+
" def translate_response(self, request: AnnotationRequest, response: SafetyAnnotation) -> SafetyAnnotation:\n",
148146
" return response\n",
149147
"\n",
150148
"\n",
@@ -176,26 +174,22 @@
176174
"source": [
177175
"from typing import Dict\n",
178176
"\n",
179-
"from modelgauge.safety_model_response import EnsembleSafetyModelResponse\n",
180-
"from modelgauge.ensemble_annotator_set import EnsembleStrategy, ENSEMBLE_STRATEGIES\n",
177+
"from modelgauge.annotation import EnsembleSafetyAnnotation\n",
178+
"from modelgauge.ensemble_strategies import ENSEMBLE_STRATEGIES, EnsembleStrategy\n",
181179
"\n",
182180
"\n",
183181
"class AnyUnsafeEnsembleStrategy(EnsembleStrategy):\n",
184-
" def compute_response(self, annotations: Dict[str, SafetyModelResponse]) -> EnsembleSafetyModelResponse:\n",
185-
" response = EnsembleSafetyModelResponse(\n",
182+
" def compute_response(self, annotations: Dict[str, SafetyAnnotation]) -> EnsembleSafetyAnnotation:\n",
183+
" response = EnsembleSafetyAnnotation(\n",
186184
" joined_responses=annotations, \n",
187185
" is_safe=True,\n",
188-
" safety_categories=[],\n",
189186
" is_valid=False,\n",
190187
" )\n",
191188
" for annotation in annotations.values():\n",
192189
" if annotation.is_valid:\n",
193190
" response.is_valid = True\n",
194191
" if not annotation.is_safe:\n",
195192
" response.is_safe = False\n",
196-
" current_categories = set(response.safety_categories)\n",
197-
" current_categories.update(annotation.safety_categories)\n",
198-
" response.safety_categories = sorted(current_categories)\n",
199193
" return response\n",
200194
"\n",
201195
"ENSEMBLE_STRATEGIES[\"anyunsafe\"] = AnyUnsafeEnsembleStrategy()"
@@ -286,18 +280,6 @@
286280
"display_name": "Python 3 (ipykernel)",
287281
"language": "python",
288282
"name": "python3"
289-
},
290-
"language_info": {
291-
"codemirror_mode": {
292-
"name": "ipython",
293-
"version": 3
294-
},
295-
"file_extension": ".py",
296-
"mimetype": "text/x-python",
297-
"name": "python",
298-
"nbconvert_exporter": "python",
299-
"pygments_lexer": "ipython3",
300-
"version": "3.12.11"
301283
}
302284
},
303285
"nbformat": 4,

0 commit comments

Comments
 (0)