Skip to content

Commit 2c92e7b

Browse files
authored
Move WORKDIR to storage disk in GPU CI & Use dynamo export for tests (#2313)
## Describe your changes 1. Remove optimum-intel version pin from tests, so we can test the latest transformers. 2. Use dynamo export by default in our tests, since torchscript is no longer been maintained. 3. Fix a couple of tests for latest transformers and dynamo export change. Skip a couple of tests that need further investigations. 4. Move GPU agent work dir and docker work dir to storage disk, otherwise the main disk will be out of space. 5. Add `register_dynamic_cache_export_support` to fix dynamic cache registration issue. 6. Remove default 13 target_opset in tests. ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. ## (Optional) Issue link
1 parent a777b97 commit 2c92e7b

39 files changed

+190
-245
lines changed

.azure_pipelines/job_templates/olive-test-cpu-template.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
- script: |
3939
python -m pip install pytest
4040
python -m pip install -r $(Build.SourcesDirectory)/test/$(requirements_file)
41-
41+
python -m pip list
4242
coverage run --source=$(Build.SourcesDirectory)/olive -m pytest -v -s -p no:warnings --disable-warnings --log-cli-level=WARNING --junitxml=$(Build.SourcesDirectory)/logs/test-TestOlive.xml $(Build.SourcesDirectory)/test --basetemp $(PYTEST_BASETEMP)
4343
coverage xml
4444
displayName: Test Olive

.azure_pipelines/job_templates/olive-test-linux-gpu-template.yaml

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,34 @@ jobs:
2020
pool:
2121
name: ${{ parameters.pool}}
2222
variables:
23-
PIP_CACHE_DIR: $(Pipeline.Workspace)/.cache/pip
24-
HF_HOME: $(Pipeline.Workspace)/.cache/huggingface
23+
PIP_CACHE_DIR: /mnt/storage/.cache/pip
24+
HF_HOME: /mnt/storage/.cache/huggingface
2525

2626
steps:
27+
- script: |
28+
set -euxo pipefail
29+
30+
# Move agent work directory to /mnt/storage via symlink
31+
AGENT_ROOT=$(dirname "$(Agent.BuildDirectory)")
32+
sudo mkdir -p /mnt/storage/vss_work
33+
sudo chown -R $USER:$USER /mnt/storage/vss_work
34+
sudo cp -a "$AGENT_ROOT"/* /mnt/storage/vss_work/ 2>/dev/null || true
35+
sudo rm -rf "$AGENT_ROOT"
36+
sudo ln -sf /mnt/storage/vss_work "$AGENT_ROOT"
37+
38+
# Move Docker and containerd to /mnt/storage
39+
sudo systemctl stop docker containerd
40+
sudo mkdir -p /mnt/storage/docker /mnt/storage/containerd /etc/containerd
41+
echo '{"data-root": "/mnt/storage/docker"}' | sudo tee /etc/docker/daemon.json
42+
containerd config default | sed 's|/var/lib/containerd|/mnt/storage/containerd|g' | sudo tee /etc/containerd/config.toml > /dev/null
43+
sudo systemctl start containerd docker
44+
45+
# Move /tmp to /mnt/storage
46+
sudo mkdir -p /mnt/storage/tmp
47+
sudo chmod 1777 /mnt/storage/tmp
48+
sudo mount --bind /mnt/storage/tmp /tmp
49+
displayName: Move pipeline to /mnt/storage
50+
2751
- template: build-docker-image-template.yaml
2852
parameters:
2953
python_version: ${{ parameters.python_version }}

.azure_pipelines/scripts/run_test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ pip install -r "$4"
3333
pip install huggingface-hub
3434
hf auth login --token "$7"
3535

36+
pip list
37+
3638
# Step 4: Run tests with or without coverage tracking
3739
XML_PATH="/logs/TestOlive.xml"
3840
if [ "$6" = "true" ]; then

olive/cli/optimize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -797,7 +797,7 @@ def _get_onnx_io_datatype_converter_pass_config(self) -> dict[str, Any]:
797797
{
798798
"name": "wikitext2_train",
799799
"type": "HuggingfaceContainer",
800-
"load_dataset_config": {"data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train"},
800+
"load_dataset_config": {"data_name": "Salesforce/wikitext", "subset": "wikitext-2-raw-v1", "split": "train"},
801801
"pre_process_data_config": {
802802
"strategy": "line-by-line",
803803
"add_special_tokens": False,

olive/common/quant/hf_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ class OliveHfQuantizer(HfQuantizer):
146146

147147
# only support load and inference, no on-the-fly quantization
148148
requires_calibration = True
149+
modules_to_not_convert: list[str] | None = None
149150

150151
def _process_model_before_weight_loading(
151152
self, model: PreTrainedModel, keep_in_fp32_modules: list[str] | None = None, **kwargs

olive/data/component/sd_lora/dataloader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
seed: Random seed for reproducibility.
3636
3737
"""
38-
super().__init__(dataset)
38+
super().__init__()
3939
self.dataset = dataset
4040
self.batch_size = batch_size
4141
self.drop_last = drop_last

olive/data/template.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@ def huggingface_data_config_template(model_name, task, **kwargs) -> DataConfig:
3939
**kwargs: dict
4040
Additional arguments:
4141
- olive.data.component.load_dataset_config.huggingface_dataset
42-
- `data_name`: str, data name in huggingface dataset, e.g.: "glue", "squad"
42+
- `data_name`: str, data name in huggingface dataset, e.g.: "nyu-mll/glue", "squad"
4343
- `subset`: str, subset of data, e.g.: "train", "validation", "test"
4444
- `split`: str, split of data, e.g.: "train", "validation", "test"
4545
- `data_files`: str | list | dict, path to source data file(s).
4646
e.g.
4747
load_dataset_config={
4848
"params": {
49-
"data_name": "glue",
49+
"data_name": "nyu-mll/glue",
5050
"subset": "train",
5151
"split": "train",
5252
"data_files": "whatever.pt"

olive/passes/onnx/conversion.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,11 @@ def _export_pytorch_model(
212212
"Please upgrade PyTorch to 2.6.0 or above."
213213
)
214214

215+
# Register DynamicCache export support
216+
from transformers.integrations.executorch import register_dynamic_cache_export_support
217+
218+
register_dynamic_cache_export_support()
219+
215220
if isinstance(dummy_inputs, dict):
216221
dummy_kwargs = dummy_inputs
217222
dummy_inputs = ()
@@ -236,7 +241,7 @@ def _export_pytorch_model(
236241
dynamic_axes=io_config.dynamic_axes,
237242
dynamic_shapes=io_config.dynamic_shapes,
238243
dynamo=True,
239-
fallback=True,
244+
fallback=False,
240245
optimize=config.optimize,
241246
report=logger.isEnabledFor(logging.DEBUG),
242247
)

olive/passes/pytorch/sparsegpt_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,14 @@ def __init__(self, module):
9090
super().__init__()
9191
self.module = module
9292

93-
def forward(self, inputs, **kwargs):
93+
def forward(self, *args, **kwargs):
94+
# First positional argument is the hidden states (inputs)
95+
layer_inputs = args[0] if args else kwargs.get("hidden_states")
9496
# handle batch dimension
95-
for batch in range(inputs.shape[0]):
97+
for batch in range(layer_inputs.shape[0]):
9698
if cache["i"] >= num_samples:
9799
break
98-
inputs[cache["i"]] = inputs[batch]
100+
inputs[cache["i"]] = layer_inputs[batch]
99101
cache["i"] += 1
100102
cache["attention_mask"] = kwargs.get("attention_mask")
101103
for input_name in additional_input:

olive/passes/pytorch/train_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def get_calibration_data_config(
324324
model_name=model_name_or_path,
325325
task="text-generation",
326326
load_dataset_config={
327-
"data_name": "wikitext",
327+
"data_name": "Salesforce/wikitext",
328328
"subset": "wikitext-2-raw-v1",
329329
"split": split,
330330
"trust_remote_code": trust_remote_code,

0 commit comments

Comments
 (0)