Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Please delete options that are not relevant.
## Checklist:
- [ ] Please add the link of [**Integration Tests Executor** run](https://github.com/deepjavalibrary/djl-serving/actions/workflows/integration_execute.yml) with related tests.
- [ ] Have you [manually built the docker image](https://github.com/deepjavalibrary/djl-serving/blob/master/serving/docker/README.md#build-docker-image) and verify the change?
- [ ] Have you run related tests? Check [how to set up the test environment here](https://github.com/deepjavalibrary/djl-serving/blob/master/.github/workflows/integration_execute.yml#L72); One example would be `pytest tests.py -k "TestCorrectnessLmiDist" -m "lmi_dist"`
- [ ] Have you run related tests? Check [how to set up the test environment here](https://github.com/deepjavalibrary/djl-serving/blob/master/.github/workflows/integration_execute.yml#L72)
- [ ] Have you added tests that prove your fix is effective or that this feature works?
- [ ] Has code been commented, particularly in hard-to-understand areas?
- [ ] Have you made corresponding changes to the documentation?
Expand Down
79 changes: 1 addition & 78 deletions .github/workflows/llm_integration_p4d.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,83 +29,6 @@ jobs:
outputs:
p4d_instance_id: ${{ steps.create_gpu_p4d.outputs.action_lmic_p4d_instance_id }}

lmi-dist-test:
if: contains(fromJson('["", "aiccl"]'), github.event.inputs.run_test)
runs-on: [ self-hosted, p4d ]
timeout-minutes: 120
needs: create-runners-p4d
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub tqdm
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: Test Mixtral-8x7B
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist_aiccl mixtral-8x7b-aiccl
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_aiccl mixtral-8x7b-aiccl
./remove_container.sh
- name: Test Llama-2-70B
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist_aiccl llama-2-70b-aiccl
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_aiccl llama-2-70b-aiccl
./remove_container.sh
- name: Test codellama/CodeLlama-34b-hf
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist_aiccl codellama-34b-aiccl
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_aiccl codellama-34b-aiccl
./remove_container.sh
- name: Test tiiuae/falcon-40b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist_aiccl falcon-40b-aiccl
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_aiccl falcon-40b-aiccl
./remove_container.sh
- name: Remove models dir
working-directory: tests/integration
run: |
sudo rm -rf models
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
sudo rm -rf models
./remove_container.sh || true
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
name: lmi-dist-aiccl-logs
path: tests/integration/logs/

trtllm-test:
runs-on: [ self-hosted, p4d ]
Expand Down Expand Up @@ -228,7 +151,7 @@ jobs:
stop-runners-p4d:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners-p4d, lmi-dist-test, trtllm-test, vllm-test ]
needs: [ create-runners-p4d, trtllm-test, vllm-test ]
steps:
- name: Stop all instances
run: |
Expand Down
115 changes: 0 additions & 115 deletions .github/workflows/lmi-dist-deps-build.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/sagemaker_llm_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
strategy:
fail-fast: false
matrix:
engine: [lmi-dist, trtllm]
engine: [trtllm]
steps:
- uses: actions/checkout@v4
- name: Set up Python3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def is_chat_completions_request(inputs: Dict) -> bool:
def parse_mistral_chat_request_inputs(messages, tokenizer):
# TODO: get rid of this mess of an integration
# Mistral has their own tokenizer with custom tokenization logic for chat type requests
# This dependency is only available in vllm/lmi-dist, so we import it here as necessary
# This dependency is only available in vllm, so we import it here as necessary
from mistral_common.protocol.instruct.request import ChatCompletionRequest
chat_request = ChatCompletionRequest(messages=messages)
# The tokenized object contains the converted prompt, token ids, and images
Expand Down Expand Up @@ -76,8 +76,8 @@ def parse_chat_completions_request(
images.extend(message.get_images())

# Less than ideal, but need a working solution for now
# is_mistral_tokenizer can only be true if lmi-dist or vllm
# mistral tokenization only works with these engines if we pass token ids directly, not text.
# is_mistral_tokenizer can only be true if vllm
# mistral tokenization only works with this engine if we pass token ids directly, not text.
# every other use case is designed for the actual string prompt being provided...
if is_mistral_tokenizer:
text_inputs = parse_mistral_chat_request_inputs(messages, tokenizer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def set_adapter_class(self):
def can_use_continuous_batching(self) -> bool:
"""
Set configuration for continuous batching, currently all vllm implementations are continuous batching
and batch size greater than 1 for tnx and lmi-dist support rolling batch.
and batch size greater than 1 for tnx support rolling batch.

:return: bool indicating if continuous batching can be used
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def set_quantize_for_backward_compatibility(self):
self.quantize = "bitsandbytes8"

# TODO remove this after refactor of all handlers
# parsing bitsandbytes8, so it can be directly passed to lmi dist model loader.
# parsing bitsandbytes8, so it can be directly passed to vllm model loader.
if self.quantize == "bitsandbytes8" \
and self.rolling_batch == RollingBatchEnum.lmidist:
and self.rolling_batch == RollingBatchEnum.vllm:
self.quantize = "bitsandbytes"
return self

Expand Down Expand Up @@ -123,9 +123,8 @@ def construct_kwargs_quantize(self):
return self

# TODO remove this after refactor of all handlers
# device map is not required for lmi dist and vllm
# device map is not required for vllm
if self.rolling_batch in {
RollingBatchEnum.lmidist,
RollingBatchEnum.vllm,
}:
return self
Expand Down
Loading
Loading