EmbeddedLLM
diff --git a/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 2 deletions b/‎.buildkite/check-wheel-size.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.buildkite/generate_index.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/generate_index.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/download-tokenizer.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/download-tokenizer.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 7 additions & 2 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎.buildkite/run-gh200-test.sh‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/run-gh200-test.sh‎
Lines changed: 2 additions & 2 deletions
@@ -1,12 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 
 
 def print_top_10_largest_files(zip_file):
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 
 
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 from pathlib import Path
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 from transformers import AutoTokenizer
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from pathlib import Path
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from lmdeploy.serve.openai.api_client import APIClient
 
 api_client = APIClient("http://localhost:8000")
 
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import os
 
@@ -56,6 +56,11 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - input: "Provide Release version here"
+    fields:
+      - text: "What is the release version?"
+        key: "release-version"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -66,7 +71,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+`
`1`	`3`	`import argparse`
`2`	`4`	`import os`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
`1`	`2`	`"""`
`2`	`3`	`LM eval harness on model to compare vs HF baseline computed offline.`
`3`	`4`	`Configs are found in configs/$MODEL.yaml`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+`
`1`	`3`	`import json`
`2`	`4`	`import os`
`3`	`5`	`from pathlib import Path`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+`
`1`	`3`	`from lmdeploy.serve.openai.api_client import APIClient`
`2`	`4`
`3`	`5`	`api_client = APIClient("http://localhost:8000")`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+`
`1`	`3`	`import datetime`
`2`	`4`	`import json`
`3`	`5`	`import os`
Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,6 @@ trap remove_docker_container EXIT`
`23`	`23`	`remove_docker_container`
`24`	`24`
`25`	`25`	`# Run the image and test offline inference`
`26`		`-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '`
`27`		`- python3 examples/offline_inference/basic.py`
	`26`	`+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '`
	`27`	`+ python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B`
`28`	`28`	`'`