Fix vLLM Gemma, add vLLM extra, fix getting throughput (#36451)

Amar3tto · web-flow · commit 518b118bcaf0 · 2025-10-24T14:55:09.000-04:00
diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Sentiment_Streaming_DistilBert_Base_Uncased.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Sentiment_Streaming_DistilBert_Base_Uncased.txt
@@ -31,5 +31,6 @@
 --device=CPU
 --input_file=gs://apache-beam-ml/testing/inputs/sentences_50k.txt
 --runner=DataflowRunner
+--dataflow_service_options=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver
 --model_path=distilbert-base-uncased-finetuned-sst-2-english
---model_state_dict_path=gs://apache-beam-ml/models/huggingface.sentiment.distilbert-base-uncased.pth
+--model_state_dict_path=gs://apache-beam-ml/models/huggingface.sentiment.distilbert-base-uncased.pth
diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
@@ -20,7 +20,7 @@
 --input=gs://apache-beam-ml/testing/inputs/sentences_50k.txt
 --machine_type=n1-standard-8
 --worker_zone=us-central1-b
---disk_size_gb=50
+--disk_size_gb=200
 --input_options={}
 --num_workers=8
 --max_num_workers=25
@@ -33,4 +33,4 @@
 --influx_measurement=gemma_vllm_batch
 --model_gcs_path=gs://apache-beam-ml/models/gemma-2b-it
 --dataflow_service_options=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver
---experiments=use_runner_v2
+--experiments=use_runner_v2
diff --git a/.github/workflows/refresh_looker_metrics.yml b/.github/workflows/refresh_looker_metrics.yml
@@ -19,19 +19,13 @@ name: Refresh Looker Performance Metrics
 
 on:
   workflow_dispatch:
-    inputs:
-      READ_ONLY:
-        description: 'Run in read-only mode'
-        required: false
-        default: 'true'
 
 env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   LOOKERSDK_BASE_URL: ${{ secrets.LOOKERSDK_BASE_URL }}
   LOOKERSDK_CLIENT_ID: ${{ secrets.LOOKERSDK_CLIENT_ID }}
   LOOKERSDK_CLIENT_SECRET: ${{ secrets.LOOKERSDK_CLIENT_SECRET }}
   GCS_BUCKET: 'public_looker_explores_us_a3853f40'
-  READ_ONLY: ${{ inputs.READ_ONLY }}
 
 jobs:
   refresh_looker_metrics:
diff --git a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile
@@ -46,23 +46,17 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3 && \
     python3 -m pip install --upgrade pip setuptools wheel
 
 # 4) Copy the Beam SDK harness (for Dataflow workers)
-COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:2.68.0.dev \
+COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:latest \
      /opt/apache/beam /opt/apache/beam
 
 # 5) Make sure the harness is discovered first
 ENV PYTHONPATH=/opt/apache/beam:$PYTHONPATH
 
 # 6) Install the Beam dev SDK from the local source package.
 # This .tar.gz file will be created by GitHub Actions workflow
-# and copied into the build context.
+# and copied into the build context. This will include vLLM dependencies
 COPY ./sdks/python/build/apache-beam.tar.gz /tmp/beam.tar.gz
-RUN python3 -m pip install --no-cache-dir "/tmp/beam.tar.gz[gcp]"
-
-# 7) Install vLLM, and other dependencies
-RUN python3 -m pip install --no-cache-dir \
-      openai>=1.52.2 \
-      vllm>=0.6.3 \
-      triton>=3.1.0
+RUN python3 -m pip install --no-cache-dir "/tmp/beam.tar.gz[gcp,vllm]"
 
 # 8) Use the Beam boot script as entrypoint
-ENTRYPOINT ["/opt/apache/beam/boot"]
+ENTRYPOINT ["/opt/apache/beam/boot"]
diff --git a/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt b/sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt
@@ -19,4 +19,4 @@ torchvision>=0.8.2
 pillow>=8.0.0
 transformers>=4.18.0
 google-cloud-monitoring>=2.27.0
-openai>=1.52.2
+openai>=1.52.2
diff --git a/sdks/python/apache_beam/testing/benchmarks/inference/vllm_gemma_benchmarks.py b/sdks/python/apache_beam/testing/benchmarks/inference/vllm_gemma_benchmarks.py
@@ -26,7 +26,7 @@ def __init__(self):
     self.metrics_namespace = "BeamML_vLLM"
     super().__init__(
         metrics_namespace=self.metrics_namespace,
-        pcollection="WriteBQ.out0",
+        pcollection="FormatForBQ.out0",
     )
 
   def test(self):
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
@@ -616,7 +616,8 @@ def get_portability_package_data():
           ],
           'xgboost': ['xgboost>=1.6.0,<2.1.3', 'datatable==1.0.0'],
           'tensorflow-hub': ['tensorflow-hub>=0.14.0,<0.16.0'],
-          'milvus': milvus_dependency
+          'milvus': milvus_dependency,
+          'vllm': ['openai==1.107.1', 'vllm==0.10.1.1', 'triton==3.3.1']
       },
       zip_safe=False,
       # PyPI package information.
diff --git a/website/www/site/data/performance.yaml b/website/www/site/data/performance.yaml
@@ -238,15 +238,15 @@ looks:
     write:
       folder: 86
       cost:
-        - id: tJWFWW3cnF2CWpmK2zZdXGvWmtNnJgrC
+        - id: J5TtpRykjwPs4W6S88FnJ28Tr8sSHpqN
           title: RunTime and EstimatedCost
       date:
-        - id: J5TtpRykjwPs4W6S88FnJ28Tr8sSHpqN
+        - id: tJWFWW3cnF2CWpmK2zZdXGvWmtNnJgrC
           title: AvgThroughputBytesPerSec by Date
         - id: Jf6qGqN25Zf787DpkNDX5CBpGRvCGMXp
           title: AvgThroughputElementsPerSec by Date
       version:
-        - id: dKyJy5ZKhkBdSTXRY3wZR6fXzptSs2qm
-          title: AvgThroughputBytesPerSec by Version
         - id: Qwxm27qY4fqT4CxXsFfKm2g3734TFJNN
-          title: AvgThroughputElementsPerSec by Version
+          title: AvgThroughputBytesPerSec by Version
+        - id: dKyJy5ZKhkBdSTXRY3wZR6fXzptSs2qm
+          title: AvgThroughputElementsPerSec by Version

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ def __init__(self):`
`26`	`26`	`self.metrics_namespace = "BeamML_vLLM"`
`27`	`27`	`super().__init__(`
`28`	`28`	`metrics_namespace=self.metrics_namespace,`
`29`		`- pcollection="WriteBQ.out0",`
	`29`	`+ pcollection="FormatForBQ.out0",`
`30`	`30`	`)`
`31`	`31`
`32`	`32`	`def test(self):`