llm-cloud-inference/cloudbuild.yaml at main · clchinkc/llm-cloud-inference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
steps:
  # Step 1: Build Docker image
  - name: gcr.io/cloud-builders/docker
    id: build
    args:
      - build
      - -t
      - asia-southeast1-docker.pkg.dev/$PROJECT_ID/llm-cloud-inference-se1/llm-cloud-inference:latest
      - -f
      - docker/Dockerfile
      - .

  # Step 2: Push to Artifact Registry
  - name: gcr.io/cloud-builders/docker
    id: push
    args:
      - push
      - asia-southeast1-docker.pkg.dev/$PROJECT_ID/llm-cloud-inference-se1/llm-cloud-inference:latest

  # Step 3: Deploy to Cloud Run
  - name: gcr.io/cloud-builders/gcloud
    id: deploy
    args:
      - run
      - deploy
      - llm-api
      - --image=asia-southeast1-docker.pkg.dev/$PROJECT_ID/llm-cloud-inference-se1/llm-cloud-inference:latest
      - --region=asia-southeast1
      - --platform=managed
      - --gpu=1
      - --gpu-type=nvidia-l4
      - --cpu=8
      - --memory=32Gi
      - --timeout=3600
      - --min-instances=0
      - --max-instances=1
      - --port=8080
      - --cpu-boost
      - --no-cpu-throttling
      - --set-env-vars=MODEL_NAME=qwen3-8b-awq,GCS_BUCKET=$PROJECT_ID-models
      - --service-account=llm-inference@$PROJECT_ID.iam.gserviceaccount.com
      - --no-allow-unauthenticated

# Build timeout (40 mins for vLLM Docker build - large base image)
timeout: 2400s

# Store images in Artifact Registry
images:
  - asia-southeast1-docker.pkg.dev/$PROJECT_ID/llm-cloud-inference-se1/llm-cloud-inference:latest

options:
  machineType: E2_HIGHCPU_8
  logging: CLOUD_LOGGING_ONLY
  substitutionOption: 'ALLOW_LOOSE'

serviceAccount: 'projects/llm-inference-31155/serviceAccounts/llm-inference@llm-inference-31155.iam.gserviceaccount.com'