Skip to content

Commit 983f267

Browse files
authored
Merge branch 'main' into docs/milvus-kubernetes
2 parents 6e40b71 + 69ee0b3 commit 983f267

32 files changed

+1717
-136
lines changed

.github/workflows/docker-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ concurrency:
3636
jobs:
3737
# Build job for multi-architecture Docker images
3838
build_multiarch:
39-
if: github.repository == 'vllm-project/semantic-router'
39+
if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
4040
runs-on: ubuntu-latest
4141
permissions:
4242
contents: read

.github/workflows/helm-publish.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ name: Publish Helm Chart
22

33
on:
44
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review]
56
branches:
67
- main
78
paths:
@@ -22,6 +23,7 @@ env:
2223
jobs:
2324
validate-and-package:
2425
name: Validate and Package
26+
if: ${{ !github.event.pull_request.draft }}
2527
runs-on: ubuntu-latest
2628
permissions:
2729
contents: read

.github/workflows/integration-test-docker.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ name: Integration Test [Docker Compose]
22

33
on:
44
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review]
56
branches:
67
- main
78
paths-ignore:
@@ -21,7 +22,7 @@ concurrency:
2122

2223
jobs:
2324
test-quickstart:
24-
if: github.repository == 'vllm-project/semantic-router'
25+
if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
2526
runs-on: ubuntu-latest
2627
timeout-minutes: 30
2728

.github/workflows/integration-test-dynamic-config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ name: Integration Test [Dynamic Config]
22

33
on:
44
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review]
56
branches:
67
- main
78
paths-ignore:
@@ -21,6 +22,7 @@ concurrency:
2122

2223
jobs:
2324
integration-test:
25+
if: ${{ !github.event.pull_request.draft }}
2426
runs-on: ubuntu-latest
2527
timeout-minutes: 180
2628

.github/workflows/integration-test-helm.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88
- 'website/**'
99
- '**/*.md'
1010
pull_request:
11+
types: [opened, synchronize, reopened, ready_for_review]
1112
branches:
1213
- main
1314
paths-ignore:
@@ -30,6 +31,7 @@ jobs:
3031
# Lint and validate Helm chart
3132
lint-chart:
3233
name: Lint Helm Chart
34+
if: ${{ !github.event.pull_request.draft }}
3335
runs-on: ubuntu-latest
3436
steps:
3537
- name: Checkout code

.github/workflows/integration-test-k8s.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ name: Integration Test [Kubernetes]
22

33
on:
44
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review]
56
branches:
67
- main
78
paths-ignore:
@@ -21,12 +22,13 @@ concurrency:
2122

2223
jobs:
2324
integration-test:
25+
if: ${{ !github.event.pull_request.draft }}
2426
runs-on: ubuntu-latest
2527
timeout-minutes: 75
2628
strategy:
2729
fail-fast: false # Continue testing other profiles even if one fails
2830
matrix:
29-
profile: [ai-gateway, aibrix, routing-strategies, llm-d, istio]
31+
profile: [ai-gateway, aibrix, routing-strategies, llm-d, istio, production-stack]
3032

3133
steps:
3234
- name: Check out the repo
@@ -75,6 +77,17 @@ jobs:
7577
run: |
7678
make build-e2e
7779
80+
- name: Free up disk space
81+
run: |
82+
# Remove unnecessary toolchains to free ~25GB disk space
83+
# This helps prevent "no space left on device" errors
84+
echo "Disk before cleanup:"
85+
df -h /
86+
# Note: Do NOT remove $AGENT_TOOLSDIRECTORY - it contains Go/Rust from setup actions
87+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
88+
echo "Disk after cleanup:"
89+
df -h /
90+
7891
- name: Run Integration E2E tests (${{ matrix.profile }})
7992
id: e2e-test
8093
run: |

.github/workflows/owner-notification.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ name: Owner Notification
22

33
on:
44
pull_request_target:
5-
types: [assigned, opened, reopened, synchronize]
5+
types: [assigned, opened, reopened, synchronize, ready_for_review]
66

77
jobs:
88
notify-owners:
9-
if: ${{ github.repository == 'vllm-project/semantic-router' }}
9+
if: ${{ github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft }}
1010
runs-on: ubuntu-latest
1111
permissions:
1212
contents: read

.github/workflows/precommit-publish.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ on:
55
paths:
66
- 'Dockerfile.precommit'
77
pull_request:
8+
types: [opened, synchronize, reopened, ready_for_review]
89
paths:
910
- 'Dockerfile.precommit'
1011

@@ -14,7 +15,7 @@ concurrency:
1415

1516
jobs:
1617
build_and_push:
17-
if: github.repository == 'vllm-project/semantic-router'
18+
if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
1819
runs-on: ubuntu-latest
1920
permissions:
2021
contents: read

.github/workflows/test-and-build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ name: Test And Build
22

33
on:
44
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review]
56
branches:
67
- main
78
push:
@@ -24,13 +25,14 @@ jobs:
2425
test-and-build:
2526
needs: changes
2627
if: >-
27-
${{ github.event_name == 'schedule'
28+
${{ !github.event.pull_request.draft
29+
&& (github.event_name == 'schedule'
2830
|| needs.changes.outputs.core == 'true'
2931
|| needs.changes.outputs.helm == 'true'
3032
|| needs.changes.outputs.e2e == 'true'
3133
|| needs.changes.outputs.docker == 'true'
3234
|| needs.changes.outputs.make == 'true'
33-
|| needs.changes.outputs.ci == 'true' }}
35+
|| needs.changes.outputs.ci == 'true') }}
3436
runs-on: ubuntu-latest
3537

3638
steps:

candle-binding/semantic-router_test.go

Lines changed: 34 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,59 +1478,55 @@ func TestGetEmbeddingSmart(t *testing.T) {
14781478
// Initialize embedding models first
14791479
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
14801480
if err != nil {
1481-
if isModelInitializationError(err) {
1482-
t.Skipf("Skipping GetEmbeddingSmart tests due to model initialization error: %v", err)
1483-
}
14841481
t.Fatalf("Failed to initialize embedding models: %v", err)
14851482
}
14861483

14871484
t.Run("ShortTextHighLatency", func(t *testing.T) {
1488-
// Short text with high latency priority should use Traditional BERT
1485+
// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
14891486
text := "Hello world"
14901487
embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
14911488

14921489
if err != nil {
1493-
t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
1494-
// This is expected since we're using placeholder implementation
1495-
return
1490+
t.Fatalf("GetEmbeddingSmart failed: %v", err)
14961491
}
14971492

1498-
if len(embedding) != 768 {
1499-
t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
1493+
// Expect Qwen3 (1024) dimension since Gemma is not available
1494+
if len(embedding) != 1024 {
1495+
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
15001496
}
15011497

15021498
t.Logf("Short text embedding generated: dim=%d", len(embedding))
15031499
})
15041500

15051501
t.Run("MediumTextBalanced", func(t *testing.T) {
1506-
// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
1502+
// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
15071503
text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
15081504
embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
15091505

15101506
if err != nil {
15111507
t.Fatalf("GetEmbeddingSmart failed: %v", err)
15121508
}
15131509

1514-
// Accept both Qwen3 (1024) and Gemma (768) dimensions
1515-
if len(embedding) != 768 && len(embedding) != 1024 {
1516-
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
1510+
// Expect Qwen3 (1024) dimension since Gemma is not available
1511+
if len(embedding) != 1024 {
1512+
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
15171513
}
15181514

15191515
t.Logf("Medium text embedding generated: dim=%d", len(embedding))
15201516
})
15211517

15221518
t.Run("LongTextHighQuality", func(t *testing.T) {
1523-
// Long text with high quality priority should use Qwen3
1519+
// Long text with high quality priority should use Qwen3 (1024)
15241520
text := strings.Repeat("This is a very long document that requires Qwen3's 32K context support. ", 50)
15251521
embedding, err := GetEmbeddingSmart(text, 0.9, 0.2)
15261522

15271523
if err != nil {
1528-
t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
1529-
return
1524+
t.Fatalf("GetEmbeddingSmart failed: %v", err)
15301525
}
15311526

1532-
if len(embedding) != 768 {
1533-
t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
1527+
// Expect Qwen3 (1024) dimension
1528+
if len(embedding) != 1024 {
1529+
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
15341530
}
15351531

15361532
t.Logf("Long text embedding generated: dim=%d", len(embedding))
@@ -1573,9 +1569,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
15731569
return
15741570
}
15751571

1576-
// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
1577-
if len(embedding) != 768 && len(embedding) != 1024 {
1578-
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
1572+
// Expect Qwen3 (1024) since Gemma is not available
1573+
if len(embedding) != 1024 {
1574+
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
15791575
}
15801576
t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
15811577
})
@@ -1598,9 +1594,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
15981594
continue
15991595
}
16001596

1601-
// Smart routing may select Qwen3 (1024) or Gemma (768)
1602-
if len(embedding) != 768 && len(embedding) != 1024 {
1603-
t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
1597+
// Expect Qwen3 (1024) since Gemma is not available
1598+
if len(embedding) != 1024 {
1599+
t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
16041600
}
16051601

16061602
// Verify no nil pointers
@@ -1639,11 +1635,12 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
16391635
}
16401636

16411637
// Test constants for embedding models (Phase 4.2)
1638+
// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
16421639
const (
16431640
Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
1644-
GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
1641+
GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
16451642
TestEmbeddingText = "This is a test sentence for embedding generation"
1646-
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
1643+
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3"
16471644
)
16481645

16491646
// Test constants for Qwen3 Multi-LoRA
@@ -1705,23 +1702,8 @@ func TestInitEmbeddingModels(t *testing.T) {
17051702
})
17061703

17071704
t.Run("InitGemmaOnly", func(t *testing.T) {
1708-
// Similar to InitBothModels, accept already-initialized state
1709-
err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
1710-
if err != nil {
1711-
t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
1712-
1713-
// Verify functionality
1714-
_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
1715-
if testErr == nil {
1716-
t.Log("✓ ModelFactory is functional (already initialized)")
1717-
} else {
1718-
if isModelInitializationError(testErr) {
1719-
t.Skipf("Skipping test due to model unavailability: %v", testErr)
1720-
}
1721-
}
1722-
} else {
1723-
t.Log("✓ Gemma model initialized successfully")
1724-
}
1705+
// Gemma is a gated model requiring HF_TOKEN, skip in CI
1706+
t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
17251707
})
17261708

17271709
t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1739,9 +1721,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
17391721
// Initialize embedding models first
17401722
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
17411723
if err != nil {
1742-
if isModelInitializationError(err) {
1743-
t.Skipf("Skipping GetEmbeddingWithDim tests due to model initialization error: %v", err)
1744-
}
17451724
t.Fatalf("Failed to initialize embedding models: %v", err)
17461725
}
17471726

@@ -1806,16 +1785,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
18061785

18071786
t.Run("OversizedDimension", func(t *testing.T) {
18081787
// Test graceful degradation when requested dimension exceeds model capacity
1809-
// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
1788+
// Qwen3: 1024, so 2048 should fall back to full dimension
18101789
embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
18111790
if err != nil {
18121791
t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
18131792
return
18141793
}
18151794

1816-
// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
1817-
if len(embedding) != 1024 && len(embedding) != 768 {
1818-
t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
1795+
// Should return full dimension (1024 for Qwen3)
1796+
if len(embedding) != 1024 {
1797+
t.Errorf("Expected full dimension (1024), got %d", len(embedding))
18191798
} else {
18201799
t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
18211800
}
@@ -1841,9 +1820,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
18411820
func TestEmbeddingConsistency(t *testing.T) {
18421821
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
18431822
if err != nil {
1844-
if isModelInitializationError(err) {
1845-
t.Skipf("Skipping consistency tests due to model initialization error: %v", err)
1846-
}
18471823
t.Fatalf("Failed to initialize embedding models: %v", err)
18481824
}
18491825

@@ -1911,12 +1887,11 @@ func TestEmbeddingConsistency(t *testing.T) {
19111887
func TestEmbeddingPriorityRouting(t *testing.T) {
19121888
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
19131889
if err != nil {
1914-
if isModelInitializationError(err) {
1915-
t.Skipf("Skipping priority routing tests due to model initialization error: %v", err)
1916-
}
19171890
t.Fatalf("Failed to initialize embedding models: %v", err)
19181891
}
19191892

1893+
// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
1894+
// The dimension is truncated from Qwen3's full 1024 dimensions
19201895
testCases := []struct {
19211896
name string
19221897
text string
@@ -1931,23 +1906,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
19311906
qualityPriority: 0.2,
19321907
latencyPriority: 0.9,
19331908
expectedDim: 768,
1934-
description: "Should prefer faster embedding model (Gemma > Qwen3)",
1909+
description: "Uses Qwen3 with Matryoshka 768 truncation",
19351910
},
19361911
{
19371912
name: "HighQualityPriority",
19381913
text: strings.Repeat("Long context text ", 30),
19391914
qualityPriority: 0.9,
19401915
latencyPriority: 0.2,
19411916
expectedDim: 768,
1942-
description: "Should prefer quality model (Qwen3/Gemma)",
1917+
description: "Uses Qwen3 with Matryoshka 768 truncation",
19431918
},
19441919
{
19451920
name: "BalancedPriority",
19461921
text: "Medium length text for embedding",
19471922
qualityPriority: 0.5,
19481923
latencyPriority: 0.5,
19491924
expectedDim: 768,
1950-
description: "Should select based on text length",
1925+
description: "Uses Qwen3 with Matryoshka 768 truncation",
19511926
},
19521927
}
19531928

0 commit comments

Comments
 (0)