Tokenization unit tests (#90)

sagiahrac · web-flow · commit 2605059eeb8b · 2025-08-21T20:43:22.000+03:00
* tokenizer unit tests

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* add tokenizer pool tests

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* lint

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* test internals

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* lint

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* lint

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* lint

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

* tidy

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;

---------

Signed-off-by: Sage Ahrac &lt;sagiahrak@gmail.com&gt;
diff --git a/go.mod b/go.mod
@@ -43,6 +43,7 @@ require (
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/common v0.62.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
+	github.com/stretchr/objx v0.5.2 // indirect
 	github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	github.com/yuin/gopher-lua v1.1.1 // indirect
diff --git a/pkg/tokenization/pool_test.go b/pkg/tokenization/pool_test.go
@@ -0,0 +1,134 @@
+/*
+Copyright 2025 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+//nolint:testpackage // need to test internal types
+package tokenization
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/daulet/tokenizers"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
+)
+
+// MockTokenizer implements the Tokenizer interface for testing.
+type MockTokenizer struct {
+	mock.Mock
+}
+
+func (m *MockTokenizer) Encode(input, modelName string) ([]uint32, []tokenizers.Offset, error) {
+	args := m.Called(input, modelName)
+	return args.Get(0).([]uint32), args.Get(1).([]tokenizers.Offset), args.Error(2) //nolint:errcheck // return mocked values
+}
+
+// MockIndexer implements the prefixstore.Indexer interface for testing.
+type MockIndexer struct {
+	mock.Mock
+}
+
+func (m *MockIndexer) AddTokenization(modelName, prompt string, tokens []uint32, offsets []tokenizers.Offset) error {
+	args := m.Called(modelName, prompt, tokens, offsets)
+	return args.Error(0)
+}
+
+func (m *MockIndexer) FindLongestContainedTokens(prompt, modelName string) []uint32 {
+	args := m.Called(prompt, modelName)
+	return args.Get(0).([]uint32) //nolint:errcheck // unused mock
+}
+
+func TestPool_ProcessTask(t *testing.T) {
+	mockIndexer := &MockIndexer{}
+	mockTokenizer := &MockTokenizer{}
+
+	pool := &Pool{
+		workers:   1,
+		indexer:   mockIndexer,
+		tokenizer: mockTokenizer,
+	}
+
+	task := Task{
+		Prompt:    "hello world",
+		ModelName: testModelName,
+	}
+
+	// Setup specific mock return values
+	expectedTokens := []uint32{12345, 67890, 11111}
+	expectedOffsets := []tokenizers.Offset{{0, 5}, {6, 11}}
+
+	mockTokenizer.On("Encode", task.Prompt, task.ModelName).Return(expectedTokens, expectedOffsets, nil)
+
+	// Verify that indexer receives exactly the same tokens and offsets that tokenizer returned
+	mockIndexer.On("AddTokenization", task.ModelName, task.Prompt, expectedTokens, expectedOffsets).Return(nil)
+
+	// Execute
+	err := pool.processTask(task)
+
+	// Assert
+	assert.NoError(t, err)
+	mockTokenizer.AssertExpectations(t)
+	mockIndexer.AssertExpectations(t)
+}
+
+func TestPool_RunIntegration(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping tokenizer integration test in short mode")
+	}
+
+	mockIndexer := &MockIndexer{}
+
+	prompts := []string{"hello world", "this is a test", "unicode test: 世界"}
+
+	// Setup mock expectations for each prompt
+	for _, prompt := range prompts {
+		mockIndexer.On("AddTokenization", testModelName, prompt,
+			mock.Anything, mock.Anything).Return(nil).Once()
+	}
+
+	config := &Config{
+		WorkersCount: 2,
+		HFTokenizerConfig: &HFTokenizerConfig{
+			TokenizersCacheDir: t.TempDir(),
+		},
+	}
+
+	pool, err := NewTokenizationPool(config, mockIndexer)
+	require.NoError(t, err)
+
+	// Create context for the pool
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	for _, prompt := range prompts {
+		pool.AddTask(prompt, testModelName)
+	}
+
+	// Run pool
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+		pool.Run(ctx)
+	}()
+
+	time.Sleep(2 * time.Second)
+	cancel()
+	<-done
+
+	mockIndexer.AssertExpectations(t)
+}
diff --git a/pkg/tokenization/tokenizer_test.go b/pkg/tokenization/tokenizer_test.go
@@ -0,0 +1,113 @@
+/*
+Copyright 2025 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+//nolint:testpackage // need to test internal types
+package tokenization
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// This should be skipped in fast unit tests.
+const testModelName = "google-bert/bert-base-uncased"
+
+func TestCachedHFTokenizer_Encode(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping tokenizer integration test in short mode")
+	}
+
+	config := &HFTokenizerConfig{
+		TokenizersCacheDir: t.TempDir(),
+	}
+	tokenizer, err := NewCachedHFTokenizer(config)
+	require.NoError(t, err)
+	require.NotNil(t, tokenizer)
+
+	tests := []struct {
+		name      string
+		input     string
+		modelName string
+	}{
+		{
+			name:      "simple text",
+			input:     "hello world",
+			modelName: testModelName,
+		},
+		{
+			name:      "empty string",
+			input:     "",
+			modelName: testModelName,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokenIds, offsets, err := tokenizer.Encode(tt.input, tt.modelName)
+
+			assert.NoError(t, err)
+			assert.GreaterOrEqual(t, len(tokenIds), 0)
+			assert.Equal(t, len(tokenIds), len(offsets))
+		})
+	}
+}
+
+func TestCachedHFTokenizer_CacheTokenizer(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping tokenizer integration test in short mode")
+	}
+
+	tokenizer, err := NewCachedHFTokenizer(&HFTokenizerConfig{
+		TokenizersCacheDir: t.TempDir(),
+	})
+	require.NoError(t, err)
+	require.NotNil(t, tokenizer)
+
+	// Test that the same model is cached
+	input := "test input"
+
+	// First call - loads tokenizer
+	tokenIds1, offsets1, err1 := tokenizer.Encode(input, testModelName)
+	require.NoError(t, err1)
+
+	// Second call - should use cached tokenizer
+	tokenIds2, offsets2, err2 := tokenizer.Encode(input, testModelName)
+	require.NoError(t, err2)
+
+	// Results should be identical
+	assert.Equal(t, tokenIds1, tokenIds2)
+	assert.Equal(t, offsets1, offsets2)
+}
+
+func TestCachedHFTokenizer_InvalidModel(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping tokenizer integration test in short mode")
+	}
+
+	tokenizer, err := NewCachedHFTokenizer(&HFTokenizerConfig{
+		TokenizersCacheDir: t.TempDir(),
+	})
+	require.NoError(t, err)
+	require.NotNil(t, tokenizer)
+
+	// Test with non-existent model
+	tokenIds, offsets, err := tokenizer.Encode("test", "non-existent/model")
+	assert.Error(t, err)
+	assert.Nil(t, tokenIds)
+	assert.Nil(t, offsets)
+}