diff --git a/src/services/code-index/embedders/__tests__/openai-compatible.spec.ts b/src/services/code-index/embedders/__tests__/openai-compatible.spec.ts index e1e5c64cd5..5c7c44a634 100644 --- a/src/services/code-index/embedders/__tests__/openai-compatible.spec.ts +++ b/src/services/code-index/embedders/__tests__/openai-compatible.spec.ts @@ -1,4 +1,4 @@ -import { vitest, describe, it, expect, beforeEach, afterEach } from "vitest" +import { vitest, describe, it, expect, beforeEach, afterEach, vi } from "vitest" import type { MockedClass, MockedFunction } from "vitest" import { OpenAI } from "openai" import { OpenAICompatibleEmbedder } from "../openai-compatible" @@ -110,6 +110,7 @@ describe("OpenAICompatibleEmbedder", () => { expect(mockEmbeddingsCreate).toHaveBeenCalledWith({ input: testTexts, model: testModelId, + encoding_format: "base64", }) expect(result).toEqual({ embeddings: [[0.1, 0.2, 0.3]], @@ -130,6 +131,7 @@ describe("OpenAICompatibleEmbedder", () => { expect(mockEmbeddingsCreate).toHaveBeenCalledWith({ input: testTexts, model: testModelId, + encoding_format: "base64", }) expect(result).toEqual({ embeddings: [ @@ -154,6 +156,7 @@ describe("OpenAICompatibleEmbedder", () => { expect(mockEmbeddingsCreate).toHaveBeenCalledWith({ input: testTexts, model: customModel, + encoding_format: "base64", }) }) @@ -173,6 +176,97 @@ describe("OpenAICompatibleEmbedder", () => { }) }) + /** + * Test base64 conversion logic + */ + describe("base64 conversion", () => { + it("should convert base64 encoded embeddings to float arrays", async () => { + const testTexts = ["Hello world"] + + // Create a Float32Array with test values that can be exactly represented in Float32 + const testEmbedding = new Float32Array([0.25, 0.5, 0.75, 1.0]) + + // Convert to base64 string (simulating what OpenAI API returns) + const buffer = Buffer.from(testEmbedding.buffer) + const base64String = buffer.toString("base64") + + const mockResponse = { + data: [{ embedding: base64String }], // Base64 string instead of array + usage: { prompt_tokens: 10, total_tokens: 15 }, + } + mockEmbeddingsCreate.mockResolvedValue(mockResponse) + + const result = await embedder.createEmbeddings(testTexts) + + expect(mockEmbeddingsCreate).toHaveBeenCalledWith({ + input: testTexts, + model: testModelId, + encoding_format: "base64", + }) + + // Verify the base64 string was converted back to the original float array + expect(result).toEqual({ + embeddings: [[0.25, 0.5, 0.75, 1.0]], + usage: { promptTokens: 10, totalTokens: 15 }, + }) + }) + + it("should handle multiple base64 encoded embeddings", async () => { + const testTexts = ["Hello world", "Goodbye world"] + + // Create test embeddings with values that can be exactly represented in Float32 + const embedding1 = new Float32Array([0.25, 0.5, 0.75]) + const embedding2 = new Float32Array([1.0, 1.25, 1.5]) + + // Convert to base64 strings + const base64String1 = Buffer.from(embedding1.buffer).toString("base64") + const base64String2 = Buffer.from(embedding2.buffer).toString("base64") + + const mockResponse = { + data: [{ embedding: base64String1 }, { embedding: base64String2 }], + usage: { prompt_tokens: 20, total_tokens: 30 }, + } + mockEmbeddingsCreate.mockResolvedValue(mockResponse) + + const result = await embedder.createEmbeddings(testTexts) + + expect(result).toEqual({ + embeddings: [ + [0.25, 0.5, 0.75], + [1.0, 1.25, 1.5], + ], + usage: { promptTokens: 20, totalTokens: 30 }, + }) + }) + + it("should handle mixed base64 and array embeddings", async () => { + const testTexts = ["Hello world", "Goodbye world"] + + // Create one base64 embedding and one regular array (edge case) + const embedding1 = new Float32Array([0.25, 0.5, 0.75]) + const base64String1 = Buffer.from(embedding1.buffer).toString("base64") + + const mockResponse = { + data: [ + { embedding: base64String1 }, // Base64 string + { embedding: [1.0, 1.25, 1.5] }, // Regular array + ], + usage: { prompt_tokens: 20, total_tokens: 30 }, + } + mockEmbeddingsCreate.mockResolvedValue(mockResponse) + + const result = await embedder.createEmbeddings(testTexts) + + expect(result).toEqual({ + embeddings: [ + [0.25, 0.5, 0.75], + [1.0, 1.25, 1.5], + ], + usage: { promptTokens: 20, totalTokens: 30 }, + }) + }) + }) + /** * Test batching logic when texts exceed token limits */ @@ -249,11 +343,15 @@ describe("OpenAICompatibleEmbedder", () => { const testTexts = ["Hello world"] const rateLimitError = { status: 429, message: "Rate limit exceeded" } + // Create base64 encoded embedding for successful response + const testEmbedding = new Float32Array([0.25, 0.5, 0.75]) + const base64String = Buffer.from(testEmbedding.buffer).toString("base64") + mockEmbeddingsCreate .mockRejectedValueOnce(rateLimitError) .mockRejectedValueOnce(rateLimitError) .mockResolvedValueOnce({ - data: [{ embedding: [0.1, 0.2, 0.3] }], + data: [{ embedding: base64String }], usage: { prompt_tokens: 10, total_tokens: 15 }, }) @@ -268,7 +366,7 @@ describe("OpenAICompatibleEmbedder", () => { expect(mockEmbeddingsCreate).toHaveBeenCalledTimes(3) expect(console.warn).toHaveBeenCalledWith(expect.stringContaining("Rate limit hit, retrying in")) expect(result).toEqual({ - embeddings: [[0.1, 0.2, 0.3]], + embeddings: [[0.25, 0.5, 0.75]], usage: { promptTokens: 10, totalTokens: 15 }, }) }) @@ -360,5 +458,84 @@ describe("OpenAICompatibleEmbedder", () => { await expect(embedder.createEmbeddings(testTexts)).rejects.toThrow() }) }) + + /** + * Test to confirm OpenAI package bug with base64 encoding + * This test verifies that when we request encoding_format: "base64", + * the OpenAI package returns unparsed base64 strings as expected. + * This is the behavior we rely on in our workaround. + */ + describe("OpenAI package base64 behavior verification", () => { + it("should return unparsed base64 when encoding_format is base64", async () => { + const testTexts = ["Hello world"] + + // Create a real OpenAI instance to test the actual package behavior + const realOpenAI = new ((await vi.importActual("openai")) as any).OpenAI({ + baseURL: testBaseUrl, + apiKey: testApiKey, + }) + + // Create test embedding data as base64 using values that can be exactly represented in Float32 + const testEmbedding = new Float32Array([0.25, 0.5, 0.75, 1.0]) + const buffer = Buffer.from(testEmbedding.buffer) + const base64String = buffer.toString("base64") + + // Mock the raw API response that would come from OpenAI + const mockApiResponse = { + data: [ + { + object: "embedding", + embedding: base64String, // Raw base64 string from API + index: 0, + }, + ], + model: "text-embedding-3-small", + object: "list", + usage: { + prompt_tokens: 2, + total_tokens: 2, + }, + } + + // Mock the methodRequest method which is called by post() + const mockMethodRequest = vi.fn() + const mockAPIPromise = { + then: vi.fn().mockImplementation((callback) => { + return Promise.resolve(callback(mockApiResponse)) + }), + catch: vi.fn(), + finally: vi.fn(), + } + mockMethodRequest.mockReturnValue(mockAPIPromise) + + // Replace the methodRequest method on the client + ;(realOpenAI as any).post = vi.fn().mockImplementation((path, opts) => { + return mockMethodRequest("post", path, opts) + }) + + // Call the embeddings.create method with base64 encoding + const response = await realOpenAI.embeddings.create({ + input: testTexts, + model: "text-embedding-3-small", + encoding_format: "base64", + }) + + // Verify that the response contains the raw base64 string + // This confirms the OpenAI package doesn't parse base64 when explicitly requested + expect(response.data[0].embedding).toBe(base64String) + expect(typeof response.data[0].embedding).toBe("string") + + // Verify we can manually convert it back to the original float array + const returnedBuffer = Buffer.from(response.data[0].embedding as string, "base64") + const returnedFloat32Array = new Float32Array( + returnedBuffer.buffer, + returnedBuffer.byteOffset, + returnedBuffer.byteLength / 4, + ) + const returnedArray = Array.from(returnedFloat32Array) + + expect(returnedArray).toEqual([0.25, 0.5, 0.75, 1.0]) + }) + }) }) }) diff --git a/src/services/code-index/embedders/openai-compatible.ts b/src/services/code-index/embedders/openai-compatible.ts index 421cb7262c..b7e4079569 100644 --- a/src/services/code-index/embedders/openai-compatible.ts +++ b/src/services/code-index/embedders/openai-compatible.ts @@ -8,6 +8,19 @@ import { } from "../constants" import { getDefaultModelId } from "../../../shared/embeddingModels" +interface EmbeddingItem { + embedding: string | number[] + [key: string]: any +} + +interface OpenAIEmbeddingResponse { + data: EmbeddingItem[] + usage?: { + prompt_tokens?: number + total_tokens?: number + } +} + /** * OpenAI Compatible implementation of the embedder interface with batching and rate limiting. * This embedder allows using any OpenAI-compatible API endpoint by specifying a custom baseURL. @@ -108,13 +121,38 @@ export class OpenAICompatibleEmbedder implements IEmbedder { ): Promise<{ embeddings: number[][]; usage: { promptTokens: number; totalTokens: number } }> { for (let attempts = 0; attempts < MAX_RETRIES; attempts++) { try { - const response = await this.embeddingsClient.embeddings.create({ + const response = (await this.embeddingsClient.embeddings.create({ input: batchTexts, model: model, + // OpenAI package (as of v4.78.1) has a parsing issue that truncates embedding dimensions to 256 + // when processing numeric arrays, which breaks compatibility with models using larger dimensions. + // By requesting base64 encoding, we bypass the package's parser and handle decoding ourselves. + encoding_format: "base64", + })) as OpenAIEmbeddingResponse + + // Convert base64 embeddings to float32 arrays + const processedEmbeddings = response.data.map((item: EmbeddingItem) => { + if (typeof item.embedding === "string") { + const buffer = Buffer.from(item.embedding, "base64") + + // Create Float32Array view over the buffer + const float32Array = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4) + + return { + ...item, + embedding: Array.from(float32Array), + } + } + return item }) + // Replace the original data with processed embeddings + response.data = processedEmbeddings + + const embeddings = response.data.map((item) => item.embedding as number[]) + return { - embeddings: response.data.map((item) => item.embedding), + embeddings: embeddings, usage: { promptTokens: response.usage?.prompt_tokens || 0, totalTokens: response.usage?.total_tokens || 0,