Skip to content

Commit 5039045

Browse files
VectorDB class migration (#4787)
* Migrate Astra to class (#4722) migrate astra to class * Migrate LanceDB to class (#4721) migrate lancedb to class * Migrate Pinecone to class (#4726) migrate pinecone to class * Migrate Zilliz to class (#4729) migrate zilliz to class * Migrate Weaviate to class (#4728) migrate weaviate to class * Migrate Qdrant to class (#4727) migrate qdrant to class * Migrate Milvus to class (#4725) migrate milvus to class * Migrate Chroma to class (#4723) migrate chroma to class * Migrate Chroma Cloud to class (#4724) * migrate chroma to class * migrate chroma cloud to class * move limits to class field --------- Co-authored-by: Timothy Carambat <[email protected]> * Migrate PGVector to class (#4730) * migrate pgvector to class * patch pgvector test * convert connectionString, tableName, and validateConnection to static methods * move instance properties to class fields --------- Co-authored-by: Timothy Carambat <[email protected]> * Refactor Zilliz Cloud vector DB provider (#4749) simplify zilliz implementation by using milvus as base class Co-authored-by: Timothy Carambat <[email protected]> * VectorDatabase base class (#4738) create generic VectorDatabase base class Co-authored-by: Timothy Carambat <[email protected]> * Extend VectorDatabase base class to all providers (#4755) extend VectorDatabase base class to all providers * patch lancedb import * breakout name and add generic logger * dev tag build --------- Co-authored-by: Timothy Carambat <[email protected]>
1 parent 7c3b790 commit 5039045

File tree

14 files changed

+892
-860
lines changed

14 files changed

+892
-860
lines changed

.github/workflows/dev-build.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ concurrency:
66

77
on:
88
push:
9-
branches: ['4841-aws-bedrock-api-key'] # put your current branch to create a build. Core team only.
9+
branches: ['vectordb-class-migration'] # put your current branch to create a build. Core team only.
1010
paths-ignore:
1111
- '**.md'
1212
- 'cloud-deployments/*'

server/__tests__/utils/vectorDbProviders/pgvector/index.test.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector");
1+
const { PGVector: PGVectorClass } = require("../../../../utils/vectorDbProviders/pgvector");
2+
3+
const PGVector = new PGVectorClass();
24

35
describe("PGVector.sanitizeForJsonb", () => {
46
it("returns null/undefined as-is", () => {

server/utils/helpers/index.js

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,40 +86,40 @@ function getVectorDbClass(getExactly = null) {
8686
switch (vectorSelection) {
8787
case "pinecone":
8888
const { Pinecone } = require("../vectorDbProviders/pinecone");
89-
return Pinecone;
89+
return new Pinecone();
9090
case "chroma":
9191
const { Chroma } = require("../vectorDbProviders/chroma");
92-
return Chroma;
92+
return new Chroma();
9393
case "chromacloud":
9494
const { ChromaCloud } = require("../vectorDbProviders/chromacloud");
95-
return ChromaCloud;
95+
return new ChromaCloud();
9696
case "lancedb":
9797
const { LanceDb } = require("../vectorDbProviders/lance");
98-
return LanceDb;
98+
return new LanceDb();
9999
case "weaviate":
100100
const { Weaviate } = require("../vectorDbProviders/weaviate");
101-
return Weaviate;
101+
return new Weaviate();
102102
case "qdrant":
103103
const { QDrant } = require("../vectorDbProviders/qdrant");
104-
return QDrant;
104+
return new QDrant();
105105
case "milvus":
106106
const { Milvus } = require("../vectorDbProviders/milvus");
107-
return Milvus;
107+
return new Milvus();
108108
case "zilliz":
109109
const { Zilliz } = require("../vectorDbProviders/zilliz");
110-
return Zilliz;
110+
return new Zilliz();
111111
case "astra":
112112
const { AstraDB } = require("../vectorDbProviders/astra");
113-
return AstraDB;
113+
return new AstraDB();
114114
case "pgvector":
115115
const { PGVector } = require("../vectorDbProviders/pgvector");
116-
return PGVector;
116+
return new PGVector();
117117
default:
118118
console.error(
119119
`\x1b[31m[ENV ERROR]\x1b[0m No VECTOR_DB value found in environment! Falling back to LanceDB`
120120
);
121121
const { LanceDb: DefaultLanceDb } = require("../vectorDbProviders/lance");
122-
return DefaultLanceDb;
122+
return new DefaultLanceDb();
123123
}
124124
}
125125

server/utils/vectorDbProviders/astra/index.js

Lines changed: 74 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
55
const { v4: uuidv4 } = require("uuid");
66
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
77
const { sourceIdentifier } = require("../../chats");
8+
const { VectorDatabase } = require("../base");
89

910
const sanitizeNamespace = (namespace) => {
1011
// If namespace already starts with ns_, don't add it again
@@ -22,14 +23,21 @@ const collectionExists = async function (client, namespace) {
2223
return collections.includes(namespace);
2324
}
2425
} catch (error) {
25-
console.log("Astra::collectionExists check error", error?.message || error);
26+
this.logger("collectionExists check error", error?.message || error);
2627
return false; // Return false for any error to allow creation attempt
2728
}
2829
};
2930

30-
const AstraDB = {
31-
name: "AstraDB",
32-
connect: async function () {
31+
class AstraDB extends VectorDatabase {
32+
constructor() {
33+
super();
34+
}
35+
36+
get name() {
37+
return "AstraDB";
38+
}
39+
40+
async connect() {
3341
if (process.env.VECTOR_DB !== "astra")
3442
throw new Error("AstraDB::Invalid ENV settings");
3543

@@ -38,21 +46,24 @@ const AstraDB = {
3846
process?.env?.ASTRA_DB_ENDPOINT
3947
);
4048
return { client };
41-
},
42-
heartbeat: async function () {
49+
}
50+
51+
async heartbeat() {
4352
return { heartbeat: Number(new Date()) };
44-
},
53+
}
54+
4555
// Astra interface will return a valid collection object even if the collection
4656
// does not actually exist. So we run a simple check which will always throw
4757
// when the table truly does not exist. Faster than iterating all collections.
48-
isRealCollection: async function (astraCollection = null) {
58+
async isRealCollection(astraCollection = null) {
4959
if (!astraCollection) return false;
5060
return await astraCollection
5161
.countDocuments()
5262
.then(() => true)
5363
.catch(() => false);
54-
},
55-
totalVectors: async function () {
64+
}
65+
66+
async totalVectors() {
5667
const { client } = await this.connect();
5768
const collectionNames = await this.allNamespaces(client);
5869
var totalVectors = 0;
@@ -62,13 +73,15 @@ const AstraDB = {
6273
totalVectors += count ? count : 0;
6374
}
6475
return totalVectors;
65-
},
66-
namespaceCount: async function (_namespace = null) {
76+
}
77+
78+
async namespaceCount(_namespace = null) {
6779
const { client } = await this.connect();
6880
const namespace = await this.namespace(client, _namespace);
6981
return namespace?.vectorCount || 0;
70-
},
71-
namespace: async function (client, namespace = null) {
82+
}
83+
84+
async namespace(client, namespace = null) {
7285
if (!namespace) throw new Error("No namespace value provided.");
7386
const sanitizedNamespace = sanitizeNamespace(namespace);
7487
const collection = await client
@@ -77,7 +90,7 @@ const AstraDB = {
7790
if (!(await this.isRealCollection(collection))) return null;
7891

7992
const count = await collection.countDocuments().catch((e) => {
80-
console.error("Astra::namespaceExists", e.message);
93+
this.logger("namespaceExists", e.message);
8194
return null;
8295
});
8396

@@ -86,27 +99,31 @@ const AstraDB = {
8699
...collection,
87100
vectorCount: typeof count === "number" ? count : 0,
88101
};
89-
},
90-
hasNamespace: async function (namespace = null) {
102+
}
103+
104+
async hasNamespace(namespace = null) {
91105
if (!namespace) return false;
92106
const { client } = await this.connect();
93107
return await this.namespaceExists(client, namespace);
94-
},
95-
namespaceExists: async function (client, namespace = null) {
108+
}
109+
110+
async namespaceExists(client, namespace = null) {
96111
if (!namespace) throw new Error("No namespace value provided.");
97112
const sanitizedNamespace = sanitizeNamespace(namespace);
98113
const collection = await client.collection(sanitizedNamespace);
99114
return await this.isRealCollection(collection);
100-
},
101-
deleteVectorsInNamespace: async function (client, namespace = null) {
115+
}
116+
117+
async deleteVectorsInNamespace(client, namespace = null) {
102118
const sanitizedNamespace = sanitizeNamespace(namespace);
103119
await client.dropCollection(sanitizedNamespace);
104120
return true;
105-
},
121+
}
122+
106123
// AstraDB requires a dimension aspect for collection creation
107124
// we pass this in from the first chunk to infer the dimensions like other
108125
// providers do.
109-
getOrCreateCollection: async function (client, namespace, dimensions = null) {
126+
async getOrCreateCollection(client, namespace, dimensions = null) {
110127
const sanitizedNamespace = sanitizeNamespace(namespace);
111128
try {
112129
const exists = await collectionExists(client, sanitizedNamespace);
@@ -132,14 +149,12 @@ const AstraDB = {
132149

133150
return await client.collection(sanitizedNamespace);
134151
} catch (error) {
135-
console.error(
136-
"Astra::getOrCreateCollection error",
137-
error?.message || error
138-
);
152+
this.logger("getOrCreateCollection", error?.message || error);
139153
throw error;
140154
}
141-
},
142-
addDocumentToNamespace: async function (
155+
}
156+
157+
async addDocumentToNamespace(
143158
namespace,
144159
documentData = {},
145160
fullFilePath = null,
@@ -151,7 +166,7 @@ const AstraDB = {
151166
const { pageContent, docId, ...metadata } = documentData;
152167
if (!pageContent || pageContent.length == 0) return false;
153168

154-
console.log("Adding new vectorized document into namespace", namespace);
169+
this.logger("Adding new vectorized document into namespace", namespace);
155170
if (!skipCache) {
156171
const cacheResult = await cachedVectorInformation(fullFilePath);
157172
if (cacheResult.exists) {
@@ -210,7 +225,7 @@ const AstraDB = {
210225
});
211226
const textChunks = await textSplitter.splitText(pageContent);
212227

213-
console.log("Snippets created from document:", textChunks.length);
228+
this.logger("Snippets created from document:", textChunks.length);
214229
const documentVectors = [];
215230
const vectors = [];
216231
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@@ -246,7 +261,7 @@ const AstraDB = {
246261
if (vectors.length > 0) {
247262
const chunks = [];
248263

249-
console.log("Inserting vectorized chunks into Astra DB.");
264+
this.logger("Inserting vectorized chunks into Astra DB.");
250265

251266
// AstraDB has maximum upsert size of 20 records per-request so we have to use a lower chunk size here
252267
// in order to do the queries - this takes a lot more time than other providers but there
@@ -266,11 +281,12 @@ const AstraDB = {
266281
await DocumentVectors.bulkInsert(documentVectors);
267282
return { vectorized: true, error: null };
268283
} catch (e) {
269-
console.error("addDocumentToNamespace", e.message);
284+
this.logger("addDocumentToNamespace", e.message);
270285
return { vectorized: false, error: e.message };
271286
}
272-
},
273-
deleteDocumentFromNamespace: async function (namespace, docId) {
287+
}
288+
289+
async deleteDocumentFromNamespace(namespace, docId) {
274290
const { DocumentVectors } = require("../../../models/vectors");
275291
const { client } = await this.connect();
276292
namespace = sanitizeNamespace(namespace);
@@ -293,8 +309,9 @@ const AstraDB = {
293309
const indexes = knownDocuments.map((doc) => doc.id);
294310
await DocumentVectors.deleteIds(indexes);
295311
return true;
296-
},
297-
performSimilaritySearch: async function ({
312+
}
313+
314+
async performSimilaritySearch({
298315
namespace = null,
299316
input = "",
300317
LLMConnector = null,
@@ -336,8 +353,9 @@ const AstraDB = {
336353
sources: this.curateSources(sources),
337354
message: false,
338355
};
339-
},
340-
similarityResponse: async function ({
356+
}
357+
358+
async similarityResponse({
341359
client,
342360
namespace,
343361
queryVector,
@@ -367,8 +385,8 @@ const AstraDB = {
367385
responses.forEach((response) => {
368386
if (response.$similarity < similarityThreshold) return;
369387
if (filterIdentifiers.includes(sourceIdentifier(response.metadata))) {
370-
console.log(
371-
"AstraDB: A source was filtered from context as it's parent document is pinned."
388+
this.logger(
389+
"A source was filtered from context as it's parent document is pinned."
372390
);
373391
return;
374392
}
@@ -380,8 +398,9 @@ const AstraDB = {
380398
result.scores.push(response.$similarity);
381399
});
382400
return result;
383-
},
384-
allNamespaces: async function (client) {
401+
}
402+
403+
async allNamespaces(client) {
385404
try {
386405
let header = new Headers();
387406
header.append("Token", client?.httpClient?.applicationToken);
@@ -403,11 +422,12 @@ const AstraDB = {
403422
const collections = resp ? JSON.parse(resp)?.status?.collections : [];
404423
return collections;
405424
} catch (e) {
406-
console.error("Astra::AllNamespace", e);
425+
this.logger("AllNamespace", e);
407426
return [];
408427
}
409-
},
410-
"namespace-stats": async function (reqBody = {}) {
428+
}
429+
430+
async "namespace-stats"(reqBody = {}) {
411431
const { namespace = null } = reqBody;
412432
if (!namespace) throw new Error("namespace required");
413433
const { client } = await this.connect();
@@ -417,8 +437,9 @@ const AstraDB = {
417437
return stats
418438
? stats
419439
: { message: "No stats were able to be fetched from DB for namespace" };
420-
},
421-
"delete-namespace": async function (reqBody = {}) {
440+
}
441+
442+
async "delete-namespace"(reqBody = {}) {
422443
const { namespace = null } = reqBody;
423444
const { client } = await this.connect();
424445
if (!(await this.namespaceExists(client, namespace)))
@@ -431,8 +452,9 @@ const AstraDB = {
431452
details?.vectorCount || "all"
432453
} vectors.`,
433454
};
434-
},
435-
curateSources: function (sources = []) {
455+
}
456+
457+
curateSources(sources = []) {
436458
const documents = [];
437459
for (const source of sources) {
438460
if (Object.keys(source).length > 0) {
@@ -446,7 +468,7 @@ const AstraDB = {
446468
}
447469

448470
return documents;
449-
},
450-
};
471+
}
472+
}
451473

452474
module.exports.AstraDB = AstraDB;

0 commit comments

Comments
 (0)