Skip to content

Commit 686d486

Browse files
Merge branch 'main' into main
2 parents 1323e40 + c23f757 commit 686d486

File tree

4 files changed

+154
-12
lines changed

4 files changed

+154
-12
lines changed

astra-db-java/src/test/java/com/datastax/astra/test/TestConstants.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22

33
public interface TestConstants {
44

5-
String NAMESPACE_NS1 = "ns1";
6-
String DEFAULT_NAMESPACE = "default_keyspace";
7-
String DATABASE_NAME = "astra_db_client";
8-
String COLLECTION_SIMPLE = "collection_simple";
5+
String NAMESPACE_NS1 = "ns1";
6+
String DEFAULT_NAMESPACE = "default_keyspace";
7+
String DATABASE_NAME = "astra_db_client";
8+
String COLLECTION_SIMPLE = "collection_simple";
99
String COLLECTION_OBJECTID = "collection_objectid";
10-
String COLLECTION_UUID = "collection_uuid";
11-
String COLLECTION_UUID_V6 = "collection_uuidv6";
12-
String COLLECTION_UUID_V7 = "collection_uuidv7";
13-
String COLLECTION_VECTOR = "collection_vector";
14-
String COLLECTION_DENY = "collection_deny";
15-
String COLLECTION_ALLOW = "collection_allow";
10+
String COLLECTION_UUID = "collection_uuid";
11+
String COLLECTION_UUID_V6 = "collection_uuidv6";
12+
String COLLECTION_UUID_V7 = "collection_uuidv7";
13+
String COLLECTION_VECTOR = "collection_vector";
14+
String COLLECTION_DENY = "collection_deny";
15+
String COLLECTION_ALLOW = "collection_allow";
1616
}

astra-db-java/src/test/java/com/datastax/astra/test/integration/collection/AstraDevCollectionITTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class AstraDevCollectionITTest extends AbstractCollectionITTest {
1313
/** {@inheritDoc} */
1414
@Override
1515
protected Database initDatabase() {
16-
return initAstraDatabase(AstraEnvironment.DEV, CloudProviderType.GCP, "europe-west4");
17-
// return initAstraDatabase(AstraEnvironment.TEST, CloudProviderType.GCP, "us-central1");
16+
//return initAstraDatabase(AstraEnvironment.DEV, CloudProviderType.GCP, "europe-west4");
17+
return initAstraDatabase(AstraEnvironment.DEV, CloudProviderType.GCP, "us-central1");
1818
// return initAstraDatabase(AstraEnvironment.TEST, CloudProviderType.AWS, "us-west-2");
1919
}
2020

tools/pom.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,31 @@
2222
<version>1.0.1-SNAPSHOT</version>
2323
</dependency>
2424

25+
26+
<dependency>
27+
<groupId>org.apache.commons</groupId>
28+
<artifactId>commons-csv</artifactId>
29+
<version>1.10.0</version>
30+
</dependency>
31+
32+
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
33+
<dependency>
34+
<groupId>org.jsoup</groupId>
35+
<artifactId>jsoup</artifactId>
36+
<version>1.17.2</version>
37+
</dependency>
38+
<dependency>
39+
<groupId>com.theokanning.openai-gpt3-java</groupId>
40+
<artifactId>client</artifactId>
41+
<version>0.18.2</version>
42+
</dependency>
43+
<dependency>
44+
<groupId>com.theokanning.openai-gpt3-java</groupId>
45+
<artifactId>service</artifactId>
46+
<version>0.18.2</version>
47+
</dependency>
48+
49+
2550
<dependency>
2651
<groupId>com.opencsv</groupId>
2752
<artifactId>opencsv</artifactId>
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
package com.datastax.astra.tool;
2+
3+
import com.datastax.astra.client.Collection;
4+
import com.datastax.astra.client.DataAPIClient;
5+
import com.datastax.astra.client.Database;
6+
import com.datastax.astra.client.model.Document;
7+
import com.datastax.astra.client.model.SimilarityMetric;
8+
import com.fasterxml.jackson.annotation.JsonProperty;
9+
import com.theokanning.openai.embedding.EmbeddingRequest;
10+
import com.theokanning.openai.embedding.EmbeddingResult;
11+
import com.theokanning.openai.service.OpenAiService;
12+
import org.apache.commons.csv.CSVFormat;
13+
import org.apache.commons.csv.CSVParser;
14+
import org.apache.commons.csv.CSVRecord;
15+
import org.jsoup.Jsoup;
16+
17+
import java.io.Reader;
18+
import java.nio.file.Files;
19+
import java.nio.file.Paths;
20+
import java.util.ArrayList;
21+
import java.util.List;
22+
23+
public class CsvLoaderCordell {
24+
25+
// CHANGE ME
26+
static String csvFilename = "/Users/cedricklunven/Downloads/knowledge-base.csv";
27+
28+
// CHANGE ME
29+
static OpenAiService service = new OpenAiService( "a valid key");
30+
31+
public static void main(String[] args) {
32+
DataAPIClient client = new DataAPIClient("a valid token");
33+
Database db = client.getDatabase("a valid url");
34+
35+
List<FileContent> ok = readFileContent(csvFilename);
36+
for (FileContent fileContent : ok) {
37+
System.out.println(fileContent.getDocumentId() + " " + fileContent.getTextContent() + " " + fileContent.getVector());
38+
}
39+
40+
Collection<FileContent> collection = db
41+
.createCollection("knowledge_base", 1536, SimilarityMetric.COSINE, FileContent.class);
42+
collection.insertMany(ok);
43+
}
44+
45+
private static float[] embedded(String textContent) {
46+
EmbeddingRequest request = EmbeddingRequest.builder()
47+
.model("text-embedding-ada-002") // Specify the model
48+
.input(List.of("HELLO")) //FIXME
49+
.build();
50+
EmbeddingResult result = service.createEmbeddings(request);
51+
List<Double> doubles = result.getData().get(0).getEmbedding();
52+
float[] floats = new float[doubles.size()];
53+
for (int i = 0; i < doubles.size(); i++) {
54+
floats[i] = doubles.get(i).floatValue();
55+
}
56+
return floats;
57+
}
58+
59+
public static List<FileContent> readFileContent(String filePath) {
60+
List<FileContent> contents = new ArrayList<>();
61+
try (Reader reader = Files.newBufferedReader(Paths.get(filePath));
62+
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT
63+
.withFirstRecordAsHeader()
64+
.withIgnoreHeaderCase()
65+
.withTrim()
66+
.withIgnoreEmptyLines()
67+
.withIgnoreSurroundingSpaces())) {
68+
69+
for (CSVRecord record : csvParser) {
70+
if (record.isConsistent()) {
71+
String htmlContent = record.get("OVERVIEW__C");
72+
String textContent = Jsoup.parse(htmlContent).text();
73+
float[] vector = embedded(textContent);
74+
String documentId = record.get("ARTICLECREATEDBYID");
75+
contents.add(new FileContent(documentId, textContent, vector));
76+
} else {
77+
System.err.println("Skipping inconsistent record: " + record);
78+
}
79+
}
80+
} catch (Exception e) {
81+
System.err.println("Error processing CSV file: " + e.getMessage());
82+
e.printStackTrace();
83+
}
84+
return contents;
85+
}
86+
87+
public static class FileContent {
88+
89+
@JsonProperty("_id")
90+
private String documentId;
91+
92+
@JsonProperty("content")
93+
private String textContent;
94+
95+
@JsonProperty("$vector")
96+
private float[] vector;
97+
98+
public FileContent(String documentId, String textContent, float[] vector) {
99+
this.documentId = documentId;
100+
this.textContent = textContent;
101+
this.vector = vector;
102+
}
103+
104+
public String getDocumentId() {
105+
return documentId;
106+
}
107+
108+
public String getTextContent() {
109+
return textContent;
110+
}
111+
112+
public float[] getVector() {
113+
return vector;
114+
}
115+
}
116+
}
117+

0 commit comments

Comments
 (0)