Skip to content

Commit fe1613e

Browse files
committed
bicep infra and java indexer process updated
1 parent 5c12f35 commit fe1613e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3539
-958
lines changed

.devcontainer/devcontainer.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"name": "Java 17 and maven 3.8.8 DevContainer to build Java RAG example with Azure AI",
3-
"image": "mcr.microsoft.com/devcontainers/python:0-3.10",
3+
"image": "mcr.microsoft.com/devcontainers/java:1-17-bullseye",
44
"features": {
55
"azure-cli": "latest",
66
"ghcr.io/azure/azure-dev/azd:latest": {},
77
"ghcr.io/devcontainers/features/java:1": {
8-
"version": "17",
8+
"version": "none",
99
"installMaven": true,
1010
"mavenVersion": "3.8.8"
1111
},
@@ -25,7 +25,6 @@
2525
"ms-azuretools.azure-dev",
2626
"ms-azuretools.vscode-bicep",
2727
"vscjava.vscode-java-pack",
28-
"ms-python.python",
2928
"amodio.tsl-problem-matcher"
3029
]
3130
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3+
<parent>
4+
<artifactId>indexer-parent</artifactId>
5+
<groupId>com.microsoft.openai.samples</groupId>
6+
<version>1.0-SNAPSHOT</version>
7+
</parent>
8+
<modelVersion>4.0.0</modelVersion>
9+
<artifactId>indexer-cli</artifactId>
10+
<build>
11+
<plugins>
12+
<plugin>
13+
<artifactId>maven-shade-plugin</artifactId>
14+
<version>3.5.1</version>
15+
<executions>
16+
<execution>
17+
<phase>package</phase>
18+
<goals>
19+
<goal>shade</goal>
20+
</goals>
21+
<configuration>
22+
<transformers>
23+
<transformer>
24+
<mainClass>com.microsoft.openai.samples.indexer.CLI</mainClass>
25+
</transformer>
26+
</transformers>
27+
<filters>
28+
<filter>
29+
<artifact>*:*</artifact>
30+
<excludes>
31+
<exclude>META-INF/*.SF</exclude>
32+
<exclude>META-INF/*.DSA</exclude>
33+
<exclude>META-INF/*.RSA</exclude>
34+
</excludes>
35+
</filter>
36+
</filters>
37+
<finalName>cli</finalName>
38+
</configuration>
39+
</execution>
40+
</executions>
41+
</plugin>
42+
</plugins>
43+
</build>
44+
<dependencies>
45+
<dependency>
46+
<groupId>org.junit.jupiter</groupId>
47+
<artifactId>junit-jupiter-api</artifactId>
48+
<version>5.8.1</version>
49+
<scope>test</scope>
50+
<exclusions>
51+
<exclusion>
52+
<artifactId>opentest4j</artifactId>
53+
<groupId>org.opentest4j</groupId>
54+
</exclusion>
55+
<exclusion>
56+
<artifactId>junit-platform-commons</artifactId>
57+
<groupId>org.junit.platform</groupId>
58+
</exclusion>
59+
<exclusion>
60+
<artifactId>apiguardian-api</artifactId>
61+
<groupId>org.apiguardian</groupId>
62+
</exclusion>
63+
</exclusions>
64+
</dependency>
65+
<dependency>
66+
<groupId>org.mockito</groupId>
67+
<artifactId>mockito-core</artifactId>
68+
<version>5.8.0</version>
69+
<scope>test</scope>
70+
<exclusions>
71+
<exclusion>
72+
<artifactId>byte-buddy</artifactId>
73+
<groupId>net.bytebuddy</groupId>
74+
</exclusion>
75+
<exclusion>
76+
<artifactId>byte-buddy-agent</artifactId>
77+
<groupId>net.bytebuddy</groupId>
78+
</exclusion>
79+
<exclusion>
80+
<artifactId>objenesis</artifactId>
81+
<groupId>org.objenesis</groupId>
82+
</exclusion>
83+
</exclusions>
84+
</dependency>
85+
</dependencies>
86+
</project>

app/indexer/cli/pom.xml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<parent>
8+
<groupId>com.microsoft.openai.samples</groupId>
9+
<artifactId>indexer-parent</artifactId>
10+
<version>1.0-SNAPSHOT</version>
11+
<relativePath>../pom.xml</relativePath>
12+
</parent>
13+
14+
<artifactId>indexer-cli</artifactId>
15+
16+
<dependencies>
17+
<dependency>
18+
<groupId>info.picocli</groupId>
19+
<artifactId>picocli</artifactId>
20+
<version>${picocli.version}</version>
21+
</dependency>
22+
<dependency>
23+
<groupId>com.microsoft.openai.samples</groupId>
24+
<artifactId>indexer-core</artifactId>
25+
<version>1.0-SNAPSHOT</version>
26+
</dependency>
27+
</dependencies>
28+
29+
<build>
30+
<plugins>
31+
<plugin>
32+
<groupId>org.apache.maven.plugins</groupId>
33+
<artifactId>maven-shade-plugin</artifactId>
34+
<version>3.5.1</version>
35+
<executions>
36+
<execution>
37+
<phase>package</phase>
38+
<goals>
39+
<goal>shade</goal>
40+
</goals>
41+
<configuration>
42+
<transformers>
43+
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
44+
<mainClass>com.microsoft.openai.samples.indexer.CLI</mainClass>
45+
</transformer>
46+
</transformers>
47+
<filters>
48+
<filter>
49+
<artifact>*:*</artifact>
50+
<excludes>
51+
<exclude>META-INF/*.SF</exclude>
52+
<exclude>META-INF/*.DSA</exclude>
53+
<exclude>META-INF/*.RSA</exclude>
54+
</excludes>
55+
</filter>
56+
</filters>
57+
<finalName>cli</finalName>
58+
</configuration>
59+
</execution>
60+
</executions>
61+
</plugin>
62+
</plugins>
63+
</build>
64+
65+
</project>
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
package com.microsoft.openai.samples.indexer;
2+
3+
import java.nio.file.Files;
4+
import java.nio.file.Path;
5+
import java.util.concurrent.Callable;
6+
7+
import com.microsoft.openai.samples.indexer.embeddings.AzureOpenAIEmbeddingService;
8+
import com.microsoft.openai.samples.indexer.embeddings.TextEmbeddingsService;
9+
import com.microsoft.openai.samples.indexer.index.AzureSearchClientFactory;
10+
import com.microsoft.openai.samples.indexer.index.SearchIndexManager;
11+
import com.microsoft.openai.samples.indexer.parser.DocumentIntelligencePDFParser;
12+
import com.microsoft.openai.samples.indexer.parser.ItextPDFParser;
13+
import com.microsoft.openai.samples.indexer.parser.TextSplitter;
14+
import com.microsoft.openai.samples.indexer.storage.BlobManager;
15+
import org.slf4j.Logger;
16+
import org.slf4j.LoggerFactory;
17+
18+
import com.azure.core.credential.TokenCredential;
19+
import com.azure.identity.AzureCliCredentialBuilder;
20+
21+
import picocli.CommandLine;
22+
import picocli.CommandLine.Command;
23+
import picocli.CommandLine.Option;
24+
import picocli.CommandLine.Parameters;
25+
26+
27+
public class CLI implements Callable<Integer> {
28+
29+
private static final Logger logger = LoggerFactory.getLogger(CLI.class);
30+
31+
@Option(names = {"--storageaccount"}, required = true)
32+
private String storageaccount;
33+
34+
@Option(names = {"--container"}, required = true)
35+
private String container;
36+
37+
@Option(names = {"--searchservice"}, required = true)
38+
private String searchservice;
39+
40+
@Option(names = {"--searchanalyzername"}, required = false, defaultValue = " en.microsoft")
41+
private String searchanalyzername;
42+
43+
@Option(names = {"--index"}, required = true)
44+
private String index;
45+
46+
@Option(names = {"--openai-emb-deployment"}, required = true)
47+
private String openaiEmbdeployment;
48+
49+
@Option(names = {"--openai-service-name"}, required = true)
50+
private String openaiServiceName;
51+
52+
@Option(names = {"--formrecognizerservice"}, required = true)
53+
private String formrecognizerservice;
54+
55+
@Option(names = {"-v","--verbose"}, required = true)
56+
private boolean verbose;
57+
58+
@Option(names = {"-c","--category"}, required = false, defaultValue = "default")
59+
private String category;
60+
61+
@Parameters(index = "0")
62+
private Path dataFolderPath;
63+
64+
public static void main(String[] args) {
65+
int exitCode = new CommandLine(new CLI()).execute(args);
66+
System.exit(exitCode);
67+
}
68+
69+
@Override
70+
public Integer call() throws Exception {
71+
System.out.println(" use add command");
72+
return 0;
73+
}
74+
75+
@Command(name = "add")
76+
public void addCommand() {
77+
TokenCredential tokenCredential = new AzureCliCredentialBuilder().build();
78+
TextEmbeddingsService textEmbeddingsService = new AzureOpenAIEmbeddingService(openaiServiceName, openaiEmbdeployment, tokenCredential, verbose);
79+
AzureSearchClientFactory azureSearchClientFactory = new AzureSearchClientFactory(searchservice, tokenCredential, index, verbose);
80+
SearchIndexManager searchIndexManager = new SearchIndexManager(azureSearchClientFactory,searchanalyzername,textEmbeddingsService);
81+
82+
searchIndexManager.createIndex();
83+
84+
//DocumentProcessor documentProcessor = new DocumentProcessor(searchIndexManager, new ItextPDFParser(), new TextSplitter(verbose));
85+
DocumentProcessor documentProcessor = new DocumentProcessor(searchIndexManager, new DocumentIntelligencePDFParser(formrecognizerservice,tokenCredential,verbose), new TextSplitter(verbose));
86+
BlobManager blobManager = new BlobManager(storageaccount, container, tokenCredential, verbose);
87+
88+
if(Files.isDirectory(dataFolderPath))
89+
processDirectory(documentProcessor, blobManager, dataFolderPath);
90+
else
91+
processFile(documentProcessor, blobManager, dataFolderPath);
92+
93+
}
94+
95+
private void processDirectory(DocumentProcessor documentProcessor, BlobManager blobManager, Path directory) {
96+
logger.debug("Processing directory {}", directory);
97+
try {
98+
Files.newDirectoryStream(directory).forEach(path -> {
99+
processFile(documentProcessor, blobManager, path);
100+
});
101+
logger.debug("All files in directory {} processed", directory.toRealPath().toString());
102+
} catch (Exception e) {
103+
throw new RuntimeException("Error processing folder ",e);
104+
}
105+
}
106+
107+
private void processFile(DocumentProcessor documentProcessor, BlobManager blobManager, Path path) {
108+
try {
109+
String absoluteFilePath = path.toRealPath().toString();
110+
documentProcessor.indexDocumentfromFile(absoluteFilePath,category);
111+
logger.debug("file {} indexed", absoluteFilePath);
112+
blobManager.uploadBlob(path.toFile());
113+
logger.debug("file {} uploaded", absoluteFilePath);
114+
} catch (Exception e) {
115+
throw new RuntimeException("Error processing file ",e);
116+
}
117+
}
118+
119+
120+
}
121+
122+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<configuration>
2+
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
3+
<encoder>
4+
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
5+
</encoder>
6+
</appender>
7+
8+
<root level="ERROR">
9+
<appender-ref ref="STDOUT" />
10+
</root>
11+
<logger name="com.microsoft.openai.samples.indexer" level="INFO" />
12+
</configuration>

app/indexer/core/pom.xml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<parent>
8+
<groupId>com.microsoft.openai.samples</groupId>
9+
<artifactId>indexer-parent</artifactId>
10+
<version>1.0-SNAPSHOT</version>
11+
<relativePath>../pom.xml</relativePath>
12+
</parent>
13+
14+
<artifactId>indexer-core</artifactId>
15+
16+
17+
<dependencies>
18+
<!-- Azure Core -->
19+
<dependency>
20+
<groupId>com.azure</groupId>
21+
<artifactId>azure-core</artifactId>
22+
</dependency>
23+
24+
<dependency>
25+
<groupId>com.azure</groupId>
26+
<artifactId>azure-identity</artifactId>
27+
</dependency>
28+
29+
<!-- Azure Storage Blob -->
30+
<dependency>
31+
<groupId>com.azure</groupId>
32+
<artifactId>azure-storage-blob</artifactId>
33+
</dependency>
34+
35+
<dependency>
36+
<groupId>com.azure</groupId>
37+
<artifactId>azure-search-documents</artifactId>
38+
</dependency>
39+
<dependency>
40+
<groupId>com.azure</groupId>
41+
<artifactId>azure-ai-openai</artifactId>
42+
<version>${azure-openai.version}</version>
43+
</dependency>
44+
<dependency>
45+
<groupId>com.azure</groupId>
46+
<artifactId>azure-ai-formrecognizer</artifactId>
47+
</dependency>
48+
49+
<dependency>
50+
<groupId>com.knuddels</groupId>
51+
<artifactId>jtokkit</artifactId>
52+
<version>0.6.1</version>
53+
</dependency>
54+
<dependency>
55+
<groupId>com.itextpdf</groupId>
56+
<artifactId>itextpdf</artifactId>
57+
<version>${itextpdf.version}</version>
58+
</dependency>
59+
<dependency>
60+
<groupId>org.apache.commons</groupId>
61+
<artifactId>commons-text</artifactId>
62+
<version>${apache.common.text}</version>
63+
</dependency>
64+
65+
</dependencies>
66+
67+
</project>

0 commit comments

Comments
 (0)