Skip to content

Commit 78b4685

Browse files
committed
adding further prelimary work for functions support
1 parent bd63b26 commit 78b4685

File tree

12 files changed

+436
-17
lines changed

12 files changed

+436
-17
lines changed

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package com.microsoft.openai.samples.indexer;
22

33
import java.io.File;
4+
import java.io.IOException;
5+
import java.nio.file.Files;
6+
import java.nio.file.Path;
47
import java.util.List;
58
import java.util.stream.Collectors;
69

@@ -26,26 +29,30 @@ public DocumentProcessor(SearchIndexManager searchIndexManager, PDFParser pdfPar
2629
this.textSplitter = textSplitter;
2730
}
2831

29-
public void indexDocumentfromFile(String filename, String category){
32+
public void indexDocumentfromFile(String filepath, String category) throws IOException {
33+
byte[] bytes = Files.readAllBytes(Path.of(filepath));
34+
indexDocumentFromBytes(filepath, category, bytes);
35+
36+
}
37+
38+
public void indexDocumentFromBytes(String filename, String category, byte[] content){
3039
logger.debug("Indexing file {}", filename);
31-
File file = new File(filename);
40+
List<Page> pages = pdfParser.parse(content);
41+
logger.info("Found {} pages in file {}", pages.size(), filename);
42+
43+
3244

33-
List<Page> pages = pdfParser.parse(file);
34-
logger.info("Found {} pages in file {}", pages.size(), file.getName());
35-
36-
37-
3845
List<SplitPage> splitPages = textSplitter.splitPages(pages);
39-
logger.info("file {} splitted in {} sections", file.getName(), splitPages.size());
46+
logger.info("file {} splitted in {} sections", filename, splitPages.size());
4047

4148
List<Section> sections = splitPages.stream()
4249
.map(splitPage -> {
43-
return new Section(splitPage, file.getName(), category);
50+
return new Section(splitPage, filename, category);
4451
})
4552
.collect(Collectors.toList());
46-
53+
4754
searchIndexManager.updateContent(sections);
48-
logger.info("File {} indexed with {} splitted sections", file.getName(),sections.size());
55+
logger.info("File {} indexed with {} splitted sections", filename,sections.size());
4956

5057
}
5158

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,26 @@ public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCr
4747
this.verbose = verbose;
4848
}
4949

50+
5051
@Override
5152
public List<Page> parse(File file) {
52-
if (verbose) {
53-
logger.info("Extracting text from {} using Azure Document Intelligence",file.getName());
53+
if (verbose) {
54+
logger.info("Extracting text from {} using Azure Document Intelligence", file.getName());
5455
}
5556

56-
List<Page> pages = new ArrayList<>();
57-
5857
Path filePath = file.toPath();
5958
BinaryData fileData = BinaryData.fromFile(filePath, (int) file.length());
59+
return parse(fileData);
60+
}
6061

62+
@Override
63+
public List<Page> parse(byte[] content) {
64+
BinaryData fileData = BinaryData.fromBytes(content);
65+
return parse(fileData);
66+
}
67+
68+
private List<Page> parse(BinaryData fileData) {
69+
List<Page> pages = new ArrayList<>();
6170
SyncPoller<OperationResult, AnalyzeResult> analyzeLayoutResultPoller =
6271
client.beginAnalyzeDocument(this.modelId, fileData);
6372

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,30 @@ public List<Page> parse(File file) {
3232
}
3333
return pages;
3434
}
35+
36+
@Override
37+
public List<Page> parse(byte[] content) {
38+
List<Page> pages = new ArrayList<>();
39+
PdfReader reader = null;
40+
41+
try {
42+
reader = new PdfReader(content);
43+
Integer offset = 0;
44+
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
45+
String pageText = PdfTextExtractor.getTextFromPage(reader, i);
46+
Page page = new Page(i, offset, pageText);
47+
offset += pageText.length();
48+
pages.add(page);
49+
}
50+
} catch (IOException e) {
51+
throw new RuntimeException(e);
52+
} finally {
53+
if (reader != null) {
54+
reader.close();
55+
}
56+
}
57+
return pages;
58+
}
59+
3560
}
3661

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/PDFParser.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@
77
public interface PDFParser {
88

99
public List<Page> parse(File file);
10+
public List<Page> parse(byte[] content);
1011
}

app/indexer/functions/pom.xml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
<java.version>17</java.version>
2121
<azure.functions.maven.plugin.version>1.29.0</azure.functions.maven.plugin.version>
2222
<azure.functions.java.library.version>3.0.0</azure.functions.java.library.version>
23-
<functionAppName>indexer-blob-processor</functionAppName>
23+
<functionAppName>indexer-function</functionAppName>
2424
</properties>
2525

2626
<dependencies>
@@ -29,6 +29,11 @@
2929
<artifactId>azure-functions-java-library</artifactId>
3030
<version>${azure.functions.java.library.version}</version>
3131
</dependency>
32+
<dependency>
33+
<groupId>com.microsoft.openai.samples</groupId>
34+
<artifactId>indexer-core</artifactId>
35+
<version>1.0-SNAPSHOT</version>
36+
</dependency>
3237
</dependencies>
3338

3439
<build>

app/indexer/functions/src/main/java/com/microsoft/openai/samples/indexer/functions/BlobProcessorFunction.java

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,94 @@
11
package com.microsoft.openai.samples.indexer.functions;
22

3+
import com.azure.core.credential.TokenCredential;
4+
import com.azure.identity.AzureDeveloperCliCredentialBuilder;
5+
import com.azure.identity.ManagedIdentityCredentialBuilder;
36
import com.microsoft.azure.functions.ExecutionContext;
47
import com.microsoft.azure.functions.HttpMethod;
58
import com.microsoft.azure.functions.HttpRequestMessage;
69
import com.microsoft.azure.functions.HttpResponseMessage;
710
import com.microsoft.azure.functions.HttpStatus;
811
import com.microsoft.azure.functions.annotation.*;
12+
import com.microsoft.openai.samples.indexer.DocumentProcessor;
13+
import com.microsoft.openai.samples.indexer.embeddings.AzureOpenAIEmbeddingService;
14+
import com.microsoft.openai.samples.indexer.index.AzureSearchClientFactory;
15+
import com.microsoft.openai.samples.indexer.index.SearchIndexManager;
16+
import com.microsoft.openai.samples.indexer.parser.DocumentIntelligencePDFParser;
17+
import com.microsoft.openai.samples.indexer.parser.TextSplitter;
18+
import com.microsoft.openai.samples.indexer.storage.BlobManager;
919

20+
import java.nio.file.Path;
1021
import java.util.Optional;
1122

1223
/**
1324
* Azure Functions with Blob Trigger.
1425
*/
1526
public class BlobProcessorFunction {
27+
28+
29+
private String storageaccount;
30+
31+
32+
private String container;
33+
34+
35+
private String searchservice;
36+
37+
38+
private String index;
39+
40+
41+
private String openaiEmbdeployment;
42+
43+
44+
private String openaiServiceName;
45+
46+
47+
private String formrecognizerservice;
48+
49+
private String searchanalyzername = " en.microsoft";
50+
51+
private boolean verbose = true;
52+
53+
private boolean locaDev = false;
54+
55+
56+
DocumentProcessor documentProcessor;
57+
SearchIndexManager searchIndexManager;
58+
59+
public BlobProcessorFunction() {
60+
this.storageaccount = System.getenv("AZURE_STORAGE_ACCOUNT");
61+
this.container = System.getenv("AZURE_STORAGE_CONTAINER");
62+
this.openaiServiceName = System.getenv("AZURE_OPENAI_SERVICE");
63+
this.openaiEmbdeployment = System.getenv("AZURE_OPENAI_EMB_DEPLOYMENT");
64+
this.searchservice = System.getenv("AZURE_SEARCH_SERVICE");
65+
this.index = System.getenv("AZURE_SEARCH_INDEX");
66+
this.formrecognizerservice = System.getenv("AZURE_FORMRECOGNIZER_SERVICE");
67+
String localDev = System.getenv("LOCAL_DEV");
68+
String userAssignedManagedIdentity = System.getenv("USER_ASSIGNED_MANAGED_IDENTITY");
69+
70+
TokenCredential tokenCredential;
71+
72+
if (localDev != null && localDev.equals("true")) {
73+
tokenCredential = new AzureDeveloperCliCredentialBuilder().build();
74+
}
75+
else {
76+
if (userAssignedManagedIdentity != null && !userAssignedManagedIdentity.isEmpty()) {
77+
tokenCredential = new ManagedIdentityCredentialBuilder().clientId(userAssignedManagedIdentity).build();
78+
} else {
79+
tokenCredential = new ManagedIdentityCredentialBuilder().build();
80+
}
81+
}
82+
83+
this.searchIndexManager = new SearchIndexManager(
84+
new AzureSearchClientFactory(searchservice, tokenCredential, index, verbose),
85+
searchanalyzername,
86+
new AzureOpenAIEmbeddingService(openaiServiceName, openaiEmbdeployment, tokenCredential, verbose));
87+
88+
//DocumentProcessor documentProcessor = new DocumentProcessor(searchIndexManager, new ItextPDFParser(), new TextSplitter(verbose));
89+
this.documentProcessor = new DocumentProcessor(searchIndexManager, new DocumentIntelligencePDFParser(formrecognizerservice,tokenCredential,verbose), new TextSplitter(verbose));
90+
// BlobManager blobManager = new BlobManager(storageaccount, container, tokenCredential, verbose);
91+
}
1692
@FunctionName("BlobEventGridProcessor")
1793
/**
1894
* This function will be invoked when a new or updated blob is detected at the specified path. The blob contents are provided as input to this function.
@@ -23,6 +99,12 @@ public void run(
2399
@BindingName("filename") String filename,
24100
final ExecutionContext context
25101
) {
26-
context.getLogger().info("Java Blob trigger function processed a blob. Name: " + filename + "\n Size: " + content.length + " Bytes");
102+
context.getLogger().info("Processing document " + filename + "\n Size: " + content.length + " Bytes");
103+
documentProcessor.indexDocumentFromBytes(filename,"",content);
104+
context.getLogger().info("Document " + filename + " successufully indexed");
105+
27106
}
107+
108+
109+
28110
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<configuration>
2+
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
3+
<encoder>
4+
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
5+
</encoder>
6+
</appender>
7+
8+
<root level="ERROR">
9+
<appender-ref ref="STDOUT" />
10+
</root>
11+
<logger name="com.microsoft.openai.samples.indexer" level="INFO" />
12+
</configuration>

azure.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,22 @@ services:
2020
run: cd ../frontend;npm install;npm run build
2121
interactive: true
2222
continueOnError: false
23+
indexer:
24+
project: ./app/indexer/functions
25+
language: java
26+
host: function
27+
hooks:
28+
prepackage:
29+
windows:
30+
shell: pwsh
31+
run: cd ..;mvn install -pl :indexer-core
32+
interactive: true
33+
continueOnError: false
34+
posix:
35+
shell: sh
36+
run: cd ..;mvn install -pl :indexer-core
37+
interactive: true
38+
continueOnError: false
2339
hooks:
2440
postprovision:
2541
windows:

infra/core/event/eventgrid.json

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
3+
"contentVersion": "1.0.0.0",
4+
"parameters": {
5+
"systemTopics_egt_documents_s24ysc6s4pxb6_name": {
6+
"defaultValue": "egt-documents-s24ysc6s4pxb6",
7+
"type": "String"
8+
},
9+
"storageAccounts_sts24ysc6s4pxb6_externalid": {
10+
"defaultValue": "/subscriptions/8b82fc4d-aabe-4658-88f2-5674bf49eec0/resourceGroups/rg-test-local-full-java/providers/Microsoft.Storage/storageAccounts/sts24ysc6s4pxb6",
11+
"type": "String"
12+
}
13+
},
14+
"variables": {},
15+
"resources": [
16+
{
17+
"type": "Microsoft.EventGrid/systemTopics",
18+
"apiVersion": "2023-12-15-preview",
19+
"name": "[parameters('systemTopics_egt_documents_s24ysc6s4pxb6_name')]",
20+
"location": "eastus2",
21+
"properties": {
22+
"source": "[parameters('storageAccounts_sts24ysc6s4pxb6_externalid')]",
23+
"topicType": "Microsoft.Storage.StorageAccounts"
24+
}
25+
},
26+
{
27+
"type": "Microsoft.EventGrid/systemTopics/eventSubscriptions",
28+
"apiVersion": "2023-12-15-preview",
29+
"name": "[concat(parameters('systemTopics_egt_documents_s24ysc6s4pxb6_name'), '/documents-upload-listener')]",
30+
"dependsOn": [
31+
"[resourceId('Microsoft.EventGrid/systemTopics', parameters('systemTopics_egt_documents_s24ysc6s4pxb6_name'))]"
32+
],
33+
"properties": {
34+
"destination": {
35+
"properties": {
36+
"maxEventsPerBatch": 1,
37+
"preferredBatchSizeInKilobytes": 64
38+
},
39+
"endpointType": "WebHook"
40+
},
41+
"filter": {
42+
"subjectBeginsWith": "/blobServices/default/containers/content",
43+
"includedEventTypes": [
44+
"Microsoft.Storage.BlobCreated"
45+
],
46+
"enableAdvancedFilteringOnArrays": true
47+
},
48+
"labels": [],
49+
"eventDeliverySchema": "EventGridSchema",
50+
"retryPolicy": {
51+
"maxDeliveryAttempts": 30,
52+
"eventTimeToLiveInMinutes": 1440
53+
}
54+
}
55+
}
56+
]
57+
}

0 commit comments

Comments
 (0)