Skip to content

Commit 13eda4f

Browse files
committed
add code documentation for indexer
1 parent b23bff9 commit 13eda4f

File tree

6 files changed

+99
-14
lines changed

6 files changed

+99
-14
lines changed

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@
1414
import org.slf4j.Logger;
1515
import org.slf4j.LoggerFactory;
1616

17+
/**
18+
* The DocumentProcessor class is responsible for processing and indexing documents.
19+
* It takes a document as input, either as a file or as a byte array, and processes it for indexing.
20+
* The processing involves
21+
* 1. parsing the document into pages
22+
* 2. splitting the pages into sections
23+
* 3. Indexing these sections in Azure AI Search also adding embeddings so that semantic similarity search can be used.
24+
* The class uses a SearchIndexManager to manage the indexing, a PDFParser to parse the document into pages, and a TextSplitter to split the pages into sections.
25+
*/
1726
public class DocumentProcessor {
1827

1928
private static final Logger logger = LoggerFactory.getLogger(DocumentProcessor.class);
@@ -38,6 +47,7 @@ public void indexDocumentfromFile(String filepath, String category) throws IOExc
3847

3948
public void indexDocumentFromBytes(String filename, String category, byte[] content){
4049
logger.debug("Indexing file {}", filename);
50+
//TODO add support for other file types (docx, pptx, txt, md, html, etc)
4151
List<Page> pages = pdfParser.parse(content);
4252
logger.info("Found {} pages in file {}", pages.size(), filename);
4353

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
import com.knuddels.jtokkit.api.ModelType;
2121
import reactor.util.retry.Retry;
2222

23+
24+
/**
25+
* This class provides a base implementation for creating text embeddings which are then stored in vector databases during batch indexing process.
26+
* It creates embedding batch and split text into batches for performance reasons.
27+
* It also includes fields for configuring batch size, token limit, and other configurations.
28+
* The class uses OpenAI client to create the embeddings and handles retries in case of HTTP response exceptions.
29+
*/
2330
public abstract class AbstractTextEmbeddingsService implements TextEmbeddingsService{
2431
protected String openAiDeploymentName;
2532
protected boolean disableBatch;

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131

3232

3333

34+
/**
35+
* The SearchIndexManager class is responsible for managing the Azure Search Index.
36+
* It provides functionalities to create an index, update the content of the index,
37+
* and manage the embeddings of the sections.
38+
*/
3439
public class SearchIndexManager {
3540
private AzureSearchClientFactory azureSearchClientFactory;
3641
private String searchAnalyzerName;
@@ -44,6 +49,12 @@ public SearchIndexManager(AzureSearchClientFactory azureSearchClientFactory, Str
4449
this.embeddingsService = embeddingsService;
4550
}
4651

52+
/**
53+
* It creates a new index with specific fields and configurations. It also sets up semantic search and vector search
54+
* configurations for the index.
55+
* This is in general not used during runtime, but only during env setup.
56+
* However, it's idempotent as it checks if the index already exists. If not it creates it.
57+
*/
4758
public void createIndex() {
4859
if (azureSearchClientFactory.isVerbose()) {
4960
logger.debug("Ensuring search index {} exists", azureSearchClientFactory.getIndexName());
@@ -128,6 +139,12 @@ public void createIndex() {
128139
logger.info("Created index {}", azureSearchClientFactory.getIndexName());
129140
}
130141

142+
/**
143+
* It updates the content of the index. It divides the sections into batches and for each batch, it creates a list of documents. Each document
144+
* is a map containing the section details.
145+
* It also creates embeddings for each section and adds them to the corresponding document. Finally, it uploads the documents to the search client.
146+
* @param sections
147+
*/
131148
public void updateContent(List<Section> sections) {
132149
int MAX_BATCH_SIZE = 1000;
133150
List<List<Section>> sectionBatches = new ArrayList<>();
@@ -161,7 +178,7 @@ public void updateContent(List<Section> sections) {
161178
documents.get(i).put("embedding", embeddings.get(i));
162179
}
163180

164-
181+
//Finally updated the document to the index including embeddings vector as well
165182
searchClient.uploadDocuments(documents);
166183
}
167184

@@ -200,8 +217,14 @@ public void removeContent(String path) {
200217
}
201218
202219
*/
203-
204-
220+
221+
222+
/**
223+
*
224+
* @param filename
225+
* @param page
226+
* @return the source page from the file page. If the file is a PDF, it appends the page number to the filename. Otherwise,it just returns the filename.
227+
*/
205228
private String getSourcePageFromFilePage(String filename, int page) {
206229
if (filename.toLowerCase().endsWith(".pdf")) {
207230
return filename + "#page=" + (page + 1);

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,23 @@
3232
import org.slf4j.LoggerFactory;
3333

3434

35+
/**
36+
* This is an implementation of a PDF parser using Azure's Document Intelligence service.
37+
* It is designed to extract text and table data from PDF files and convert them into a structured format.
38+
*
39+
* It initializes an instance of DocumentAnalysisClient from Azure's Document Intelligence service in the constructor.
40+
* It provides two parse methods, one accepting a File object and another accepting a byte array. Both methods convert the input into BinaryData and pass it to a private parse method.
41+
* The private parse method sends the BinaryData to Azure's Document Intelligence service for analysis. It then processes the analysis result, extracting text and table data from each page of the PDF. Tables are converted into HTML format.
42+
* The tableToHtml method is used to convert a DocumentTable object into an HTML table. It handles row and column spans and escapes any HTML characters in the cell content.
43+
*/
3544
public class DocumentIntelligencePDFParser implements PDFParser {
3645
private static final Logger logger = LoggerFactory.getLogger(DocumentIntelligencePDFParser.class);
3746

3847
private final DocumentAnalysisClient client;
3948
private boolean verbose = false;
4049
private String modelId = "prebuilt-layout";
4150

51+
4252
public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCredential, Boolean verbose) {
4353
this.client = new DocumentAnalysisClientBuilder()
4454
.endpoint("https://%s.cognitiveservices.azure.com/".formatted(serviceName))
@@ -66,64 +76,86 @@ public List<Page> parse(byte[] content) {
6676
}
6777

6878
private List<Page> parse(BinaryData fileData) {
79+
// Create a list to store the pages of the PDF
6980
List<Page> pages = new ArrayList<>();
81+
82+
// Begin the document analysis process using Azure's Document Intelligence service
7083
SyncPoller<OperationResult, AnalyzeResult> analyzeLayoutResultPoller =
71-
client.beginAnalyzeDocument(this.modelId, fileData);
84+
client.beginAnalyzeDocument(this.modelId, fileData);
7285

86+
// Get the final result of the document analysis
7387
AnalyzeResult analyzeLayoutResult = analyzeLayoutResultPoller.getFinalResult();
7488

7589
int offset = 0;
90+
// Loop through each page in the analyzed document
7691
for (int page_num = 0; page_num < analyzeLayoutResult.getPages().size(); page_num++) {
7792
DocumentPage page = analyzeLayoutResult.getPages().get(page_num);
93+
94+
// Create a list to store the tables on the current page
7895
List<DocumentTable> tables_on_page = new ArrayList<>();
7996

80-
if(analyzeLayoutResult.getTables() != null){
97+
// If there are tables in the analyzed document, add the tables on the current page to the list
98+
if (analyzeLayoutResult.getTables() != null) {
8199
for (DocumentTable table : analyzeLayoutResult.getTables()) {
82100
BoundingRegion boundingRegion = table.getBoundingRegions().get(0);
83101
if (boundingRegion.getPageNumber() == page_num + 1) {
84102
tables_on_page.add(table);
85103
}
86104
}
87105
}
88-
106+
89107
DocumentSpan pageSpan = page.getSpans().get(0);
90108
int pageOffset = pageSpan.getOffset();
91109
int pageLength = pageSpan.getLength();
110+
111+
// Create an array to store the characters in the tables on the current page
92112
int[] tableChars = new int[pageLength];
93113
Arrays.fill(tableChars, -1);
94114

115+
// Loop through each table on the current page
95116
for (int tableId = 0; tableId < tables_on_page.size(); tableId++) {
96117
DocumentTable table = tables_on_page.get(tableId);
97-
118+
119+
// Loop through each span in the current table and mark the characters in the table
98120
for (DocumentSpan span : table.getSpans()) {
99121
for (int i = 0; i < span.getLength(); i++) {
100122
int idx = span.getOffset() - pageOffset + i;
123+
// If the character is in the current table, store the table ID in the array
101124
if (idx >= 0 && idx < pageLength) {
102125
tableChars[idx] = tableId;
103126
}
104127
}
105128
}
106129
}
107130

131+
// Create a StringBuilder to store the text of the current page
108132
StringBuilder pageText = new StringBuilder();
133+
134+
// Create a set to store the IDs of the tables that have been added to the page text
109135
Set<Integer> addedTables = new HashSet<>();
136+
137+
// Loop through each character in the array
110138
for (int idx = 0; idx < tableChars.length; idx++) {
111139
int tableId = tableChars[idx];
112140
if (tableId == -1) {
141+
// If the character is not in a table, add it to the page text
113142
pageText.append(analyzeLayoutResult.getContent().substring(pageOffset + idx, pageOffset + idx + 1));
114143
} else if (!addedTables.contains(tableId)) {
144+
// If the character is in a table and the table has not been added to the page text, add the table to the page text
115145
DocumentTable table = tables_on_page.get(tableId);
116146
pageText.append(tableToHtml(table));
117147
addedTables.add(tableId);
118148
}
119149
}
120150

121-
pages.add( new Page(page_num, offset, pageText.toString()));
151+
// Add the current page to the list of pages
152+
pages.add(new Page(page_num, offset, pageText.toString()));
153+
122154
offset += pageText.length();
123155

124-
}
156+
}
125157
return pages;
126-
}
158+
}
127159

128160

129161
private String tableToHtml(DocumentTable table) {

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
import java.util.List;
99
import java.util.ArrayList;
1010

11+
/**
12+
* This is an implementation of a PDF parser using open source iText library.
13+
* It can only handle text within pdf.
14+
* Can't extract data from tables within images. See @DocumentIntelligencePDFParser for that.
15+
*/
1116
public class ItextPDFParser implements PDFParser {
1217
@Override
1318
public List<Page> parse(File file) {

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
import com.microsoft.openai.samples.indexer.SplitPage;
77

88

9+
/**
10+
* It's responsible for splitting the text content of a list of pages into smaller sections.
11+
* It does this by identifying sentence endings and word breaks, and then using these to determine where to split the text.
12+
* The class also has a maximum section length, a sentence search limit, and a section overlap, which are used to fine-tune the splitting process.
13+
*/
914
public class TextSplitter {
1015
private List<String> sentenceEndings;
1116
private List<String> wordBreaks;
@@ -15,6 +20,10 @@ public class TextSplitter {
1520
private boolean verbose;
1621

1722
public TextSplitter(boolean verbose) {
23+
this(true,1000,100,100)
24+
}
25+
26+
public TextSplitter(boolean verbose, int maxSectionLength, int sentenceSearchLimit, int sectionOverlap) {
1827
this.sentenceEndings = new ArrayList<>();
1928
this.sentenceEndings.add(".");
2029
this.sentenceEndings.add("。");
@@ -41,12 +50,11 @@ public TextSplitter(boolean verbose) {
4150
this.wordBreaks.add("\t");
4251
this.wordBreaks.add("\n");
4352

44-
this.maxSectionLength = 1000;
45-
this.sentenceSearchLimit = 100;
46-
this.sectionOverlap = 100;
53+
this.maxSectionLength = maxSectionLength;
54+
this.sentenceSearchLimit = sentenceSearchLimit;
55+
this.sectionOverlap = sectionOverlap;
4756
this.verbose = verbose;
4857
}
49-
5058
public List<SplitPage> splitPages(List<Page> pages) {
5159
List<SplitPage> splitPages = new ArrayList<>();
5260
StringBuilder allText = new StringBuilder();

0 commit comments

Comments
 (0)