add code documentation for indexer

dantelmomsft · dantelmomsft · commit 13eda4fc3839 · 2024-08-05T12:01:09.000+02:00
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/DocumentProcessor.java
@@ -14,6 +14,15 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+/**
+ * The DocumentProcessor class is responsible for processing and indexing documents.
+ * It takes a document as input, either as a file or as a byte array, and processes it for indexing.
+ * The processing involves
+ * 1. parsing the document into pages
+ * 2. splitting the pages into sections
+ * 3. Indexing these sections in Azure AI Search also adding embeddings so that semantic similarity search can be used.
+ * The class uses a SearchIndexManager to manage the indexing, a PDFParser to parse the document into pages, and a TextSplitter to split the pages into sections.
+ */
 public class DocumentProcessor {
     
     private static final Logger logger = LoggerFactory.getLogger(DocumentProcessor.class);
@@ -38,6 +47,7 @@ public void indexDocumentfromFile(String filepath, String category) throws IOExc
 
     public void indexDocumentFromBytes(String filename, String category, byte[] content){
         logger.debug("Indexing file {}", filename);
+        //TODO add support for other file types (docx, pptx, txt, md, html, etc)
         List<Page> pages = pdfParser.parse(content);
         logger.info("Found {} pages in file {}", pages.size(), filename);
 
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/embeddings/AbstractTextEmbeddingsService.java
@@ -20,6 +20,13 @@
 import com.knuddels.jtokkit.api.ModelType;
 import reactor.util.retry.Retry;
 
+
+/**
+ * This class  provides a base implementation for creating text embeddings which are then stored in vector databases during batch indexing process.
+ * It creates embedding batch and split text into batches for performance reasons.
+ * It also includes fields for configuring batch size, token limit, and other configurations.
+ * The class uses OpenAI client to create the embeddings and handles retries in case of HTTP response exceptions.
+ */
 public abstract class AbstractTextEmbeddingsService implements TextEmbeddingsService{
     protected String openAiDeploymentName;
     protected boolean disableBatch;
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/index/SearchIndexManager.java
@@ -31,6 +31,11 @@
 
 
 
+/**
+ * The SearchIndexManager class is responsible for managing the Azure Search Index.
+ * It provides functionalities to create an index, update the content of the index,
+ * and manage the embeddings of the sections.
+ */
 public class SearchIndexManager {
     private AzureSearchClientFactory azureSearchClientFactory;
     private String searchAnalyzerName;
@@ -44,6 +49,12 @@ public SearchIndexManager(AzureSearchClientFactory azureSearchClientFactory, Str
         this.embeddingsService = embeddingsService;
     }
 
+    /**
+     *  It creates a new index with specific fields and configurations. It also sets up semantic search and vector search
+     *  configurations for the index.
+     *  This is in general not used during runtime, but only during env setup.
+     *  However, it's idempotent as it checks if the index already exists. If not it creates it.
+     */
     public void createIndex() {
         if (azureSearchClientFactory.isVerbose()) {
                   logger.debug("Ensuring search index {} exists", azureSearchClientFactory.getIndexName());
@@ -128,6 +139,12 @@ public void createIndex() {
         logger.info("Created index {}", azureSearchClientFactory.getIndexName());
     }
 
+    /**
+     *  It updates the content of the index. It divides the sections into batches and for each batch, it creates a list of documents. Each document
+     *  is a map containing the section details.
+     *  It also creates embeddings for each section and adds them to the corresponding document. Finally, it uploads the documents to the search client.
+     * @param sections
+     */
     public void updateContent(List<Section> sections) {
         int MAX_BATCH_SIZE = 1000;
         List<List<Section>> sectionBatches = new ArrayList<>();
@@ -161,7 +178,7 @@ public void updateContent(List<Section> sections) {
                 documents.get(i).put("embedding", embeddings.get(i));
             }
         
-
+            //Finally updated the document to the index including embeddings vector as well
             searchClient.uploadDocuments(documents);
         }
         
@@ -200,8 +217,14 @@ public void removeContent(String path) {
     }
 
     */
- 
- 
+
+
+    /**
+     *
+     * @param filename
+     * @param page
+     * @return the source page from the file page. If the file is a PDF, it appends the page number to the filename. Otherwise,it just returns the filename.
+     */
         private  String getSourcePageFromFilePage(String filename, int page) {
             if (filename.toLowerCase().endsWith(".pdf")) {
                 return filename + "#page=" + (page + 1);
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/DocumentIntelligencePDFParser.java
@@ -32,13 +32,23 @@
 import org.slf4j.LoggerFactory;
 
 
+/**
+ * This is an implementation of a PDF parser using Azure's Document Intelligence service.
+ * It is designed to extract text and table data from PDF files and convert them into a structured format.
+ *
+ * It initializes an instance of DocumentAnalysisClient from Azure's Document Intelligence service in the constructor.
+ * It provides two parse methods, one accepting a File object and another accepting a byte array. Both methods convert the input into BinaryData and pass it to a private parse method.
+ * The private parse method sends the BinaryData to Azure's Document Intelligence service for analysis. It then processes the analysis result, extracting text and table data from each page of the PDF. Tables are converted into HTML format.
+ * The tableToHtml method is used to convert a DocumentTable object into an HTML table. It handles row and column spans and escapes any HTML characters in the cell content.
+ */
 public class DocumentIntelligencePDFParser implements PDFParser {
        private static final Logger logger = LoggerFactory.getLogger(DocumentIntelligencePDFParser.class); 
 
     private final DocumentAnalysisClient  client;
     private boolean verbose = false;
     private String modelId = "prebuilt-layout";
 
+
     public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCredential, Boolean verbose) {
         this.client = new DocumentAnalysisClientBuilder()
                 .endpoint("https://%s.cognitiveservices.azure.com/".formatted(serviceName))
@@ -66,64 +76,86 @@ public List<Page> parse(byte[] content) {
     }
 
     private List<Page> parse(BinaryData fileData) {
+        // Create a list to store the pages of the PDF
         List<Page> pages = new ArrayList<>();
+
+        // Begin the document analysis process using Azure's Document Intelligence service
         SyncPoller<OperationResult, AnalyzeResult> analyzeLayoutResultPoller =
-            client.beginAnalyzeDocument(this.modelId, fileData);
+                client.beginAnalyzeDocument(this.modelId, fileData);
 
+        // Get the final result of the document analysis
         AnalyzeResult analyzeLayoutResult = analyzeLayoutResultPoller.getFinalResult();
 
         int offset = 0;
+        // Loop through each page in the analyzed document
         for (int page_num = 0; page_num < analyzeLayoutResult.getPages().size(); page_num++) {
             DocumentPage page = analyzeLayoutResult.getPages().get(page_num);
+
+            // Create a list to store the tables on the current page
             List<DocumentTable> tables_on_page = new ArrayList<>();
 
-            if(analyzeLayoutResult.getTables() != null){
+            // If there are tables in the analyzed document, add the tables on the current page to the list
+            if (analyzeLayoutResult.getTables() != null) {
                 for (DocumentTable table : analyzeLayoutResult.getTables()) {
                     BoundingRegion boundingRegion = table.getBoundingRegions().get(0);
                     if (boundingRegion.getPageNumber() == page_num + 1) {
                         tables_on_page.add(table);
                     }
                 }
             }
-            
+
             DocumentSpan pageSpan = page.getSpans().get(0);
             int pageOffset = pageSpan.getOffset();
             int pageLength = pageSpan.getLength();
+
+            // Create an array to store the characters in the tables on the current page
             int[] tableChars = new int[pageLength];
             Arrays.fill(tableChars, -1);
 
+            // Loop through each table on the current page
             for (int tableId = 0; tableId < tables_on_page.size(); tableId++) {
                 DocumentTable table = tables_on_page.get(tableId);
-                
+
+                // Loop through each span in the current table and mark the characters in the table
                 for (DocumentSpan span : table.getSpans()) {
                     for (int i = 0; i < span.getLength(); i++) {
                         int idx = span.getOffset() - pageOffset + i;
+                        // If the character is in the current table, store the table ID in the array
                         if (idx >= 0 && idx < pageLength) {
                             tableChars[idx] = tableId;
                         }
                     }
                 }
             }
 
+            // Create a StringBuilder to store the text of the current page
             StringBuilder pageText = new StringBuilder();
+
+            // Create a set to store the IDs of the tables that have been added to the page text
             Set<Integer> addedTables = new HashSet<>();
+
+            // Loop through each character in the array
             for (int idx = 0; idx < tableChars.length; idx++) {
                 int tableId = tableChars[idx];
                 if (tableId == -1) {
+                    // If the character is not in a table, add it to the page text
                     pageText.append(analyzeLayoutResult.getContent().substring(pageOffset + idx, pageOffset + idx + 1));
                 } else if (!addedTables.contains(tableId)) {
+                    // If the character is in a table and the table has not been added to the page text, add the table to the page text
                     DocumentTable table = tables_on_page.get(tableId);
                     pageText.append(tableToHtml(table));
                     addedTables.add(tableId);
                 }
             }
 
-            pages.add( new Page(page_num, offset, pageText.toString()));
+            // Add the current page to the list of pages
+            pages.add(new Page(page_num, offset, pageText.toString()));
+
             offset += pageText.length();
 
-                            }
+        }
         return pages;
-                        }
+    }
                     
 
     private String tableToHtml(DocumentTable table) {
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/ItextPDFParser.java
@@ -8,6 +8,11 @@
 import java.util.List;
 import java.util.ArrayList;
 
+/**
+ *  This is an implementation of a PDF parser using open source iText library.
+ *  It can only handle text within pdf.
+ *  Can't extract data from tables within images. See @DocumentIntelligencePDFParser for that.
+ */
 public class ItextPDFParser implements PDFParser {
     @Override
     public List<Page> parse(File file) {
diff --git a/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java b/app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java
@@ -6,6 +6,11 @@
 import com.microsoft.openai.samples.indexer.SplitPage;
 
 
+/**
+ * It's responsible for splitting the text content of a list of pages into smaller sections.
+ * It does this by identifying sentence endings and word breaks, and then using these to determine where to split the text.
+ * The class also has a maximum section length, a sentence search limit, and a section overlap, which are used to fine-tune the splitting process.
+ */
 public class TextSplitter {
     private List<String> sentenceEndings;
     private List<String> wordBreaks;
@@ -15,6 +20,10 @@ public class TextSplitter {
     private boolean verbose;
 
     public TextSplitter(boolean verbose) {
+        this(true,1000,100,100)
+    }
+
+    public TextSplitter(boolean verbose, int maxSectionLength, int sentenceSearchLimit, int sectionOverlap) {
         this.sentenceEndings = new ArrayList<>();
         this.sentenceEndings.add(".");
         this.sentenceEndings.add("。");
@@ -41,12 +50,11 @@ public TextSplitter(boolean verbose) {
         this.wordBreaks.add("\t");
         this.wordBreaks.add("\n");
 
-        this.maxSectionLength = 1000;
-        this.sentenceSearchLimit = 100;
-        this.sectionOverlap = 100;
+        this.maxSectionLength = maxSectionLength;
+        this.sentenceSearchLimit = sentenceSearchLimit;
+        this.sectionOverlap = sectionOverlap;
         this.verbose = verbose;
     }
-
     public List<SplitPage> splitPages(List<Page> pages) {
         List<SplitPage> splitPages = new ArrayList<>();
         StringBuilder allText = new StringBuilder();