|
32 | 32 | import org.slf4j.LoggerFactory;
|
33 | 33 |
|
34 | 34 |
|
| 35 | +/** |
| 36 | + * This is an implementation of a PDF parser using Azure's Document Intelligence service. |
| 37 | + * It is designed to extract text and table data from PDF files and convert them into a structured format. |
| 38 | + * |
| 39 | + * It initializes an instance of DocumentAnalysisClient from Azure's Document Intelligence service in the constructor. |
| 40 | + * It provides two parse methods, one accepting a File object and another accepting a byte array. Both methods convert the input into BinaryData and pass it to a private parse method. |
| 41 | + * The private parse method sends the BinaryData to Azure's Document Intelligence service for analysis. It then processes the analysis result, extracting text and table data from each page of the PDF. Tables are converted into HTML format. |
| 42 | + * The tableToHtml method is used to convert a DocumentTable object into an HTML table. It handles row and column spans and escapes any HTML characters in the cell content. |
| 43 | + */ |
35 | 44 | public class DocumentIntelligencePDFParser implements PDFParser {
|
36 | 45 | private static final Logger logger = LoggerFactory.getLogger(DocumentIntelligencePDFParser.class);
|
37 | 46 |
|
38 | 47 | private final DocumentAnalysisClient client;
|
39 | 48 | private boolean verbose = false;
|
40 | 49 | private String modelId = "prebuilt-layout";
|
41 | 50 |
|
| 51 | + |
42 | 52 | public DocumentIntelligencePDFParser(String serviceName, TokenCredential tokenCredential, Boolean verbose) {
|
43 | 53 | this.client = new DocumentAnalysisClientBuilder()
|
44 | 54 | .endpoint("https://%s.cognitiveservices.azure.com/".formatted(serviceName))
|
@@ -66,64 +76,86 @@ public List<Page> parse(byte[] content) {
|
66 | 76 | }
|
67 | 77 |
|
68 | 78 | private List<Page> parse(BinaryData fileData) {
|
| 79 | + // Create a list to store the pages of the PDF |
69 | 80 | List<Page> pages = new ArrayList<>();
|
| 81 | + |
| 82 | + // Begin the document analysis process using Azure's Document Intelligence service |
70 | 83 | SyncPoller<OperationResult, AnalyzeResult> analyzeLayoutResultPoller =
|
71 |
| - client.beginAnalyzeDocument(this.modelId, fileData); |
| 84 | + client.beginAnalyzeDocument(this.modelId, fileData); |
72 | 85 |
|
| 86 | + // Get the final result of the document analysis |
73 | 87 | AnalyzeResult analyzeLayoutResult = analyzeLayoutResultPoller.getFinalResult();
|
74 | 88 |
|
75 | 89 | int offset = 0;
|
| 90 | + // Loop through each page in the analyzed document |
76 | 91 | for (int page_num = 0; page_num < analyzeLayoutResult.getPages().size(); page_num++) {
|
77 | 92 | DocumentPage page = analyzeLayoutResult.getPages().get(page_num);
|
| 93 | + |
| 94 | + // Create a list to store the tables on the current page |
78 | 95 | List<DocumentTable> tables_on_page = new ArrayList<>();
|
79 | 96 |
|
80 |
| - if(analyzeLayoutResult.getTables() != null){ |
| 97 | + // If there are tables in the analyzed document, add the tables on the current page to the list |
| 98 | + if (analyzeLayoutResult.getTables() != null) { |
81 | 99 | for (DocumentTable table : analyzeLayoutResult.getTables()) {
|
82 | 100 | BoundingRegion boundingRegion = table.getBoundingRegions().get(0);
|
83 | 101 | if (boundingRegion.getPageNumber() == page_num + 1) {
|
84 | 102 | tables_on_page.add(table);
|
85 | 103 | }
|
86 | 104 | }
|
87 | 105 | }
|
88 |
| - |
| 106 | + |
89 | 107 | DocumentSpan pageSpan = page.getSpans().get(0);
|
90 | 108 | int pageOffset = pageSpan.getOffset();
|
91 | 109 | int pageLength = pageSpan.getLength();
|
| 110 | + |
| 111 | + // Create an array to store the characters in the tables on the current page |
92 | 112 | int[] tableChars = new int[pageLength];
|
93 | 113 | Arrays.fill(tableChars, -1);
|
94 | 114 |
|
| 115 | + // Loop through each table on the current page |
95 | 116 | for (int tableId = 0; tableId < tables_on_page.size(); tableId++) {
|
96 | 117 | DocumentTable table = tables_on_page.get(tableId);
|
97 |
| - |
| 118 | + |
| 119 | + // Loop through each span in the current table and mark the characters in the table |
98 | 120 | for (DocumentSpan span : table.getSpans()) {
|
99 | 121 | for (int i = 0; i < span.getLength(); i++) {
|
100 | 122 | int idx = span.getOffset() - pageOffset + i;
|
| 123 | + // If the character is in the current table, store the table ID in the array |
101 | 124 | if (idx >= 0 && idx < pageLength) {
|
102 | 125 | tableChars[idx] = tableId;
|
103 | 126 | }
|
104 | 127 | }
|
105 | 128 | }
|
106 | 129 | }
|
107 | 130 |
|
| 131 | + // Create a StringBuilder to store the text of the current page |
108 | 132 | StringBuilder pageText = new StringBuilder();
|
| 133 | + |
| 134 | + // Create a set to store the IDs of the tables that have been added to the page text |
109 | 135 | Set<Integer> addedTables = new HashSet<>();
|
| 136 | + |
| 137 | + // Loop through each character in the array |
110 | 138 | for (int idx = 0; idx < tableChars.length; idx++) {
|
111 | 139 | int tableId = tableChars[idx];
|
112 | 140 | if (tableId == -1) {
|
| 141 | + // If the character is not in a table, add it to the page text |
113 | 142 | pageText.append(analyzeLayoutResult.getContent().substring(pageOffset + idx, pageOffset + idx + 1));
|
114 | 143 | } else if (!addedTables.contains(tableId)) {
|
| 144 | + // If the character is in a table and the table has not been added to the page text, add the table to the page text |
115 | 145 | DocumentTable table = tables_on_page.get(tableId);
|
116 | 146 | pageText.append(tableToHtml(table));
|
117 | 147 | addedTables.add(tableId);
|
118 | 148 | }
|
119 | 149 | }
|
120 | 150 |
|
121 |
| - pages.add( new Page(page_num, offset, pageText.toString())); |
| 151 | + // Add the current page to the list of pages |
| 152 | + pages.add(new Page(page_num, offset, pageText.toString())); |
| 153 | + |
122 | 154 | offset += pageText.length();
|
123 | 155 |
|
124 |
| - } |
| 156 | + } |
125 | 157 | return pages;
|
126 |
| - } |
| 158 | + } |
127 | 159 |
|
128 | 160 |
|
129 | 161 | private String tableToHtml(DocumentTable table) {
|
|
0 commit comments