Skip to content

Commit a028d94

Browse files
authored
feat: Add file-based operations for conversion and chunking (#229)
- Introduced support for file-based conversion and chunking workflows. - Added synchronous and asynchronous methods for handling file paths in API and client layers. - Enhanced error handling and validation for file-based operations. - Updated tests to cover new file-based use cases, including edge cases and null checks. Fixes #227 Signed-off-by: Eric Deandrea <[email protected]>
1 parent 2a7048b commit a028d94

File tree

11 files changed

+742
-67
lines changed

11 files changed

+742
-67
lines changed

docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeChunkApi.java

Lines changed: 164 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
package ai.docling.serve.api;
22

3-
import java.util.concurrent.CompletableFuture;
3+
import java.nio.file.Path;
4+
import java.util.Optional;
5+
import java.util.concurrent.CompletionStage;
6+
7+
import org.jspecify.annotations.Nullable;
48

59
import ai.docling.serve.api.chunk.request.HierarchicalChunkDocumentRequest;
610
import ai.docling.serve.api.chunk.request.HybridChunkDocumentRequest;
711
import ai.docling.serve.api.chunk.response.ChunkDocumentResponse;
12+
import ai.docling.serve.api.util.FileUtils;
13+
import ai.docling.serve.api.util.ValidationUtils;
814

915
/**
1016
* Represents the Docling Serve Chunk API, providing methods for processing document sources
@@ -19,21 +25,120 @@ public interface DoclingServeChunkApi {
1925
ChunkDocumentResponse chunkSourceWithHierarchicalChunker(HierarchicalChunkDocumentRequest request);
2026

2127
/**
22-
* Converts and chunks the provided document source(s) into a processed document based on the specified options
23-
* and using a hybrid chunker for splitting the document into smaller chunks.
28+
* Processes and chunks the specified files into smaller, structured pieces
29+
* using a hierarchical chunker. This method internally delegates the processing
30+
* to another overloaded method with default options for the hierarchical chunker.
31+
*
32+
* @param files the files to be processed and chunked using the hierarchical chunker
33+
* @return a {@link ChunkDocumentResponse} containing the processed chunks, optionally the
34+
* converted documents, and associated metadata
35+
*/
36+
default ChunkDocumentResponse chunkFilesWithHierarchicalChunker(Path... files) {
37+
return chunkFilesWithHierarchicalChunker(null, files);
38+
}
39+
40+
/**
41+
* Processes and chunks the specified files into smaller, structured pieces
42+
* using a hierarchical chunker. This method utilizes a provided hierarchical
43+
* chunk request, applying additional configurations if needed during the
44+
* creation of the chunking request.
45+
*
46+
* @param request the request containing configurations and options for hierarchical
47+
* chunking. It may include settings for conversion, chunking parameters,
48+
* and optional output specifications. Can be null to use default options.
49+
* @param files the files to be processed and chunked using the hierarchical chunker.
50+
* @return a {@link ChunkDocumentResponse} containing the processed chunks, optionally the
51+
* converted documents, and associated metadata.
52+
*/
53+
default ChunkDocumentResponse chunkFilesWithHierarchicalChunker(@Nullable HierarchicalChunkDocumentRequest request, Path... files) {
54+
return chunkSourceWithHierarchicalChunker(createHierarchicalChunkRequest(request, files));
55+
}
56+
57+
/**
58+
* Processes and chunks the provided document source(s) into smaller documents
59+
* using a hybrid chunking strategy. The method utilizes the specified hybrid
60+
* chunker options to split and process the input request.
61+
*
62+
* @param request the request containing the document source(s), conversion options, hybrid
63+
* chunker configurations, and optional specifications for output targets
64+
* @return a {@link ChunkDocumentResponse} containing the processed chunks, optionally the
65+
* converted document, and other relevant metadata
2466
*/
2567
ChunkDocumentResponse chunkSourceWithHybridChunker(HybridChunkDocumentRequest request);
2668

69+
/**
70+
* Processes and chunks the specified files into smaller, structured pieces
71+
* using a hybrid chunking strategy. This method delegates the processing
72+
* to another overloaded method with default options for the hybrid chunker.
73+
*
74+
* @param files the files to be processed and chunked using the hybrid chunker
75+
* @return a {@link ChunkDocumentResponse} containing the processed chunks,
76+
* optionally the converted documents, and associated metadata
77+
*/
78+
default ChunkDocumentResponse chunkFilesWithHybridChunker(Path... files) {
79+
return chunkFilesWithHybridChunker(null, files);
80+
}
81+
82+
/**
83+
* Processes and chunks the specified files into smaller, structured pieces
84+
* using a hybrid chunking strategy. The method converts the input files into
85+
* a hybrid chunk request and processes them to generate a structured representation
86+
* of the content.
87+
*
88+
* @param request the request containing configurations for processing, including
89+
* conversion options, hybrid chunking parameters, and optional
90+
* specifications for output targets. Can be null to use default options.
91+
* @param files the files to be processed and chunked using the hybrid chunking strategy.
92+
* @return a {@code ChunkDocumentResponse} containing the processed chunks, optionally the
93+
* converted documents, and associated metadata.
94+
*/
95+
default ChunkDocumentResponse chunkFilesWithHybridChunker(@Nullable HybridChunkDocumentRequest request, Path... files) {
96+
return chunkSourceWithHybridChunker(createHybridChunkRequest(request, files));
97+
}
98+
2799
/**
28100
* Asynchronously processes the provided document source(s) by converting and chunking them
29101
* into smaller documents using the hierarchical chunker. This operation allows for handling
30102
* large document processing tasks without blocking the caller thread.
31103
*
32104
* @param request the request containing the document source(s) and options for hierarchical chunking
33-
* @return a CompletableFuture that resolves to a {@link ChunkDocumentResponse}, which contains
105+
* @return a {@link CompletionStage} that resolves to a {@link ChunkDocumentResponse}, which contains
34106
* the processed chunks, optionally the converted document, and processing metadata
35107
*/
36-
CompletableFuture<ChunkDocumentResponse> chunkSourceWithHierarchicalChunkerAsync(HierarchicalChunkDocumentRequest request);
108+
CompletionStage<ChunkDocumentResponse> chunkSourceWithHierarchicalChunkerAsync(HierarchicalChunkDocumentRequest request);
109+
110+
/**
111+
* Asynchronously processes and chunks the specified files into smaller, structured pieces
112+
* using a hierarchical chunker. This method delegates the processing to another overloaded
113+
* method with default options for the hierarchical chunker, leveraging non-blocking
114+
* asynchronous execution.
115+
*
116+
* @param files the files to be processed and chunked using the hierarchical chunker
117+
* @return a {@link CompletionStage} resolving to a {@link ChunkDocumentResponse}, which
118+
* includes the processed chunks, optionally the converted documents, and associated
119+
* metadata
120+
*/
121+
default CompletionStage<ChunkDocumentResponse> chunkFilesWithHierarchicalChunkerAsync(Path... files) {
122+
return chunkFilesWithHierarchicalChunkerAsync(null, files);
123+
}
124+
125+
/**
126+
* Asynchronously processes and chunks the specified files into smaller, structured pieces
127+
* using a hierarchical chunker. This method allows for non-blocking execution by delegating
128+
* the processing to an underlying method that handles hierarchical chunking configurations
129+
* and file chunking.
130+
*
131+
* @param request the request object containing configurations, options for hierarchical
132+
* chunking, and optional specifications for output targets. Can be null
133+
* to use default options for processing.
134+
* @param files the files to be processed and chunked using the hierarchical chunker.
135+
* @return a {@link CompletionStage} that resolves to a {@link ChunkDocumentResponse},
136+
* which includes the processed chunks, optionally the converted documents,
137+
* and associated metadata.
138+
*/
139+
default CompletionStage<ChunkDocumentResponse> chunkFilesWithHierarchicalChunkerAsync(@Nullable HierarchicalChunkDocumentRequest request, Path... files) {
140+
return chunkSourceWithHierarchicalChunkerAsync(createHierarchicalChunkRequest(request, files));
141+
}
37142

38143
/**
39144
* Asynchronously processes the provided document source(s) by converting and chunking them
@@ -42,8 +147,60 @@ public interface DoclingServeChunkApi {
42147
*
43148
* @param request the request containing the document source(s), options for conversion,
44149
* hybrid chunking parameters, and optional specifications for output targets
45-
* @return a CompletableFuture that resolves to a {@link ChunkDocumentResponse}, which includes
150+
* @return a {@link CompletionStage} that resolves to a {@link ChunkDocumentResponse}, which includes
46151
* the processed chunks, optionally the converted document, and relevant processing metadata
47152
*/
48-
CompletableFuture<ChunkDocumentResponse> chunkSourceWithHybridChunkerAsync(HybridChunkDocumentRequest request);
153+
CompletionStage<ChunkDocumentResponse> chunkSourceWithHybridChunkerAsync(HybridChunkDocumentRequest request);
154+
155+
/**
156+
* Asynchronously processes and chunks the provided files using a hybrid chunking strategy.
157+
*
158+
* @param files An array of file paths to be processed and chunked. Each path should represent
159+
* a valid file location.
160+
* @return A CompletionStage that, when completed, holds a ChunkDocumentResponse containing
161+
* the results of the chunking operation.
162+
*/
163+
default CompletionStage<ChunkDocumentResponse> chunkFilesWithHybridChunkerAsync(Path... files) {
164+
return chunkFilesWithHybridChunkerAsync(null, files);
165+
}
166+
167+
/**
168+
* Asynchronously processes and chunks the given files using a hybrid chunking mechanism.
169+
*
170+
* @param request An optional {@code HybridChunkDocumentRequest} containing configuration details for chunking.
171+
* If {@code null}, default settings will be applied.
172+
* @param files A varargs array of {@code Path} objects representing the files to be chunked.
173+
* Must not be null or empty.
174+
* @return A {@code CompletionStage<ChunkDocumentResponse>} that completes with the resulting
175+
* {@code ChunkDocumentResponse} once the chunking operation is finished.
176+
*/
177+
default CompletionStage<ChunkDocumentResponse> chunkFilesWithHybridChunkerAsync(@Nullable HybridChunkDocumentRequest request, Path... files) {
178+
return chunkSourceWithHybridChunkerAsync(createHybridChunkRequest(request, files));
179+
}
180+
181+
private HierarchicalChunkDocumentRequest createHierarchicalChunkRequest(@Nullable HierarchicalChunkDocumentRequest request, Path... files) {
182+
ValidationUtils.ensureNotEmpty(files, "files");
183+
184+
var builder = Optional.ofNullable(request)
185+
.map(HierarchicalChunkDocumentRequest::toBuilder)
186+
.orElseGet(HierarchicalChunkDocumentRequest::builder);
187+
188+
FileUtils.createFileSources(files)
189+
.forEach(builder::source);
190+
191+
return builder.build();
192+
}
193+
194+
private HybridChunkDocumentRequest createHybridChunkRequest(@Nullable HybridChunkDocumentRequest request, Path... files) {
195+
ValidationUtils.ensureNotEmpty(files, "files");
196+
197+
var builder = Optional.ofNullable(request)
198+
.map(HybridChunkDocumentRequest::toBuilder)
199+
.orElseGet(HybridChunkDocumentRequest::builder);
200+
201+
FileUtils.createFileSources(files)
202+
.forEach(builder::source);
203+
204+
return builder.build();
205+
}
49206
}

docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeConvertApi.java

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
package ai.docling.serve.api;
22

3-
import java.util.concurrent.CompletableFuture;
3+
import java.nio.file.Path;
4+
import java.util.Optional;
5+
import java.util.concurrent.CompletionStage;
6+
7+
import org.jspecify.annotations.Nullable;
48

59
import ai.docling.serve.api.convert.request.ConvertDocumentRequest;
610
import ai.docling.serve.api.convert.response.ConvertDocumentResponse;
11+
import ai.docling.serve.api.util.FileUtils;
12+
import ai.docling.serve.api.util.ValidationUtils;
713

814
/**
915
* Interface representing the Docling Serve Convert API.
@@ -19,9 +25,34 @@ public interface DoclingServeConvertApi {
1925
*/
2026
ConvertDocumentResponse convertSource(ConvertDocumentRequest request);
2127

28+
/**
29+
* Converts the specified files into a processed document using default options.
30+
* This is a convenience method that delegates to {@link #convertFiles(ConvertDocumentRequest, Path...)}
31+
* with a null request.
32+
*
33+
* @param files an array of {@link Path} objects representing the file paths to be converted
34+
* @return a {@link ConvertDocumentResponse} containing the processed document data, metadata, and any errors
35+
*/
36+
default ConvertDocumentResponse convertFiles(Path... files) {
37+
return convertFiles(null, files);
38+
}
39+
40+
/**
41+
* Converts the specified files into a processed document based on the options provided in the request.
42+
* If the request is null, default conversion options are applied.
43+
*
44+
* @param request an optional {@link ConvertDocumentRequest} specifying conversion settings and parameters
45+
* @param files an array of {@link Path} objects representing the file paths to be converted
46+
* @return a {@link ConvertDocumentResponse} containing the processed document data, any errors encountered,
47+
* and additional processing metadata
48+
*/
49+
default ConvertDocumentResponse convertFiles(@Nullable ConvertDocumentRequest request, Path... files) {
50+
return convertSource(createRequest(request, files));
51+
}
52+
2253
/**
2354
* Initiates an asynchronous conversion of the provided document source(s) and returns a
24-
* {@link CompletableFuture} that completes when the conversion is done.
55+
* {@link CompletionStage} that completes when the conversion is done.
2556
*
2657
* <p>This method starts the conversion, polls the status in the background, and completes
2758
* the future with the result when the conversion finishes.
@@ -35,9 +66,51 @@ public interface DoclingServeConvertApi {
3566
* }</pre>
3667
*
3768
* @param request the {@link ConvertDocumentRequest} containing the source(s) and conversion options.
38-
* @return a {@link CompletableFuture} that completes with the {@link ConvertDocumentResponse}
69+
* @return a {@link CompletionStage} that completes with the {@link ConvertDocumentResponse}
3970
* when the conversion is finished, or completes exceptionally if the conversion fails
4071
* or times out.
4172
*/
42-
CompletableFuture<ConvertDocumentResponse> convertSourceAsync(ConvertDocumentRequest request);
73+
CompletionStage<ConvertDocumentResponse> convertSourceAsync(ConvertDocumentRequest request);
74+
75+
/**
76+
* Initiates an asynchronous conversion of the provided files into a processed document
77+
* using default conversion options.
78+
*
79+
* @param files an array of {@link Path} objects representing the file paths to be converted
80+
* @return a {@link CompletionStage} that completes with the {@link ConvertDocumentResponse}
81+
* when the conversion finishes, or completes exceptionally if the conversion fails
82+
* or times out
83+
*/
84+
default CompletionStage<ConvertDocumentResponse> convertFilesAsync(Path... files) {
85+
return convertFilesAsync(null, files);
86+
}
87+
88+
/**
89+
* Initiates an asynchronous conversion of the specified files into a processed document
90+
* using the provided conversion request options. If the request is null, default conversion
91+
* options are applied.
92+
*
93+
* @param request an optional {@link ConvertDocumentRequest} containing conversion settings
94+
* and parameters, or null to use default options
95+
* @param files an array of {@link Path} objects representing the file paths to be converted
96+
* @return a {@link CompletionStage} that completes with the {@link ConvertDocumentResponse}
97+
* when the conversion finishes, or completes exceptionally if the conversion fails
98+
* or times out
99+
*/
100+
default CompletionStage<ConvertDocumentResponse> convertFilesAsync(@Nullable ConvertDocumentRequest request, Path... files) {
101+
return convertSourceAsync(createRequest(request, files));
102+
}
103+
104+
private ConvertDocumentRequest createRequest(@Nullable ConvertDocumentRequest request, Path... files) {
105+
ValidationUtils.ensureNotEmpty(files, "files");
106+
107+
var builder = Optional.ofNullable(request)
108+
.map(ConvertDocumentRequest::toBuilder)
109+
.orElseGet(ConvertDocumentRequest::builder);
110+
111+
FileUtils.createFileSources(files)
112+
.forEach(builder::source);
113+
114+
return builder.build();
115+
}
43116
}

0 commit comments

Comments
 (0)