Skip to content

Commit fcf586f

Browse files
authored
feat: Add S3-based source and target support with enhanced extensibility (#256)
* feat: Add S3-based source and target support with enhanced extensibility - Introduced `S3Source` and `S3Target` for S3-based document processing. - Refactored chunking request hierarchy for better reusability. - Added test coverage for S3 workflow validation. - Updated `DoclingServeApi` to support S3 credentials and configuration. - Enhanced documentation to include S3-based examples. Fixes #254 Signed-off-by: Eric Deandrea <[email protected]> * feat: Add S3-based source and target support with enhanced extensibility - Introduced `S3Source` and `S3Target` for S3-based document processing. - Refactored chunking request hierarchy for better reusability. - Added test coverage for S3 workflow validation. - Updated `DoclingServeApi` to support S3 credentials and configuration. - Enhanced documentation to include S3-based examples. Fixes #254 Signed-off-by: Eric Deandrea <[email protected]> --------- Signed-off-by: Eric Deandrea <[email protected]>
1 parent d4c676d commit fcf586f

File tree

20 files changed

+630
-148
lines changed

20 files changed

+630
-148
lines changed

docling-serve/docling-serve-api/src/main/java/ai/docling/serve/api/DoclingServeChunkApi.java

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
package ai.docling.serve.api;
22

33
import java.nio.file.Path;
4-
import java.util.Optional;
54
import java.util.concurrent.CompletionStage;
65

76
import org.jspecify.annotations.Nullable;
87

8+
import ai.docling.serve.api.chunk.request.ChunkType;
99
import ai.docling.serve.api.chunk.request.HierarchicalChunkDocumentRequest;
1010
import ai.docling.serve.api.chunk.request.HybridChunkDocumentRequest;
1111
import ai.docling.serve.api.chunk.response.ChunkDocumentResponse;
@@ -41,6 +41,20 @@ default ChunkDocumentResponse chunkFilesWithHierarchicalChunker(Path... files) {
4141
return chunkFilesWithHierarchicalChunker(null, files);
4242
}
4343

44+
/**
45+
* Splits the given files into smaller chunks based on the specified chunking type.
46+
*
47+
* @param chunkType the type of chunking to be applied, determining the chunking strategy
48+
* @param files the array of file paths to be processed and chunked
49+
* @return a ChunkDocumentResponse containing the results of the chunking operation
50+
*/
51+
default ChunkDocumentResponse chunkFiles(ChunkType chunkType, Path... files) {
52+
return switch(chunkType) {
53+
case HYBRID -> chunkFilesWithHybridChunker(files);
54+
case HIERARCHICAL -> chunkFilesWithHierarchicalChunker(files);
55+
};
56+
}
57+
4458
/**
4559
* Processes and chunks the specified files into smaller, structured pieces
4660
* using a hierarchical chunker. This method utilizes a provided hierarchical
@@ -151,6 +165,20 @@ default CompletionStage<ChunkDocumentResponse> chunkFilesWithHierarchicalChunker
151165
return chunkSourceWithHierarchicalChunkerAsync(createHierarchicalChunkRequest(request, files));
152166
}
153167

168+
/**
169+
* Asynchronously chunks the provided files based on the specified chunk type.
170+
*
171+
* @param chunkType the type of chunking to apply, determining the chunking strategy
172+
* @param files the array of file paths to be chunked
173+
* @return a CompletionStage that, when completed, contains the result of the chunking operation as a ChunkDocumentResponse
174+
*/
175+
default CompletionStage<ChunkDocumentResponse> chunkFilesAsync(ChunkType chunkType, Path... files) {
176+
return switch(chunkType) {
177+
case HYBRID -> chunkFilesWithHybridChunkerAsync(files);
178+
case HIERARCHICAL -> chunkFilesWithHierarchicalChunkerAsync(files);
179+
};
180+
}
181+
154182
/**
155183
* Asynchronously processes the provided document source(s) by converting and chunking them
156184
* into smaller documents using the hybrid chunker. This operation facilitates non-blocking
@@ -195,9 +223,9 @@ default CompletionStage<ChunkDocumentResponse> chunkFilesWithHybridChunkerAsync(
195223
private HierarchicalChunkDocumentRequest createHierarchicalChunkRequest(@Nullable HierarchicalChunkDocumentRequest request, Path... files) {
196224
ValidationUtils.ensureNotEmpty(files, "files");
197225

198-
var builder = Optional.ofNullable(request)
199-
.map(HierarchicalChunkDocumentRequest::toBuilder)
200-
.orElseGet(HierarchicalChunkDocumentRequest::builder);
226+
var builder = (request != null) ?
227+
request.toBuilder() :
228+
HierarchicalChunkDocumentRequest.builder();
201229

202230
FileUtils.createFileSources(files)
203231
.forEach(builder::source);
@@ -208,9 +236,9 @@ private HierarchicalChunkDocumentRequest createHierarchicalChunkRequest(@Nullabl
208236
private HybridChunkDocumentRequest createHybridChunkRequest(@Nullable HybridChunkDocumentRequest request, Path... files) {
209237
ValidationUtils.ensureNotEmpty(files, "files");
210238

211-
var builder = Optional.ofNullable(request)
212-
.map(HybridChunkDocumentRequest::toBuilder)
213-
.orElseGet(HybridChunkDocumentRequest::builder);
239+
var builder = (request != null) ?
240+
request.toBuilder() :
241+
HybridChunkDocumentRequest.builder();
214242

215243
FileUtils.createFileSources(files)
216244
.forEach(builder::source);
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package ai.docling.serve.api.chunk.request;
2+
3+
import java.util.List;
4+
5+
import org.jspecify.annotations.Nullable;
6+
7+
import com.fasterxml.jackson.annotation.JsonInclude;
8+
import com.fasterxml.jackson.annotation.JsonProperty;
9+
import com.fasterxml.jackson.annotation.JsonSetter;
10+
import com.fasterxml.jackson.annotation.Nulls;
11+
12+
import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions;
13+
import ai.docling.serve.api.convert.request.source.Source;
14+
import ai.docling.serve.api.convert.request.target.Target;
15+
16+
@JsonInclude(JsonInclude.Include.NON_EMPTY)
17+
@tools.jackson.databind.annotation.JsonDeserialize(builder = ChunkDocumentRequest.ChunkDocumentRequestBuilder.class)
18+
@lombok.experimental.SuperBuilder(toBuilder = true)
19+
@lombok.Getter
20+
@lombok.ToString
21+
public sealed abstract class ChunkDocumentRequest permits HierarchicalChunkDocumentRequest, HybridChunkDocumentRequest {
22+
/**
23+
* List of input document sources to process.
24+
*
25+
* @param sources the list of document sources
26+
* @return the list of document sources
27+
*/
28+
@JsonProperty("sources")
29+
@JsonSetter(nulls = Nulls.AS_EMPTY)
30+
@lombok.Singular
31+
private List<Source> sources;
32+
33+
/**
34+
* Conversion options.
35+
*
36+
* @param options the conversion options
37+
* @return the conversion options
38+
*/
39+
@JsonProperty("convert_options")
40+
@lombok.NonNull
41+
@lombok.Builder.Default
42+
private ConvertDocumentOptions options = ConvertDocumentOptions.builder().build();
43+
44+
/**
45+
* Specification for the type of output target.
46+
*
47+
* @param target the output target specification, or null if not specified
48+
* @return the output target specification, or null if not specified
49+
*/
50+
@JsonProperty("target")
51+
@Nullable
52+
private Target target;
53+
54+
/**
55+
* If true, the output will include both the chunks and the converted document.
56+
*
57+
* @param includeConvertedDoc true if the converted document should be included, false otherwise
58+
* @return true if the converted document should be included, false otherwise
59+
*/
60+
@JsonProperty("include_converted_doc")
61+
private boolean includeConvertedDoc;
62+
63+
@tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "")
64+
public static abstract class ChunkDocumentRequestBuilder<C extends ChunkDocumentRequest, B extends ChunkDocumentRequestBuilder<C, B>> {
65+
// Lombok's @SuperBuilder generates the actual implementation
66+
}
67+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package ai.docling.serve.api.chunk.request;
2+
3+
/**
4+
* Defines the types of chunking mechanisms supported for document processing.
5+
*
6+
* The enum provides options for selecting distinct strategies to parse and divide
7+
* documents into manageable segments for further processing or analysis.
8+
*/
9+
public enum ChunkType {
10+
/**
11+
* Represents a hybrid chunking type, which combines features of multiple
12+
* chunking strategies to process documents in a flexible manner.
13+
*
14+
* Used in scenarios where the benefits of both sequential and hierarchical
15+
* chunking approaches are desired, enabling customization based on specific
16+
* processing requirements.
17+
*/
18+
HYBRID,
19+
20+
/**
21+
* Represents a hierarchical chunking type, which processes documents by
22+
* breaking them into a nested structure of smaller chunks.
23+
*
24+
* Used in scenarios where maintaining a clear hierarchy within the document
25+
* structure is critical, enabling processing at multiple levels of granularity.
26+
*/
27+
HIERARCHICAL
28+
}
Lines changed: 5 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,21 @@
11
package ai.docling.serve.api.chunk.request;
22

3-
import java.util.List;
4-
5-
import org.jspecify.annotations.Nullable;
6-
73
import com.fasterxml.jackson.annotation.JsonInclude;
84
import com.fasterxml.jackson.annotation.JsonProperty;
9-
import com.fasterxml.jackson.annotation.JsonSetter;
10-
import com.fasterxml.jackson.annotation.Nulls;
115

126
import ai.docling.serve.api.chunk.request.options.HierarchicalChunkerOptions;
13-
import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions;
14-
import ai.docling.serve.api.convert.request.source.Source;
15-
import ai.docling.serve.api.convert.request.target.Target;
167

178
/**
189
* Represents a request to convert a document and chunk it into smaller documents
1910
* using the Docling hierarchical chunker.
2011
*/
2112
@JsonInclude(JsonInclude.Include.NON_EMPTY)
22-
@tools.jackson.databind.annotation.JsonDeserialize(builder = HierarchicalChunkDocumentRequest.Builder.class)
13+
@tools.jackson.databind.annotation.JsonDeserialize(builder = HierarchicalChunkDocumentRequest.BuilderImpl.class)
2314
@lombok.extern.jackson.Jacksonized
24-
@lombok.Builder(toBuilder = true)
15+
@lombok.experimental.SuperBuilder(toBuilder = true)
2516
@lombok.Getter
26-
@lombok.ToString
27-
public class HierarchicalChunkDocumentRequest {
28-
29-
/**
30-
* List of input document sources to process.
31-
*
32-
* @param sources the list of document sources
33-
* @return the list of document sources
34-
*/
35-
@JsonProperty("sources")
36-
@JsonSetter(nulls = Nulls.AS_EMPTY)
37-
@lombok.Singular
38-
private List<Source> sources;
39-
40-
/**
41-
* Conversion options.
42-
*
43-
* @param options the conversion options
44-
* @return the conversion options
45-
*/
46-
@JsonProperty("convert_options")
47-
@lombok.NonNull
48-
@lombok.Builder.Default
49-
private ConvertDocumentOptions options = ConvertDocumentOptions.builder().build();
50-
51-
/**
52-
* Specification for the type of output target.
53-
*
54-
* @param target the output target specification, or null if not specified
55-
* @return the output target specification, or null if not specified
56-
*/
57-
@JsonProperty("target")
58-
@Nullable
59-
private Target target;
60-
61-
/**
62-
* If true, the output will include both the chunks and the converted document.
63-
*
64-
* @param includeConvertedDoc true if the converted document should be included, false otherwise
65-
* @return true if the converted document should be included, false otherwise
66-
*/
67-
@JsonProperty("include_converted_doc")
68-
private boolean includeConvertedDoc;
69-
17+
@lombok.ToString(callSuper = true)
18+
public final class HierarchicalChunkDocumentRequest extends ChunkDocumentRequest {
7019
/**
7120
* Options specific to the chunker.
7221
*
@@ -93,6 +42,5 @@ public class HierarchicalChunkDocumentRequest {
9342
* </ul>
9443
*/
9544
@tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "")
96-
public static class Builder { }
97-
45+
public static abstract class HierarchicalChunkDocumentRequestBuilder extends ChunkDocumentRequest.ChunkDocumentRequestBuilder<HierarchicalChunkDocumentRequest, HierarchicalChunkDocumentRequestBuilder> { }
9846
}
Lines changed: 5 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,21 @@
11
package ai.docling.serve.api.chunk.request;
22

3-
import java.util.List;
4-
5-
import org.jspecify.annotations.Nullable;
6-
73
import com.fasterxml.jackson.annotation.JsonInclude;
84
import com.fasterxml.jackson.annotation.JsonProperty;
9-
import com.fasterxml.jackson.annotation.JsonSetter;
10-
import com.fasterxml.jackson.annotation.Nulls;
115

126
import ai.docling.serve.api.chunk.request.options.HybridChunkerOptions;
13-
import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions;
14-
import ai.docling.serve.api.convert.request.source.Source;
15-
import ai.docling.serve.api.convert.request.target.Target;
167

178
/**
189
* Represents a request to convert a document and chunk it into smaller documents
1910
* using the Docling hybrid chunker.
2011
*/
2112
@JsonInclude(JsonInclude.Include.NON_EMPTY)
22-
@tools.jackson.databind.annotation.JsonDeserialize(builder = HybridChunkDocumentRequest.Builder.class)
13+
@tools.jackson.databind.annotation.JsonDeserialize(builder = HybridChunkDocumentRequest.BuilderImpl.class)
2314
@lombok.extern.jackson.Jacksonized
24-
@lombok.Builder(toBuilder = true)
15+
@lombok.experimental.SuperBuilder(toBuilder = true)
2516
@lombok.Getter
26-
@lombok.ToString
27-
public class HybridChunkDocumentRequest {
28-
29-
/**
30-
* List of input document sources to process.
31-
*
32-
* @param sources the list of document sources
33-
* @return the list of document sources
34-
*/
35-
@JsonProperty("sources")
36-
@JsonSetter(nulls = Nulls.AS_EMPTY)
37-
@lombok.Singular
38-
private List<Source> sources;
39-
40-
/**
41-
* Conversion options.
42-
*
43-
* @param options the conversion options
44-
* @return the conversion options
45-
*/
46-
@JsonProperty("convert_options")
47-
@lombok.NonNull
48-
@lombok.Builder.Default
49-
private ConvertDocumentOptions options = ConvertDocumentOptions.builder().build();
50-
51-
/**
52-
* Specification for the type of output target.
53-
*
54-
* @param target the output target specification, or null if not specified
55-
* @return the output target specification, or null if not specified
56-
*/
57-
@JsonProperty("target")
58-
@Nullable
59-
private Target target;
60-
61-
/**
62-
* If true, the output will include both the chunks and the converted document.
63-
*
64-
* @param includeConvertedDoc true if the converted document should be included, false otherwise
65-
* @return true if the converted document should be included, false otherwise
66-
*/
67-
@JsonProperty("include_converted_doc")
68-
private boolean includeConvertedDoc;
69-
17+
@lombok.ToString(callSuper = true)
18+
public final class HybridChunkDocumentRequest extends ChunkDocumentRequest {
7019
/**
7120
* Options specific to the chunker.
7221
*
@@ -93,6 +42,5 @@ public class HybridChunkDocumentRequest {
9342
* </ul>
9443
*/
9544
@tools.jackson.databind.annotation.JsonPOJOBuilder(withPrefix = "")
96-
public static class Builder { }
97-
45+
public static abstract class HybridChunkDocumentRequestBuilder extends ChunkDocumentRequest.ChunkDocumentRequestBuilder<HybridChunkDocumentRequest, HybridChunkDocumentRequestBuilder> { }
9846
}

0 commit comments

Comments
 (0)