Skip to content

Commit 0adbbbf

Browse files
authored
Merge pull request #239 from johnoliver/rag-chunker
Add plugin for document text splitting
2 parents 8f43b96 + b756361 commit 0adbbbf

File tree

28 files changed

+1361
-2
lines changed

28 files changed

+1361
-2
lines changed

samples/semantickernel-concepts/semantickernel-syntax-examples/pom.xml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
34
<modelVersion>4.0.0</modelVersion>
45
<parent>
56
<groupId>com.microsoft.semantic-kernel</groupId>
@@ -100,6 +101,16 @@
100101
<groupId>com.microsoft.semantic-kernel</groupId>
101102
<artifactId>semantickernel-aiservices-google</artifactId>
102103
</dependency>
104+
<dependency>
105+
<groupId>com.microsoft.semantic-kernel</groupId>
106+
<artifactId>semantickernel-text-splitter-plugin</artifactId>
107+
<version>${project.version}</version>
108+
</dependency>
109+
<dependency>
110+
<groupId>org.apache.pdfbox</groupId>
111+
<artifactId>pdfbox</artifactId>
112+
<version>3.0.3</version>
113+
</dependency>
103114

104115
<dependency>
105116
<groupId>com.google.cloud</groupId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantickernel.samples.syntaxexamples.rag;
3+
4+
import com.microsoft.semantic.kernel.rag.splitting.Chunk;
5+
import com.microsoft.semantic.kernel.rag.splitting.Document;
6+
import com.microsoft.semantic.kernel.rag.splitting.Splitter;
7+
import com.microsoft.semantic.kernel.rag.splitting.TextSplitter;
8+
import com.microsoft.semantic.kernel.rag.splitting.document.TextDocument;
9+
import com.microsoft.semantic.kernel.rag.splitting.overlap.NoOverlapCondition;
10+
import com.microsoft.semantic.kernel.rag.splitting.splitconditions.CountSplitCondition;
11+
import com.microsoft.semantic.kernel.rag.splitting.splitconditions.SplitPoint;
12+
import com.microsoft.semantickernel.implementation.EmbeddedResourceLoader;
13+
import java.io.ByteArrayInputStream;
14+
import java.io.IOException;
15+
import java.net.URI;
16+
import java.net.http.HttpClient;
17+
import java.net.http.HttpRequest;
18+
import java.net.http.HttpResponse;
19+
import java.net.http.HttpResponse.BodyHandlers;
20+
import java.util.List;
21+
import java.util.regex.Pattern;
22+
import java.util.stream.Collectors;
23+
import org.apache.pdfbox.io.RandomAccessReadBuffer;
24+
import org.apache.pdfbox.pdfparser.PDFParser;
25+
import org.apache.pdfbox.pdmodel.PDDocument;
26+
import org.apache.pdfbox.text.PDFTextStripper;
27+
import reactor.core.publisher.Flux;
28+
import reactor.core.publisher.Mono;
29+
30+
public class DocumentSplittingExample {
31+
32+
private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf";
33+
34+
private static class PDFDocument implements Document {
35+
36+
private final byte[] pdf;
37+
38+
private PDFDocument(byte[] pdf) {
39+
this.pdf = pdf;
40+
}
41+
42+
@Override
43+
public Flux<String> getContent() {
44+
try {
45+
PDFParser parser = new PDFParser(
46+
RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf)));
47+
PDDocument document = parser.parse();
48+
String text = new PDFTextStripper().getText(document);
49+
50+
return Flux.just(text);
51+
} catch (IOException e) {
52+
return Flux.error(e);
53+
}
54+
}
55+
}
56+
57+
public static void main(String[] args) throws IOException, InterruptedException {
58+
useCustomChunker();
59+
useInbuiltChunker();
60+
}
61+
62+
private static void useInbuiltChunker() throws IOException, InterruptedException {
63+
byte[] pdfBytes = getPdfDoc();
64+
PDFDocument pdfDoc = new PDFDocument(pdfBytes);
65+
66+
Splitter splitter = Splitter
67+
.builder()
68+
.maxParagraphsPerChunk(4)
69+
.overlapNPercent(30.0f)
70+
.trimWhitespace()
71+
.build();
72+
73+
List<Chunk> chunks = splitter
74+
.splitDocument(pdfDoc)
75+
.collectList()
76+
.block();
77+
78+
chunks
79+
.forEach(chunk -> {
80+
System.out.println("=========");
81+
System.out.println(chunk.getContents());
82+
});
83+
}
84+
85+
public static void useCustomChunker() throws IOException, InterruptedException {
86+
87+
String example = EmbeddedResourceLoader.readFile("example.md",
88+
DocumentSplittingExample.class);
89+
90+
// Define how we are splitting tokens, in this case we are splitting on headers of an md file
91+
// i.e <new line> followed by one or more # characters
92+
TextSplitter textSplitter = (doc, numTokens) -> {
93+
// Split on headers
94+
Pattern pattern = Pattern.compile("(\\r?\\n|\\r)\s*#+", Pattern.MULTILINE);
95+
96+
Flux<Integer> splitPoints = Flux.fromStream(pattern.matcher(doc).results())
97+
.map(window -> window.start());
98+
99+
return createWindows(doc, splitPoints);
100+
};
101+
102+
// Split into single sections
103+
CountSplitCondition condition = new CountSplitCondition(1, textSplitter);
104+
105+
Splitter splitter = Splitter
106+
.builder()
107+
.addChunkEndCondition(condition)
108+
// No overlap
109+
.setOverlapCondition(NoOverlapCondition.build())
110+
// Tidy up the text
111+
.trimWhitespace()
112+
.build();
113+
114+
String chunks = splitter
115+
.splitDocument(new TextDocument(example))
116+
.collectList()
117+
.map(it -> it.stream()
118+
.map(chunk -> chunk.getContents())
119+
.collect(Collectors.joining("\n============\n")))
120+
.block();
121+
122+
System.out.println(chunks);
123+
}
124+
125+
/*
126+
* Transforms: [ 2, 10, 20, 100 ] -> [ (0, 2), (2, 10), (10, 20), (20, 100), (100, <doc length>)
127+
* ]
128+
*/
129+
private static List<SplitPoint> createWindows(String doc, Flux<Integer> splitPoints) {
130+
return Flux.concat(
131+
Flux.just(0),
132+
splitPoints,
133+
Flux.just(doc.length()))
134+
.window(2, 1)
135+
.concatMap(window -> {
136+
return window.collectList()
137+
.flatMap(list -> {
138+
if (list.size() <= 1) {
139+
return Mono.empty();
140+
}
141+
return Mono.just(
142+
new SplitPoint(list.get(0), list.get(1)));
143+
});
144+
})
145+
.collectList()
146+
.block();
147+
}
148+
149+
private static byte[] getPdfDoc() throws IOException, InterruptedException {
150+
HttpResponse<byte[]> doc = HttpClient.newHttpClient()
151+
.send(HttpRequest.newBuilder()
152+
.GET()
153+
.uri(URI.create(BENEFITS_DOC))
154+
.build(),
155+
BodyHandlers.ofByteArray());
156+
return doc.body();
157+
}
158+
159+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
## Section 1
2+
3+
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna
4+
aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis
5+
aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint
6+
occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
7+
8+
## Section 2
9+
10+
Another section.
11+
12+
### Subsection 1
13+
14+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10.
15+
16+
# Section 3
17+
18+
This is the last section.
19+
20+
```
21+
some code
22+
```

samples/semantickernel-sample-plugins/pom.xml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8" ?>
2-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
34
<modelVersion>4.0.0</modelVersion>
45
<parent>
56
<groupId>com.microsoft.semantic-kernel</groupId>
@@ -15,5 +16,6 @@
1516
<modules>
1617
<module>semantickernel-openapi-plugin</module>
1718
<module>semantickernel-presidio-plugin</module>
19+
<module>semantickernel-text-splitter-plugin</module>
1820
</modules>
1921
</project>
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4+
<modelVersion>4.0.0</modelVersion>
5+
<parent>
6+
<groupId>com.microsoft.semantic-kernel</groupId>
7+
<artifactId>semantickernel-sample-plugins</artifactId>
8+
<version>1.3.1-SNAPSHOT</version>
9+
<relativePath>../pom.xml</relativePath>
10+
</parent>
11+
12+
<artifactId>semantickernel-text-splitter-plugin</artifactId>
13+
<name>semantickernel-text-splitter-plugin</name>
14+
<packaging>jar</packaging>
15+
16+
<dependencyManagement>
17+
<dependencies>
18+
<dependency>
19+
<groupId>com.microsoft.semantic-kernel</groupId>
20+
<artifactId>semantickernel-bom</artifactId>
21+
<version>${project.version}</version>
22+
<type>pom</type>
23+
<scope>import</scope>
24+
</dependency>
25+
</dependencies>
26+
</dependencyManagement>
27+
28+
<dependencies>
29+
<dependency>
30+
<groupId>com.microsoft.semantic-kernel</groupId>
31+
<artifactId>semantickernel-api</artifactId>
32+
</dependency>
33+
34+
<dependency>
35+
<groupId>org.apache.logging.log4j</groupId>
36+
<artifactId>log4j-api</artifactId>
37+
<scope>runtime</scope>
38+
</dependency>
39+
<dependency>
40+
<groupId>org.apache.logging.log4j</groupId>
41+
<artifactId>log4j-core</artifactId>
42+
<scope>runtime</scope>
43+
</dependency>
44+
<dependency>
45+
<groupId>org.apache.logging.log4j</groupId>
46+
<artifactId>log4j-slf4j2-impl</artifactId>
47+
<scope>runtime</scope>
48+
</dependency>
49+
<dependency>
50+
<groupId>com.fasterxml.jackson.core</groupId>
51+
<artifactId>jackson-databind</artifactId>
52+
<scope>compile</scope>
53+
</dependency>
54+
<dependency>
55+
<groupId>com.fasterxml.jackson.core</groupId>
56+
<artifactId>jackson-core</artifactId>
57+
<scope>compile</scope>
58+
</dependency>
59+
<dependency>
60+
<groupId>org.junit.jupiter</groupId>
61+
<artifactId>junit-jupiter-api</artifactId>
62+
<scope>test</scope>
63+
</dependency>
64+
</dependencies>
65+
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
public class Chunk {
5+
6+
private final String chunk;
7+
8+
public Chunk(String chunk) {
9+
this.chunk = chunk;
10+
}
11+
12+
public String getContents() {
13+
return chunk;
14+
}
15+
16+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* Defines the condition that should be met for a chunk to be considered full.
6+
*/
7+
public interface ChunkEndCondition {
8+
9+
/**
10+
* Accepts a string and returns the number of character that should be considered as the end of
11+
* the FIRST chunk within the string. This method will be subsequently called until all pages
12+
* are found.
13+
* <p>
14+
* Return -1 if the value does not contain enough characters to be considered as a full chunk.
15+
*
16+
* @param value the value to be checked
17+
* @return the index of the character that should be considered as the end of the first chunk in
18+
* the string
19+
*/
20+
public int getEndOfNextChunk(String value);
21+
22+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* A post processor that processes a chunk after it has been split.
6+
*/
7+
public interface ChunkPostProcessor {
8+
Chunk process(Chunk chunk);
9+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
import reactor.core.publisher.Flux;
5+
6+
/**
7+
* A document to be read and split into chunks.
8+
*/
9+
public interface Document {
10+
Flux<String> getContent();
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* Defines how much overlap is allowed between two pages.
6+
*/
7+
public interface OverlapCondition {
8+
9+
/**
10+
* Returns the index of the first character that should be considered as the beginning of the
11+
* overlap.
12+
*
13+
* @param chunk the chunk to be checked
14+
* @return the index of the first character that should be considered as the beginning of the
15+
* overlap
16+
*/
17+
public int getOverlapIndex(String chunk);
18+
19+
}

0 commit comments

Comments
 (0)