Skip to content

Commit b9d935a

Browse files
committed
Add plugin for document text splitting
1 parent 8bee95b commit b9d935a

File tree

27 files changed

+1254
-2
lines changed

27 files changed

+1254
-2
lines changed

samples/semantickernel-concepts/semantickernel-syntax-examples/pom.xml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
34
<modelVersion>4.0.0</modelVersion>
45
<parent>
56
<groupId>com.microsoft.semantic-kernel</groupId>
@@ -81,6 +82,16 @@
8182
<groupId>com.microsoft.semantic-kernel</groupId>
8283
<artifactId>semantickernel-aiservices-google</artifactId>
8384
</dependency>
85+
<dependency>
86+
<groupId>com.microsoft.semantic-kernel</groupId>
87+
<artifactId>semantickernel-text-splitter-plugin</artifactId>
88+
<version>${project.version}</version>
89+
</dependency>
90+
<dependency>
91+
<groupId>org.apache.pdfbox</groupId>
92+
<artifactId>pdfbox</artifactId>
93+
<version>3.0.3</version>
94+
</dependency>
8495

8596
<dependency>
8697
<groupId>com.google.cloud</groupId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantickernel.samples.syntaxexamples.rag;
3+
4+
import com.microsoft.semantic.kernel.rag.splitting.Chunk;
5+
import com.microsoft.semantic.kernel.rag.splitting.Document;
6+
import com.microsoft.semantic.kernel.rag.splitting.Splitter;
7+
import java.io.ByteArrayInputStream;
8+
import java.io.IOException;
9+
import java.net.URI;
10+
import java.net.http.HttpClient;
11+
import java.net.http.HttpRequest;
12+
import java.net.http.HttpResponse;
13+
import java.net.http.HttpResponse.BodyHandlers;
14+
import java.util.List;
15+
import org.apache.pdfbox.io.RandomAccessReadBuffer;
16+
import org.apache.pdfbox.pdfparser.PDFParser;
17+
import org.apache.pdfbox.pdmodel.PDDocument;
18+
import org.apache.pdfbox.text.PDFTextStripper;
19+
import reactor.core.publisher.Flux;
20+
21+
public class DocumentSplittingExample {
22+
23+
private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf";
24+
25+
private static class PDFDocument implements Document {
26+
27+
private final byte[] pdf;
28+
29+
private PDFDocument(byte[] pdf) {
30+
this.pdf = pdf;
31+
}
32+
33+
@Override
34+
public Flux<String> getContent() {
35+
try {
36+
PDFParser parser = new PDFParser(
37+
RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf)));
38+
PDDocument document = parser.parse();
39+
String text = new PDFTextStripper().getText(document);
40+
41+
return Flux.just(text);
42+
} catch (IOException e) {
43+
return Flux.error(e);
44+
}
45+
}
46+
}
47+
48+
public static void main(String[] args) throws IOException, InterruptedException {
49+
byte[] pdfBytes = getPdfDoc();
50+
PDFDocument pdfDoc = new PDFDocument(pdfBytes);
51+
52+
Splitter splitter = Splitter
53+
.builder()
54+
.maxParagraphsPerChunk(4)
55+
.overlapNPercent(30.0f)
56+
.trimWhitespace()
57+
.build();
58+
59+
List<Chunk> chunks = splitter
60+
.splitDocument(pdfDoc)
61+
.collectList()
62+
.block();
63+
64+
chunks
65+
.forEach(chunk -> {
66+
System.out.println("=========");
67+
System.out.println(chunk.getContents());
68+
});
69+
}
70+
71+
private static byte[] getPdfDoc() throws IOException, InterruptedException {
72+
HttpResponse<byte[]> doc = HttpClient.newHttpClient()
73+
.send(HttpRequest.newBuilder()
74+
.GET()
75+
.uri(URI.create(BENEFITS_DOC))
76+
.build(),
77+
BodyHandlers.ofByteArray());
78+
return doc.body();
79+
}
80+
81+
}

samples/semantickernel-sample-plugins/pom.xml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8" ?>
2-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
34
<modelVersion>4.0.0</modelVersion>
45
<parent>
56
<groupId>com.microsoft.semantic-kernel</groupId>
@@ -15,5 +16,6 @@
1516
<modules>
1617
<module>semantickernel-openapi-plugin</module>
1718
<module>semantickernel-presidio-plugin</module>
19+
<module>semantickernel-text-splitter-plugin</module>
1820
</modules>
1921
</project>
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4+
<modelVersion>4.0.0</modelVersion>
5+
<parent>
6+
<groupId>com.microsoft.semantic-kernel</groupId>
7+
<artifactId>semantickernel-sample-plugins</artifactId>
8+
<version>1.3.1-SNAPSHOT</version>
9+
<relativePath>../pom.xml</relativePath>
10+
</parent>
11+
12+
<artifactId>semantickernel-text-splitter-plugin</artifactId>
13+
<name>semantickernel-text-splitter-plugin</name>
14+
<packaging>jar</packaging>
15+
16+
<dependencyManagement>
17+
<dependencies>
18+
<dependency>
19+
<groupId>com.microsoft.semantic-kernel</groupId>
20+
<artifactId>semantickernel-bom</artifactId>
21+
<version>${project.version}</version>
22+
<type>pom</type>
23+
<scope>import</scope>
24+
</dependency>
25+
</dependencies>
26+
</dependencyManagement>
27+
28+
<dependencies>
29+
<dependency>
30+
<groupId>com.microsoft.semantic-kernel</groupId>
31+
<artifactId>semantickernel-api</artifactId>
32+
</dependency>
33+
34+
<dependency>
35+
<groupId>org.apache.logging.log4j</groupId>
36+
<artifactId>log4j-api</artifactId>
37+
<scope>runtime</scope>
38+
</dependency>
39+
<dependency>
40+
<groupId>org.apache.logging.log4j</groupId>
41+
<artifactId>log4j-core</artifactId>
42+
<scope>runtime</scope>
43+
</dependency>
44+
<dependency>
45+
<groupId>org.apache.logging.log4j</groupId>
46+
<artifactId>log4j-slf4j2-impl</artifactId>
47+
<scope>runtime</scope>
48+
</dependency>
49+
<dependency>
50+
<groupId>com.fasterxml.jackson.core</groupId>
51+
<artifactId>jackson-databind</artifactId>
52+
<scope>compile</scope>
53+
</dependency>
54+
<dependency>
55+
<groupId>com.fasterxml.jackson.core</groupId>
56+
<artifactId>jackson-core</artifactId>
57+
<scope>compile</scope>
58+
</dependency>
59+
<dependency>
60+
<groupId>org.junit.jupiter</groupId>
61+
<artifactId>junit-jupiter-api</artifactId>
62+
<scope>test</scope>
63+
</dependency>
64+
</dependencies>
65+
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
public class Chunk {
5+
6+
private final String chunk;
7+
8+
public Chunk(String chunk) {
9+
this.chunk = chunk;
10+
}
11+
12+
public String getContents() {
13+
return chunk;
14+
}
15+
16+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* Defines the condition that should be met for a chunk to be considered full.
6+
*/
7+
public interface ChunkEndCondition {
8+
9+
/**
10+
* Accepts a string and returns the number of character that should be considered as the end of
11+
* the FIRST chunk within the string. This method will be subsiquently called until all pages
12+
* are found.
13+
* <p>
14+
* Return -1 if the value does not contain enough characters to be considered as a full chunk.
15+
*
16+
* @param value the value to be checked
17+
* @return the index of the character that should be considered as the end of the first chunk in
18+
* the string
19+
*/
20+
public int getEndOfNextChunk(String value);
21+
22+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* A post processor that processes a chunk after it has been split.
6+
*/
7+
public interface ChunkPostProcessor {
8+
Chunk process(Chunk chunk);
9+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
import reactor.core.publisher.Flux;
5+
6+
/**
7+
* A document to be read and split into chunks.
8+
*/
9+
public interface Document {
10+
Flux<String> getContent();
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
package com.microsoft.semantic.kernel.rag.splitting;
3+
4+
/**
5+
* Defines how much overlap is allowed between two pages.
6+
*/
7+
public interface OverlapCondition {
8+
9+
/**
10+
* Returns the index of the first character that should be considered as the beginning of the
11+
* overlap.
12+
*
13+
* @param chunk the chunk to be checked
14+
* @return the index of the first character that should be considered as the beginning of the
15+
* overlap
16+
*/
17+
public int getOverlapIndex(String chunk);
18+
19+
}

0 commit comments

Comments
 (0)