Skip to content

Commit ba29dc5

Browse files
committed
merged main
2 parents 2fccef4 + cf7a2bd commit ba29dc5

File tree

4 files changed

+347
-306
lines changed

4 files changed

+347
-306
lines changed

.github/workflows/nightly-jobs.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Nightly Jobs
1+
name: Nightly and CICD Jobs
22

33
on:
44
pull_request:
@@ -61,6 +61,12 @@ jobs:
6161
java-version: '17'
6262
cache: 'maven'
6363

64+
- name: Verify Indexer project
65+
run: |
66+
echo "Testing indexer project."
67+
cd ./app/indexer
68+
./mvnw test
69+
6470
- name: Build Spring Boot App
6571
run: |
6672
echo "Building Spring Boot app."

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,18 @@ public class TextSplitter {
1717
public TextSplitter(boolean verbose) {
1818
this.sentenceEndings = new ArrayList<>();
1919
this.sentenceEndings.add(".");
20+
this.sentenceEndings.add("。");
21+
this.sentenceEndings.add(".");
2022
this.sentenceEndings.add("!");
2123
this.sentenceEndings.add("?");
24+
this.sentenceEndings.add("‼");
25+
this.sentenceEndings.add("⁇");
26+
this.sentenceEndings.add("⁈");
27+
this.sentenceEndings.add("⁉");
2228

2329
this.wordBreaks = new ArrayList<>();
2430
this.wordBreaks.add(",");
31+
this.wordBreaks.add("、");
2532
this.wordBreaks.add(";");
2633
this.wordBreaks.add(":");
2734
this.wordBreaks.add(" ");
@@ -49,6 +56,12 @@ public List<SplitPage> splitPages(List<Page> pages) {
4956
int length = allText.length();
5057
int start = 0;
5158
int end = length;
59+
60+
if (length <= maxSectionLength) {
61+
splitPages.add(new SplitPage(findPage(start, pages), allText.toString()));
62+
return splitPages;
63+
}
64+
5265
while (start + sectionOverlap < length) {
5366
int lastWord = -1;
5467
end = start + maxSectionLength;
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package com.microsoft.openai.samples.indexer.parser;
2+
3+
import com.microsoft.openai.samples.indexer.SplitPage;
4+
import org.junit.jupiter.api.Test;
5+
6+
import java.util.List;
7+
8+
import static org.junit.jupiter.api.Assertions.*;
9+
10+
class TextSplitterTest {
11+
12+
@Test
13+
void testSplitTinyPages() {
14+
List<Page> testPages = List.of(new Page[]{
15+
new Page(1, 0, "hello, world")
16+
});
17+
TextSplitter splitter = new TextSplitter(false);
18+
List<SplitPage> result = splitter.splitPages(testPages);
19+
assertEquals(1, result.size());
20+
assertEquals("hello, world", result.get(0).getText());
21+
}
22+
}

0 commit comments

Comments
 (0)