Skip to content

Commit af0839d

Browse files
authored
Merge pull request #77 from Azure-Samples/tonybaloney-patch-1
Add ideographic and full-width punctuation to splitter
2 parents 20138ff + 166ffda commit af0839d

File tree

1 file changed

+7
-0
lines changed
  • app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser

1 file changed

+7
-0
lines changed

app/indexer/core/src/main/java/com/microsoft/openai/samples/indexer/parser/TextSplitter.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,18 @@ public class TextSplitter {
1717
public TextSplitter(boolean verbose) {
1818
this.sentenceEndings = new ArrayList<>();
1919
this.sentenceEndings.add(".");
20+
this.sentenceEndings.add("。");
21+
this.sentenceEndings.add(".");
2022
this.sentenceEndings.add("!");
2123
this.sentenceEndings.add("?");
24+
this.sentenceEndings.add("‼");
25+
this.sentenceEndings.add("⁇");
26+
this.sentenceEndings.add("⁈");
27+
this.sentenceEndings.add("⁉");
2228

2329
this.wordBreaks = new ArrayList<>();
2430
this.wordBreaks.add(",");
31+
this.wordBreaks.add("、");
2532
this.wordBreaks.add(";");
2633
this.wordBreaks.add(":");
2734
this.wordBreaks.add(" ");

0 commit comments

Comments
 (0)