Skip to content

Commit 1802c46

Browse files
authored
feat: add ideographic and fullwidth punctuation to splitter (#192)
1 parent 523f583 commit 1802c46

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

packages/indexer/src/lib/document-processor.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ import { type BaseLogger } from 'pino';
22
import { getBlobNameFromFile } from './blob-storage.js';
33
import { type ContentPage, type ContentSection, type Section } from './document.js';
44

5-
const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
6-
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
5+
const SENTENCE_ENDINGS = new Set(['.', '。', '.', '!', '?', '‼', '⁇', '⁈', '⁉']);
6+
const WORD_BREAKS = new Set([',', '、', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
77
const MAX_SECTION_LENGTH = 1000;
88
const SENTENCE_SEARCH_LIMIT = 100;
99
const SECTION_OVERLAP = 100;

0 commit comments

Comments
 (0)