We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent ccd457b commit 09a29eaCopy full SHA for 09a29ea
src/semchunk/semchunk.py
@@ -23,10 +23,12 @@
23
"""A map of token counters to their memoized versions."""
24
25
_NON_WHITESPACE_SEMANTIC_SPLITTERS = (
26
+ # Sentence terminators.
27
".",
28
"?",
29
"!",
- "*", # Sentence terminators.
30
+ "*",
31
+ # Clause separators.
32
";",
33
",",
34
"(",
@@ -39,15 +41,17 @@
39
41
"’",
40
42
"'",
43
'"',
- "`", # Clause separators.
44
+ "`",
45
+ # Sentence interrupters.
46
":",
47
"—",
- "…", # Sentence interrupters.
48
+ "…",
49
+ # Word joiners.
50
"/",
51
"\\",
52
"–",
53
"&",
- "-", # Word joiners.
54
+ "-",
55
)
56
"""A tuple of semantically meaningful non-whitespace splitters that may be used to chunk texts, ordered from most desirable to least desirable."""
57
0 commit comments