Skip to content

Commit b327603

Browse files
committed
fix: improve splitText efficiency
1 parent 96e632d commit b327603

File tree

2 files changed

+174
-23
lines changed

2 files changed

+174
-23
lines changed

src/splitText.ts

Lines changed: 116 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,90 @@ export function splitText<const S extends string[]>(text: string, separators: S)
77
if (separators.length === 0)
88
return [text];
99

10-
const [firstSeparator, ...otherSeparators] = separators;
10+
const separatorsFindTree = separatorsToFindTree(separators);
11+
const activeChecks: SeparatorCheck[] = [];
12+
const textIndexToLowestSeparatorIndex = new Map<TextIndex, SeparatorIndex>();
13+
const finalPartRanges: Array<{
14+
textStartIndex: TextIndex,
15+
separatorIndex: SeparatorIndex,
16+
conflictedBy: TextIndex[]
17+
}> = [];
1118

12-
let res: (string | Separator<S[number]>)[] = [];
19+
for (let i = 0; i < text.length; i++) {
20+
const char = text[i];
1321

14-
let lastSplitIndex = 0;
15-
let splitIndex = text.indexOf(firstSeparator);
22+
activeChecks.unshift({
23+
currentNode: separatorsFindTree,
24+
startIndex: i,
25+
conflictedBy: []
26+
});
27+
let rangeToAdd: (typeof finalPartRanges[number]) | null = null;
1628

17-
while (splitIndex >= 0) {
18-
res.push(text.slice(lastSplitIndex, splitIndex));
19-
res.push(Separator._create(firstSeparator));
29+
for (let j = 0; j < activeChecks.length; j++) {
30+
const activeCheck = activeChecks[j];
31+
const nextNode = activeCheck.currentNode.next.get(char);
2032

21-
lastSplitIndex = splitIndex + firstSeparator.length;
22-
splitIndex = text.indexOf(firstSeparator, lastSplitIndex);
33+
if (nextNode == null) {
34+
activeChecks.splice(j, 1);
35+
j--;
36+
continue;
37+
}
38+
39+
if (nextNode.separator != null && nextNode.separatorIndex != null) {
40+
if (!textIndexToLowestSeparatorIndex.has(activeCheck.startIndex) ||
41+
nextNode.separatorIndex < textIndexToLowestSeparatorIndex.get(activeCheck.startIndex)!
42+
)
43+
textIndexToLowestSeparatorIndex.set(activeCheck.startIndex, nextNode.separatorIndex);
44+
45+
if (rangeToAdd == null || nextNode.separatorIndex < rangeToAdd.separatorIndex)
46+
rangeToAdd = {
47+
textStartIndex: activeCheck.startIndex,
48+
separatorIndex: nextNode.separatorIndex,
49+
conflictedBy: activeCheck.conflictedBy.slice()
50+
};
51+
}
52+
53+
activeCheck.currentNode = nextNode;
54+
}
55+
56+
if (rangeToAdd != null) {
57+
if (activeChecks.length > 0) {
58+
for (const activeCheck of activeChecks) {
59+
rangeToAdd.conflictedBy.push(activeCheck.startIndex);
60+
activeCheck.conflictedBy.push(rangeToAdd.textStartIndex);
61+
}
62+
}
63+
64+
finalPartRanges.push(rangeToAdd);
65+
}
2366
}
2467

25-
if (lastSplitIndex < text.length)
26-
res.push(text.slice(lastSplitIndex));
27-
28-
if (otherSeparators.length > 0) {
29-
res = res
30-
.filter((item) => item !== "")
31-
.flatMap((item) => {
32-
if (typeof item === "string")
33-
return splitText(item, otherSeparators);
34-
else
35-
return item;
36-
});
68+
const res: (string | Separator<S[number]>)[] = [];
69+
let lastEndIndex = 0;
70+
71+
for (const range of finalPartRanges) {
72+
const isConflicted = range.conflictedBy.some((textIndex) => {
73+
const conflictedByIndexSeparatorIndex = textIndexToLowestSeparatorIndex.get(textIndex);
74+
75+
if (conflictedByIndexSeparatorIndex == null)
76+
return false;
77+
78+
return conflictedByIndexSeparatorIndex < range.separatorIndex;
79+
});
80+
81+
if (isConflicted)
82+
continue;
83+
84+
if (lastEndIndex < range.textStartIndex)
85+
res.push(text.slice(lastEndIndex, range.textStartIndex));
86+
87+
res.push(Separator._create(separators[range.separatorIndex]));
88+
lastEndIndex = range.textStartIndex + separators[range.separatorIndex].length;
3789
}
3890

91+
if (lastEndIndex < text.length)
92+
res.push(text.slice(lastEndIndex));
93+
3994
if (res.length === 0 && text.length === 0)
4095
res.push("");
4196

@@ -54,3 +109,43 @@ export class Separator<S extends string> {
54109
return new Separator(separator);
55110
}
56111
}
112+
113+
function separatorsToFindTree(separators: string[]): SeparatorFindNode {
114+
const root: SeparatorFindNode = {next: new Map()};
115+
116+
for (let i = 0; i < separators.length; i++) {
117+
const separator = separators[i];
118+
let node = root;
119+
120+
for (let j = 0; j < separator.length; j++) {
121+
const char = separator[j];
122+
123+
if (!node.next.has(char))
124+
node.next.set(char, {next: new Map()});
125+
126+
node = node.next.get(char)!;
127+
}
128+
129+
if (node.separator == null) {
130+
node.separator = separator;
131+
node.separatorIndex = i;
132+
}
133+
}
134+
135+
return root;
136+
}
137+
138+
type SeparatorIndex = number;
139+
type TextIndex = number;
140+
type Char = string;
141+
type SeparatorFindNode = {
142+
separator?: string,
143+
separatorIndex?: SeparatorIndex,
144+
next: Map<Char, SeparatorFindNode>
145+
};
146+
147+
type SeparatorCheck = {
148+
currentNode: SeparatorFindNode,
149+
readonly startIndex: TextIndex,
150+
readonly conflictedBy: TextIndex[]
151+
};

test/splitText.test.ts

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,65 @@ describe("splitText", () => {
77

88
expect(res).toEqual([
99
"Hello ",
10-
new Separator("<and>"),
10+
Separator._create("<and>"),
1111
" world ",
12-
new Separator("[then]"),
12+
Separator._create("[then]"),
13+
" !"
14+
]);
15+
});
16+
17+
test("splitting text prioritizes first match", () => {
18+
const res1 = splitText("Hello <and> world [then] !", ["<and>", "<and> worl", "[then]"]);
19+
expect(res1).toEqual([
20+
"Hello ",
21+
Separator._create("<and>"),
22+
" world ",
23+
Separator._create("[then]"),
24+
" !"
25+
]);
26+
27+
const res2 = splitText("Hello <and> world [then] !", ["<and> worl", "<and>", "[then]"]);
28+
expect(res2).toEqual([
29+
"Hello ",
30+
Separator._create("<and> worl"),
31+
"d ",
32+
Separator._create("[then]"),
33+
" !"
34+
]);
35+
36+
const res3 = splitText("Hello <and> world [then] !", ["<and>", "llo <and>", "[then]"]);
37+
expect(res3).toEqual([
38+
"Hello ",
39+
Separator._create("<and>"),
40+
" world ",
41+
Separator._create("[then]"),
42+
" !"
43+
]);
44+
45+
const res4 = splitText("Hello <and> world [then] !", ["llo <and>", "<and>", "[then]"]);
46+
expect(res4).toEqual([
47+
"He",
48+
Separator._create("llo <and>"),
49+
" world ",
50+
Separator._create("[then]"),
51+
" !"
52+
]);
53+
54+
const res5 = splitText("Hello <and> world [then] !", ["llo <and> w", "<and>", "[then]"]);
55+
expect(res5).toEqual([
56+
"He",
57+
Separator._create("llo <and> w"),
58+
"orld ",
59+
Separator._create("[then]"),
60+
" !"
61+
]);
62+
63+
const res6 = splitText("Hello <and> world [then] !", ["<and>", "llo <and> w", "[then]"]);
64+
expect(res6).toEqual([
65+
"Hello ",
66+
Separator._create("<and>"),
67+
" world ",
68+
Separator._create("[then]"),
1369
" !"
1470
]);
1571
});

0 commit comments

Comments
 (0)