Skip to content

Commit 65debd2

Browse files
committed
Speed up tokenization for heavy workflows
* support tokenization concurrently * return headwords when tokenizing * use LRU cache for tokenize * fix refactor bug * maintain separation between parse results * add array index to results <rikaitan.link>ZDgyNjg0ZDliNzQ2ZGE2MGFkYjBlMjhkZWM1ZjRhNDkxNGRhNjhjMQo=</rikaitan.link>
1 parent 8132c57 commit 65debd2

File tree

5 files changed

+123
-38
lines changed

5 files changed

+123
-38
lines changed

ext/js/background/backend.js

Lines changed: 80 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import {OptionsUtil} from '../data/options-util.js';
3434
import {getAllPermissions, hasPermissions, hasRequiredPermissionsForOptions} from '../data/permissions-util.js';
3535
import {DictionaryDatabase} from '../dictionary/dictionary-database.js';
3636
import {Environment} from '../extension/environment.js';
37+
import {CacheMap} from '../general/cache-map.js';
3738
import {ObjectPropertyAccessor} from '../general/object-property-accessor.js';
3839
import {distributeFuriganaInflected, isCodePointJapanese, convertKatakanaToHiragana as jpConvertKatakanaToHiragana} from '../language/ja/japanese.js';
3940
import {getLanguageSummaries, isTextLookupWorthy} from '../language/languages.js';
@@ -208,6 +209,8 @@ export class Backend {
208209

209210
/** @type {RikaitanApi} */
210211
this._rikaitanApi = new RikaitanApi(this._apiMap, this._offscreen);
212+
/** @type {CacheMap<string, {originalTextLength: number, textSegments: import('api').ParseTextSegment[]}>} */
213+
this._textParseCache = new CacheMap(10000, 3600000); // 1 hour idle time, ~32MB per 1000 entries for Japanese
211214
}
212215

213216
/**
@@ -558,33 +561,46 @@ export class Backend {
558561

559562
/** @type {import('api').ApiHandler<'parseText'>} */
560563
async _onApiParseText({text, optionsContext, scanLength, useInternalParser, useMecabParser}) {
561-
const [internalResults, mecabResults] = await Promise.all([
562-
(useInternalParser ? this._textParseScanning(text, scanLength, optionsContext) : null),
563-
(useMecabParser ? this._textParseMecab(text) : null),
564-
]);
565-
566564
/** @type {import('api').ParseTextResultItem[]} */
567565
const results = [];
568566

569-
if (internalResults !== null) {
570-
results.push({
571-
id: 'scan',
572-
source: 'scanning-parser',
573-
dictionary: null,
574-
content: internalResults,
575-
});
576-
}
567+
const [internalResults, mecabResults] = await Promise.all([
568+
useInternalParser ?
569+
(Array.isArray(text) ?
570+
Promise.all(text.map((t) => this._textParseScanning(t, scanLength, optionsContext))) :
571+
Promise.all([this._textParseScanning(text, scanLength, optionsContext)])) :
572+
null,
573+
useMecabParser ?
574+
(Array.isArray(text) ?
575+
Promise.all(text.map((t) => this._textParseMecab(t))) :
576+
Promise.all([this._textParseMecab(text)])) :
577+
null,
578+
]);
577579

578-
if (mecabResults !== null) {
579-
for (const [dictionary, content] of mecabResults) {
580+
if (internalResults !== null) {
581+
for (const [index, internalResult] of internalResults.entries()) {
580582
results.push({
581-
id: `mecab-${dictionary}`,
582-
source: 'mecab',
583-
dictionary,
584-
content,
583+
id: 'scan',
584+
source: 'scanning-parser',
585+
dictionary: null,
586+
index,
587+
content: internalResult,
585588
});
586589
}
587590
}
591+
if (mecabResults !== null) {
592+
for (const [index, mecabResult] of mecabResults.entries()) {
593+
for (const [dictionary, content] of mecabResult) {
594+
results.push({
595+
id: `mecab-${dictionary}`,
596+
source: 'mecab',
597+
dictionary,
598+
index,
599+
content,
600+
});
601+
}
602+
}
603+
}
588604

589605
return results;
590606
}
@@ -1496,6 +1512,8 @@ export class Backend {
14961512

14971513
void this._accessibilityController.update(this._getOptionsFull(false));
14981514

1515+
this._textParseCache.clear();
1516+
14991517
this._sendMessageAllTabsIgnoreResponse({action: 'applicationOptionsUpdated', params: {source}});
15001518
}
15011519

@@ -1677,25 +1695,54 @@ export class Backend {
16771695
let i = 0;
16781696
const ii = text.length;
16791697
while (i < ii) {
1680-
const {dictionaryEntries, originalTextLength} = await this._translator.findTerms(
1681-
mode,
1682-
text.substring(i, i + scanLength),
1683-
findTermsOptions,
1684-
);
16851698
const codePoint = /** @type {number} */ (text.codePointAt(i));
16861699
const character = String.fromCodePoint(codePoint);
1687-
if (
1688-
dictionaryEntries.length > 0 &&
1700+
const substring = text.substring(i, i + scanLength);
1701+
const cacheKey = `${optionsContext.index}:${substring}`;
1702+
let cached = this._textParseCache.get(cacheKey);
1703+
if (typeof cached === 'undefined') {
1704+
const {dictionaryEntries, originalTextLength} = await this._translator.findTerms(
1705+
mode,
1706+
substring,
1707+
findTermsOptions,
1708+
);
1709+
/** @type {import('api').ParseTextSegment[]} */
1710+
const textSegments = [];
1711+
if (dictionaryEntries.length > 0 &&
16891712
originalTextLength > 0 &&
16901713
(originalTextLength !== character.length || isCodePointJapanese(codePoint))
1691-
) {
1692-
previousUngroupedSegment = null;
1693-
const {headwords: [{term, reading}]} = dictionaryEntries[0];
1694-
const source = text.substring(i, i + originalTextLength);
1695-
const textSegments = [];
1696-
for (const {text: text2, reading: reading2} of distributeFuriganaInflected(term, reading, source)) {
1697-
textSegments.push({text: text2, reading: reading2});
1714+
) {
1715+
const {headwords: [{term, reading}]} = dictionaryEntries[0];
1716+
const source = substring.substring(0, originalTextLength);
1717+
for (const {text: text2, reading: reading2} of distributeFuriganaInflected(term, reading, source)) {
1718+
textSegments.push({text: text2, reading: reading2});
1719+
}
1720+
if (textSegments.length > 0) {
1721+
const token = textSegments.map((s) => s.text).join('');
1722+
const trimmedHeadwords = [];
1723+
for (const dictionaryEntry of dictionaryEntries) {
1724+
const validHeadwords = [];
1725+
for (const headword of dictionaryEntry.headwords) {
1726+
const validSources = [];
1727+
for (const src of headword.sources) {
1728+
if (src.originalText !== token) { continue; }
1729+
if (!src.isPrimary) { continue; }
1730+
if (src.matchType !== 'exact') { continue; }
1731+
validSources.push(src);
1732+
}
1733+
if (validSources.length > 0) { validHeadwords.push({term: headword.term, reading: headword.reading, sources: validSources}); }
1734+
}
1735+
if (validHeadwords.length > 0) { trimmedHeadwords.push(validHeadwords); }
1736+
}
1737+
textSegments[0].headwords = trimmedHeadwords;
1738+
}
16981739
}
1740+
cached = {originalTextLength, textSegments};
1741+
if (typeof optionsContext.index !== 'undefined') { this._textParseCache.set(cacheKey, cached); }
1742+
}
1743+
const {originalTextLength, textSegments} = cached;
1744+
if (textSegments.length > 0) {
1745+
previousUngroupedSegment = null;
16991746
results.push(textSegments);
17001747
i += originalTextLength;
17011748
} else {

ext/js/comm/rikaitan-api.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ export class RikaitanApi {
240240
/** @type {import('rikaitan-api.js').tokenizeInput} */
241241
// @ts-expect-error - Allow this to error
242242
const {text, scanLength} = parsedBody;
243-
if (typeof text !== 'string') {
244-
throw new Error('Invalid input for tokenize, expected "text" to be a string but got ' + typeof text);
243+
if (typeof text !== 'string' && !Array.isArray(text)) {
244+
throw new Error('Invalid input for tokenize, expected "text" to be a string or a string array but got ' + typeof text);
245245
}
246246
if (typeof scanLength !== 'number') {
247247
throw new Error('Invalid input for tokenize, expected "scanLength" to be a number but got ' + typeof scanLength);

ext/js/general/cache-map.js

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,30 @@ export class CacheMap {
2626
/**
2727
* Creates a new CacheMap.
2828
* @param {number} maxSize The maximum number of entries able to be stored in the cache.
29+
* @param {number} [maxIdleTime=0] The maximum idle time (ms) before the cache is automatically cleared.
2930
*/
30-
constructor(maxSize) {
31+
constructor(maxSize, maxIdleTime = 0) {
3132
if (!(
3233
Number.isFinite(maxSize) &&
3334
maxSize >= 0 &&
3435
Math.floor(maxSize) === maxSize
3536
)) {
3637
throw new Error('Invalid maxCount');
3738
}
39+
if (!(
40+
Number.isFinite(maxIdleTime) &&
41+
maxIdleTime >= 0 &&
42+
Math.floor(maxIdleTime) === maxIdleTime
43+
)) {
44+
throw new Error('Invalid maxIdleTime');
45+
}
3846

3947
/** @type {number} */
4048
this._maxSize = maxSize;
49+
/** @type {number} */
50+
this._maxIdleTime = maxIdleTime;
51+
/** @type {?import('core').Timeout} */
52+
this._idleTimeout = null;
4153
/** @type {Map<K, import('cache-map').Node<K, V>>} */
4254
this._map = new Map();
4355
/** @type {import('cache-map').Node<K, V>} */
@@ -116,6 +128,16 @@ export class CacheMap {
116128
clear() {
117129
this._map.clear();
118130
this._resetEndNodes();
131+
this.clearIdleTimeout();
132+
}
133+
134+
/**
135+
* Clears the idle timeout.
136+
*/
137+
clearIdleTimeout() {
138+
if (this._idleTimeout === null) { return; }
139+
clearTimeout(this._idleTimeout);
140+
this._idleTimeout = null;
119141
}
120142

121143
// Private
@@ -142,6 +164,7 @@ export class CacheMap {
142164
* @param {import('cache-map').Node<K, V>} previous
143165
*/
144166
_addNode(node, previous) {
167+
this._resetIdleTimeout();
145168
const next = previous.next;
146169
node.next = next;
147170
node.previous = previous;
@@ -164,4 +187,13 @@ export class CacheMap {
164187
this._listFirst.next = this._listLast;
165188
this._listLast.previous = this._listFirst;
166189
}
190+
191+
/**
192+
* @returns {void}
193+
*/
194+
_resetIdleTimeout() {
195+
if (this._maxIdleTime <= 0) { return; }
196+
this.clearIdleTimeout();
197+
this._idleTimeout = setTimeout(() => this.clear(), this._maxIdleTime);
198+
}
167199
}

types/ext/api.d.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,18 @@ export type ParseTextResultItem = {
5555
id: string;
5656
source: 'scanning-parser' | 'mecab';
5757
dictionary: null | string;
58+
index: number;
5859
content: ParseTextLine[];
5960
};
6061

6162
export type ParseTextSegment = {
6263
text: string;
6364
reading: string;
65+
headwords?: {
66+
term: string;
67+
reading: string;
68+
sources: Dictionary.TermSource[];
69+
}[][];
6470
};
6571

6672
export type ParseTextLine = ParseTextSegment[];
@@ -141,7 +147,7 @@ type ApiSurface = {
141147
};
142148
parseText: {
143149
params: {
144-
text: string;
150+
text: string | string[];
145151
optionsContext: Settings.OptionsContext;
146152
scanLength: number;
147153
useInternalParser: boolean;

types/ext/rikaitan-api.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export type ankiFieldsInput = {
3232
};
3333

3434
export type tokenizeInput = {
35-
text: string;
35+
text: string | string[];
3636
scanLength: number;
3737
};
3838

0 commit comments

Comments
 (0)