diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index fc2f81615c1..95d3125c2a9 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -334,7 +334,7 @@ export class ContextTokenization { } /** - * Given the existing tokenization and an incoming input `Transform`, this + * Given this existing tokenization and an incoming input `Transform`, this * method precomputes how both the current, pre-application tokenization will * be altered and how the incoming Transform will be tokenized. * @@ -351,158 +351,7 @@ export class ContextTokenization { transform: Transform, edgeOptions?: EdgeWindowOptions ): TokenizationTransitionEdits { - // Step 4: now that our window's been properly updated, determine what the - // input's effects on the context is. - // - // Context does not slide within this function. - // - // Assumption: this alignment cannot fail; we KNOW there's a solid - // before-and-after relationship here, and we can base it on the results of - // a prior syncToSourceWindow call. - // - // We don't wish to do the full tokenization here - we only want to check - // over the last few tokens that might reasonably shift. We also want to - // batch effects. - - // Do not mutate the original transform; it can cause unexpected assertion - // effects in unit tests. - const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; - const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions); - const { - retokenizationText, - editBoundary, - sliceIndex: edgeSliceIndex - } = edgeWindow; - // Prevent mutation of the original return property. - const stackedDeletes = edgeWindow.deleteLengths.slice(); - - const tokenize = determineModelTokenizer(lexicalModel); - const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); - if(postTokenization.length == 0) { - postTokenization.push(''); - } - const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); - - // What does the edge's retokenization look like when we remove the inserted portions? - const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); - const insertBoundaryToken = postTokenization[firstInsertPostIndex]; - - // Note: requires that helpers have not mutated `stackedInserts`. - const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); - - // Do not preserve empty tokens here, even if tokenization normally would produce one. - // It's redundant and replaceable for tokenization batching efforts. - if(uninsertedBoundaryToken != '') { - retokenizedEdge.push(uninsertedBoundaryToken); - } - - // We've found the root token within the root context state to which deletes (and inserts) - // may be applied. - // We've also found the last post-application token to which transform changes contributed. - // How do these indices line up - we need to properly construct and index our transforms, - // but 'merge' and 'split' edits can mess up that indexing. - - const currentTokens = this.tokens; - const preTokenization = currentTokens - .slice(edgeSliceIndex, editBoundary.tokenIndex+1) - .map(t => t.exampleInput); - - // Determine the effects of splits & merges as applied to the original - // cached context state. - const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( - preTokenization, - postTokenization.slice(0, firstInsertPostIndex+1) - ); - - /* - * Final steps: We can now safely index the transforms. Let's do it! - * 1. Determine the first index a Transform may align to - * 2. Build the transforms - * - * Notes: - * - text applied to the end of a 'merged' token at the tail: should have - * index 0, not -1. - * - pretokenization index will mismatch by -1: -SUM(merge size - 1) - * - Ex: can + ' + t => can't - * -1 0 0 - * - text applied to the end of a 'split' token at the tail: should also - * have index 0, not 1. - * - posttokenization index will mismatch by +1: SUM(split size - 1) - * - new token after 'split': index 1 - * - Ex: can' + ? => can + ' + ? - * 0 -1 0 1 - * - * The first transform applies at the end of the retokenized zone and its - * associated index. The question: were there deletes that occurred? - */ - - const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; - let shiftDeletes = false; - // first popped entry == 0 - a delete no-op. - if(stackedDeletes[stackedDeletes.length - 1] == 0) { - // the boundary indices found by both methods above differ - if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { - shiftDeletes = true; - } - - // there are no inserts, so we don't affect the boundary token we landed on. - if(stackedDeletes.length > 1 && transform.insert == '') { - shiftDeletes = true; - } - } - - if(shiftDeletes) { - // Do not add a zero-length delete if we're not actually altering the - // corresponding token at all. - stackedDeletes.pop(); - } - - // The first delete always applies to index 0. If the built edge window - // omits a context-final empty-string, adjust the tokenization indices - // accordingly. - const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); - // Mutates stackedInserts, stackedDeletes. - const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); - const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); - - // If there's an empty transform in the 0 position and we already know we're - // dropping tokens - and only deleting - we're dropping an - // otherwise-untracked empty token - make sure it's included! - const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); - // Past that, if we have more delete entries than insert entries for our transforms, we - // dropped some tokens outright. - const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); - - // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' - // and not caused by transforms. All transforms always apply in sequence at the end. - const unmappedEdits: EditTuple[] = []; - for(let i = 0; i < editPath.length - transformMap.size; i++) { - const op = editPath[i].op; - switch(op) { - case 'merge': - case 'split': - // already calculated - // can fall through to the `continue;` line. - case 'match': - continue; - default: - // Should only be substitutions here. - // We may wish to add extra analysis in the future when supporting - // prediction from multiple competing tokenizations. - unmappedEdits.push(editPath[i] as EditTuple); - } - } - - return { - alignment: { - edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, - merges, - splits, - unmappedEdits, - removedTokenCount - }, - tokenizedTransform: transformMap, - }; + return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions); } /** @@ -763,6 +612,190 @@ interface RetokenizedEdgeWindow extends EdgeWindow { retokenization: string[]; } +export interface ContextTokenLike { + exampleInput: string; + isPartial?: boolean; + sourceRangeKey?: string; +} + +/** + * Given an existing tokenization and an incoming input `Transform`, this + * method precomputes how both the current, pre-application tokenization will + * be altered and how the incoming Transform will be tokenized. + * + * This function is able to operate with a reduced interface, not requiring + * the full ContextToken/ContextState/etc subsystem and its related + * SearchQuotientNode requirements. + * + * Note that this method is designed for use with languages that employ + * classical space-based wordbreaking. Do not use it for languages that need + * dictionary-based wordbreaking support! + * @param tokens + * @param lexicalModel + * @param transform + * @param edgeOptions + * @returns + */ +export function mapWhitespacedTokenization( + tokens: ContextTokenLike[], + lexicalModel: LexicalModel, + transform: Transform, + edgeOptions?: EdgeWindowOptions +): TokenizationTransitionEdits { + // Step 4: now that our window's been properly updated, determine what the + // input's effects on the context is. + // + // Context does not slide within this function. + // + // Assumption: this alignment cannot fail; we KNOW there's a solid + // before-and-after relationship here, and we can base it on the results of + // a prior syncToSourceWindow call. + // + // We don't wish to do the full tokenization here - we only want to check + // over the last few tokens that might reasonably shift. We also want to + // batch effects. + + // Do not mutate the original transform; it can cause unexpected assertion + // effects in unit tests. + const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0}; + const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions); + const { + retokenizationText, + editBoundary, + sliceIndex: edgeSliceIndex + } = edgeWindow; + // Prevent mutation of the original return property. + const stackedDeletes = edgeWindow.deleteLengths.slice(); + + const tokenize = determineModelTokenizer(lexicalModel); + const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text); + if(postTokenization.length == 0) { + postTokenization.push(''); + } + const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform); + + // What does the edge's retokenization look like when we remove the inserted portions? + const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex); + const insertBoundaryToken = postTokenization[firstInsertPostIndex]; + + // Note: requires that helpers have not mutated `stackedInserts`. + const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0])); + + // Do not preserve empty tokens here, even if tokenization normally would produce one. + // It's redundant and replaceable for tokenization batching efforts. + if(uninsertedBoundaryToken != '') { + retokenizedEdge.push(uninsertedBoundaryToken); + } + + // We've found the root token within the root context state to which deletes (and inserts) + // may be applied. + // We've also found the last post-application token to which transform changes contributed. + // How do these indices line up - we need to properly construct and index our transforms, + // but 'merge' and 'split' edits can mess up that indexing. + + const currentTokens = tokens; + const preTokenization = currentTokens + .slice(edgeSliceIndex, editBoundary.tokenIndex+1) + .map(t => t.exampleInput); + + // Determine the effects of splits & merges as applied to the original + // cached context state. + const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits( + preTokenization, + postTokenization.slice(0, firstInsertPostIndex+1) + ); + + /* + * Final steps: We can now safely index the transforms. Let's do it! + * 1. Determine the first index a Transform may align to + * 2. Build the transforms + * + * Notes: + * - text applied to the end of a 'merged' token at the tail: should have + * index 0, not -1. + * - pretokenization index will mismatch by -1: -SUM(merge size - 1) + * - Ex: can + ' + t => can't + * -1 0 0 + * - text applied to the end of a 'split' token at the tail: should also + * have index 0, not 1. + * - posttokenization index will mismatch by +1: SUM(split size - 1) + * - new token after 'split': index 1 + * - Ex: can' + ? => can + ' + ? + * 0 -1 0 1 + * + * The first transform applies at the end of the retokenized zone and its + * associated index. The question: were there deletes that occurred? + */ + + const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex; + let shiftDeletes = false; + // first popped entry == 0 - a delete no-op. + if(stackedDeletes[stackedDeletes.length - 1] == 0) { + // the boundary indices found by both methods above differ + if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) { + shiftDeletes = true; + } + + // there are no inserts, so we don't affect the boundary token we landed on. + if(stackedDeletes.length > 1 && transform.insert == '') { + shiftDeletes = true; + } + } + + if(shiftDeletes) { + // Do not add a zero-length delete if we're not actually altering the + // corresponding token at all. + stackedDeletes.pop(); + } + + // The first delete always applies to index 0. If the built edge window + // omits a context-final empty-string, adjust the tokenization indices + // accordingly. + const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0); + // Mutates stackedInserts, stackedDeletes. + const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length); + const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex); + + // If there's an empty transform in the 0 position and we already know we're + // dropping tokens - and only deleting - we're dropping an + // otherwise-untracked empty token - make sure it's included! + const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0)); + // Past that, if we have more delete entries than insert entries for our transforms, we + // dropped some tokens outright. + const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0); + + // Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split' + // and not caused by transforms. All transforms always apply in sequence at the end. + const unmappedEdits: EditTuple[] = []; + for(let i = 0; i < editPath.length - transformMap.size; i++) { + const op = editPath[i].op; + switch(op) { + case 'merge': + case 'split': + // already calculated + // can fall through to the `continue;` line. + case 'match': + continue; + default: + // Should only be substitutions here. + // We may wish to add extra analysis in the future when supporting + // prediction from multiple competing tokenizations. + unmappedEdits.push(editPath[i] as EditTuple); + } + } + + return { + alignment: { + edgeWindow: {...edgeWindow, retokenization: retokenizedEdge}, + merges, + splits, + unmappedEdits, + removedTokenCount + }, + tokenizedTransform: transformMap, + }; +} + /** * Constructs a window on one side of the represented context that is aligned to * existing tokenization. @@ -777,7 +810,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow { * @returns */ export function buildEdgeWindow( - currentTokens: ContextToken[], + currentTokens: ContextTokenLike[], // Requires deleteRight be explicitly set. transform: Transform & { deleteRight: number }, applyAtFront: boolean,