keymanapp · jahorton · May 6, 2026 · May 7, 2026
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -334,7 +334,7 @@ export class ContextTokenization {
   }
 
   /**
-   * Given the existing tokenization and an incoming input `Transform`, this
+   * Given this existing tokenization and an incoming input `Transform`, this
    * method precomputes how both the current, pre-application tokenization will
    * be altered and how the incoming Transform will be tokenized.
    *
@@ -351,158 +351,7 @@ export class ContextTokenization {
     transform: Transform,
     edgeOptions?: EdgeWindowOptions
   ): TokenizationTransitionEdits {
-    // Step 4:  now that our window's been properly updated, determine what the
-    // input's effects on the context is.
-    //
-    // Context does not slide within this function.
-    //
-    // Assumption:  this alignment cannot fail; we KNOW there's a solid
-    // before-and-after relationship here, and we can base it on the results of
-    // a prior syncToSourceWindow call.
-    //
-    // We don't wish to do the full tokenization here - we only want to check
-    // over the last few tokens that might reasonably shift.  We also want to
-    // batch effects.
-
-    // Do not mutate the original transform; it can cause unexpected assertion
-    // effects in unit tests.
-    const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
-    const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions);
-    const {
-      retokenizationText,
-      editBoundary,
-      sliceIndex: edgeSliceIndex
-    } = edgeWindow;
-    // Prevent mutation of the original return property.
-    const stackedDeletes = edgeWindow.deleteLengths.slice();
-
-    const tokenize = determineModelTokenizer(lexicalModel);
-    const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
-    if(postTokenization.length == 0) {
-      postTokenization.push('');
-    }
-    const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);
-
-    // What does the edge's retokenization look like when we remove the inserted portions?
-    const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
-    const insertBoundaryToken = postTokenization[firstInsertPostIndex];
-
-    // Note:  requires that helpers have not mutated `stackedInserts`.
-    const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));
-
-    // Do not preserve empty tokens here, even if tokenization normally would produce one.
-    // It's redundant and replaceable for tokenization batching efforts.
-    if(uninsertedBoundaryToken != '') {
-      retokenizedEdge.push(uninsertedBoundaryToken);
-    }
-
-    // We've found the root token within the root context state to which deletes (and inserts)
-    // may be applied.
-    // We've also found the last post-application token to which transform changes contributed.
-    // How do these indices line up - we need to properly construct and index our transforms,
-    // but 'merge' and 'split' edits can mess up that indexing.
-
-    const currentTokens = this.tokens;
-    const preTokenization = currentTokens
-      .slice(edgeSliceIndex, editBoundary.tokenIndex+1)
-      .map(t => t.exampleInput);
-
-    // Determine the effects of splits & merges as applied to the original
-    // cached context state.
-    const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
-      preTokenization,
-      postTokenization.slice(0, firstInsertPostIndex+1)
-    );
-
-    /*
-     * Final steps:  We can now safely index the transforms.  Let's do it!
-     * 1. Determine the first index a Transform may align to
-     * 2. Build the transforms
-     *
-     * Notes:
-     * - text applied to the end of a 'merged' token at the tail:  should have
-     *   index 0, not -1.
-     *   - pretokenization index will mismatch by -1: -SUM(merge size - 1)
-     *   - Ex: can + ' + t => can't
-     *          -1   0          0
-     * - text applied to the end of a 'split' token at the tail:  should also
-     *   have index 0, not 1.
-     *   - posttokenization index will mismatch by +1: SUM(split size - 1)
-     *   - new token after 'split':  index 1
-     *   - Ex: can' + ? => can + ' + ?
-     *          0          -1    0   1
-     *
-     * The first transform applies at the end of the retokenized zone and its
-     * associated index.  The question:  were there deletes that occurred?
-     */
-
-    const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
-    let shiftDeletes = false;
-    // first popped entry == 0 - a delete no-op.
-    if(stackedDeletes[stackedDeletes.length - 1] == 0) {
-      // the boundary indices found by both methods above differ
-      if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
-        shiftDeletes = true;
-      }
-
-      // there are no inserts, so we don't affect the boundary token we landed on.
-      if(stackedDeletes.length > 1 && transform.insert == '') {
-        shiftDeletes = true;
-      }
-    }
-
-    if(shiftDeletes) {
-      // Do not add a zero-length delete if we're not actually altering the
-      // corresponding token at all.
-      stackedDeletes.pop();
-    }
-
-    // The first delete always applies to index 0. If the built edge window
-    // omits a context-final empty-string, adjust the tokenization indices
-    // accordingly.
-    const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
-    // Mutates stackedInserts, stackedDeletes.
-    const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
-    const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);
-
-    // If there's an empty transform in the 0 position and we already know we're
-    // dropping tokens - and only deleting - we're dropping an
-    // otherwise-untracked empty token - make sure it's included!
-    const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
-    // Past that, if we have more delete entries than insert entries for our transforms, we
-    // dropped some tokens outright.
-    const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);
-
-    // Final step:  check for any unexpected boundary shifts not mappable to 'merge' / 'split'
-    // and not caused by transforms.  All transforms always apply in sequence at the end.
-    const unmappedEdits: EditTuple<EditOperation>[] = [];
-    for(let i = 0; i < editPath.length - transformMap.size; i++) {
-      const op = editPath[i].op;
-      switch(op) {
-        case 'merge':
-        case 'split':
-          // already calculated
-          // can fall through to the `continue;` line.
-        case 'match':
-          continue;
-        default:
-          // Should only be substitutions here.
-          // We may wish to add extra analysis in the future when supporting
-          // prediction from multiple competing tokenizations.
-          unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
-      }
-    }
-
-    return {
-      alignment: {
-        edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
-        merges,
-        splits,
-        unmappedEdits,
-        removedTokenCount
-      },
-      tokenizedTransform: transformMap,
-    };
+    return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions);
   }
 
   /**
@@ -763,6 +612,190 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
   retokenization: string[];
 }
 
+export interface ContextTokenLike {
+  exampleInput: string;
+  isPartial?: boolean;
+  sourceRangeKey?: string;
+}
+
+/**
+ * Given an existing tokenization and an incoming input `Transform`, this
+ * method precomputes how both the current, pre-application tokenization will
+ * be altered and how the incoming Transform will be tokenized.
+ *
+ * This function is able to operate with a reduced interface, not requiring
+ * the full ContextToken/ContextState/etc subsystem and its related
+ * SearchQuotientNode requirements.
+ *
+ * Note that this method is designed for use with languages that employ
+ * classical space-based wordbreaking.  Do not use it for languages that need
+ * dictionary-based wordbreaking support!
+ * @param tokens
+ * @param lexicalModel
+ * @param transform
+ * @param edgeOptions
+ * @returns
+ */
+export function mapWhitespacedTokenization(
+  tokens: ContextTokenLike[],
+  lexicalModel: LexicalModel,
+  transform: Transform,
+  edgeOptions?: EdgeWindowOptions
+): TokenizationTransitionEdits {
+  // Step 4:  now that our window's been properly updated, determine what the
+  // input's effects on the context is.
+  //
+  // Context does not slide within this function.
+  //
+  // Assumption:  this alignment cannot fail; we KNOW there's a solid
+  // before-and-after relationship here, and we can base it on the results of
+  // a prior syncToSourceWindow call.
+  //
+  // We don't wish to do the full tokenization here - we only want to check
+  // over the last few tokens that might reasonably shift.  We also want to
+  // batch effects.
+
+  // Do not mutate the original transform; it can cause unexpected assertion
+  // effects in unit tests.
+  const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
+  const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions);
+  const {
+    retokenizationText,
+    editBoundary,
+    sliceIndex: edgeSliceIndex
+  } = edgeWindow;
+  // Prevent mutation of the original return property.
+  const stackedDeletes = edgeWindow.deleteLengths.slice();
+
+  const tokenize = determineModelTokenizer(lexicalModel);
+  const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
+  if(postTokenization.length == 0) {
+    postTokenization.push('');
+  }
+  const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);
+
+  // What does the edge's retokenization look like when we remove the inserted portions?
+  const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
+  const insertBoundaryToken = postTokenization[firstInsertPostIndex];
+
+  // Note:  requires that helpers have not mutated `stackedInserts`.
+  const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));
+
+  // Do not preserve empty tokens here, even if tokenization normally would produce one.
+  // It's redundant and replaceable for tokenization batching efforts.
+  if(uninsertedBoundaryToken != '') {
+    retokenizedEdge.push(uninsertedBoundaryToken);
+  }
+
+  // We've found the root token within the root context state to which deletes (and inserts)
+  // may be applied.
+  // We've also found the last post-application token to which transform changes contributed.
+  // How do these indices line up - we need to properly construct and index our transforms,
+  // but 'merge' and 'split' edits can mess up that indexing.
+
+  const currentTokens = tokens;
+  const preTokenization = currentTokens
+    .slice(edgeSliceIndex, editBoundary.tokenIndex+1)
+    .map(t => t.exampleInput);
+
+  // Determine the effects of splits & merges as applied to the original
+  // cached context state.
+  const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
+    preTokenization,
+    postTokenization.slice(0, firstInsertPostIndex+1)
+  );
+
+  /*
+    * Final steps:  We can now safely index the transforms.  Let's do it!
+    * 1. Determine the first index a Transform may align to
+    * 2. Build the transforms
+    *
+    * Notes:
+    * - text applied to the end of a 'merged' token at the tail:  should have
+    *   index 0, not -1.
+    *   - pretokenization index will mismatch by -1: -SUM(merge size - 1)
+    *   - Ex: can + ' + t => can't
+    *          -1   0          0
+    * - text applied to the end of a 'split' token at the tail:  should also
+    *   have index 0, not 1.
+    *   - posttokenization index will mismatch by +1: SUM(split size - 1)
+    *   - new token after 'split':  index 1
+    *   - Ex: can' + ? => can + ' + ?
+    *          0          -1    0   1
+    *
+    * The first transform applies at the end of the retokenized zone and its
+    * associated index.  The question:  were there deletes that occurred?
+    */
+
+  const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
+  let shiftDeletes = false;
+  // first popped entry == 0 - a delete no-op.
+  if(stackedDeletes[stackedDeletes.length - 1] == 0) {
+    // the boundary indices found by both methods above differ
+    if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
+      shiftDeletes = true;
+    }
+
+    // there are no inserts, so we don't affect the boundary token we landed on.
+    if(stackedDeletes.length > 1 && transform.insert == '') {
+      shiftDeletes = true;
+    }
+  }
+
+  if(shiftDeletes) {
+    // Do not add a zero-length delete if we're not actually altering the
+    // corresponding token at all.
+    stackedDeletes.pop();
+  }
+
+  // The first delete always applies to index 0. If the built edge window
+  // omits a context-final empty-string, adjust the tokenization indices
+  // accordingly.
+  const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
+  // Mutates stackedInserts, stackedDeletes.
+  const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
+  const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);
+
+  // If there's an empty transform in the 0 position and we already know we're
+  // dropping tokens - and only deleting - we're dropping an
+  // otherwise-untracked empty token - make sure it's included!
+  const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
+  // Past that, if we have more delete entries than insert entries for our transforms, we
+  // dropped some tokens outright.
+  const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);
+
+  // Final step:  check for any unexpected boundary shifts not mappable to 'merge' / 'split'
+  // and not caused by transforms.  All transforms always apply in sequence at the end.
+  const unmappedEdits: EditTuple<EditOperation>[] = [];
+  for(let i = 0; i < editPath.length - transformMap.size; i++) {
+    const op = editPath[i].op;
+    switch(op) {
+      case 'merge':
+      case 'split':
+        // already calculated
+        // can fall through to the `continue;` line.
+      case 'match':
+        continue;
+      default:
+        // Should only be substitutions here.
+        // We may wish to add extra analysis in the future when supporting
+        // prediction from multiple competing tokenizations.
+        unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
+    }
+  }
+
+  return {
+    alignment: {
+      edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
+      merges,
+      splits,
+      unmappedEdits,
+      removedTokenCount
+    },
+    tokenizedTransform: transformMap,
+  };
+}
+
 /**
  * Constructs a window on one side of the represented context that is aligned to
  * existing tokenization.
@@ -777,7 +810,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
  * @returns
  */
 export function buildEdgeWindow(
-  currentTokens: ContextToken[],
+  currentTokens: ContextTokenLike[],
   // Requires deleteRight be explicitly set.
   transform: Transform & { deleteRight: number },
   applyAtFront: boolean,