Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ export class ContextTokenization {
}

/**
* Given the existing tokenization and an incoming input `Transform`, this
* Given this existing tokenization and an incoming input `Transform`, this
* method precomputes how both the current, pre-application tokenization will
* be altered and how the incoming Transform will be tokenized.
*
Expand All @@ -351,158 +351,7 @@ export class ContextTokenization {
transform: Transform,
edgeOptions?: EdgeWindowOptions
): TokenizationTransitionEdits {
// Step 4: now that our window's been properly updated, determine what the
// input's effects on the context is.
//
// Context does not slide within this function.
//
// Assumption: this alignment cannot fail; we KNOW there's a solid
// before-and-after relationship here, and we can base it on the results of
// a prior syncToSourceWindow call.
//
// We don't wish to do the full tokenization here - we only want to check
// over the last few tokens that might reasonably shift. We also want to
// batch effects.

// Do not mutate the original transform; it can cause unexpected assertion
// effects in unit tests.
const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
const edgeWindow = buildEdgeWindow(this.tokens, edgeTransform, false, edgeOptions);
const {
retokenizationText,
editBoundary,
sliceIndex: edgeSliceIndex
} = edgeWindow;
// Prevent mutation of the original return property.
const stackedDeletes = edgeWindow.deleteLengths.slice();

const tokenize = determineModelTokenizer(lexicalModel);
const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
if(postTokenization.length == 0) {
postTokenization.push('');
}
const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);

// What does the edge's retokenization look like when we remove the inserted portions?
const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
const insertBoundaryToken = postTokenization[firstInsertPostIndex];

// Note: requires that helpers have not mutated `stackedInserts`.
const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));

// Do not preserve empty tokens here, even if tokenization normally would produce one.
// It's redundant and replaceable for tokenization batching efforts.
if(uninsertedBoundaryToken != '') {
retokenizedEdge.push(uninsertedBoundaryToken);
}

// We've found the root token within the root context state to which deletes (and inserts)
// may be applied.
// We've also found the last post-application token to which transform changes contributed.
// How do these indices line up - we need to properly construct and index our transforms,
// but 'merge' and 'split' edits can mess up that indexing.

const currentTokens = this.tokens;
const preTokenization = currentTokens
.slice(edgeSliceIndex, editBoundary.tokenIndex+1)
.map(t => t.exampleInput);

// Determine the effects of splits & merges as applied to the original
// cached context state.
const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
preTokenization,
postTokenization.slice(0, firstInsertPostIndex+1)
);

/*
* Final steps: We can now safely index the transforms. Let's do it!
* 1. Determine the first index a Transform may align to
* 2. Build the transforms
*
* Notes:
* - text applied to the end of a 'merged' token at the tail: should have
* index 0, not -1.
* - pretokenization index will mismatch by -1: -SUM(merge size - 1)
* - Ex: can + ' + t => can't
* -1 0 0
* - text applied to the end of a 'split' token at the tail: should also
* have index 0, not 1.
* - posttokenization index will mismatch by +1: SUM(split size - 1)
* - new token after 'split': index 1
* - Ex: can' + ? => can + ' + ?
* 0 -1 0 1
*
* The first transform applies at the end of the retokenized zone and its
* associated index. The question: were there deletes that occurred?
*/

const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
let shiftDeletes = false;
// first popped entry == 0 - a delete no-op.
if(stackedDeletes[stackedDeletes.length - 1] == 0) {
// the boundary indices found by both methods above differ
if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
shiftDeletes = true;
}

// there are no inserts, so we don't affect the boundary token we landed on.
if(stackedDeletes.length > 1 && transform.insert == '') {
shiftDeletes = true;
}
}

if(shiftDeletes) {
// Do not add a zero-length delete if we're not actually altering the
// corresponding token at all.
stackedDeletes.pop();
}

// The first delete always applies to index 0. If the built edge window
// omits a context-final empty-string, adjust the tokenization indices
// accordingly.
const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
// Mutates stackedInserts, stackedDeletes.
const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);

// If there's an empty transform in the 0 position and we already know we're
// dropping tokens - and only deleting - we're dropping an
// otherwise-untracked empty token - make sure it's included!
const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
// Past that, if we have more delete entries than insert entries for our transforms, we
// dropped some tokens outright.
const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);

// Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split'
// and not caused by transforms. All transforms always apply in sequence at the end.
const unmappedEdits: EditTuple<EditOperation>[] = [];
for(let i = 0; i < editPath.length - transformMap.size; i++) {
const op = editPath[i].op;
switch(op) {
case 'merge':
case 'split':
// already calculated
// can fall through to the `continue;` line.
case 'match':
continue;
default:
// Should only be substitutions here.
// We may wish to add extra analysis in the future when supporting
// prediction from multiple competing tokenizations.
unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
}
}

return {
alignment: {
edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
merges,
splits,
unmappedEdits,
removedTokenCount
},
tokenizedTransform: transformMap,
};
return mapWhitespacedTokenization(this.tokens, lexicalModel, transform, edgeOptions);
}

/**
Expand Down Expand Up @@ -763,6 +612,190 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
retokenization: string[];
}

export interface ContextTokenLike {
exampleInput: string;
isPartial?: boolean;
sourceRangeKey?: string;
}

/**
* Given an existing tokenization and an incoming input `Transform`, this
* method precomputes how both the current, pre-application tokenization will
* be altered and how the incoming Transform will be tokenized.
*
* This function is able to operate with a reduced interface, not requiring
* the full ContextToken/ContextState/etc subsystem and its related
* SearchQuotientNode requirements.
*
* Note that this method is designed for use with languages that employ
* classical space-based wordbreaking. Do not use it for languages that need
* dictionary-based wordbreaking support!
* @param tokens
* @param lexicalModel
* @param transform
* @param edgeOptions
* @returns
*/
export function mapWhitespacedTokenization(
tokens: ContextTokenLike[],
lexicalModel: LexicalModel,
transform: Transform,
edgeOptions?: EdgeWindowOptions
): TokenizationTransitionEdits {
// Step 4: now that our window's been properly updated, determine what the
// input's effects on the context is.
//
// Context does not slide within this function.
//
// Assumption: this alignment cannot fail; we KNOW there's a solid
// before-and-after relationship here, and we can base it on the results of
// a prior syncToSourceWindow call.
//
// We don't wish to do the full tokenization here - we only want to check
// over the last few tokens that might reasonably shift. We also want to
// batch effects.

// Do not mutate the original transform; it can cause unexpected assertion
// effects in unit tests.
const edgeTransform = {...transform, deleteRight: transform.deleteRight || 0};
const edgeWindow = buildEdgeWindow(tokens, edgeTransform, false, edgeOptions);
const {
retokenizationText,
editBoundary,
sliceIndex: edgeSliceIndex
} = edgeWindow;
// Prevent mutation of the original return property.
const stackedDeletes = edgeWindow.deleteLengths.slice();

const tokenize = determineModelTokenizer(lexicalModel);
const postTokenization = tokenize({left: retokenizationText + transform.insert, startOfBuffer: true, endOfBuffer: true}).left.map(t => t.text);
if(postTokenization.length == 0) {
postTokenization.push('');
}
const { stackedInserts, firstInsertPostIndex } = traceInsertEdits(postTokenization, transform);

// What does the edge's retokenization look like when we remove the inserted portions?
const retokenizedEdge = postTokenization.slice(0, firstInsertPostIndex);
const insertBoundaryToken = postTokenization[firstInsertPostIndex];

// Note: requires that helpers have not mutated `stackedInserts`.
const uninsertedBoundaryToken = KMWString.substring(insertBoundaryToken, 0, KMWString.lastIndexOf(insertBoundaryToken, stackedInserts[0]));

// Do not preserve empty tokens here, even if tokenization normally would produce one.
// It's redundant and replaceable for tokenization batching efforts.
if(uninsertedBoundaryToken != '') {
retokenizedEdge.push(uninsertedBoundaryToken);
}

// We've found the root token within the root context state to which deletes (and inserts)
// may be applied.
// We've also found the last post-application token to which transform changes contributed.
// How do these indices line up - we need to properly construct and index our transforms,
// but 'merge' and 'split' edits can mess up that indexing.

const currentTokens = tokens;
const preTokenization = currentTokens
.slice(edgeSliceIndex, editBoundary.tokenIndex+1)
.map(t => t.exampleInput);

// Determine the effects of splits & merges as applied to the original
// cached context state.
const { mergeOffset, splitOffset, editPath, merges, splits } = analyzePathMergesAndSplits(
preTokenization,
postTokenization.slice(0, firstInsertPostIndex+1)
);

/*
* Final steps: We can now safely index the transforms. Let's do it!
* 1. Determine the first index a Transform may align to
* 2. Build the transforms
*
* Notes:
* - text applied to the end of a 'merged' token at the tail: should have
* index 0, not -1.
* - pretokenization index will mismatch by -1: -SUM(merge size - 1)
* - Ex: can + ' + t => can't
* -1 0 0
* - text applied to the end of a 'split' token at the tail: should also
* have index 0, not 1.
* - posttokenization index will mismatch by +1: SUM(split size - 1)
* - new token after 'split': index 1
* - Ex: can' + ? => can + ' + ?
* 0 -1 0 1
*
* The first transform applies at the end of the retokenized zone and its
* associated index. The question: were there deletes that occurred?
*/

const lastEditedPreTokenIndex = editBoundary.tokenIndex - edgeSliceIndex;
let shiftDeletes = false;
// first popped entry == 0 - a delete no-op.
if(stackedDeletes[stackedDeletes.length - 1] == 0) {
// the boundary indices found by both methods above differ
if(lastEditedPreTokenIndex + mergeOffset != firstInsertPostIndex + splitOffset) {
shiftDeletes = true;
}

// there are no inserts, so we don't affect the boundary token we landed on.
if(stackedDeletes.length > 1 && transform.insert == '') {
shiftDeletes = true;
}
}

if(shiftDeletes) {
// Do not add a zero-length delete if we're not actually altering the
// corresponding token at all.
stackedDeletes.pop();
}

// The first delete always applies to index 0. If the built edge window
// omits a context-final empty-string, adjust the tokenization indices
// accordingly.
const tailIndex = 0 - (stackedDeletes.length - 1) + (editBoundary.omitsEmptyToken ? -1 : 0);
// Mutates stackedInserts, stackedDeletes.
const baseRemovedTokenCount = Math.max(0, stackedDeletes.length - stackedInserts.length);
const transformMap = assembleTransforms(stackedInserts, stackedDeletes, tailIndex);

// If there's an empty transform in the 0 position and we already know we're
// dropping tokens - and only deleting - we're dropping an
// otherwise-untracked empty token - make sure it's included!
const droppedFinalTransform = baseRemovedTokenCount > 0 && transform.insert == '' && TransformUtils.isEmpty(transformMap.get(0));
// Past that, if we have more delete entries than insert entries for our transforms, we
// dropped some tokens outright.
const removedTokenCount = baseRemovedTokenCount + (droppedFinalTransform ? 1 : 0);

// Final step: check for any unexpected boundary shifts not mappable to 'merge' / 'split'
// and not caused by transforms. All transforms always apply in sequence at the end.
const unmappedEdits: EditTuple<EditOperation>[] = [];
for(let i = 0; i < editPath.length - transformMap.size; i++) {
const op = editPath[i].op;
switch(op) {
case 'merge':
case 'split':
// already calculated
// can fall through to the `continue;` line.
case 'match':
continue;
default:
// Should only be substitutions here.
// We may wish to add extra analysis in the future when supporting
// prediction from multiple competing tokenizations.
unmappedEdits.push(editPath[i] as EditTuple<EditOperation>);
}
}

return {
alignment: {
edgeWindow: {...edgeWindow, retokenization: retokenizedEdge},
merges,
splits,
unmappedEdits,
removedTokenCount
},
tokenizedTransform: transformMap,
};
}

/**
* Constructs a window on one side of the represented context that is aligned to
* existing tokenization.
Expand All @@ -777,7 +810,7 @@ interface RetokenizedEdgeWindow extends EdgeWindow {
* @returns
*/
export function buildEdgeWindow(
currentTokens: ContextToken[],
currentTokens: ContextTokenLike[],
// Requires deleteRight be explicitly set.
transform: Transform & { deleteRight: number },
applyAtFront: boolean,
Expand Down
Loading