Skip to content

Commit 321c182

Browse files
Zaczerorchiodo
andauthored
Avoid cloneStr for small strings and intern tokenizer identifiers (#11267)
Co-authored-by: Rich Chiodo <rchiodo@users.noreply.github.com>
1 parent 74ec0f5 commit 321c182

File tree

2 files changed

+25
-3
lines changed

2 files changed

+25
-3
lines changed

packages/pyright-internal/src/common/core.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ export function cloneStr(str: string): string {
190190
// to ensure we get a copy of the string to prevent the original string from being retained in memory.
191191
// For example, the import resolution cache in importResolver might hold onto the full original file content
192192
// because seemingly innocent the import name (e.g., `foo` in `import foo`) is in the cache.
193+
194+
// V8 uses a SlicedString representation for substrings only above a small length threshold (currently 13),
195+
// so short strings can be returned as-is without retaining the original text in memory.
196+
// https://github.com/v8/v8/blob/02558d5a88c8f06ff064e3b6b332f342e1ab6143/src/objects/string.h#L1054
197+
if (str.length < 13) {
198+
return str;
199+
}
200+
193201
return Buffer.from(str, 'utf8').toString('utf8');
194202
}
195203

packages/pyright-internal/src/parser/tokenizer.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,11 @@ export class Tokenizer {
255255
// Assume Jupyter notebook tokenization rules?
256256
private _useNotebookMode = false;
257257

258+
// Intern identifier strings within a single tokenization pass. This reduces
259+
// per-identifier allocations while still ensuring we don't retain substrings
260+
// that reference the original source text.
261+
private readonly _identifierInternedStrings = new Map<string, string>();
262+
258263
tokenize(
259264
text: string,
260265
start?: number,
@@ -284,6 +289,7 @@ export class Tokenizer {
284289
this._lineRanges = [];
285290
this._indentAmounts = [];
286291
this._useNotebookMode = useNotebookMode;
292+
this._identifierInternedStrings.clear();
287293

288294
const end = start + length;
289295

@@ -905,20 +911,28 @@ export class Tokenizer {
905911

906912
if (this._cs.position > start) {
907913
const value = this._cs.getText().slice(start, this._cs.position);
908-
if (_keywords.has(value)) {
914+
const keywordType = _keywords.get(value);
915+
if (keywordType !== undefined) {
909916
this._tokens.push(
910-
KeywordToken.create(start, this._cs.position - start, _keywords.get(value)!, this._getComments())
917+
KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments())
911918
);
912919
} else {
920+
const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value);
913921
this._tokens.push(
914-
IdentifierToken.create(start, this._cs.position - start, cloneStr(value), this._getComments())
922+
IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments())
915923
);
916924
}
917925
return true;
918926
}
919927
return false;
920928
}
921929

930+
private _internIdentifierString(value: string) {
931+
const clonedValue = cloneStr(value);
932+
this._identifierInternedStrings.set(clonedValue, clonedValue);
933+
return clonedValue;
934+
}
935+
922936
private _isPossibleNumber(): boolean {
923937
if (isDecimal(this._cs.currentChar)) {
924938
return true;

0 commit comments

Comments
 (0)