Avoid cloneStr for small strings and intern tokenizer identifiers (#11267)

Zaczero · rchiodo · web-flow · commit 321c1825ab6b · 2026-02-04T02:53:08.000Z
Co-authored-by: Rich Chiodo &lt;rchiodo@users.noreply.github.com&gt;
diff --git a/packages/pyright-internal/src/common/core.ts b/packages/pyright-internal/src/common/core.ts
@@ -190,6 +190,14 @@ export function cloneStr(str: string): string {
     // to ensure we get a copy of the string to prevent the original string from being retained in memory.
     // For example, the import resolution cache in importResolver might hold onto the full original file content
     // because seemingly innocent the import name  (e.g., `foo` in `import foo`) is in the cache.
+
+    // V8 uses a SlicedString representation for substrings only above a small length threshold (currently 13),
+    // so short strings can be returned as-is without retaining the original text in memory.
+    // https://github.com/v8/v8/blob/02558d5a88c8f06ff064e3b6b332f342e1ab6143/src/objects/string.h#L1054
+    if (str.length < 13) {
+        return str;
+    }
+
     return Buffer.from(str, 'utf8').toString('utf8');
 }
 
diff --git a/packages/pyright-internal/src/parser/tokenizer.ts b/packages/pyright-internal/src/parser/tokenizer.ts
@@ -255,6 +255,11 @@ export class Tokenizer {
     // Assume Jupyter notebook tokenization rules?
     private _useNotebookMode = false;
 
+    // Intern identifier strings within a single tokenization pass. This reduces
+    // per-identifier allocations while still ensuring we don't retain substrings
+    // that reference the original source text.
+    private readonly _identifierInternedStrings = new Map<string, string>();
+
     tokenize(
         text: string,
         start?: number,
@@ -284,6 +289,7 @@ export class Tokenizer {
         this._lineRanges = [];
         this._indentAmounts = [];
         this._useNotebookMode = useNotebookMode;
+        this._identifierInternedStrings.clear();
 
         const end = start + length;
 
@@ -905,20 +911,28 @@ export class Tokenizer {
 
         if (this._cs.position > start) {
             const value = this._cs.getText().slice(start, this._cs.position);
-            if (_keywords.has(value)) {
+            const keywordType = _keywords.get(value);
+            if (keywordType !== undefined) {
                 this._tokens.push(
-                    KeywordToken.create(start, this._cs.position - start, _keywords.get(value)!, this._getComments())
+                    KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments())
                 );
             } else {
+                const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value);
                 this._tokens.push(
-                    IdentifierToken.create(start, this._cs.position - start, cloneStr(value), this._getComments())
+                    IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments())
                 );
             }
             return true;
         }
         return false;
     }
 
+    private _internIdentifierString(value: string) {
+        const clonedValue = cloneStr(value);
+        this._identifierInternedStrings.set(clonedValue, clonedValue);
+        return clonedValue;
+    }
+
     private _isPossibleNumber(): boolean {
         if (isDecimal(this._cs.currentChar)) {
             return true;