Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 86 additions & 58 deletions src/services/tree-sitter/languageParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ async function loadLanguage(langName: string, sourceDirectory?: string) {

let isParserInitialized = false

// Global cache for parser instances to avoid recreating them
const parserInstanceCache: Map<string, ParserT> = new Map()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thread safety consideration: If this code runs in a multi-threaded environment (workers, etc.), these global Map objects could face race conditions. While JavaScript is single-threaded in most contexts, VS Code extensions can use workers.

Is this a concern for the current architecture?

// Global cache for loaded languages to avoid reloading WASM files
const languageCache: Map<string, LanguageT> = new Map()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Memory management concern: These caches will grow indefinitely without any eviction policy. For large codebases with many file types, this could lead to memory issues over time.

Consider:

  1. Adding a maximum cache size with LRU eviction
  2. Providing a way to clear caches when needed
  3. Adding cache statistics for monitoring


/*
Using node bindings for tree-sitter is problematic in vscode extensions
because of incompatibility with electron. Going the .wasm route has the
Expand Down Expand Up @@ -96,134 +101,157 @@ export async function loadRequiredLanguageParsers(filesToParse: string[], source
let query: QueryT
let parserKey = ext // Default to using extension as key

// Determine the language name for caching
let languageName: string
let queryString: string

switch (ext) {
case "js":
case "jsx":
case "json":
language = await loadLanguage("javascript", sourceDirectory)
query = new Query(language, javascriptQuery)
languageName = "javascript"
queryString = javascriptQuery
break
case "ts":
language = await loadLanguage("typescript", sourceDirectory)
query = new Query(language, typescriptQuery)
languageName = "typescript"
queryString = typescriptQuery
break
case "tsx":
language = await loadLanguage("tsx", sourceDirectory)
query = new Query(language, tsxQuery)
languageName = "tsx"
queryString = tsxQuery
break
case "py":
language = await loadLanguage("python", sourceDirectory)
query = new Query(language, pythonQuery)
languageName = "python"
queryString = pythonQuery
break
case "rs":
language = await loadLanguage("rust", sourceDirectory)
query = new Query(language, rustQuery)
languageName = "rust"
queryString = rustQuery
break
case "go":
language = await loadLanguage("go", sourceDirectory)
query = new Query(language, goQuery)
languageName = "go"
queryString = goQuery
break
case "cpp":
case "hpp":
language = await loadLanguage("cpp", sourceDirectory)
query = new Query(language, cppQuery)
languageName = "cpp"
queryString = cppQuery
break
case "c":
case "h":
language = await loadLanguage("c", sourceDirectory)
query = new Query(language, cQuery)
languageName = "c"
queryString = cQuery
break
case "cs":
language = await loadLanguage("c_sharp", sourceDirectory)
query = new Query(language, csharpQuery)
languageName = "c_sharp"
queryString = csharpQuery
break
case "rb":
language = await loadLanguage("ruby", sourceDirectory)
query = new Query(language, rubyQuery)
languageName = "ruby"
queryString = rubyQuery
break
case "java":
language = await loadLanguage("java", sourceDirectory)
query = new Query(language, javaQuery)
languageName = "java"
queryString = javaQuery
break
case "php":
language = await loadLanguage("php", sourceDirectory)
query = new Query(language, phpQuery)
languageName = "php"
queryString = phpQuery
break
case "swift":
language = await loadLanguage("swift", sourceDirectory)
query = new Query(language, swiftQuery)
languageName = "swift"
queryString = swiftQuery
break
case "kt":
case "kts":
language = await loadLanguage("kotlin", sourceDirectory)
query = new Query(language, kotlinQuery)
languageName = "kotlin"
queryString = kotlinQuery
break
case "css":
language = await loadLanguage("css", sourceDirectory)
query = new Query(language, cssQuery)
languageName = "css"
queryString = cssQuery
break
case "html":
language = await loadLanguage("html", sourceDirectory)
query = new Query(language, htmlQuery)
languageName = "html"
queryString = htmlQuery
break
case "ml":
case "mli":
language = await loadLanguage("ocaml", sourceDirectory)
query = new Query(language, ocamlQuery)
languageName = "ocaml"
queryString = ocamlQuery
break
case "scala":
language = await loadLanguage("scala", sourceDirectory)
query = new Query(language, luaQuery) // Temporarily use Lua query until Scala is implemented
languageName = "scala"
queryString = luaQuery // Temporarily use Lua query until Scala is implemented
break
case "sol":
language = await loadLanguage("solidity", sourceDirectory)
query = new Query(language, solidityQuery)
languageName = "solidity"
queryString = solidityQuery
break
case "toml":
language = await loadLanguage("toml", sourceDirectory)
query = new Query(language, tomlQuery)
languageName = "toml"
queryString = tomlQuery
break
case "vue":
language = await loadLanguage("vue", sourceDirectory)
query = new Query(language, vueQuery)
languageName = "vue"
queryString = vueQuery
break
case "lua":
language = await loadLanguage("lua", sourceDirectory)
query = new Query(language, luaQuery)
languageName = "lua"
queryString = luaQuery
break
case "rdl":
language = await loadLanguage("systemrdl", sourceDirectory)
query = new Query(language, systemrdlQuery)
languageName = "systemrdl"
queryString = systemrdlQuery
break
case "tla":
language = await loadLanguage("tlaplus", sourceDirectory)
query = new Query(language, tlaPlusQuery)
languageName = "tlaplus"
queryString = tlaPlusQuery
break
case "zig":
language = await loadLanguage("zig", sourceDirectory)
query = new Query(language, zigQuery)
languageName = "zig"
queryString = zigQuery
break
case "ejs":
case "erb":
parserKey = "embedded_template" // Use same key for both extensions.
language = await loadLanguage("embedded_template", sourceDirectory)
query = new Query(language, embeddedTemplateQuery)
languageName = "embedded_template"
queryString = embeddedTemplateQuery
break
case "el":
language = await loadLanguage("elisp", sourceDirectory)
query = new Query(language, elispQuery)
languageName = "elisp"
queryString = elispQuery
break
case "ex":
case "exs":
language = await loadLanguage("elixir", sourceDirectory)
query = new Query(language, elixirQuery)
languageName = "elixir"
queryString = elixirQuery
break
default:
throw new Error(`Unsupported language: ${ext}`)
}

const parser = new Parser()
parser.setLanguage(language)
// Load language from cache or load it fresh
if (languageCache.has(languageName)) {
language = languageCache.get(languageName)!
} else {
language = await loadLanguage(languageName, sourceDirectory)
languageCache.set(languageName, language)
}

// Create query
query = new Query(language, queryString)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance concern: While you're caching parser instances and languages, Query objects are still being created fresh on every call. Since Query construction can be expensive (it parses and compiles the query string), should we also cache these?


// Reuse parser instance from cache or create new one
let parser: ParserT
if (parserInstanceCache.has(languageName)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Critical issue: The caching logic has a fundamental flaw. Multiple file extensions map to the same language (e.g., .js, .jsx, .json all use "javascript"), but you're caching by languageName while returning parsers keyed by extension.

This means when processing ["test.js", "test.jsx"], the second iteration will reuse the cached parser but still store it under a different key. More importantly, if different extensions need different parser configurations, this approach won't work.

Consider either:

  1. Cache by extension instead of language name
  2. Or ensure the parserKey logic aligns with the caching key

parser = parserInstanceCache.get(languageName)!
} else {
parser = new Parser()
parser.setLanguage(language)
parserInstanceCache.set(languageName, parser)
}

parsers[parserKey] = { parser, query }
}

Expand Down
Loading