Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/aws-lsp-codewhisperer-runtimes/scripts/package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ done
cd ./build/private/bundle
for config in "${configs[@]}"; do
cd ${config}
cp $START_DIR/../../server/aws-lsp-codewhisperer/src/shared/fileProcessingWorker.js ./fileProcessingWorker.js
zip -r ../../../../$ARCHIVES_DIR/${config}/win-x64/servers.zip .
zip -r ../../../../$ARCHIVES_DIR/${config}/linux-x64/servers.zip .
zip -r ../../../../$ARCHIVES_DIR/${config}/mac-x64/servers.zip .
Expand Down
1 change: 1 addition & 0 deletions app/aws-lsp-codewhisperer-runtimes/webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ const webworkerConfig = {
tls: false,
http2: false,
buffer: require.resolve('buffer/'),
worker_threads: false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confused about this...wouldn't this turn off worker threads?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no the worker thread are copied to the servers folder on package.sh, we are not put it in the browser bundle using webpack. This will help resolve a CI failure on webpack

},
extensions: ['.ts', '.tsx', '.js', '.jsx'],
},
Expand Down
83 changes: 83 additions & 0 deletions server/aws-lsp-codewhisperer/src/shared/fileProcessingWorker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
const { parentPort } = require('worker_threads')
const fs = require('fs')

const uniqueFiles = new Set()
let filesExceedingMaxSize = 0
let maxFileSize
let remainingIndexSize

function getFileExtensionName(filepath) {
if (!filepath || !filepath.includes('.') || filepath.endsWith('.')) {
return ''
}
if (filepath.startsWith('.') && filepath.indexOf('.', 1) === -1) {
return ''
}
return filepath.substring(filepath.lastIndexOf('.') + 1).toLowerCase()
}

parentPort.on('message', message => {
const { type, data } = message

try {
if (type === 'init') {
const { maxFileSizeMB, maxIndexSizeMB } = data
const MB_TO_BYTES = 1024 * 1024
maxFileSize = maxFileSizeMB * MB_TO_BYTES
remainingIndexSize = maxIndexSizeMB * MB_TO_BYTES
parentPort.postMessage({ type: 'ready' })
} else if (type === 'processBatch') {
const { files, fileExtensions } = data

for (const file of files) {
const fileExtName = '.' + getFileExtensionName(file)
if (!uniqueFiles.has(file) && fileExtensions.includes(fileExtName)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converting fileExtensions to set should quicken the includes lookup

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is copied from previous processWorkspaceFolders

try {
const fileSize = fs.statSync(file).size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not be sync to allow node to parallelize better

if (fileSize < maxFileSize) {
if (remainingIndexSize > fileSize) {
uniqueFiles.add(file)
remainingIndexSize -= fileSize
} else {
parentPort.postMessage({
type: 'result',
data: {
files: [...uniqueFiles],
Copy link
Contributor

@ashishrp-aws ashishrp-aws Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in case of 184K files like in description, would sending a message help with all the 184k file paths? could we stream it instead or in chunks?

filesExceedingMaxSize,
reachedLimit: true,
},
})
return
}
} else {
filesExceedingMaxSize++
}
} catch (error) {
// Skip files that can't be accessed
}
}
}

parentPort.postMessage({ type: 'batchComplete' })
} else if (type === 'complete') {
parentPort.postMessage({
type: 'result',
data: {
files: [...uniqueFiles],
filesExceedingMaxSize,
reachedLimit: false,
},
})
} else {
parentPort.postMessage({
type: 'error',
error: `Unknown message type: ${type}`,
})
}
} catch (error) {
parentPort.postMessage({
type: 'error',
error: error.message,
})
}
})
Original file line number Diff line number Diff line change
Expand Up @@ -404,10 +404,119 @@ export class LocalProjectContextController {
}

this.log.info(`Processing ${workspaceFolders.length} workspace folders...`)
const startTime = Date.now()

maxFileSizeMB = Math.min(maxFileSizeMB ?? Infinity, this.DEFAULT_MAX_FILE_SIZE_MB)
maxIndexSizeMB = Math.min(maxIndexSizeMB ?? Infinity, this.DEFAULT_MAX_INDEX_SIZE_MB)

try {
const { Worker } = await import('worker_threads')
const workerPath = path.join(__dirname, 'fileProcessingWorker.js')

if (!fs.existsSync(workerPath)) {
throw new Error(`Worker file not found: ${workerPath}`)
}

this.log.info(`Processing ${workspaceFolders.length} workspace folders in worker thread`)
const worker = new Worker(workerPath)

return await new Promise<string[]>((resolve, reject) => {
const timeout = setTimeout(() => {
void worker.terminate()
reject(new Error('Worker timeout after 5 minutes'))
}, 300_000)

let batchesInProgress = 0

worker.on('message', msg => {
if (msg.type === 'ready') {
// Worker initialized, start sending batches
sendBatches().catch(reject)
} else if (msg.type === 'batchComplete') {
batchesInProgress--
} else if (msg.type === 'result') {
clearTimeout(timeout)
void worker.terminate()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try and catch for terminate failures/errors

const { files, filesExceedingMaxSize, reachedLimit } = msg.data
const duration = Date.now() - startTime
if (reachedLimit) {
this.log.info(
`Reaching max file collection size limit ${maxIndexSizeMB} MB. ${files.length} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB (took ${duration}ms)`
)
} else {
this.log.info(
`ProcessWorkspaceFolders complete. ${files.length} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB (took ${duration}ms using worker thread)`
)
}
resolve(files)
} else if (msg.type === 'error') {
clearTimeout(timeout)
void worker.terminate()
reject(new Error(msg.error))
}
})

worker.on('error', err => {
clearTimeout(timeout)
void worker.terminate()
reject(err)
})

async function sendBatches() {
const BATCH_SIZE = 10000

for (const folder of workspaceFolders!) {
const folderPath = path.resolve(URI.parse(folder.uri).fsPath)
const filesInFolder = await listFilesWithGitignore(folderPath)

for (let i = 0; i < filesInFolder.length; i += BATCH_SIZE) {
const batch = filesInFolder.slice(i, i + BATCH_SIZE)
batchesInProgress++
worker.postMessage({
type: 'processBatch',
data: { files: batch, fileExtensions },
})

// Wait if too many batches in progress
while (batchesInProgress > 5) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to constants:

private readonly BATCH_SIZE = 10000;
private readonly MAX_CONCURRENT_BATCHES = 5;
private readonly WORKER_TIMEOUT_MS = 300_000;

await sleep(10)
}
}
}

// Wait for all batches to complete
while (batchesInProgress > 0) {
await sleep(10)
}

worker.postMessage({ type: 'complete' })
}

worker.postMessage({
type: 'init',
data: { maxFileSizeMB, maxIndexSizeMB },
})
})
} catch (error) {
this.log.warn(`Worker thread failed, falling back to main thread: ${error}`)
const result = await this.processWorkspaceFoldersFallback(
workspaceFolders,
maxFileSizeMB,
maxIndexSizeMB,
fileExtensions
)
const duration = Date.now() - startTime
this.log.info(`Processing completed in ${duration}ms (fallback)`)
return result
}
}

private async processWorkspaceFoldersFallback(
workspaceFolders: WorkspaceFolder[],
maxFileSizeMB: number,
maxIndexSizeMB: number,
fileExtensions?: string[]
): Promise<string[]> {
const sizeConstraints: SizeConstraints = {
maxFileSize: maxFileSizeMB * this.MB_TO_BYTES,
remainingIndexSize: maxIndexSizeMB * this.MB_TO_BYTES,
Expand All @@ -429,7 +538,7 @@ export class LocalProjectContextController {
sizeConstraints.remainingIndexSize = sizeConstraints.remainingIndexSize - fileSize
} else {
this.log.info(
`Reaching max file collection size limit ${this.maxIndexSizeMB} MB. ${uniqueFilesToIndex.size} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB `
`Reaching max file collection size limit ${maxIndexSizeMB} MB. ${uniqueFilesToIndex.size} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB `
)
return [...uniqueFilesToIndex]
}
Expand All @@ -446,7 +555,7 @@ export class LocalProjectContextController {
}

this.log.info(
`ProcessWorkspaceFolders complete. ${uniqueFilesToIndex.size} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB`
`ProcessWorkspaceFolders complete. ${uniqueFilesToIndex.size} files found. ${filesExceedingMaxSize} files exceeded ${maxFileSizeMB} MB (fallback)`
)
return [...uniqueFilesToIndex]
}
Expand Down
Loading