diff --git a/packages/core/src/amazonq/util/files.ts b/packages/core/src/amazonq/util/files.ts index b7c25c7f887..802cc29ea3e 100644 --- a/packages/core/src/amazonq/util/files.ts +++ b/packages/core/src/amazonq/util/files.ts @@ -88,7 +88,7 @@ export async function prepareRepoData( } const files = await collectFiles(repoRootPaths, workspaceFolders, { - maxSizeBytes: maxRepoSizeBytes, + maxTotalSizeBytes: maxRepoSizeBytes, excludeByGitIgnore: true, excludePatterns: excludePatterns, filterFn: filterFn, diff --git a/packages/core/src/codewhisperer/util/zipUtil.ts b/packages/core/src/codewhisperer/util/zipUtil.ts index 47ed91909bf..00ee0ae053d 100644 --- a/packages/core/src/codewhisperer/util/zipUtil.ts +++ b/packages/core/src/codewhisperer/util/zipUtil.ts @@ -420,7 +420,7 @@ export class ZipUtil { ) : vscode.workspace.workspaceFolders) as CurrentWsFolders, { - maxSizeBytes: this.getProjectScanPayloadSizeLimitInBytes(), + maxTotalSizeBytes: this.getProjectScanPayloadSizeLimitInBytes(), excludePatterns: useCase === FeatureUseCase.TEST_GENERATION ? [...CodeWhispererConstants.testGenExcludePatterns, ...defaultExcludePatterns] diff --git a/packages/core/src/shared/utilities/workspaceUtils.ts b/packages/core/src/shared/utilities/workspaceUtils.ts index 224c6645445..12cce75b3ff 100644 --- a/packages/core/src/shared/utilities/workspaceUtils.ts +++ b/packages/core/src/shared/utilities/workspaceUtils.ts @@ -293,14 +293,13 @@ export const defaultExcludePatterns = [ ] export function getExcludePattern(useDefaults: boolean = true) { - const globAlwaysExcludedDirs = getGlobalExcludePatterns() - const allPatterns = [...globAlwaysExcludedDirs] + const patterns = [...getGlobalExcludePatterns()] if (useDefaults) { - allPatterns.push(...defaultExcludePatterns) + patterns.push(...defaultExcludePatterns) } - return excludePatternsAsString(allPatterns) + return excludePatternsAsString(patterns) } function getGlobalExcludePatterns() { @@ -335,10 +334,19 @@ export type CollectFilesResultItem = { relativeFilePath: string fileUri: vscode.Uri fileContent: string + fileSizeBytes: number zipFilePath: string } export type CollectFilesFilter = (relativePath: string) => boolean // returns true if file should be filtered out - +interface CollectFilesOptions { + maxTotalSizeBytes?: number // 200 MB default + maxFileSizeBytes?: number // 10 MB default + includeContent?: boolean // default true + failOnLimit?: boolean // default true + excludeByGitIgnore?: boolean // default true + excludePatterns?: string[] // default defaultExcludePatterns + filterFn?: CollectFilesFilter +} /** * search files in sourcePaths and collect them using filtering options * @param sourcePaths the paths where collection starts @@ -349,48 +357,40 @@ export type CollectFilesFilter = (relativePath: string) => boolean // returns tr export async function collectFiles( sourcePaths: string[], workspaceFolders: CurrentWsFolders, - options?: { - maxSizeBytes?: number // 200 MB default - excludeByGitIgnore?: boolean // default true - excludePatterns?: string[] // default defaultExcludePatterns - filterFn?: CollectFilesFilter - } -): Promise { - const storage: Awaited = [] - + options?: (CollectFilesOptions & { includeContent: true }) | Omit +): Promise +export async function collectFiles( + sourcePaths: string[], + workspaceFolders: CurrentWsFolders, + options?: CollectFilesOptions & { includeContent: false } +): Promise[]> +export async function collectFiles( + sourcePaths: string[], + workspaceFolders: CurrentWsFolders, + options?: CollectFilesOptions +) { const workspaceFoldersMapping = getWorkspaceFoldersByPrefixes(workspaceFolders) const workspaceToPrefix = new Map( workspaceFoldersMapping === undefined ? [[workspaceFolders[0], '']] : Object.entries(workspaceFoldersMapping).map((value) => [value[1], value[0]]) ) - const prefixWithFolderPrefix = (folder: vscode.WorkspaceFolder, path: string) => { - const prefix = workspaceToPrefix.get(folder) - /** - * collects all files that are marked as source - * @param sourcePaths the paths where collection starts - * @param workspaceFolders the current workspace folders opened - * @param respectGitIgnore whether to respect gitignore file - * @returns all matched files - */ - if (prefix === undefined) { - throw new ToolkitError(`Failed to find prefix for workspace folder ${folder.name}`) - } - return prefix === '' ? path : `${prefix}/${path}` - } - - let totalSizeBytes = 0 + const includeContent = options?.includeContent ?? true + const maxFileSizeBytes = options?.maxFileSizeBytes ?? 1024 * 1024 * 10 const excludeByGitIgnore = options?.excludeByGitIgnore ?? true + const failOnLimit = options?.failOnLimit ?? true const inputExcludePatterns = options?.excludePatterns ?? defaultExcludePatterns - const maxSizeBytes = options?.maxSizeBytes ?? maxRepoSizeBytes + const maxSizeBytes = options?.maxTotalSizeBytes ?? maxRepoSizeBytes const excludePatterns = [...getGlobalExcludePatterns()] if (inputExcludePatterns.length) { excludePatterns.push(...inputExcludePatterns) } - const excludePatternFilter = excludePatternsAsString(excludePatterns) + let totalSizeBytes = 0 + const storage = [] + const excludePatternFilter = excludePatternsAsString(excludePatterns) for (const rootPath of sourcePaths) { const allFiles = await vscode.workspace.findFiles( new vscode.RelativePattern(rootPath, '**'), @@ -410,31 +410,56 @@ export async function collectFiles( } const fileStat = await fs.stat(file) - if (totalSizeBytes + fileStat.size > maxSizeBytes) { + if (failOnLimit && totalSizeBytes + fileStat.size > maxSizeBytes) { throw new ToolkitError( 'The project you have selected for source code is too large to use as context. Please select a different folder to use', { code: 'ContentLengthError' } ) } - const fileContent = await readFile(file) - - if (fileContent === undefined) { + if (fileStat.size > maxFileSizeBytes) { continue } - // Now that we've read the file, increase our usage - totalSizeBytes += fileStat.size - storage.push({ + const result = { workspaceFolder: relativePath.workspaceFolder, relativeFilePath: relativePath.relativePath, fileUri: file, - fileContent: fileContent, + fileSizeBytes: fileStat.size, zipFilePath: prefixWithFolderPrefix(relativePath.workspaceFolder, relativePath.relativePath), - }) + } + if (includeContent) { + const content = await readFile(file) + if (content === undefined) { + continue + } + totalSizeBytes += fileStat.size + storage.push({ + ...result, + fileContent: content, + }) + } else { + totalSizeBytes += fileStat.size + storage.push(result) + } } } return storage + + function prefixWithFolderPrefix(folder: vscode.WorkspaceFolder, path: string) { + const prefix = workspaceToPrefix.get(folder) + /** + * collects all files that are marked as source + * @param sourcePaths the paths where collection starts + * @param workspaceFolders the current workspace folders opened + * @param respectGitIgnore whether to respect gitignore file + * @returns all matched files + */ + if (prefix === undefined) { + throw new ToolkitError(`Failed to find prefix for workspace folder ${folder.name}`) + } + return prefix === '' ? path : `${prefix}/${path}` + } } const readFile = async (file: vscode.Uri) => { @@ -576,7 +601,7 @@ export function getWorkspaceFoldersByPrefixes( * 2. Must not be auto generated code * 3. Must not be within gitignore * 4. Ranked by priority. - * 5. Select files within maxSize limit. + * 5. Select files within maxFileSize limit. * This function do not read the actual file content or compress them into a zip. * TODO: Move this to LSP * @param sourcePaths the paths where collection starts @@ -590,65 +615,20 @@ export async function collectFilesForIndex( respectGitIgnore: boolean = true, maxSize = 250 * 1024 * 1024 // 250 MB, // make this configurable, so we can test it -): Promise< - { - workspaceFolder: vscode.WorkspaceFolder - relativeFilePath: string - fileUri: vscode.Uri - fileSizeBytes: number - }[] -> { - const storage: Awaited> = [] - - const isLanguageSupported = (filename: string) => { - const k = - /\.(js|ts|java|py|rb|cpp|tsx|jsx|cc|c|cs|vb|pl|r|m|hs|mts|mjs|h|clj|dart|groovy|lua|rb|jl|ipynb|html|json|css|md|php|swift|rs|scala|yaml|tf|sql|sh|go|yml|kt|smithy|config|kts|gradle|cfg|xml|vue)$/i - return k.test(filename) || filename.endsWith('Config') - } - - const isBuildOrBin = (filePath: string) => { - const k = /[/\\](bin|build|node_modules|env|\.idea|\.venv|venv)[/\\]/i - return k.test(filePath) - } - - let totalSizeBytes = 0 - for (const rootPath of sourcePaths) { - const allFiles = await vscode.workspace.findFiles( - new vscode.RelativePattern(rootPath, '**'), - getExcludePattern() - ) - const files = respectGitIgnore ? await filterOutGitignoredFiles(rootPath, allFiles) : allFiles - - for (const file of files) { - if (!isLanguageSupported(file.fsPath)) { - continue - } - if (isBuildOrBin(file.fsPath)) { - continue - } - const relativePath = getWorkspaceRelativePath(file.fsPath, { workspaceFolders }) - if (!relativePath) { - continue - } - - const fileStat = await fs.stat(file) - // ignore single file over 10 MB - if (fileStat.size > 10 * 1024 * 1024) { - continue - } - storage.push({ - workspaceFolder: relativePath.workspaceFolder, - relativeFilePath: relativePath.relativePath, - fileUri: file, - fileSizeBytes: fileStat.size, - }) - } - } +) { + const storage = await collectFiles(sourcePaths, workspaceFolders, { + maxFileSizeBytes: 10 * 1024 * 1024, + includeContent: false, + failOnLimit: false, + excludeByGitIgnore: respectGitIgnore, + filterFn: (rp) => !isLanguageSupported(rp) || isBuildOrBin(rp), + }) // prioritize upper level files storage.sort((a, b) => a.fileUri.fsPath.length - b.fileUri.fsPath.length) const maxSizeBytes = Math.min(maxSize, os.freemem() / 2) + let totalSizeBytes = 0 let i = 0 for (i = 0; i < storage.length; i += 1) { totalSizeBytes += storage[i].fileSizeBytes @@ -658,6 +638,17 @@ export async function collectFilesForIndex( } // pick top 100k files below size limit return storage.slice(0, Math.min(100000, i)) + + function isLanguageSupported(filename: string) { + const k = + /\.(js|ts|java|py|rb|cpp|tsx|jsx|cc|c|cs|vb|pl|r|m|hs|mts|mjs|h|clj|dart|groovy|lua|rb|jl|ipynb|html|json|css|md|php|swift|rs|scala|yaml|tf|sql|sh|go|yml|kt|smithy|config|kts|gradle|cfg|xml|vue)$/i + return k.test(filename) || filename.endsWith('Config') + } + + function isBuildOrBin(filePath: string) { + const k = /[/\\](bin|build|node_modules|env|\.idea|\.venv|venv)[/\\]/i + return k.test(filePath) + } } /** diff --git a/packages/core/src/testInteg/shared/utilities/workspaceUtils.test.ts b/packages/core/src/testInteg/shared/utilities/workspaceUtils.test.ts index 18761491458..55c551b87d0 100644 --- a/packages/core/src/testInteg/shared/utilities/workspaceUtils.test.ts +++ b/packages/core/src/testInteg/shared/utilities/workspaceUtils.test.ts @@ -9,6 +9,7 @@ import * as vscode from 'vscode' import { collectFiles, collectFilesForIndex, + CollectFilesResultItem, findParentProjectFile, findStringInDirectory, getWorkspaceFoldersByPrefixes, @@ -19,7 +20,7 @@ import globals from '../../../shared/extensionGlobals' import { CodelensRootRegistry } from '../../../shared/fs/codelensRootRegistry' import { createTestWorkspace, createTestWorkspaceFolder, toFile } from '../../../test/testUtil' import sinon from 'sinon' -import { fs } from '../../../shared' +import { fs, ToolkitError } from '../../../shared' describe('workspaceUtils', () => { let sandbox: sinon.SinonSandbox @@ -256,11 +257,7 @@ describe('workspaceUtils', () => { await writeFile(['src', 'folder3', 'negate_test1'], fileContent) await writeFile(['src', 'folder3', 'negate_test6'], fileContent) - const result = (await collectFiles([workspaceFolder.uri.fsPath], [workspaceFolder])) - // for some reason, uri created inline differ in subfields, so skipping them from assertion - .map(({ fileUri, zipFilePath, ...r }) => ({ ...r })) - - result.sort((l, r) => l.relativeFilePath.localeCompare(r.relativeFilePath)) + const result = processIndexResults(await collectFiles([workspaceFolder.uri.fsPath], [workspaceFolder])) // non-posix filePath check here is important. assert.deepStrictEqual( @@ -269,41 +266,49 @@ describe('workspaceUtils', () => { workspaceFolder, relativeFilePath: '.gitignore', fileContent: gitignoreContent, + fileSizeBytes: 162, }, { workspaceFolder, relativeFilePath: 'file1', fileContent: 'test content', + fileSizeBytes: 12, }, { workspaceFolder, relativeFilePath: 'file3', fileContent: 'test content', + fileSizeBytes: 12, }, { workspaceFolder, relativeFilePath: 'range_file9', fileContent: 'test content', + fileSizeBytes: 12, }, { workspaceFolder, relativeFilePath: path.join('src', '.gitignore'), fileContent: gitignore2, + fileSizeBytes: 8, }, { workspaceFolder, relativeFilePath: path.join('src', 'folder2', 'a.js'), fileContent: fileContent, + fileSizeBytes: 12, }, { workspaceFolder, relativeFilePath: path.join('src', 'folder3', '.gitignore'), fileContent: gitignore3, + fileSizeBytes: 42, }, { workspaceFolder, relativeFilePath: path.join('src', 'folder3', 'negate_test1'), fileContent: fileContent, + fileSizeBytes: 12, }, ] satisfies typeof result, result @@ -336,6 +341,20 @@ describe('workspaceUtils', () => { assert.deepStrictEqual(1, result.length) assert.deepStrictEqual('non-license.md', result[0].relativeFilePath) }) + + it('throws when total size limit is exceeded (by default)', async function () { + const workspace = await createTestWorkspaceFolder() + sandbox.stub(vscode.workspace, 'workspaceFolders').value([workspace]) + + const fileContent = 'this is some text' + await toFile(fileContent, path.join(workspace.uri.fsPath, 'file1')) + await toFile(fileContent, path.join(workspace.uri.fsPath, 'file2')) + + await assert.rejects( + () => collectFiles([workspace.uri.fsPath], [workspace], { maxTotalSizeBytes: 15 }), + (e) => e instanceof ToolkitError && e.code === 'ContentLengthError' + ) + }) }) describe('getWorkspaceFoldersByPrefixes', function () { @@ -440,19 +459,20 @@ describe('workspaceUtils', () => { }) describe('collectFilesForIndex', function () { - it('returns all files in the workspace not excluded by gitignore and is a supported programming language', async function () { - // these variables are a manual selection of settings for the test in order to test the collectFiles function - const fileAmount = 3 - const fileNamePrefix = 'file' - const fileContent = 'test content' + let workspaceFolder: vscode.WorkspaceFolder - const workspaceFolder = await createTestWorkspace(fileAmount, { fileNamePrefix, fileContent }) - - const writeFile = (pathParts: string[], fileContent: string) => { - return toFile(fileContent, path.join(workspaceFolder.uri.fsPath, ...pathParts)) - } + const writeFile = (pathParts: string[], fileContent: string) => { + return toFile(fileContent, path.join(workspaceFolder.uri.fsPath, ...pathParts)) + } + beforeEach(async function () { + workspaceFolder = await createTestWorkspaceFolder() sandbox.stub(vscode.workspace, 'workspaceFolders').value([workspaceFolder]) + }) + + it('returns all files in the workspace not excluded by gitignore and is a supported programming language', async function () { + const fileContent = 'test content' + const gitignoreContent = `file2 # different formats of prefixes /build @@ -486,11 +506,9 @@ describe('workspaceUtils', () => { await writeFile(['src', 'folder3', 'negate_test1'], fileContent) await writeFile(['src', 'folder3', 'negate_test6'], fileContent) - const result = (await collectFilesForIndex([workspaceFolder.uri.fsPath], [workspaceFolder], true)) - // for some reason, uri created inline differ in subfields, so skipping them from assertion - .map(({ fileUri, ...r }) => ({ ...r })) - - result.sort((l, r) => l.relativeFilePath.localeCompare(r.relativeFilePath)) + const result = processIndexResults( + await collectFilesForIndex([workspaceFolder.uri.fsPath], [workspaceFolder], true) + ) // non-posix filePath check here is important. assert.deepStrictEqual( @@ -509,6 +527,68 @@ describe('workspaceUtils', () => { result ) }) + + it('does not include build related files', async function () { + const fileContent = 'this is a file' + + await writeFile(['bin', `ignored1`], fileContent) + await writeFile(['bin', `ignored2`], fileContent) + + await writeFile([`a.js`], fileContent) + await writeFile([`b.java`], fileContent) + + const result = processIndexResults( + await collectFilesForIndex([workspaceFolder.uri.fsPath], [workspaceFolder], true) + ) + + // non-posix filePath check here is important. + assert.deepStrictEqual( + [ + { + workspaceFolder, + relativeFilePath: 'a.js', + fileSizeBytes: 14, + }, + { + workspaceFolder, + relativeFilePath: 'b.java', + fileSizeBytes: 14, + }, + ] satisfies typeof result, + result + ) + }) + + it('returns top level files when max size is reached', async function () { + const fileContent = 'this is a file' + + await writeFile(['path', 'to', 'file', 'bot.js'], fileContent) + await writeFile(['path', 'to', 'file', `bot.java`], fileContent) + + await writeFile([`top.js`], fileContent) + await writeFile([`top.java`], fileContent) + + const result = processIndexResults( + await collectFilesForIndex([workspaceFolder.uri.fsPath], [workspaceFolder], true, 30) + ) + + // non-posix filePath check here is important. + assert.deepStrictEqual( + [ + { + workspaceFolder, + relativeFilePath: 'top.java', + fileSizeBytes: 14, + }, + { + workspaceFolder, + relativeFilePath: 'top.js', + fileSizeBytes: 14, + }, + ] satisfies typeof result, + result + ) + }) }) describe('findStringInDirectory', function () { @@ -522,3 +602,10 @@ describe('workspaceUtils', () => { }) }) }) + +// for some reason, uri created inline differ in subfields, so skipping them from assertion +function processIndexResults(results: Omit[] | CollectFilesResultItem[]) { + return results + .map(({ zipFilePath, fileUri, ...r }) => ({ ...r })) + .sort((l, r) => l.relativeFilePath.localeCompare(r.relativeFilePath)) +}