Skip to content

Commit 5ff53ba

Browse files
authored
refactor(workspaceutil): unify file collecting utilities (aws#6804)
## Problem `collectFiles` and `collectFilesForIndex` serve different purposes, but implement the same core functionality. This core functionality should be not duplicated in two places. ## Solution - Increase testing coverage for both methods. - Refactor `collectFiles` to be general enough to handle `collectFilesForIndex` use case. This includes the ability to avoid reading the files when we are indexing. - `collectFilesForIndex` now directly calls `collectIndex` then does its additional logic with the result. ## Verification - Tested `@workspace` in chat with some different prompts. Seems to behave the same as before. - Tested `/review` on a project. --- - Treat all work as PUBLIC. Private `feature/x` branches will not be squash-merged at release time. - Your code changes must meet the guidelines in [CONTRIBUTING.md](https://github.com/aws/aws-toolkit-vscode/blob/master/CONTRIBUTING.md#guidelines). - License: I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent 070c36d commit 5ff53ba

File tree

4 files changed

+197
-119
lines changed

4 files changed

+197
-119
lines changed

packages/core/src/amazonq/util/files.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ export async function prepareRepoData(
8787
}
8888

8989
const files = await collectFiles(repoRootPaths, workspaceFolders, {
90-
maxSizeBytes: maxRepoSizeBytes,
90+
maxTotalSizeBytes: maxRepoSizeBytes,
9191
excludeByGitIgnore: true,
9292
excludePatterns: excludePatterns,
9393
filterFn: filterFn,

packages/core/src/codewhisperer/util/zipUtil.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ export class ZipUtil {
420420
)
421421
: vscode.workspace.workspaceFolders) as CurrentWsFolders,
422422
{
423-
maxSizeBytes: this.getProjectScanPayloadSizeLimitInBytes(),
423+
maxTotalSizeBytes: this.getProjectScanPayloadSizeLimitInBytes(),
424424
excludePatterns:
425425
useCase === FeatureUseCase.TEST_GENERATION
426426
? [...CodeWhispererConstants.testGenExcludePatterns, ...defaultExcludePatterns]

packages/core/src/shared/utilities/workspaceUtils.ts

Lines changed: 87 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -293,14 +293,13 @@ export const defaultExcludePatterns = [
293293
]
294294

295295
export function getExcludePattern(useDefaults: boolean = true) {
296-
const globAlwaysExcludedDirs = getGlobalExcludePatterns()
297-
const allPatterns = [...globAlwaysExcludedDirs]
296+
const patterns = [...getGlobalExcludePatterns()]
298297

299298
if (useDefaults) {
300-
allPatterns.push(...defaultExcludePatterns)
299+
patterns.push(...defaultExcludePatterns)
301300
}
302301

303-
return excludePatternsAsString(allPatterns)
302+
return excludePatternsAsString(patterns)
304303
}
305304

306305
function getGlobalExcludePatterns() {
@@ -335,10 +334,19 @@ export type CollectFilesResultItem = {
335334
relativeFilePath: string
336335
fileUri: vscode.Uri
337336
fileContent: string
337+
fileSizeBytes: number
338338
zipFilePath: string
339339
}
340340
export type CollectFilesFilter = (relativePath: string) => boolean // returns true if file should be filtered out
341-
341+
interface CollectFilesOptions {
342+
maxTotalSizeBytes?: number // 200 MB default
343+
maxFileSizeBytes?: number // 10 MB default
344+
includeContent?: boolean // default true
345+
failOnLimit?: boolean // default true
346+
excludeByGitIgnore?: boolean // default true
347+
excludePatterns?: string[] // default defaultExcludePatterns
348+
filterFn?: CollectFilesFilter
349+
}
342350
/**
343351
* search files in sourcePaths and collect them using filtering options
344352
* @param sourcePaths the paths where collection starts
@@ -349,48 +357,40 @@ export type CollectFilesFilter = (relativePath: string) => boolean // returns tr
349357
export async function collectFiles(
350358
sourcePaths: string[],
351359
workspaceFolders: CurrentWsFolders,
352-
options?: {
353-
maxSizeBytes?: number // 200 MB default
354-
excludeByGitIgnore?: boolean // default true
355-
excludePatterns?: string[] // default defaultExcludePatterns
356-
filterFn?: CollectFilesFilter
357-
}
358-
): Promise<CollectFilesResultItem[]> {
359-
const storage: Awaited<CollectFilesResultItem[]> = []
360-
360+
options?: (CollectFilesOptions & { includeContent: true }) | Omit<CollectFilesOptions, 'includeContent'>
361+
): Promise<CollectFilesResultItem[]>
362+
export async function collectFiles(
363+
sourcePaths: string[],
364+
workspaceFolders: CurrentWsFolders,
365+
options?: CollectFilesOptions & { includeContent: false }
366+
): Promise<Omit<CollectFilesResultItem, 'fileContent'>[]>
367+
export async function collectFiles(
368+
sourcePaths: string[],
369+
workspaceFolders: CurrentWsFolders,
370+
options?: CollectFilesOptions
371+
) {
361372
const workspaceFoldersMapping = getWorkspaceFoldersByPrefixes(workspaceFolders)
362373
const workspaceToPrefix = new Map<vscode.WorkspaceFolder, string>(
363374
workspaceFoldersMapping === undefined
364375
? [[workspaceFolders[0], '']]
365376
: Object.entries(workspaceFoldersMapping).map((value) => [value[1], value[0]])
366377
)
367-
const prefixWithFolderPrefix = (folder: vscode.WorkspaceFolder, path: string) => {
368-
const prefix = workspaceToPrefix.get(folder)
369-
/**
370-
* collects all files that are marked as source
371-
* @param sourcePaths the paths where collection starts
372-
* @param workspaceFolders the current workspace folders opened
373-
* @param respectGitIgnore whether to respect gitignore file
374-
* @returns all matched files
375-
*/
376-
if (prefix === undefined) {
377-
throw new ToolkitError(`Failed to find prefix for workspace folder ${folder.name}`)
378-
}
379-
return prefix === '' ? path : `${prefix}/${path}`
380-
}
381-
382-
let totalSizeBytes = 0
383378

379+
const includeContent = options?.includeContent ?? true
380+
const maxFileSizeBytes = options?.maxFileSizeBytes ?? 1024 * 1024 * 10
384381
const excludeByGitIgnore = options?.excludeByGitIgnore ?? true
382+
const failOnLimit = options?.failOnLimit ?? true
385383
const inputExcludePatterns = options?.excludePatterns ?? defaultExcludePatterns
386-
const maxSizeBytes = options?.maxSizeBytes ?? maxRepoSizeBytes
384+
const maxSizeBytes = options?.maxTotalSizeBytes ?? maxRepoSizeBytes
387385

388386
const excludePatterns = [...getGlobalExcludePatterns()]
389387
if (inputExcludePatterns.length) {
390388
excludePatterns.push(...inputExcludePatterns)
391389
}
392-
const excludePatternFilter = excludePatternsAsString(excludePatterns)
393390

391+
let totalSizeBytes = 0
392+
const storage = []
393+
const excludePatternFilter = excludePatternsAsString(excludePatterns)
394394
for (const rootPath of sourcePaths) {
395395
const allFiles = await vscode.workspace.findFiles(
396396
new vscode.RelativePattern(rootPath, '**'),
@@ -410,31 +410,56 @@ export async function collectFiles(
410410
}
411411

412412
const fileStat = await fs.stat(file)
413-
if (totalSizeBytes + fileStat.size > maxSizeBytes) {
413+
if (failOnLimit && totalSizeBytes + fileStat.size > maxSizeBytes) {
414414
throw new ToolkitError(
415415
'The project you have selected for source code is too large to use as context. Please select a different folder to use',
416416
{ code: 'ContentLengthError' }
417417
)
418418
}
419419

420-
const fileContent = await readFile(file)
421-
422-
if (fileContent === undefined) {
420+
if (fileStat.size > maxFileSizeBytes) {
423421
continue
424422
}
425423

426-
// Now that we've read the file, increase our usage
427-
totalSizeBytes += fileStat.size
428-
storage.push({
424+
const result = {
429425
workspaceFolder: relativePath.workspaceFolder,
430426
relativeFilePath: relativePath.relativePath,
431427
fileUri: file,
432-
fileContent: fileContent,
428+
fileSizeBytes: fileStat.size,
433429
zipFilePath: prefixWithFolderPrefix(relativePath.workspaceFolder, relativePath.relativePath),
434-
})
430+
}
431+
if (includeContent) {
432+
const content = await readFile(file)
433+
if (content === undefined) {
434+
continue
435+
}
436+
totalSizeBytes += fileStat.size
437+
storage.push({
438+
...result,
439+
fileContent: content,
440+
})
441+
} else {
442+
totalSizeBytes += fileStat.size
443+
storage.push(result)
444+
}
435445
}
436446
}
437447
return storage
448+
449+
function prefixWithFolderPrefix(folder: vscode.WorkspaceFolder, path: string) {
450+
const prefix = workspaceToPrefix.get(folder)
451+
/**
452+
* collects all files that are marked as source
453+
* @param sourcePaths the paths where collection starts
454+
* @param workspaceFolders the current workspace folders opened
455+
* @param respectGitIgnore whether to respect gitignore file
456+
* @returns all matched files
457+
*/
458+
if (prefix === undefined) {
459+
throw new ToolkitError(`Failed to find prefix for workspace folder ${folder.name}`)
460+
}
461+
return prefix === '' ? path : `${prefix}/${path}`
462+
}
438463
}
439464

440465
const readFile = async (file: vscode.Uri) => {
@@ -576,7 +601,7 @@ export function getWorkspaceFoldersByPrefixes(
576601
* 2. Must not be auto generated code
577602
* 3. Must not be within gitignore
578603
* 4. Ranked by priority.
579-
* 5. Select files within maxSize limit.
604+
* 5. Select files within maxFileSize limit.
580605
* This function do not read the actual file content or compress them into a zip.
581606
* TODO: Move this to LSP
582607
* @param sourcePaths the paths where collection starts
@@ -590,65 +615,20 @@ export async function collectFilesForIndex(
590615
respectGitIgnore: boolean = true,
591616
maxSize = 250 * 1024 * 1024 // 250 MB,
592617
// make this configurable, so we can test it
593-
): Promise<
594-
{
595-
workspaceFolder: vscode.WorkspaceFolder
596-
relativeFilePath: string
597-
fileUri: vscode.Uri
598-
fileSizeBytes: number
599-
}[]
600-
> {
601-
const storage: Awaited<ReturnType<typeof collectFilesForIndex>> = []
602-
603-
const isLanguageSupported = (filename: string) => {
604-
const k =
605-
/\.(js|ts|java|py|rb|cpp|tsx|jsx|cc|c|cs|vb|pl|r|m|hs|mts|mjs|h|clj|dart|groovy|lua|rb|jl|ipynb|html|json|css|md|php|swift|rs|scala|yaml|tf|sql|sh|go|yml|kt|smithy|config|kts|gradle|cfg|xml|vue)$/i
606-
return k.test(filename) || filename.endsWith('Config')
607-
}
608-
609-
const isBuildOrBin = (filePath: string) => {
610-
const k = /[/\\](bin|build|node_modules|env|\.idea|\.venv|venv)[/\\]/i
611-
return k.test(filePath)
612-
}
613-
614-
let totalSizeBytes = 0
615-
for (const rootPath of sourcePaths) {
616-
const allFiles = await vscode.workspace.findFiles(
617-
new vscode.RelativePattern(rootPath, '**'),
618-
getExcludePattern()
619-
)
620-
const files = respectGitIgnore ? await filterOutGitignoredFiles(rootPath, allFiles) : allFiles
621-
622-
for (const file of files) {
623-
if (!isLanguageSupported(file.fsPath)) {
624-
continue
625-
}
626-
if (isBuildOrBin(file.fsPath)) {
627-
continue
628-
}
629-
const relativePath = getWorkspaceRelativePath(file.fsPath, { workspaceFolders })
630-
if (!relativePath) {
631-
continue
632-
}
633-
634-
const fileStat = await fs.stat(file)
635-
// ignore single file over 10 MB
636-
if (fileStat.size > 10 * 1024 * 1024) {
637-
continue
638-
}
639-
storage.push({
640-
workspaceFolder: relativePath.workspaceFolder,
641-
relativeFilePath: relativePath.relativePath,
642-
fileUri: file,
643-
fileSizeBytes: fileStat.size,
644-
})
645-
}
646-
}
618+
) {
619+
const storage = await collectFiles(sourcePaths, workspaceFolders, {
620+
maxFileSizeBytes: 10 * 1024 * 1024,
621+
includeContent: false,
622+
failOnLimit: false,
623+
excludeByGitIgnore: respectGitIgnore,
624+
filterFn: (rp) => !isLanguageSupported(rp) || isBuildOrBin(rp),
625+
})
647626
// prioritize upper level files
648627
storage.sort((a, b) => a.fileUri.fsPath.length - b.fileUri.fsPath.length)
649628

650629
const maxSizeBytes = Math.min(maxSize, os.freemem() / 2)
651630

631+
let totalSizeBytes = 0
652632
let i = 0
653633
for (i = 0; i < storage.length; i += 1) {
654634
totalSizeBytes += storage[i].fileSizeBytes
@@ -658,6 +638,17 @@ export async function collectFilesForIndex(
658638
}
659639
// pick top 100k files below size limit
660640
return storage.slice(0, Math.min(100000, i))
641+
642+
function isLanguageSupported(filename: string) {
643+
const k =
644+
/\.(js|ts|java|py|rb|cpp|tsx|jsx|cc|c|cs|vb|pl|r|m|hs|mts|mjs|h|clj|dart|groovy|lua|rb|jl|ipynb|html|json|css|md|php|swift|rs|scala|yaml|tf|sql|sh|go|yml|kt|smithy|config|kts|gradle|cfg|xml|vue)$/i
645+
return k.test(filename) || filename.endsWith('Config')
646+
}
647+
648+
function isBuildOrBin(filePath: string) {
649+
const k = /[/\\](bin|build|node_modules|env|\.idea|\.venv|venv)[/\\]/i
650+
return k.test(filePath)
651+
}
661652
}
662653

663654
/**

0 commit comments

Comments
 (0)