Skip to content

Commit 88a5acd

Browse files
authored
fix(codewhisperer): refactor BM25 algorithm impl, prioritize opening files when generating supplemental context (#3583)
* fix cross file not prioritize opened files in editors * trimEnd() on every code chunk of cross file * fix BM25 algorithm counting logic throwing NaN
1 parent bb1ce25 commit 88a5acd

File tree

11 files changed

+632
-253
lines changed

11 files changed

+632
-253
lines changed

src/codewhisperer/models/constants.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,3 +267,13 @@ export enum UserGroup {
267267
export const isClassifierEnabledKey = 'CODEWHISPERER_CLASSIFIER_TRIGGER_ENABLED'
268268

269269
export const supplemetalContextFetchingTimeoutMsg = 'codewhisperer supplemental context fetching timeout'
270+
271+
export const crossFileContextConfig = {
272+
numberOfChunkToFetch: 60,
273+
topK: 3,
274+
numberOfLinesEachChunk: 10,
275+
}
276+
277+
export const utgConfig = {
278+
maxSegmentSize: 10200,
279+
}

src/codewhisperer/service/recommendationHandler.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ export class RecommendationHandler {
5353
public isGenerateRecommendationInProgress: boolean
5454
private _onDidReceiveRecommendation: vscode.EventEmitter<void> = new vscode.EventEmitter<void>()
5555
public readonly onDidReceiveRecommendation: vscode.Event<void> = this._onDidReceiveRecommendation.event
56-
private supplementalContextMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined
56+
private supplementalContextMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined
5757

5858
constructor() {
5959
this.requestId = ''
@@ -149,7 +149,7 @@ export class RecommendationHandler {
149149
let nextToken = ''
150150
let errorCode = ''
151151
let req: codewhispererClient.ListRecommendationsRequest | codewhispererClient.GenerateRecommendationsRequest
152-
let supplementalContextMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined
152+
let supplementalContextMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined
153153
let shouldRecordServiceInvocation = false
154154

155155
if (pagination) {

src/codewhisperer/util/editorContext.ts

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import { supplementalContextTimeoutInMs } from '../models/constants'
1919
import { CodeWhispererUserGroupSettings } from './userGroupUtil'
2020
import { isTestFile } from './supplementalContext/codeParsingUtil'
2121
import { DependencyGraphFactory } from './dependencyGraph/dependencyGraphFactory'
22+
import { selectFrom } from '../../shared/utilities/tsUtils'
2223

2324
let tabSize: number = getTabSizeSetting()
2425

@@ -80,7 +81,7 @@ export async function buildListRecommendationRequest(
8081
allowCodeWithReference: boolean | undefined = undefined
8182
): Promise<{
8283
request: codewhispererClient.ListRecommendationsRequest
83-
supplementalMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined
84+
supplementalMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined
8485
}> {
8586
const fileContext = extractContextForCodeWhisperer(editor)
8687

@@ -98,23 +99,30 @@ export async function buildListRecommendationRequest(
9899
? await fetchSupplementalContext(editor, tokenSource.token)
99100
: undefined
100101

101-
const suppelmetalMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined = supplementalContexts
102-
? {
103-
isUtg: supplementalContexts.isUtg,
104-
isProcessTimeout: supplementalContexts.isProcessTimeout,
105-
contentsLength: supplementalContexts.contentsLength,
106-
latency: supplementalContexts.latency,
107-
}
108-
: undefined
102+
const suppelmetalMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined =
103+
supplementalContexts
104+
? {
105+
isUtg: supplementalContexts.isUtg,
106+
isProcessTimeout: supplementalContexts.isProcessTimeout,
107+
contentsLength: supplementalContexts.contentsLength,
108+
latency: supplementalContexts.latency,
109+
}
110+
: undefined
109111

110112
logSupplementalContext(supplementalContexts)
111113

114+
const supplementalContext: codewhispererClient.SupplementalContext[] = supplementalContexts
115+
? supplementalContexts.supplementalContextItems.map(v => {
116+
return selectFrom(v, 'content', 'filePath')
117+
})
118+
: []
119+
112120
if (allowCodeWithReference === undefined) {
113121
return {
114122
request: {
115123
fileContext: fileContext,
116124
nextToken: nextToken,
117-
supplementalContexts: supplementalContexts ? supplementalContexts.contents : [],
125+
supplementalContexts: supplementalContext,
118126
},
119127
supplementalMetadata: suppelmetalMetadata,
120128
}
@@ -127,15 +135,15 @@ export async function buildListRecommendationRequest(
127135
referenceTrackerConfiguration: {
128136
recommendationsWithReferences: allowCodeWithReference ? 'ALLOW' : 'BLOCK',
129137
},
130-
supplementalContexts: supplementalContexts ? supplementalContexts.contents : [],
138+
supplementalContexts: supplementalContext,
131139
},
132140
supplementalMetadata: suppelmetalMetadata,
133141
}
134142
}
135143

136144
export async function buildGenerateRecommendationRequest(editor: vscode.TextEditor): Promise<{
137145
request: codewhispererClient.GenerateRecommendationsRequest
138-
supplementalMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined
146+
supplementalMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined
139147
}> {
140148
const fileContext = extractContextForCodeWhisperer(editor)
141149

@@ -144,7 +152,7 @@ export async function buildGenerateRecommendationRequest(editor: vscode.TextEdit
144152
tokenSource.cancel()
145153
}, supplementalContextTimeoutInMs)
146154
const supplementalContexts = await fetchSupplementalContext(editor, tokenSource.token)
147-
let supplemetalMetadata: Omit<CodeWhispererSupplementalContext, 'contents'> | undefined
155+
let supplemetalMetadata: Omit<CodeWhispererSupplementalContext, 'supplementalContextItems'> | undefined
148156

149157
if (supplementalContexts) {
150158
supplemetalMetadata = {
@@ -161,7 +169,7 @@ export async function buildGenerateRecommendationRequest(editor: vscode.TextEdit
161169
request: {
162170
fileContext: fileContext,
163171
maxResults: CodeWhispererConstants.maxRecommendations,
164-
supplementalContexts: supplementalContexts?.contents ?? [],
172+
supplementalContexts: supplementalContexts?.supplementalContextItems ?? [],
165173
},
166174
supplementalMetadata: supplemetalMetadata,
167175
}
@@ -228,10 +236,12 @@ function logSupplementalContext(supplementalContext: CodeWhispererSupplementalCo
228236
latency: ${supplementalContext.latency},
229237
`)
230238

231-
supplementalContext.contents.forEach((context, index) => {
239+
supplementalContext.supplementalContextItems.forEach((context, index) => {
232240
getLogger().verbose(`
233241
-----------------------------------------------
234-
Chunk ${index}:${context.content}
242+
Path: ${context.filePath}
243+
Score: ${context.score}
244+
Chunk: ${index}:${context.content}
235245
-----------------------------------------------
236246
`)
237247
})

src/codewhisperer/util/supplementalContext/crossFileContextUtil.ts

Lines changed: 62 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,28 @@
44
*/
55

66
import * as vscode from 'vscode'
7-
import * as codewhispererClient from '../../client/codewhisperer'
87
import * as fs from 'fs-extra'
98
import { DependencyGraph } from '../dependencyGraph/dependencyGraph'
10-
import { BMDocument, performBM25Scoring } from './rankBm25'
11-
import { getRelevantFilesFromEditor, isRelevant } from './editorFilesUtil'
9+
import { BM25Document, BM25Okapi } from './rankBm25'
10+
import { isRelevant } from './editorFilesUtil'
1211
import { ToolkitError } from '../../../shared/errors'
13-
import { supplemetalContextFetchingTimeoutMsg } from '../../models/constants'
12+
import { crossFileContextConfig, supplemetalContextFetchingTimeoutMsg } from '../../models/constants'
1413
import { CancellationError } from '../../../shared/utilities/timeoutUtils'
14+
import { CodeWhispererSupplementalContextItem } from './supplementalContextUtil'
1515

1616
const crossFileLanguageConfigs = ['java']
1717
interface Chunk {
1818
fileName: string
1919
content: string
2020
nextContent: string
21+
score?: number
2122
}
22-
const chunkSize = 10
23-
const chunkCount = 60
24-
const topK = 3
2523

2624
export async function fetchSupplementalContextForSrc(
2725
editor: vscode.TextEditor,
2826
dependencyGraph: DependencyGraph,
2927
cancellationToken: vscode.CancellationToken
30-
) {
28+
): Promise<CodeWhispererSupplementalContextItem[] | undefined> {
3129
if (crossFileLanguageConfigs.includes(editor.document.languageId) === false) {
3230
return undefined
3331
}
@@ -38,59 +36,66 @@ export async function fetchSupplementalContextForSrc(
3836
// Step 2: Split files to chunks with upper bound on chunkCount
3937
// We restrict the total number of chunks to improve on latency.
4038
// Chunk linking is required as we want to pass the next chunk value for matched chunk.
41-
const chunkList: Chunk[] = []
39+
let chunkList: Chunk[] = []
4240
for (const relevantFile of relevantCrossFilePaths) {
4341
throwIfCancelled(cancellationToken)
44-
45-
const chunks: Chunk[] = splitFileToChunks(relevantFile, chunkSize)
42+
const chunks: Chunk[] = splitFileToChunks(relevantFile, crossFileContextConfig.numberOfLinesEachChunk)
4643
const linkedChunks = linkChunks(chunks)
4744
chunkList.push(...linkedChunks)
48-
if (chunkList.length >= chunkCount) {
45+
if (chunkList.length >= crossFileContextConfig.numberOfChunkToFetch) {
4946
break
5047
}
5148
}
5249

50+
// it's required since chunkList.push(...) is likely giving us a list of size > 60
51+
chunkList = chunkList.slice(0, crossFileContextConfig.numberOfChunkToFetch)
52+
5353
// Step 3: Generate Input chunk (10 lines left of cursor position)
5454
// and Find Best K chunks w.r.t input chunk using BM25
55-
const inputChunk: Chunk = getInputChunk(editor, chunkSize)
56-
const bestChunks: Chunk[] = findBestKChunkMatches(inputChunk, chunkList, topK)
55+
const inputChunk: Chunk = getInputChunk(editor, crossFileContextConfig.numberOfLinesEachChunk)
56+
const bestChunks: Chunk[] = findBestKChunkMatches(inputChunk, chunkList, crossFileContextConfig.topK)
5757
throwIfCancelled(cancellationToken)
5858

5959
// Step 4: Transform best chunks to supplemental contexts
60-
const supplementalContexts: codewhispererClient.SupplementalContext[] = []
60+
const supplementalContexts: CodeWhispererSupplementalContextItem[] = []
6161
for (const chunk of bestChunks) {
6262
throwIfCancelled(cancellationToken)
6363

64-
const context = {
64+
supplementalContexts.push({
6565
filePath: chunk.fileName,
6666
content: chunk.nextContent,
67-
} as codewhispererClient.SupplementalContext
68-
supplementalContexts.push(context)
67+
score: chunk.score,
68+
})
6969
}
7070

7171
return supplementalContexts
7272
}
7373

74-
function findBestKChunkMatches(chunkInput: Chunk, chunkReferences: Chunk[], k: number) {
74+
function findBestKChunkMatches(chunkInput: Chunk, chunkReferences: Chunk[], k: number): Chunk[] {
7575
const chunkContentList = chunkReferences.map(chunk => chunk.content)
76+
7677
//performBM25Scoring returns the output in a sorted order (descending of scores)
77-
const output: BMDocument[] = performBM25Scoring(chunkContentList, chunkInput.content) as BMDocument[]
78-
const bestChunks: Chunk[] = []
79-
//pick Top 3
80-
for (let i = 0; i < Math.min(k, output.length); i++) {
81-
const chunkIndex = output[i].index
78+
const top3: BM25Document[] = new BM25Okapi(chunkContentList).topN(chunkInput.content, crossFileContextConfig.topK)
79+
80+
return top3.map(doc => {
81+
// reference to the original metadata since BM25.top3 will sort the result
82+
const chunkIndex = doc.index
8283
const chunkReference = chunkReferences[chunkIndex]
83-
bestChunks.push(chunkReference)
84-
}
85-
return bestChunks
84+
return {
85+
content: chunkReference.content,
86+
fileName: chunkReference.fileName,
87+
nextContent: chunkReference.nextContent,
88+
score: doc.score,
89+
}
90+
})
8691
}
8792

8893
/* This extract 10 lines to the left of the cursor from trigger file.
8994
* This will be the inputquery to bm25 matching against list of cross-file chunks
9095
*/
9196
function getInputChunk(editor: vscode.TextEditor, chunkSize: number) {
9297
const cursorPosition = editor.selection.active
93-
const startLine = Math.max(cursorPosition.line - 10, 0)
98+
const startLine = Math.max(cursorPosition.line - chunkSize, 0)
9499
const endLine = Math.max(cursorPosition.line - 1, 0)
95100
const inputChunkContent = editor.document.getText(
96101
new vscode.Range(startLine, 0, endLine, editor.document.lineAt(endLine).text.length)
@@ -109,7 +114,7 @@ function linkChunks(chunks: Chunk[]) {
109114

110115
// This additional chunk is needed to create a next pointer to chunk 0.
111116
const firstChunk = chunks[0]
112-
const firstChunkSubContent = firstChunk.content.split('\n').slice(0, 3).join('\n')
117+
const firstChunkSubContent = firstChunk.content.split('\n').slice(0, 3).join('\n').trimEnd()
113118
const newFirstChunk = {
114119
fileName: firstChunk.fileName,
115120
content: firstChunkSubContent,
@@ -132,12 +137,12 @@ function linkChunks(chunks: Chunk[]) {
132137
function splitFileToChunks(filePath: string, chunkSize: number): Chunk[] {
133138
const chunks: Chunk[] = []
134139

135-
const fileContent = fs.readFileSync(filePath, 'utf-8')
140+
const fileContent = fs.readFileSync(filePath, 'utf-8').trimEnd()
136141
const lines = fileContent.split('\n')
137142

138143
for (let i = 0; i < lines.length; i += chunkSize) {
139144
const chunkContent = lines.slice(i, Math.min(i + chunkSize, lines.length)).join('\n')
140-
const chunk = { fileName: filePath, content: chunkContent, nextContent: '' }
145+
const chunk = { fileName: filePath, content: chunkContent.trimEnd(), nextContent: '' }
141146
chunks.push(chunk)
142147
}
143148
return chunks
@@ -148,7 +153,10 @@ function splitFileToChunks(filePath: string, chunkSize: number): Chunk[] {
148153
* by referencing open files, imported files and same package files.
149154
*/
150155
async function getRelevantCrossFiles(editor: vscode.TextEditor, dependencyGraph: DependencyGraph): Promise<string[]> {
151-
const srcDependencies = await dependencyGraph.getSourceDependencies(editor.document.uri, editor.document.getText())
156+
const openedFilesInEditor = new Set(getOpenFilesInWindow())
157+
158+
let srcDependencies = await dependencyGraph.getSourceDependencies(editor.document.uri, editor.document.getText())
159+
srcDependencies = moveToFront(srcDependencies, openedFilesInEditor)
152160

153161
const samePackageFiles = await dependencyGraph.getSamePackageFiles(
154162
editor.document.uri,
@@ -158,21 +166,31 @@ async function getRelevantCrossFiles(editor: vscode.TextEditor, dependencyGraph:
158166
return isRelevant(editor.document.fileName, file, editor.document.languageId)
159167
})
160168

161-
const relevantOpenFiles: vscode.Uri[] = await getRelevantFilesFromEditor(
162-
editor.document.fileName,
163-
editor.document.languageId
164-
)
169+
const mergedCrossFileList = [...new Set([...srcDependencies, ...samePackageRelevantFiles])]
165170

166-
// We refer to only those open files which are in srcDependencies
167-
const filteredRelevantOpenFiles = relevantOpenFiles
168-
.filter(file => srcDependencies.includes(file.fsPath))
169-
.map(file => file.fsPath)
171+
return mergedCrossFileList
172+
}
170173

171-
const mergedCrossFileList = [
172-
...new Set([...filteredRelevantOpenFiles, ...srcDependencies, ...samePackageRelevantFiles]),
173-
]
174+
// Util to move selected files to the front of the input array if it exists
175+
function moveToFront<T>(arr: T[], picked: Set<T>) {
176+
return [...arr].sort((a, b) => (picked.has(b) ? 1 : 0) - (picked.has(a) ? 1 : 0))
177+
}
174178

175-
return mergedCrossFileList
179+
function getOpenFilesInWindow(): string[] {
180+
const filesOpenedInEditor: string[] = []
181+
182+
try {
183+
const tabArrays = vscode.window.tabGroups.all
184+
tabArrays.forEach(tabArray => {
185+
tabArray.tabs.forEach(tab => {
186+
filesOpenedInEditor.push((tab.input as any).uri.path)
187+
})
188+
})
189+
} catch (e) {
190+
// Older versions of VSC do not have the tab API
191+
}
192+
193+
return filesOpenedInEditor
176194
}
177195

178196
function throwIfCancelled(token: vscode.CancellationToken): void | never {

0 commit comments

Comments
 (0)