Skip to content

Commit 77fd3c7

Browse files
authored
Start improving externalIngest (#2728)
* Start improving externalIngest externalIngest Starting to hook up better basic db support. Still not very optimized yet * Fix tests for ci
1 parent 30ad4c7 commit 77fd3c7

File tree

5 files changed

+596
-164
lines changed

5 files changed

+596
-164
lines changed

src/platform/workspaceChunkSearch/node/codeSearch/codeSearchChunkSearch.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import { EmbeddingsChunkSearch } from '../embeddingsChunkSearch';
4444
import { TfIdfWithSemanticChunkSearch } from '../tfidfWithSemanticChunkSearch';
4545
import { IWorkspaceFileIndex } from '../workspaceFileIndex';
4646
import { AdoCodeSearchRepo, BuildIndexTriggerReason, CodeSearchRepo, CodeSearchRepoStatus, GithubCodeSearchRepo, TriggerIndexingError, TriggerRemoteIndexingError } from './codeSearchRepo';
47+
import { ExternalIngestClient } from './externalIngestClient';
4748
import { ExternalIngestIndex } from './externalIngestIndex';
4849
import { CodeSearchRepoTracker, RepoInfo, TrackedRepoStatus } from './repoTracker';
4950
import { CodeSearchDiff, CodeSearchWorkspaceDiffTracker } from './workspaceDiff';
@@ -162,7 +163,10 @@ export class CodeSearchChunkSearch extends Disposable implements IWorkspaceChunk
162163
this._tfIdfChunkSearch = tfIdfChunkSearch;
163164

164165
this._repoTracker = this._register(instantiationService.createInstance(CodeSearchRepoTracker));
165-
this._externalIngestIndex = new Lazy(() => this._register(instantiationService.createInstance(ExternalIngestIndex)));
166+
this._externalIngestIndex = new Lazy(() => {
167+
const client = instantiationService.createInstance(ExternalIngestClient);
168+
return this._register(instantiationService.createInstance(ExternalIngestIndex, client));
169+
});
166170

167171
this._register(this._repoTracker.onDidAddOrUpdateRepo(info => {
168172
if (info.status === TrackedRepoStatus.Resolved && info.resolvedRemoteInfo) {

src/platform/workspaceChunkSearch/node/codeSearch/externalIngestClient.ts

Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Licensed under the MIT License. See License.txt in the project root for license information.
44
*--------------------------------------------------------------------------------------------*/
55

6-
import { DocumentContents, GeoFilter, IngestFilter, canIngestDocument, createCodedSymbols, setupPanicHooks } from '@github/blackbird-external-ingest-utils';
6+
import { canIngestDocument, canIngestPathAndSize, createCodedSymbols, DocumentContents, GeoFilter, IngestFilter, setupPanicHooks } from '@github/blackbird-external-ingest-utils';
77
import crypto from 'crypto';
88
import fs from 'fs';
99
import { posix } from 'node:path';
@@ -12,26 +12,71 @@ import { coalesce } from '../../../../util/vs/base/common/arrays';
1212
import { timeout } from '../../../../util/vs/base/common/async';
1313
import { URI } from '../../../../util/vs/base/common/uri';
1414
import { Range } from '../../../../util/vs/editor/common/core/range';
15+
import { IAuthenticationService } from '../../../authentication/common/authentication';
1516
import { FileChunkAndScore } from '../../../chunking/common/chunk';
1617
import { EmbeddingType } from '../../../embeddings/common/embeddingsComputer';
1718
import { ILogService } from '../../../log/common/logService';
1819
import { CodeSearchResult } from '../../../remoteCodeSearch/common/remoteCodeSearch';
1920
import { ApiClient } from './externalIngestApi';
2021

22+
export interface ExternalIngestFile {
23+
readonly uri: URI;
24+
readonly docSha: Uint8Array;
25+
26+
read(): Promise<Uint8Array>;
27+
}
28+
29+
/**
30+
* Interface for the external ingest client that handles indexing and searching files.
31+
*/
32+
export interface IExternalIngestClient {
33+
doInitialIndex(filesetName: string, root: URI, allFiles: AsyncIterable<ExternalIngestFile>, token: CancellationToken): Promise<void>;
34+
35+
listFilesets(token: CancellationToken): Promise<string[]>;
36+
deleteFileset(filesetName: string, token: CancellationToken): Promise<void>;
37+
38+
searchFilesets(filesetName: string, rootUri: URI, prompt: string, limit: number, token: CancellationToken): Promise<CodeSearchResult>;
39+
40+
/**
41+
* Quickly checks if a file can be ingested based on its path and size.
42+
*/
43+
canIngestPathAndSize(filePath: string, size: number): boolean;
44+
45+
/**
46+
* Checks if a file can be ingested based on its path and file contents.
47+
*/
48+
canIngestDocument(filePath: string, data: Uint8Array): boolean;
49+
}
2150

2251
// Create a shared API client with throttling (target quota usage of 80)
2352
// You can change this to `null` to ignore the throttle
2453

25-
export class ExternalIngestClient {
54+
export class ExternalIngestClient implements IExternalIngestClient {
2655
private static apiClient = new ApiClient(80);
2756

2857
private static readonly PROMISE_POOL_SIZE = 32;
2958
private static baseUrl = 'https://api.github.com';
3059

60+
private readonly _ingestFilter = new IngestFilter();
61+
3162
constructor(
63+
@IAuthenticationService private readonly _authenticationService: IAuthenticationService,
3264
@ILogService private readonly logService: ILogService,
3365
) { }
3466

67+
public async getAuthToken(): Promise<string | undefined> {
68+
return (await this._authenticationService.getGitHubSession('permissive', { silent: true }))?.accessToken
69+
?? (await this._authenticationService.getGitHubSession('any', { silent: true }))?.accessToken;
70+
}
71+
72+
public canIngestPathAndSize(filePath: string, size: number): boolean {
73+
return canIngestPathAndSize(this._ingestFilter, filePath, size);
74+
}
75+
76+
public canIngestDocument(filePath: string, data: Uint8Array): boolean {
77+
return canIngestDocument(this._ingestFilter, filePath, new DocumentContents(data));
78+
}
79+
3580
private getHeaders(authToken: string): Record<string, string> {
3681
const headers: Record<string, string> = {
3782
'Content-Type': 'application/json',
@@ -51,56 +96,36 @@ export class ExternalIngestClient {
5196
return ExternalIngestClient.apiClient.makeRequest(url, this.getHeaders(authToken), 'POST', body, token);
5297
}
5398

54-
async doInitialIndex(authToken: string, filesetName: string, root: URI, allFiles: AsyncIterable<{ readonly uri: URI; readonly docSha: Uint8Array }>, token: CancellationToken): Promise<void> {
99+
async doInitialIndex(filesetName: string, root: URI, allFiles: AsyncIterable<ExternalIngestFile>, token: CancellationToken): Promise<void> {
100+
const authToken = await this.getAuthToken();
101+
if (!authToken) {
102+
this.logService.warn('ExternalIngestClient::doInitialIndex(): No auth token available');
103+
return;
104+
}
105+
55106
setupPanicHooks();
56107

57108
// Initial setup
58-
const ingestFilter = new IngestFilter();
59109
const mappings = new Map<string, { full: string; relative: string }>();
60110
const geoFilter = new GeoFilter();
61111

62112

63113
this.logService.info(`ExternalIngestClient::doInitialIndex(). Creating ingest for fileset: ${filesetName}`);
64-
const allDocShas: Uint8Array[] = [];
65114

66115
this.logService.trace(`ExternalIngestClient::doInitialIndex(). Checking for ingestable files...`);
67116
const ingestableCheckStart = performance.now();
68-
// Figure out which documents are uploadable and insert them into the geoFilter
69-
// and DocSha to path map and DocSha array.
70-
const checking = new Set<Promise<void>>();
117+
118+
const allDocShas: Uint8Array[] = [];
71119
for await (const file of allFiles) {
72120
const relativePath = posix.relative(root.path, file.uri.path);
73121
const full = file.uri.fsPath;
74122

75-
const p = (async () => {
76-
this.logService.debug(`ExternalIngestClient::doInitialIndex(). Checking if file can be ingested: ${relativePath}`);
77-
const fileBytes = await fs.promises.readFile(full);
78-
const content = new DocumentContents(fileBytes);
79-
if (canIngestDocument(ingestFilter, relativePath, content)) { // Can we do this lazily?
80-
try {
81-
const docSha = file.docSha; //getDocSha(relativePath, content);
82-
geoFilter.push(docSha);
83-
allDocShas.push(docSha);
84-
// Clients of the external ingest process are required to store a mapping of docSha to
85-
// document path. In this example ingestion code it is handled in memory but you might want
86-
// to persist somewhere. Note that our example converts the Uin8Arrays to base64 strings
87-
// since Uint8Array doesn't work as a Map key because equality is checked by reference.
88-
const docShaBase64 = Buffer.from(docSha).toString('base64');
89-
mappings.set(docShaBase64, { full, relative: relativePath });
90-
} catch (err) {
91-
throw new Error('Exception during ingest file', err);
92-
}
93-
}
94-
})();
95-
p.finally(() => {
96-
checking.delete(p);
97-
});
98-
checking.add(p);
99-
if (checking.size >= ExternalIngestClient.PROMISE_POOL_SIZE) {
100-
await Promise.race(checking);
101-
}
123+
geoFilter.push(file.docSha);
124+
allDocShas.push(file.docSha);
125+
126+
const docShaBase64 = Buffer.from(file.docSha).toString('base64');
127+
mappings.set(docShaBase64, { full, relative: relativePath });
102128
}
103-
await Promise.all(checking);
104129

105130
this.logService.debug(`ExternalIngestClient::doInitialIndex(). Found ${mappings.size} ingestable files in ${Math.round(performance.now() - ingestableCheckStart)}ms`,);
106131

@@ -280,7 +305,13 @@ export class ExternalIngestClient {
280305
this.logService.debug(`requestId: '${requestId}', body: ${body}`);
281306
}
282307

283-
async listFilesets(authToken: string, token: CancellationToken): Promise<string[]> {
308+
async listFilesets(token: CancellationToken): Promise<string[]> {
309+
const authToken = await this.getAuthToken();
310+
if (!authToken) {
311+
this.logService.warn('ExternalIngestClient::listFilesets(): No auth token available');
312+
return [];
313+
}
314+
284315
const resp = await ExternalIngestClient.apiClient.makeRequest(
285316
`${ExternalIngestClient.baseUrl}/external/code/ingest`,
286317
this.getHeaders(authToken),
@@ -293,7 +324,13 @@ export class ExternalIngestClient {
293324
return coalesce((body.filesets ?? []).map(x => x.name));
294325
}
295326

296-
async deleteFileset(authToken: string, filesetName: string, token: CancellationToken): Promise<void> {
327+
async deleteFileset(filesetName: string, token: CancellationToken): Promise<void> {
328+
const authToken = await this.getAuthToken();
329+
if (!authToken) {
330+
this.logService.warn('ExternalIngestClient::deleteFileset(): No auth token available');
331+
return;
332+
}
333+
297334
return this.deleteFilesetByName(authToken, filesetName, token);
298335
}
299336

@@ -313,7 +350,13 @@ export class ExternalIngestClient {
313350
this.logService.info(`ExternalIngestClient::deleteFilesetByName(): Deleted: ${fileSetName}`);
314351
}
315352

316-
async searchFilesets(authToken: string, filesetName: string, rootUri: URI, prompt: string, limit: number, token: CancellationToken): Promise<CodeSearchResult> {
353+
async searchFilesets(filesetName: string, rootUri: URI, prompt: string, limit: number, token: CancellationToken): Promise<CodeSearchResult> {
354+
const authToken = await this.getAuthToken();
355+
if (!authToken) {
356+
this.logService.warn('ExternalIngestClient::searchFilesets(): No auth token available');
357+
return { outOfSync: false, chunks: [] };
358+
}
359+
317360
this.logService.debug(`ExternalIngestClient::searchFilesets(): Searching fileset '${filesetName}' for prompt: '${prompt}'`);
318361
const embeddingType = EmbeddingType.metis_1024_I16_Binary;
319362
const resp = await this.post(authToken, '/external/embeddings/code/search', {

0 commit comments

Comments
 (0)