33 * Licensed under the MIT License. See License.txt in the project root for license information.
44 *--------------------------------------------------------------------------------------------*/
55
6- import { DocumentContents , GeoFilter , IngestFilter , canIngestDocument , createCodedSymbols , setupPanicHooks } from '@github/blackbird-external-ingest-utils' ;
6+ import { canIngestDocument , canIngestPathAndSize , createCodedSymbols , DocumentContents , GeoFilter , IngestFilter , setupPanicHooks } from '@github/blackbird-external-ingest-utils' ;
77import crypto from 'crypto' ;
88import fs from 'fs' ;
99import { posix } from 'node:path' ;
@@ -12,26 +12,71 @@ import { coalesce } from '../../../../util/vs/base/common/arrays';
1212import { timeout } from '../../../../util/vs/base/common/async' ;
1313import { URI } from '../../../../util/vs/base/common/uri' ;
1414import { Range } from '../../../../util/vs/editor/common/core/range' ;
15+ import { IAuthenticationService } from '../../../authentication/common/authentication' ;
1516import { FileChunkAndScore } from '../../../chunking/common/chunk' ;
1617import { EmbeddingType } from '../../../embeddings/common/embeddingsComputer' ;
1718import { ILogService } from '../../../log/common/logService' ;
1819import { CodeSearchResult } from '../../../remoteCodeSearch/common/remoteCodeSearch' ;
1920import { ApiClient } from './externalIngestApi' ;
2021
22+ export interface ExternalIngestFile {
23+ readonly uri : URI ;
24+ readonly docSha : Uint8Array ;
25+
26+ read ( ) : Promise < Uint8Array > ;
27+ }
28+
29+ /**
30+ * Interface for the external ingest client that handles indexing and searching files.
31+ */
32+ export interface IExternalIngestClient {
33+ doInitialIndex ( filesetName : string , root : URI , allFiles : AsyncIterable < ExternalIngestFile > , token : CancellationToken ) : Promise < void > ;
34+
35+ listFilesets ( token : CancellationToken ) : Promise < string [ ] > ;
36+ deleteFileset ( filesetName : string , token : CancellationToken ) : Promise < void > ;
37+
38+ searchFilesets ( filesetName : string , rootUri : URI , prompt : string , limit : number , token : CancellationToken ) : Promise < CodeSearchResult > ;
39+
40+ /**
41+ * Quickly checks if a file can be ingested based on its path and size.
42+ */
43+ canIngestPathAndSize ( filePath : string , size : number ) : boolean ;
44+
45+ /**
46+ * Checks if a file can be ingested based on its path and file contents.
47+ */
48+ canIngestDocument ( filePath : string , data : Uint8Array ) : boolean ;
49+ }
2150
2251// Create a shared API client with throttling (target quota usage of 80)
2352// You can change this to `null` to ignore the throttle
2453
25- export class ExternalIngestClient {
54+ export class ExternalIngestClient implements IExternalIngestClient {
2655 private static apiClient = new ApiClient ( 80 ) ;
2756
2857 private static readonly PROMISE_POOL_SIZE = 32 ;
2958 private static baseUrl = 'https://api.github.com' ;
3059
60+ private readonly _ingestFilter = new IngestFilter ( ) ;
61+
3162 constructor (
63+ @IAuthenticationService private readonly _authenticationService : IAuthenticationService ,
3264 @ILogService private readonly logService : ILogService ,
3365 ) { }
3466
67+ public async getAuthToken ( ) : Promise < string | undefined > {
68+ return ( await this . _authenticationService . getGitHubSession ( 'permissive' , { silent : true } ) ) ?. accessToken
69+ ?? ( await this . _authenticationService . getGitHubSession ( 'any' , { silent : true } ) ) ?. accessToken ;
70+ }
71+
72+ public canIngestPathAndSize ( filePath : string , size : number ) : boolean {
73+ return canIngestPathAndSize ( this . _ingestFilter , filePath , size ) ;
74+ }
75+
76+ public canIngestDocument ( filePath : string , data : Uint8Array ) : boolean {
77+ return canIngestDocument ( this . _ingestFilter , filePath , new DocumentContents ( data ) ) ;
78+ }
79+
3580 private getHeaders ( authToken : string ) : Record < string , string > {
3681 const headers : Record < string , string > = {
3782 'Content-Type' : 'application/json' ,
@@ -51,56 +96,36 @@ export class ExternalIngestClient {
5196 return ExternalIngestClient . apiClient . makeRequest ( url , this . getHeaders ( authToken ) , 'POST' , body , token ) ;
5297 }
5398
54- async doInitialIndex ( authToken : string , filesetName : string , root : URI , allFiles : AsyncIterable < { readonly uri : URI ; readonly docSha : Uint8Array } > , token : CancellationToken ) : Promise < void > {
99+ async doInitialIndex ( filesetName : string , root : URI , allFiles : AsyncIterable < ExternalIngestFile > , token : CancellationToken ) : Promise < void > {
100+ const authToken = await this . getAuthToken ( ) ;
101+ if ( ! authToken ) {
102+ this . logService . warn ( 'ExternalIngestClient::doInitialIndex(): No auth token available' ) ;
103+ return ;
104+ }
105+
55106 setupPanicHooks ( ) ;
56107
57108 // Initial setup
58- const ingestFilter = new IngestFilter ( ) ;
59109 const mappings = new Map < string , { full : string ; relative : string } > ( ) ;
60110 const geoFilter = new GeoFilter ( ) ;
61111
62112
63113 this . logService . info ( `ExternalIngestClient::doInitialIndex(). Creating ingest for fileset: ${ filesetName } ` ) ;
64- const allDocShas : Uint8Array [ ] = [ ] ;
65114
66115 this . logService . trace ( `ExternalIngestClient::doInitialIndex(). Checking for ingestable files...` ) ;
67116 const ingestableCheckStart = performance . now ( ) ;
68- // Figure out which documents are uploadable and insert them into the geoFilter
69- // and DocSha to path map and DocSha array.
70- const checking = new Set < Promise < void > > ( ) ;
117+
118+ const allDocShas : Uint8Array [ ] = [ ] ;
71119 for await ( const file of allFiles ) {
72120 const relativePath = posix . relative ( root . path , file . uri . path ) ;
73121 const full = file . uri . fsPath ;
74122
75- const p = ( async ( ) => {
76- this . logService . debug ( `ExternalIngestClient::doInitialIndex(). Checking if file can be ingested: ${ relativePath } ` ) ;
77- const fileBytes = await fs . promises . readFile ( full ) ;
78- const content = new DocumentContents ( fileBytes ) ;
79- if ( canIngestDocument ( ingestFilter , relativePath , content ) ) { // Can we do this lazily?
80- try {
81- const docSha = file . docSha ; //getDocSha(relativePath, content);
82- geoFilter . push ( docSha ) ;
83- allDocShas . push ( docSha ) ;
84- // Clients of the external ingest process are required to store a mapping of docSha to
85- // document path. In this example ingestion code it is handled in memory but you might want
86- // to persist somewhere. Note that our example converts the Uin8Arrays to base64 strings
87- // since Uint8Array doesn't work as a Map key because equality is checked by reference.
88- const docShaBase64 = Buffer . from ( docSha ) . toString ( 'base64' ) ;
89- mappings . set ( docShaBase64 , { full, relative : relativePath } ) ;
90- } catch ( err ) {
91- throw new Error ( 'Exception during ingest file' , err ) ;
92- }
93- }
94- } ) ( ) ;
95- p . finally ( ( ) => {
96- checking . delete ( p ) ;
97- } ) ;
98- checking . add ( p ) ;
99- if ( checking . size >= ExternalIngestClient . PROMISE_POOL_SIZE ) {
100- await Promise . race ( checking ) ;
101- }
123+ geoFilter . push ( file . docSha ) ;
124+ allDocShas . push ( file . docSha ) ;
125+
126+ const docShaBase64 = Buffer . from ( file . docSha ) . toString ( 'base64' ) ;
127+ mappings . set ( docShaBase64 , { full, relative : relativePath } ) ;
102128 }
103- await Promise . all ( checking ) ;
104129
105130 this . logService . debug ( `ExternalIngestClient::doInitialIndex(). Found ${ mappings . size } ingestable files in ${ Math . round ( performance . now ( ) - ingestableCheckStart ) } ms` , ) ;
106131
@@ -280,7 +305,13 @@ export class ExternalIngestClient {
280305 this . logService . debug ( `requestId: '${ requestId } ', body: ${ body } ` ) ;
281306 }
282307
283- async listFilesets ( authToken : string , token : CancellationToken ) : Promise < string [ ] > {
308+ async listFilesets ( token : CancellationToken ) : Promise < string [ ] > {
309+ const authToken = await this . getAuthToken ( ) ;
310+ if ( ! authToken ) {
311+ this . logService . warn ( 'ExternalIngestClient::listFilesets(): No auth token available' ) ;
312+ return [ ] ;
313+ }
314+
284315 const resp = await ExternalIngestClient . apiClient . makeRequest (
285316 `${ ExternalIngestClient . baseUrl } /external/code/ingest` ,
286317 this . getHeaders ( authToken ) ,
@@ -293,7 +324,13 @@ export class ExternalIngestClient {
293324 return coalesce ( ( body . filesets ?? [ ] ) . map ( x => x . name ) ) ;
294325 }
295326
296- async deleteFileset ( authToken : string , filesetName : string , token : CancellationToken ) : Promise < void > {
327+ async deleteFileset ( filesetName : string , token : CancellationToken ) : Promise < void > {
328+ const authToken = await this . getAuthToken ( ) ;
329+ if ( ! authToken ) {
330+ this . logService . warn ( 'ExternalIngestClient::deleteFileset(): No auth token available' ) ;
331+ return ;
332+ }
333+
297334 return this . deleteFilesetByName ( authToken , filesetName , token ) ;
298335 }
299336
@@ -313,7 +350,13 @@ export class ExternalIngestClient {
313350 this . logService . info ( `ExternalIngestClient::deleteFilesetByName(): Deleted: ${ fileSetName } ` ) ;
314351 }
315352
316- async searchFilesets ( authToken : string , filesetName : string , rootUri : URI , prompt : string , limit : number , token : CancellationToken ) : Promise < CodeSearchResult > {
353+ async searchFilesets ( filesetName : string , rootUri : URI , prompt : string , limit : number , token : CancellationToken ) : Promise < CodeSearchResult > {
354+ const authToken = await this . getAuthToken ( ) ;
355+ if ( ! authToken ) {
356+ this . logService . warn ( 'ExternalIngestClient::searchFilesets(): No auth token available' ) ;
357+ return { outOfSync : false , chunks : [ ] } ;
358+ }
359+
317360 this . logService . debug ( `ExternalIngestClient::searchFilesets(): Searching fileset '${ filesetName } ' for prompt: '${ prompt } '` ) ;
318361 const embeddingType = EmbeddingType . metis_1024_I16_Binary ;
319362 const resp = await this . post ( authToken , '/external/embeddings/code/search' , {
0 commit comments