1
1
// Copyright (c) Microsoft. All rights reserved.
2
2
3
+ using Azure . AI . OpenAI ;
4
+ using Google . Protobuf . WellKnownTypes ;
5
+ using Microsoft . Extensions . Options ;
6
+
3
7
namespace EmbedFunctions . Services ;
4
8
5
- internal sealed partial class AzureSearchEmbedService (
9
+ public sealed partial class AzureSearchEmbedService (
10
+ OpenAIClient openAIClient ,
11
+ string embeddingModelName ,
6
12
SearchClient indexSectionClient ,
13
+ string searchIndexName ,
7
14
SearchIndexClient searchIndexClient ,
8
15
DocumentAnalysisClient documentAnalysisClient ,
9
16
BlobContainerClient corpusContainerClient ,
10
- ILogger < AzureSearchEmbedService > logger ) : IEmbedService
17
+ ILogger < AzureSearchEmbedService > ? logger ) : IEmbedService
11
18
{
12
19
[ GeneratedRegex ( "[^0-9a-zA-Z_-]" ) ]
13
20
private static partial Regex MatchInSetRegex ( ) ;
@@ -16,9 +23,6 @@ public async Task<bool> EmbedBlobAsync(Stream blobStream, string blobName)
16
23
{
17
24
try
18
25
{
19
- var searchIndexName = Environment . GetEnvironmentVariable (
20
- "AZURE_SEARCH_INDEX" ) ?? "gptkbindex" ;
21
-
22
26
await EnsureSearchIndexAsync ( searchIndexName ) ;
23
27
24
28
var pageMap = await GetDocumentTextAsync ( blobStream , blobName ) ;
@@ -41,67 +45,94 @@ public async Task<bool> EmbedBlobAsync(Stream blobStream, string blobName)
41
45
}
42
46
catch ( Exception exception )
43
47
{
44
- logger . LogError (
48
+ logger ? . LogError (
45
49
exception , "Failed to embed blob '{BlobName}'" , blobName ) ;
46
50
47
51
return false ;
48
52
}
49
53
}
50
54
51
- private async Task EnsureSearchIndexAsync ( string searchIndexName )
55
+ public async Task CreateSearchIndexAsync ( string searchIndexName )
52
56
{
53
- var indexNames = searchIndexClient . GetIndexNamesAsync ( ) ;
54
- await foreach ( var page in indexNames . AsPages ( ) )
57
+ string vectorSearchConfigName = "my-vector-config" ;
58
+ string vectorSearchProfile = "my-vector-profile" ;
59
+ var index = new SearchIndex ( searchIndexName )
55
60
{
56
- if ( page . Values . Any ( indexName => indexName == searchIndexName ) )
61
+ VectorSearch = new ( )
57
62
{
58
- logger . LogWarning (
59
- "Search index '{SearchIndexName}' already exists" , searchIndexName ) ;
60
- return ;
63
+ Algorithms =
64
+ {
65
+ new HnswVectorSearchAlgorithmConfiguration ( vectorSearchConfigName )
66
+ } ,
67
+ Profiles =
68
+ {
69
+ new VectorSearchProfile ( vectorSearchProfile , vectorSearchConfigName )
61
70
}
62
- }
63
-
64
- var index = new SearchIndex ( searchIndexName )
65
- {
71
+ } ,
66
72
Fields =
73
+ {
74
+ new SimpleField ( "id" , SearchFieldDataType . String ) { IsKey = true } ,
75
+ new SearchableField ( "content" ) { AnalyzerName = LexicalAnalyzerName . EnMicrosoft } ,
76
+ new SimpleField ( "category" , SearchFieldDataType . String ) { IsFacetable = true } ,
77
+ new SimpleField ( "sourcepage" , SearchFieldDataType . String ) { IsFacetable = true } ,
78
+ new SimpleField ( "sourcefile" , SearchFieldDataType . String ) { IsFacetable = true } ,
79
+ new SearchField ( "embedding" , SearchFieldDataType . Collection ( SearchFieldDataType . Single ) )
67
80
{
68
- new SimpleField ( "id" , SearchFieldDataType . String ) { IsKey = true } ,
69
- new SearchableField ( "content" ) { AnalyzerName = "en.microsoft" } ,
70
- new SimpleField ( "category" , SearchFieldDataType . String ) { IsFacetable = true } ,
71
- new SimpleField ( "sourcepage" , SearchFieldDataType . String ) { IsFacetable = true } ,
72
- new SimpleField ( "sourcefile" , SearchFieldDataType . String ) { IsFacetable = true }
73
- } ,
81
+ VectorSearchDimensions = 1536 ,
82
+ IsSearchable = true ,
83
+ VectorSearchProfile = vectorSearchProfile ,
84
+ }
85
+ } ,
74
86
SemanticSettings = new SemanticSettings
75
87
{
76
88
Configurations =
89
+ {
90
+ new SemanticConfiguration ( "default" , new PrioritizedFields
77
91
{
78
- new SemanticConfiguration ( "default" , new PrioritizedFields
92
+ ContentFields =
79
93
{
80
- ContentFields =
94
+ new SemanticField
81
95
{
82
- new SemanticField
83
- {
84
- FieldName = "content"
85
- }
96
+ FieldName = "content"
86
97
}
87
- } )
88
98
}
99
+ } )
100
+ }
89
101
}
90
102
} ;
91
103
92
- logger . LogInformation (
93
- "Creating '{searchIndexName}' search index" , searchIndexName ) ;
104
+ logger ? . LogInformation (
105
+ "Creating '{searchIndexName}' search index" , searchIndexName ) ;
94
106
95
107
await searchIndexClient . CreateIndexAsync ( index ) ;
96
108
}
97
109
110
+ public async Task EnsureSearchIndexAsync ( string searchIndexName )
111
+ {
112
+ var indexNames = searchIndexClient . GetIndexNamesAsync ( ) ;
113
+ await foreach ( var page in indexNames . AsPages ( ) )
114
+ {
115
+ if ( page . Values . Any ( indexName => indexName == searchIndexName ) )
116
+ {
117
+ logger ? . LogWarning (
118
+ "Search index '{SearchIndexName}' already exists" , searchIndexName ) ;
119
+ return ;
120
+ }
121
+ }
122
+
123
+ await CreateSearchIndexAsync ( searchIndexName ) ;
124
+ }
125
+
98
126
private async Task < IReadOnlyList < PageDetail > > GetDocumentTextAsync ( Stream blobStream , string blobName )
99
127
{
100
- logger . LogInformation (
128
+ logger ? . LogInformation (
101
129
"Extracting text from '{Blob}' using Azure Form Recognizer" , blobName ) ;
102
130
131
+ using var ms = new MemoryStream ( ) ;
132
+ blobStream . CopyTo ( ms ) ;
133
+ ms . Position = 0 ;
103
134
AnalyzeDocumentOperation operation = documentAnalysisClient . AnalyzeDocument (
104
- WaitUntil . Started , "prebuilt-layout" , blobStream ) ;
135
+ WaitUntil . Started , "prebuilt-layout" , ms ) ;
105
136
106
137
var offset = 0 ;
107
138
List < PageDetail > pageMap = [ ] ;
@@ -208,7 +239,7 @@ private async Task UploadCorpusAsync(string corpusBlobName, string text)
208
239
return ;
209
240
}
210
241
211
- logger . LogInformation ( "Uploading corpus '{CorpusBlobName}'" , corpusBlobName ) ;
242
+ logger ? . LogInformation ( "Uploading corpus '{CorpusBlobName}'" , corpusBlobName ) ;
212
243
213
244
await using var stream = new MemoryStream ( Encoding . UTF8 . GetBytes ( text ) ) ;
214
245
await blobClient . UploadAsync ( stream , new BlobHttpHeaders
@@ -231,7 +262,7 @@ private IEnumerable<Section> CreateSections(
231
262
var start = 0 ;
232
263
var end = length ;
233
264
234
- logger . LogInformation ( "Splitting '{BlobName}' into sections" , blobName ) ;
265
+ logger ? . LogInformation ( "Splitting '{BlobName}' into sections" , blobName ) ;
235
266
236
267
while ( start + SectionOverlap < length )
237
268
{
@@ -300,9 +331,9 @@ private IEnumerable<Section> CreateSections(
300
331
// If the section ends with an unclosed table, we need to start the next section with the table.
301
332
// If table starts inside SentenceSearchLimit, we ignore it, as that will cause an infinite loop for tables longer than MaxSectionLength
302
333
// If last table starts inside SectionOverlap, keep overlapping
303
- if ( logger . IsEnabled ( LogLevel . Warning ) )
334
+ if ( logger ? . IsEnabled ( LogLevel . Warning ) is true )
304
335
{
305
- logger . LogWarning ( """
336
+ logger ? . LogWarning ( """
306
337
Section ends with unclosed table, starting next section with the
307
338
table at page {Offset} offset {Start} table start {LastTableStart}
308
339
""" ,
@@ -349,10 +380,10 @@ private static string BlobNameFromFilePage(string blobName, int page = 0) => Pat
349
380
350
381
private async Task IndexSectionsAsync ( string searchIndexName , IEnumerable < Section > sections , string blobName )
351
382
{
352
- var infoLoggingEnabled = logger . IsEnabled ( LogLevel . Information ) ;
353
- if ( infoLoggingEnabled )
383
+ var infoLoggingEnabled = logger ? . IsEnabled ( LogLevel . Information ) ;
384
+ if ( infoLoggingEnabled is true )
354
385
{
355
- logger . LogInformation ( """
386
+ logger ? . LogInformation ( """
356
387
Indexing sections from '{BlobName}' into search index '{SearchIndexName}'
357
388
""" ,
358
389
blobName ,
@@ -363,6 +394,8 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}'
363
394
var batch = new IndexDocumentsBatch < SearchDocument > ( ) ;
364
395
foreach ( var section in sections )
365
396
{
397
+ var embeddings = await openAIClient . GetEmbeddingsAsync ( embeddingModelName , new Azure . AI . OpenAI . EmbeddingsOptions ( section . Content . Replace ( '\r ' , ' ' ) ) ) ;
398
+ var embedding = embeddings . Value . Data . FirstOrDefault ( ) ? . Embedding . ToArray ( ) ?? [ ] ;
366
399
batch . Actions . Add ( new IndexDocumentsAction < SearchDocument > (
367
400
IndexActionType . MergeOrUpload ,
368
401
new SearchDocument
@@ -371,7 +404,8 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}'
371
404
[ "content" ] = section . Content ,
372
405
[ "category" ] = section . Category ,
373
406
[ "sourcepage" ] = section . SourcePage ,
374
- [ "sourcefile" ] = section . SourceFile
407
+ [ "sourcefile" ] = section . SourceFile ,
408
+ [ "embedding" ] = embedding ,
375
409
} ) ) ;
376
410
377
411
iteration ++ ;
@@ -380,9 +414,9 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}'
380
414
// Every one thousand documents, batch create.
381
415
IndexDocumentsResult result = await indexSectionClient . IndexDocumentsAsync ( batch ) ;
382
416
int succeeded = result . Results . Count ( r => r . Succeeded ) ;
383
- if ( infoLoggingEnabled )
417
+ if ( infoLoggingEnabled is true )
384
418
{
385
- logger . LogInformation ( """
419
+ logger ? . LogInformation ( """
386
420
Indexed {Count} sections, {Succeeded} succeeded
387
421
""" ,
388
422
batch . Actions . Count ,
@@ -399,9 +433,9 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}'
399
433
var index = new SearchIndex ( $ "index-{ batch . Actions . Count } ") ;
400
434
IndexDocumentsResult result = await indexSectionClient . IndexDocumentsAsync ( batch ) ;
401
435
int succeeded = result . Results . Count ( r => r . Succeeded ) ;
402
- if ( logger . IsEnabled ( LogLevel . Information ) )
436
+ if ( logger ? . IsEnabled ( LogLevel . Information ) is true )
403
437
{
404
- logger . LogInformation ( """
438
+ logger ? . LogInformation ( """
405
439
Indexed {Count} sections, {Succeeded} succeeded
406
440
""" ,
407
441
batch . Actions . Count ,
0 commit comments