77using Elastic . Documentation . Diagnostics ;
88using Elastic . Documentation . Search ;
99using Elastic . Ingest . Elasticsearch ;
10+ using Elastic . Ingest . Elasticsearch . Indices ;
1011using Elastic . Markdown . Helpers ;
1112using Elastic . Markdown . IO ;
1213using Elastic . Transport ;
1314using Elastic . Transport . Products . Elasticsearch ;
1415using Markdig . Syntax ;
1516using Microsoft . Extensions . Logging ;
17+ using NetEscapades . EnumGenerators ;
1618
1719namespace Elastic . Markdown . Exporters . Elasticsearch ;
1820
21+ [ EnumExtensions ]
22+ public enum IngestStrategy { Reindex , Multiplex }
23+
1924public class ElasticsearchMarkdownExporter : IMarkdownExporter , IDisposable
2025{
2126 private readonly IDiagnosticsCollector _collector ;
@@ -27,6 +32,7 @@ public class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposable
2732
2833 private readonly DateTimeOffset _batchIndexDate = DateTimeOffset . UtcNow ;
2934 private readonly DistributedTransport _transport ;
35+ private IngestStrategy _indexStrategy ;
3036
3137 public ElasticsearchMarkdownExporter (
3238 ILoggerFactory logFactory ,
@@ -38,6 +44,7 @@ string indexNamespace
3844 _collector = collector ;
3945 _logger = logFactory . CreateLogger < ElasticsearchMarkdownExporter > ( ) ;
4046 _endpoint = endpoints . Elasticsearch ;
47+ _indexStrategy = IngestStrategy . Reindex ;
4148
4249 var es = endpoints . Elasticsearch ;
4350
@@ -66,14 +73,41 @@ string indexNamespace
6673
6774 _transport = new DistributedTransport ( configuration ) ;
6875
69- _lexicalChannel = new ElasticsearchLexicalExporter ( logFactory , collector , es , indexNamespace , _transport , _batchIndexDate ) ;
76+ _lexicalChannel = new ElasticsearchLexicalExporter ( logFactory , collector , es , indexNamespace , _transport ) ;
7077 _semanticChannel = new ElasticsearchSemanticExporter ( logFactory , collector , es , indexNamespace , _transport ) ;
71-
7278 }
7379
7480 /// <inheritdoc />
75- public async ValueTask StartAsync ( Cancel ctx = default ) =>
76- await _lexicalChannel . Channel . BootstrapElasticsearchAsync ( BootstrapMethod . Failure , null , ctx ) ;
81+ public async ValueTask StartAsync ( Cancel ctx = default )
82+ {
83+ _ = await _lexicalChannel . Channel . BootstrapElasticsearchAsync ( BootstrapMethod . Failure , null , ctx ) ;
84+
85+ var semanticIndex = _semanticChannel . Channel . IndexName ;
86+ var semanticWriteAlias = string . Format ( _semanticChannel . Channel . Options . IndexFormat , "latest" ) ;
87+ var semanticIndexHead = await _transport . HeadAsync ( semanticWriteAlias , ctx ) ;
88+ if ( ! semanticIndexHead . ApiCallDetails . HasSuccessfulStatusCode )
89+ {
90+ _logger . LogInformation ( "No semantic index exists yet, creating index {Index} for semantic search" , semanticIndex ) ;
91+ _ = await _semanticChannel . Channel . BootstrapElasticsearchAsync ( BootstrapMethod . Failure , null , ctx ) ;
92+ var semanticIndexPut = await _transport . PutAsync < StringResponse > ( semanticIndex , PostData . String ( "{}" ) , ctx ) ;
93+ if ( ! semanticIndexPut . ApiCallDetails . HasSuccessfulStatusCode )
94+ throw new Exception ( $ "Failed to create index { semanticIndex } : { semanticIndexPut } ") ;
95+ _ = await _semanticChannel . Channel . ApplyAliasesAsync ( ctx ) ;
96+ if ( ! _endpoint . ForceReindex )
97+ {
98+ _indexStrategy = IngestStrategy . Multiplex ;
99+ _logger . LogInformation ( "Index strategy set to multiplex because {SemanticIndex} does not exist, pass --force-reindex to always use reindex" , semanticIndex ) ;
100+ }
101+ }
102+ _logger . LogInformation ( "Using {IndexStrategy} to sync lexical index to semantic index" , _indexStrategy . ToStringFast ( true ) ) ;
103+ }
104+
105+ public async ValueTask < long > CountAsync ( string body , Cancel ctx = default )
106+ {
107+ var lexicalSearchAlias = _lexicalChannel . Channel . Options . ActiveSearchAlias ;
108+ var countResponse = await _transport . PostAsync < DynamicResponse > ( $ "/{ lexicalSearchAlias } /_count", PostData . String ( body ) , ctx ) ;
109+ return countResponse . Body . Get < long > ( "count" ) ;
110+ }
77111
78112 /// <inheritdoc />
79113 public async ValueTask StopAsync ( Cancel ctx = default )
@@ -82,18 +116,38 @@ public async ValueTask StopAsync(Cancel ctx = default)
82116 var lexicalWriteAlias = string . Format ( _lexicalChannel . Channel . Options . IndexFormat , "latest" ) ;
83117
84118 var semanticIndex = _semanticChannel . Channel . IndexName ;
85- var semanticIndexHead = await _transport . HeadAsync ( semanticWriteAlias , ctx ) ;
86119
87- if ( _endpoint . NoSemantic )
120+ var stopped = await _lexicalChannel . StopAsync ( ctx ) ;
121+ if ( ! stopped )
122+ throw new Exception ( $ "Failed to stop { _lexicalChannel . GetType ( ) . Name } ") ;
123+
124+ var updated = await CountAsync ( $$ """ { "query": { "range": { "last_updated": { "gte": "{{ _batchIndexDate : o}} " } } } }""" , ctx ) ;
125+ var total = await CountAsync ( $$ """ { "query": { "range": { "batch_index_date": { "gte": "{{ _batchIndexDate : o}} " } } } }""" , ctx ) ;
126+ var deleted = await CountAsync ( $$ """ { "query": { "range": { "batch_index_date": { "lt": "{{ _batchIndexDate : o}} " } } } }""" , ctx ) ;
127+
128+ // TODO emit these as metrics
129+ _logger . LogInformation ( "Exported {Total}, Updated {Updated}, Deleted, {Deleted} documents to {LexicalIndex}" , total , updated , deleted , lexicalWriteAlias ) ;
130+ _logger . LogInformation ( "Syncing to semantic index using {IndexStrategy} strategy" , _indexStrategy . ToStringFast ( true ) ) ;
131+
132+ if ( _indexStrategy == IngestStrategy . Multiplex )
88133 {
89- _logger . LogInformation ( "--no-semantic was specified so exiting early before syncing to {Index}" , semanticIndex ) ;
134+ if ( ! _endpoint . NoSemantic )
135+ _ = await _semanticChannel . StopAsync ( ctx ) ;
136+ else
137+ _logger . LogInformation ( "--no-semantic was specified when doing multiplex writes, not rolling over {SemanticIndex}" , semanticIndex ) ;
138+
139+ // cleanup lexical index of old data
140+ await DoDeleteByQuery ( lexicalWriteAlias , ctx ) ;
90141 return ;
91142 }
92143
93- var stopped = await _lexicalChannel . StopAsync ( ctx ) ;
94- if ( ! stopped )
95- throw new Exception ( $ "Failed to stop { _lexicalChannel . GetType ( ) . Name } ") ;
144+ if ( _endpoint . NoSemantic )
145+ {
146+ _logger . LogInformation ( "--no-semantic was specified so exiting early before reindexing to {Index}" , semanticIndex ) ;
147+ return ;
148+ }
96149
150+ var semanticIndexHead = await _transport . HeadAsync ( semanticWriteAlias , ctx ) ;
97151 if ( ! semanticIndexHead . ApiCallDetails . HasSuccessfulStatusCode )
98152 {
99153 _logger . LogInformation ( "No semantic index exists yet, creating index {Index} for semantic search" , semanticIndex ) ;
@@ -148,6 +202,8 @@ public async ValueTask StopAsync(Cancel ctx = default)
148202 await DoReindex ( request , lexicalWriteAlias , semanticWriteAlias , "deletions" , ctx ) ;
149203
150204 await DoDeleteByQuery ( lexicalWriteAlias , ctx ) ;
205+
206+ _logger . LogInformation ( "Finish sync to semantic index using {IndexStrategy} strategy" , _indexStrategy . ToStringFast ( true ) ) ;
151207 }
152208
153209 private async ValueTask DoDeleteByQuery ( string lexicalWriteAlias , Cancel ctx )
@@ -275,6 +331,18 @@ public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext,
275331 } ) . Reverse ( ) . ToArray ( ) ,
276332 Headings = headings
277333 } ;
334+
335+ var semanticHash = _semanticChannel . Channel . Options . ChannelHash ;
336+ var lexicalHash = _lexicalChannel . Channel . Options . ChannelHash ;
337+ var hash = HashedBulkUpdate . CreateHash ( semanticHash , lexicalHash ,
338+ doc . Url , doc . Body ?? string . Empty , string . Join ( "," , doc . Headings . OrderBy ( h => h ) )
339+ ) ;
340+ doc . Hash = hash ;
341+ doc . LastUpdated = _batchIndexDate ;
342+ doc . BatchIndexDate = _batchIndexDate ;
343+
344+ if ( _indexStrategy == IngestStrategy . Multiplex )
345+ return await _lexicalChannel . TryWrite ( doc , ctx ) && await _semanticChannel . TryWrite ( doc , ctx ) ;
278346 return await _lexicalChannel . TryWrite ( doc , ctx ) ;
279347 }
280348
0 commit comments