Skip to content

Commit 8731657

Browse files
authored
Add back semantic Elasticsearch indexing through semantic exporter (#1697)
1 parent df3621c commit 8731657

File tree

4 files changed

+104
-40
lines changed

4 files changed

+104
-40
lines changed

src/Elastic.Documentation/Exporter.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@ namespace Elastic.Documentation;
88

99
public enum Exporter
1010
{
11-
Html = 0,
12-
LLMText = 1,
13-
Elasticsearch = 2,
14-
Configuration = 3,
15-
DocumentationState = 4,
16-
LinkMetadata = 5,
17-
Redirects = 6
11+
Html,
12+
LLMText,
13+
Elasticsearch,
14+
SemanticElasticsearch,
15+
Configuration,
16+
DocumentationState,
17+
LinkMetadata,
18+
Redirects,
1819
}
1920
public static class ExportOptions
2021
{

src/Elastic.Documentation/Search/DocumentationDocument.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,7 @@ public record DocumentationDocument
3030

3131
[JsonPropertyName("body")]
3232
public string? Body { get; set; }
33+
34+
[JsonPropertyName("abstract")]
35+
public string? Abstract { get; set; }
3336
}

src/tooling/Elastic.Documentation.Tooling/Arguments/ExportOption.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public static bool TryParse(ReadOnlySpan<char> s, out IReadOnlySet<Exporter> res
2727
"llmtext" => LLMText,
2828
"es" => Elasticsearch,
2929
"elasticsearch" => Elasticsearch,
30+
"semantic" => SemanticElasticsearch,
3031
"html" => Html,
3132
"config" => Exporter.Configuration,
3233
"links" => LinkMetadata,
@@ -62,15 +63,15 @@ public static IReadOnlyCollection<IMarkdownExporter> CreateMarkdownExporters(
6263
IDocumentationConfigurationContext context
6364
)
6465
{
65-
var esExporter = new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints);
66-
6766
var markdownExporters = new List<IMarkdownExporter>(3);
6867
if (exportOptions.Contains(LLMText))
6968
markdownExporters.Add(new LlmMarkdownExporter());
7069
if (exportOptions.Contains(Exporter.Configuration))
7170
markdownExporters.Add(new ConfigurationExporter(logFactory, context.ConfigurationFileProvider, context));
7271
if (exportOptions.Contains(Elasticsearch))
73-
markdownExporters.Add(esExporter);
72+
markdownExporters.Add(new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints));
73+
if (exportOptions.Contains(SemanticElasticsearch))
74+
markdownExporters.Add(new ElasticsearchMarkdownSemanticExporter(logFactory, context.Collector, context.Endpoints));
7475
return markdownExporters;
7576
}
7677
}

src/tooling/Elastic.Documentation.Tooling/Exporters/ElasticsearchMarkdownExporter.cs

Lines changed: 89 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information
44

55
using System.IO.Abstractions;
6+
using Elastic.Channels;
67
using Elastic.Documentation.Configuration;
78
using Elastic.Documentation.Diagnostics;
89
using Elastic.Documentation.Search;
@@ -18,10 +19,82 @@
1819
namespace Elastic.Documentation.Tooling.Exporters;
1920

2021
public class ElasticsearchMarkdownExporter(ILoggerFactory logFactory, IDiagnosticsCollector collector, DocumentationEndpoints endpoints)
22+
: ElasticsearchMarkdownExporterBase<CatalogIndexChannelOptions<DocumentationDocument>, CatalogIndexChannel<DocumentationDocument>>
23+
(logFactory, collector, endpoints)
24+
{
25+
/// <inheritdoc />
26+
protected override CatalogIndexChannelOptions<DocumentationDocument> NewOptions(DistributedTransport transport) => new(transport)
27+
{
28+
GetMapping = () => CreateMapping(null),
29+
IndexFormat = "documentation{0:yyyy.MM.dd.HHmmss}",
30+
ActiveSearchAlias = "documentation"
31+
};
32+
33+
/// <inheritdoc />
34+
protected override CatalogIndexChannel<DocumentationDocument> NewChannel(CatalogIndexChannelOptions<DocumentationDocument> options) => new(options);
35+
}
36+
public class ElasticsearchMarkdownSemanticExporter(ILoggerFactory logFactory, IDiagnosticsCollector collector, DocumentationEndpoints endpoints)
37+
: ElasticsearchMarkdownExporterBase<SemanticIndexChannelOptions<DocumentationDocument>, SemanticIndexChannel<DocumentationDocument>>
38+
(logFactory, collector, endpoints)
39+
{
40+
/// <inheritdoc />
41+
protected override SemanticIndexChannelOptions<DocumentationDocument> NewOptions(DistributedTransport transport) => new(transport)
42+
{
43+
GetMapping = (inferenceId, _) => CreateMapping(inferenceId),
44+
IndexFormat = "semantic-documentation-{0:yyyy.MM.dd.HHmmss}",
45+
ActiveSearchAlias = "semantic-documentation",
46+
IndexNumThreads = IndexNumThreads,
47+
InferenceCreateTimeout = TimeSpan.FromMinutes(4),
48+
};
49+
50+
/// <inheritdoc />
51+
protected override SemanticIndexChannel<DocumentationDocument> NewChannel(SemanticIndexChannelOptions<DocumentationDocument> options) => new(options);
52+
}
53+
54+
public abstract class ElasticsearchMarkdownExporterBase<TChannelOptions, TChannel>(
55+
ILoggerFactory logFactory,
56+
IDiagnosticsCollector collector,
57+
DocumentationEndpoints endpoints)
2158
: IMarkdownExporter, IDisposable
59+
where TChannelOptions : CatalogIndexChannelOptionsBase<DocumentationDocument>
60+
where TChannel : CatalogIndexChannel<DocumentationDocument, TChannelOptions>
2261
{
23-
private CatalogIndexChannel<DocumentationDocument>? _channel;
24-
private readonly ILogger<ElasticsearchMarkdownExporter> _logger = logFactory.CreateLogger<ElasticsearchMarkdownExporter>();
62+
private TChannel? _channel;
63+
private readonly ILogger<IMarkdownExporter> _logger = logFactory.CreateLogger<IMarkdownExporter>();
64+
65+
protected abstract TChannelOptions NewOptions(DistributedTransport transport);
66+
protected abstract TChannel NewChannel(TChannelOptions options);
67+
68+
protected int IndexNumThreads => 8;
69+
70+
protected static string CreateMapping(string? inferenceId) =>
71+
// langugage=json
72+
$$"""
73+
{
74+
"properties": {
75+
"title": { "type": "text" },
76+
"body": { "type": "text" }
77+
{{(!string.IsNullOrWhiteSpace(inferenceId) ? AbstractInferenceMapping(inferenceId) : AbstractMapping())}}
78+
}
79+
}
80+
""";
81+
82+
private static string AbstractMapping() =>
83+
// langugage=json
84+
"""
85+
, "abstract": {
86+
"type": "text",
87+
}
88+
""";
89+
90+
private static string AbstractInferenceMapping(string inferenceId) =>
91+
// langugage=json
92+
$$"""
93+
, "abstract": {
94+
"type": "semantic_text",
95+
"inference_id": "{{inferenceId}}"
96+
}
97+
""";
2598

2699
public async ValueTask StartAsync(Cancel ctx = default)
27100
{
@@ -41,36 +114,18 @@ public async ValueTask StartAsync(Cancel ctx = default)
41114
var transport = new DistributedTransport(configuration);
42115
//The max num threads per allocated node, from testing its best to limit our max concurrency
43116
//producing to this number as well
44-
var indexNumThreads = 8;
45-
var options = new CatalogIndexChannelOptions<DocumentationDocument>(transport)
117+
var options = NewOptions(transport);
118+
options.BufferOptions = new BufferOptions
46119
{
47-
BufferOptions =
48-
{
49-
OutboundBufferMaxSize = 100,
50-
ExportMaxConcurrency = indexNumThreads,
51-
ExportMaxRetries = 3
52-
},
53-
SerializerContext = SourceGenerationContext.Default,
54-
// IndexNumThreads = indexNumThreads,
55-
IndexFormat = "documentation-{0:yyyy.MM.dd.HHmmss}",
56-
ActiveSearchAlias = "documentation",
57-
ExportBufferCallback = () => _logger.LogInformation("Exported buffer to Elasticsearch"),
58-
ExportExceptionCallback = e => _logger.LogError(e, "Failed to export document"),
59-
ServerRejectionCallback = items => _logger.LogInformation("Server rejection: {Rejection}", items.First().Item2),
60-
//GetMapping = (inferenceId, _) => // language=json
61-
GetMapping = () => // language=json
62-
$$"""
63-
{
64-
"properties": {
65-
"title": { "type": "text" },
66-
"body": {
67-
"type": "text"
68-
}
69-
}
70-
}
71-
"""
120+
OutboundBufferMaxSize = 100,
121+
ExportMaxConcurrency = IndexNumThreads,
122+
ExportMaxRetries = 3
72123
};
73-
_channel = new CatalogIndexChannel<DocumentationDocument>(options);
124+
options.SerializerContext = SourceGenerationContext.Default;
125+
options.ExportBufferCallback = () => _logger.LogInformation("Exported buffer to Elasticsearch");
126+
options.ExportExceptionCallback = e => _logger.LogError(e, "Failed to export document");
127+
options.ServerRejectionCallback = items => _logger.LogInformation("Server rejection: {Rejection}", items.First().Item2);
128+
_channel = NewChannel(options);
74129
_logger.LogInformation($"Bootstrapping {nameof(SemanticIndexChannel<DocumentationDocument>)} Elasticsearch target for indexing");
75130
_ = await _channel.BootstrapElasticsearchAsync(BootstrapMethod.Failure, null, ctx);
76131
}
@@ -103,6 +158,7 @@ public void Dispose()
103158
_channel.Complete();
104159
_channel.Dispose();
105160
}
161+
106162
GC.SuppressFinalize(this);
107163
}
108164

@@ -134,6 +190,9 @@ public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext,
134190
Url = url,
135191
Body = body,
136192
Description = fileContext.SourceFile.YamlFrontMatter?.Description,
193+
Abstract = !string.IsNullOrEmpty(body)
194+
? body[..Math.Min(body.Length, 400)]
195+
: string.Empty,
137196
Applies = fileContext.SourceFile.YamlFrontMatter?.AppliesTo,
138197
};
139198
return await TryWrite(doc, ctx);

0 commit comments

Comments
 (0)