Skip to content

Commit cb72d5e

Browse files
authored
Finish LLM text exporter (#1417)
* Reorganize Elastic.Markdown so that code is grouped by purpose not type * LLMText exports resolves includes and substitutions * ensure llm exporter writes output files * tweak output paths of markdown files * Ensure we emit llm data as zip too * blind windows test fix attempt
1 parent d708737 commit cb72d5e

File tree

99 files changed

+659
-363
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+659
-363
lines changed

src/Elastic.Markdown/Slices/DescriptionGenerator.cs renamed to src/Elastic.Markdown/DescriptionGenerator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
using Markdig.Syntax;
88
using Markdig.Syntax.Inlines;
99

10-
namespace Elastic.Markdown.Slices;
10+
namespace Elastic.Markdown;
1111

1212
public interface IDescriptionGenerator
1313
{

src/Elastic.Markdown/DocumentationGenerator.cs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
using Elastic.Markdown.Exporters;
1616
using Elastic.Markdown.IO;
1717
using Elastic.Markdown.Links.CrossLinks;
18-
using Elastic.Markdown.Slices;
1918
using Markdig.Syntax;
2019
using Microsoft.Extensions.Logging;
2120

@@ -174,6 +173,7 @@ await Parallel.ForEachAsync(DocumentationSet.Files, ctx, async (file, token) =>
174173
_logger.LogInformation("-> Processed {ProcessedFiles}/{TotalFileCount} files", processedFiles, totalFileCount);
175174
});
176175
_logger.LogInformation("-> Processed {ProcessedFileCount}/{TotalFileCount} files", processedFileCount, totalFileCount);
176+
177177
}
178178

179179
private void HintUnusedSubstitutionKeys()
@@ -246,7 +246,14 @@ private async Task ProcessFile(HashSet<string> offendingFiles, DocumentationFile
246246
foreach (var exporter in _markdownExporters)
247247
{
248248
var document = context.MarkdownDocument ??= await markdown.ParseFullAsync(ctx);
249-
_ = await exporter.ExportAsync(new MarkdownExportContext { Document = document, File = markdown }, ctx);
249+
_ = await exporter.ExportAsync(new MarkdownExportFileContext
250+
{
251+
BuildContext = Context,
252+
Resolvers = DocumentationSet.MarkdownParser.Resolvers,
253+
Document = document,
254+
SourceFile = markdown,
255+
DefaultOutputFile = outputFile
256+
}, ctx);
250257
}
251258
}
252259
}

src/Elastic.Markdown/Exporters/DocumentationFileExporter.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
using System.IO.Abstractions;
66
using Elastic.Documentation.Configuration;
77
using Elastic.Markdown.IO;
8-
using Elastic.Markdown.Slices;
98
using Markdig.Syntax;
109

1110
namespace Elastic.Markdown.Exporters;

src/Elastic.Markdown/Exporters/IMarkdownExporter.cs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,32 @@
22
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
33
// See the LICENSE file in the project root for more information
44

5+
using System.IO.Abstractions;
6+
using Elastic.Documentation.Configuration;
57
using Elastic.Markdown.IO;
8+
using Elastic.Markdown.Myst;
69
using Markdig.Syntax;
710

811
namespace Elastic.Markdown.Exporters;
912

10-
public class MarkdownExportContext
13+
14+
public record MarkdownExportContext
15+
{
16+
}
17+
public record MarkdownExportFileContext
1118
{
19+
public required BuildContext BuildContext { get; init; }
20+
public required IParserResolvers Resolvers { get; init; }
1221
public required MarkdownDocument Document { get; init; }
13-
public required MarkdownFile File { get; init; }
22+
public required MarkdownFile SourceFile { get; init; }
23+
public required IFileInfo DefaultOutputFile { get; init; }
1424
public string? LLMText { get; set; }
1525
}
1626

1727
public interface IMarkdownExporter
1828
{
1929
ValueTask StartAsync(Cancel ctx = default);
2030
ValueTask StopAsync(Cancel ctx = default);
21-
ValueTask<bool> ExportAsync(MarkdownExportContext context, Cancel ctx);
31+
ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx);
32+
ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx);
2233
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Buffers;
6+
using System.IO.Abstractions;
7+
using System.IO.Compression;
8+
using System.Text;
9+
using Elastic.Documentation.Configuration;
10+
using Elastic.Markdown.Helpers;
11+
using Elastic.Markdown.Myst;
12+
using Elastic.Markdown.Myst.FrontMatter;
13+
14+
namespace Elastic.Markdown.Exporters;
15+
16+
public class LLMTextExporter : IMarkdownExporter
17+
{
18+
public ValueTask StartAsync(Cancel ctx = default) => ValueTask.CompletedTask;
19+
20+
public ValueTask StopAsync(Cancel ctx = default) => ValueTask.CompletedTask;
21+
22+
public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx)
23+
{
24+
var source = fileContext.SourceFile.SourceFile;
25+
var fs = source.FileSystem;
26+
var llmText = fileContext.LLMText ??= ToLLMText(fileContext.BuildContext, fileContext.SourceFile.YamlFrontMatter, fileContext.Resolvers, source);
27+
28+
// write to the output version of the Markdown file directly
29+
var outputFile = fileContext.DefaultOutputFile;
30+
if (outputFile.Name == "index.md")
31+
{
32+
var root = fileContext.BuildContext.DocumentationOutputDirectory;
33+
// Write to a file named after the parent folder
34+
if (outputFile.Directory!.FullName == root.FullName)
35+
{
36+
// TODO in FinishExportAsync find a way to generate llms.txt
37+
// e.g should it embedd all the links?
38+
outputFile = fs.FileInfo.New(Path.Combine(root.FullName, "llms.md"));
39+
}
40+
else
41+
outputFile = fs.FileInfo.New(outputFile.Directory!.FullName + ".md");
42+
}
43+
44+
if (outputFile.Directory is { Exists: false })
45+
outputFile.Directory.Create();
46+
47+
await fs.File.WriteAllTextAsync(outputFile.FullName, llmText, ctx);
48+
return true;
49+
}
50+
51+
/// <inheritdoc />
52+
public ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx)
53+
{
54+
var outputDirectory = Path.Combine(outputFolder.FullName, "docs");
55+
var zipPath = Path.Combine(outputDirectory, "llm.zip");
56+
using (var zip = ZipFile.Open(zipPath, ZipArchiveMode.Create))
57+
{
58+
var markdownFiles = Directory.GetFiles(outputDirectory, "*.md", SearchOption.AllDirectories);
59+
60+
foreach (var file in markdownFiles)
61+
{
62+
var relativePath = Path.GetRelativePath(outputDirectory, file);
63+
_ = zip.CreateEntryFromFile(file, relativePath);
64+
}
65+
}
66+
return ValueTask.FromResult(true);
67+
}
68+
69+
public static string ToLLMText(BuildContext buildContext, YamlFrontMatter? frontMatter, IParserResolvers resolvers, IFileInfo source)
70+
{
71+
var fs = source.FileSystem;
72+
var sb = DocumentationObjectPoolProvider.StringBuilderPool.Get();
73+
74+
Read(source, fs, sb, buildContext.DocumentationSourceDirectory);
75+
var full = sb.ToString();
76+
var state = new ParserState(buildContext)
77+
{
78+
YamlFrontMatter = frontMatter,
79+
MarkdownSourcePath = source,
80+
CrossLinkResolver = resolvers.CrossLinkResolver,
81+
DocumentationFileLookup = resolvers.DocumentationFileLookup
82+
};
83+
DocumentationObjectPoolProvider.StringBuilderPool.Return(sb);
84+
var replaced = full.ReplaceSubstitutions(new ParserContext(state));
85+
return replaced;
86+
}
87+
88+
private static void Read(IFileInfo source, IFileSystem fs, StringBuilder sb, IDirectoryInfo setDirectory)
89+
{
90+
var text = fs.File.ReadAllText(source.FullName).AsSpan();
91+
var spanStart = ":::{include}".AsSpan();
92+
var include = SearchValues.Create([spanStart.ToString(), $":::{Environment.NewLine}"], StringComparison.OrdinalIgnoreCase);
93+
int i;
94+
var startIndex = 0;
95+
while ((i = text[startIndex..].IndexOfAny(include)) >= 0)
96+
{
97+
var cursor = startIndex + i;
98+
var marker = text[cursor..];
99+
if (marker.StartsWith(spanStart))
100+
{
101+
_ = sb.Append(text.Slice(startIndex, i).TrimEnd('\n'));
102+
var relativeFileStart = marker.IndexOf('}') + 1;
103+
var relativeFileEnd = marker.IndexOf('\n');
104+
var relativeFile = marker[relativeFileStart..relativeFileEnd].Trim();
105+
var includePath = Path.GetFullPath(Path.Combine(source.Directory!.FullName, relativeFile.ToString()));
106+
var includeSource = fs.FileInfo.New(includePath);
107+
if (relativeFile.StartsWith('/'))
108+
{
109+
includePath = Path.Combine(setDirectory.FullName, relativeFile.TrimStart('/').ToString());
110+
includeSource = fs.FileInfo.New(includePath);
111+
}
112+
113+
if (includeSource.Extension == "md" && includePath.Contains("_snippets"))
114+
Read(includeSource, fs, sb, setDirectory);
115+
startIndex = cursor + relativeFileEnd;
116+
startIndex = Math.Min(text.Length, startIndex);
117+
}
118+
else
119+
{
120+
startIndex += i + 3 + Environment.NewLine.Length;
121+
startIndex = Math.Min(text.Length, startIndex);
122+
}
123+
}
124+
125+
_ = sb.Append(text[startIndex..]);
126+
}
127+
}

src/Elastic.Markdown/Slices/HtmlWriter.cs renamed to src/Elastic.Markdown/HtmlWriter.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
using Elastic.Documentation.Site.Navigation;
1111
using Elastic.Markdown.Extensions.DetectionRules;
1212
using Elastic.Markdown.IO;
13+
using Elastic.Markdown.Page;
1314
using Markdig.Syntax;
1415
using RazorSlices;
1516
using IFileInfo = System.IO.Abstractions.IFileInfo;
1617

17-
namespace Elastic.Markdown.Slices;
18+
namespace Elastic.Markdown;
1819

1920
public class HtmlWriter(
2021
DocumentationSet documentationSet,
@@ -99,7 +100,7 @@ private async Task<string> RenderLayout(MarkdownFile markdown, MarkdownDocument
99100
if (PositionalNavigation.MarkdownNavigationLookup.TryGetValue("docs-content://versions.md", out var item))
100101
allVersionsUrl = item.Url;
101102

102-
var slice = Index.Create(new IndexViewModel
103+
var slice = Page.Index.Create(new IndexViewModel
103104
{
104105
SiteName = siteName,
105106
DocSetName = DocumentationSet.Name,

src/Elastic.Markdown/IO/DocumentationFile.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
using Elastic.Documentation.Site;
66
using Elastic.Markdown.Myst;
77
using Elastic.Markdown.Myst.FrontMatter;
8-
using Elastic.Markdown.Slices;
98

109
namespace Elastic.Markdown.IO;
1110

src/Elastic.Markdown/IO/MarkdownFile.cs

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
using Elastic.Markdown.Links.CrossLinks;
1414
using Elastic.Markdown.Myst;
1515
using Elastic.Markdown.Myst.Directives;
16+
using Elastic.Markdown.Myst.Directives.Include;
1617
using Elastic.Markdown.Myst.FrontMatter;
1718
using Elastic.Markdown.Myst.InlineParsers;
18-
using Elastic.Markdown.Slices;
1919
using Markdig;
2020
using Markdig.Extensions.Yaml;
2121
using Markdig.Renderers.Roundtrip;
@@ -185,17 +185,6 @@ public async Task<MarkdownDocument> ParseFullAsync(Cancel ctx)
185185
return document;
186186
}
187187

188-
public static string ToLLMText(MarkdownDocument document)
189-
{
190-
using var sw = new StringWriter();
191-
var rr = new RoundtripRenderer(sw);
192-
rr.Write(document);
193-
var outputMarkdown = sw.ToString();
194-
195-
return outputMarkdown;
196-
197-
}
198-
199188
private IReadOnlyDictionary<string, string> GetSubstitutions()
200189
{
201190
var globalSubstitutions = _globalSubstitutions;

src/Elastic.Markdown/IO/Navigation/DocumentationGroup.cs

Lines changed: 0 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
// See the LICENSE file in the project root for more information
44

55
using System.Diagnostics;
6-
using System.Diagnostics.CodeAnalysis;
76
using Elastic.Documentation;
87
using Elastic.Documentation.Configuration;
98
using Elastic.Documentation.Configuration.TableOfContents;
@@ -12,84 +11,6 @@
1211

1312
namespace Elastic.Markdown.IO.Navigation;
1413

15-
[DebuggerDisplay("Current: {Model.RelativePath}")]
16-
public record FileNavigationItem(MarkdownFile Model, DocumentationGroup Group, bool Hidden = false) : ILeafNavigationItem<MarkdownFile>
17-
{
18-
public INodeNavigationItem<INavigationModel, INavigationItem>? Parent { get; set; } = Group;
19-
public IRootNavigationItem<INavigationModel, INavigationItem> NavigationRoot { get; } = Group.NavigationRoot;
20-
public string Url => Model.Url;
21-
public string NavigationTitle => Model.NavigationTitle;
22-
public int NavigationIndex { get; set; }
23-
}
24-
25-
public class TableOfContentsTreeCollector
26-
{
27-
private Dictionary<Uri, TableOfContentsTree> NestedTableOfContentsTrees { get; } = [];
28-
29-
public void Collect(Uri source, TableOfContentsTree tree) =>
30-
NestedTableOfContentsTrees[source] = tree;
31-
32-
public void Collect(TocReference tocReference, TableOfContentsTree tree) =>
33-
NestedTableOfContentsTrees[tocReference.Source] = tree;
34-
35-
public bool TryGetTableOfContentsTree(Uri source, [NotNullWhen(true)] out TableOfContentsTree? tree) =>
36-
NestedTableOfContentsTrees.TryGetValue(source, out tree);
37-
}
38-
39-
40-
[DebuggerDisplay("Toc >{Depth} {FolderName} {Source} ({NavigationItems.Count} items)")]
41-
public class TableOfContentsTree : DocumentationGroup, IRootNavigationItem<MarkdownFile, INavigationItem>
42-
{
43-
public Uri Source { get; }
44-
45-
public TableOfContentsTreeCollector TreeCollector { get; }
46-
47-
public TableOfContentsTree(
48-
Uri source,
49-
BuildContext context,
50-
NavigationLookups lookups,
51-
TableOfContentsTreeCollector treeCollector,
52-
ref int fileIndex)
53-
: base(".", treeCollector, context, lookups, source, ref fileIndex, 0, null, null)
54-
{
55-
TreeCollector = treeCollector;
56-
NavigationRoot = this;
57-
58-
Source = source;
59-
TreeCollector.Collect(source, this);
60-
61-
//edge case if a tree only holds a single group, ensure we collapse it down to the root (this)
62-
if (NavigationItems.Count == 1 && NavigationItems.First() is DocumentationGroup { NavigationItems.Count: 0 })
63-
NavigationItems = [];
64-
65-
66-
}
67-
68-
internal TableOfContentsTree(
69-
Uri source,
70-
string folderName,
71-
TableOfContentsTreeCollector treeCollector,
72-
BuildContext context,
73-
NavigationLookups lookups,
74-
ref int fileIndex,
75-
int depth,
76-
IRootNavigationItem<MarkdownFile, INavigationItem> toplevelTree,
77-
DocumentationGroup? parent
78-
) : base(folderName, treeCollector, context, lookups, source, ref fileIndex, depth, toplevelTree, parent)
79-
{
80-
Source = source;
81-
TreeCollector = treeCollector;
82-
NavigationRoot = this;
83-
TreeCollector.Collect(source, this);
84-
}
85-
86-
protected override IRootNavigationItem<MarkdownFile, INavigationItem> DefaultNavigation => this;
87-
88-
// We rely on IsPrimaryNavEnabled to determine if we should show the dropdown
89-
/// <inheritdoc />
90-
public bool IsUsingNavigationDropdown => false;
91-
}
92-
9314
[DebuggerDisplay("Group >{Depth} {FolderName} ({NavigationItems.Count} items)")]
9415
public class DocumentationGroup : INodeNavigationItem<MarkdownFile, INavigationItem>
9516
{
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Diagnostics;
6+
using Elastic.Documentation.Site.Navigation;
7+
8+
namespace Elastic.Markdown.IO.Navigation;
9+
10+
[DebuggerDisplay("Current: {Model.RelativePath}")]
11+
public record FileNavigationItem(MarkdownFile Model, DocumentationGroup Group, bool Hidden = false) : ILeafNavigationItem<MarkdownFile>
12+
{
13+
public INodeNavigationItem<INavigationModel, INavigationItem>? Parent { get; set; } = Group;
14+
public IRootNavigationItem<INavigationModel, INavigationItem> NavigationRoot { get; } = Group.NavigationRoot;
15+
public string Url => Model.Url;
16+
public string NavigationTitle => Model.NavigationTitle;
17+
public int NavigationIndex { get; set; }
18+
}

0 commit comments

Comments
 (0)