|
| 1 | +// Licensed to Elasticsearch B.V under one or more agreements. |
| 2 | +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. |
| 3 | +// See the LICENSE file in the project root for more information |
| 4 | + |
| 5 | +using System.Buffers; |
| 6 | +using System.IO.Abstractions; |
| 7 | +using System.IO.Compression; |
| 8 | +using System.Text; |
| 9 | +using Elastic.Documentation.Configuration; |
| 10 | +using Elastic.Markdown.Helpers; |
| 11 | +using Elastic.Markdown.Myst; |
| 12 | +using Elastic.Markdown.Myst.FrontMatter; |
| 13 | + |
| 14 | +namespace Elastic.Markdown.Exporters; |
| 15 | + |
| 16 | +public class LLMTextExporter : IMarkdownExporter |
| 17 | +{ |
| 18 | + public ValueTask StartAsync(Cancel ctx = default) => ValueTask.CompletedTask; |
| 19 | + |
| 20 | + public ValueTask StopAsync(Cancel ctx = default) => ValueTask.CompletedTask; |
| 21 | + |
| 22 | + public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx) |
| 23 | + { |
| 24 | + var source = fileContext.SourceFile.SourceFile; |
| 25 | + var fs = source.FileSystem; |
| 26 | + var llmText = fileContext.LLMText ??= ToLLMText(fileContext.BuildContext, fileContext.SourceFile.YamlFrontMatter, fileContext.Resolvers, source); |
| 27 | + |
| 28 | + // write to the output version of the Markdown file directly |
| 29 | + var outputFile = fileContext.DefaultOutputFile; |
| 30 | + if (outputFile.Name == "index.md") |
| 31 | + { |
| 32 | + var root = fileContext.BuildContext.DocumentationOutputDirectory; |
| 33 | + // Write to a file named after the parent folder |
| 34 | + if (outputFile.Directory!.FullName == root.FullName) |
| 35 | + { |
| 36 | + // TODO in FinishExportAsync find a way to generate llms.txt |
| 37 | + // e.g should it embedd all the links? |
| 38 | + outputFile = fs.FileInfo.New(Path.Combine(root.FullName, "llms.md")); |
| 39 | + } |
| 40 | + else |
| 41 | + outputFile = fs.FileInfo.New(outputFile.Directory!.FullName + ".md"); |
| 42 | + } |
| 43 | + |
| 44 | + if (outputFile.Directory is { Exists: false }) |
| 45 | + outputFile.Directory.Create(); |
| 46 | + |
| 47 | + await fs.File.WriteAllTextAsync(outputFile.FullName, llmText, ctx); |
| 48 | + return true; |
| 49 | + } |
| 50 | + |
| 51 | + /// <inheritdoc /> |
| 52 | + public ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx) |
| 53 | + { |
| 54 | + var outputDirectory = Path.Combine(outputFolder.FullName, "docs"); |
| 55 | + var zipPath = Path.Combine(outputDirectory, "llm.zip"); |
| 56 | + using (var zip = ZipFile.Open(zipPath, ZipArchiveMode.Create)) |
| 57 | + { |
| 58 | + var markdownFiles = Directory.GetFiles(outputDirectory, "*.md", SearchOption.AllDirectories); |
| 59 | + |
| 60 | + foreach (var file in markdownFiles) |
| 61 | + { |
| 62 | + var relativePath = Path.GetRelativePath(outputDirectory, file); |
| 63 | + _ = zip.CreateEntryFromFile(file, relativePath); |
| 64 | + } |
| 65 | + } |
| 66 | + return ValueTask.FromResult(true); |
| 67 | + } |
| 68 | + |
| 69 | + public static string ToLLMText(BuildContext buildContext, YamlFrontMatter? frontMatter, IParserResolvers resolvers, IFileInfo source) |
| 70 | + { |
| 71 | + var fs = source.FileSystem; |
| 72 | + var sb = DocumentationObjectPoolProvider.StringBuilderPool.Get(); |
| 73 | + |
| 74 | + Read(source, fs, sb, buildContext.DocumentationSourceDirectory); |
| 75 | + var full = sb.ToString(); |
| 76 | + var state = new ParserState(buildContext) |
| 77 | + { |
| 78 | + YamlFrontMatter = frontMatter, |
| 79 | + MarkdownSourcePath = source, |
| 80 | + CrossLinkResolver = resolvers.CrossLinkResolver, |
| 81 | + DocumentationFileLookup = resolvers.DocumentationFileLookup |
| 82 | + }; |
| 83 | + DocumentationObjectPoolProvider.StringBuilderPool.Return(sb); |
| 84 | + var replaced = full.ReplaceSubstitutions(new ParserContext(state)); |
| 85 | + return replaced; |
| 86 | + } |
| 87 | + |
| 88 | + private static void Read(IFileInfo source, IFileSystem fs, StringBuilder sb, IDirectoryInfo setDirectory) |
| 89 | + { |
| 90 | + var text = fs.File.ReadAllText(source.FullName).AsSpan(); |
| 91 | + var spanStart = ":::{include}".AsSpan(); |
| 92 | + var include = SearchValues.Create([spanStart.ToString(), $":::{Environment.NewLine}"], StringComparison.OrdinalIgnoreCase); |
| 93 | + int i; |
| 94 | + var startIndex = 0; |
| 95 | + while ((i = text[startIndex..].IndexOfAny(include)) >= 0) |
| 96 | + { |
| 97 | + var cursor = startIndex + i; |
| 98 | + var marker = text[cursor..]; |
| 99 | + if (marker.StartsWith(spanStart)) |
| 100 | + { |
| 101 | + _ = sb.Append(text.Slice(startIndex, i).TrimEnd('\n')); |
| 102 | + var relativeFileStart = marker.IndexOf('}') + 1; |
| 103 | + var relativeFileEnd = marker.IndexOf('\n'); |
| 104 | + var relativeFile = marker[relativeFileStart..relativeFileEnd].Trim(); |
| 105 | + var includePath = Path.GetFullPath(Path.Combine(source.Directory!.FullName, relativeFile.ToString())); |
| 106 | + var includeSource = fs.FileInfo.New(includePath); |
| 107 | + if (relativeFile.StartsWith('/')) |
| 108 | + { |
| 109 | + includePath = Path.Combine(setDirectory.FullName, relativeFile.TrimStart('/').ToString()); |
| 110 | + includeSource = fs.FileInfo.New(includePath); |
| 111 | + } |
| 112 | + |
| 113 | + if (includeSource.Extension == "md" && includePath.Contains("_snippets")) |
| 114 | + Read(includeSource, fs, sb, setDirectory); |
| 115 | + startIndex = cursor + relativeFileEnd; |
| 116 | + startIndex = Math.Min(text.Length, startIndex); |
| 117 | + } |
| 118 | + else |
| 119 | + { |
| 120 | + startIndex += i + 3 + Environment.NewLine.Length; |
| 121 | + startIndex = Math.Min(text.Length, startIndex); |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + _ = sb.Append(text[startIndex..]); |
| 126 | + } |
| 127 | +} |
0 commit comments