Skip to content

Commit 26ab1dd

Browse files
committed
Finished example of LLAMA cloud
1 parent ee0aa28 commit 26ab1dd

File tree

1 file changed

+171
-2
lines changed

1 file changed

+171
-2
lines changed

src/KernelMemory.Extensions.ConsoleTest/Samples/CustomParsersSample.cs

Lines changed: 171 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55
using Microsoft.KernelMemory;
66
using Microsoft.KernelMemory.Context;
77
using Microsoft.KernelMemory.DataFormats;
8+
using Microsoft.KernelMemory.Diagnostics;
89
using Microsoft.KernelMemory.DocumentStorage.DevTools;
910
using Microsoft.KernelMemory.FileSystem.DevTools;
1011
using Microsoft.KernelMemory.Handlers;
1112
using Microsoft.KernelMemory.MemoryStorage.DevTools;
13+
using Microsoft.KernelMemory.Pipeline;
14+
using System.Security.Cryptography;
15+
using System.Text;
1216

1317
namespace SemanticMemory.Samples;
1418

@@ -48,6 +52,9 @@ public async Task RunSample(string fileToParse)
4852
TextPartitioningHandler textPartitioning = new("partition", orchestrator);
4953
await orchestrator.AddHandlerAsync(textPartitioning);
5054

55+
CustomSamplePartitioningHandler customMarkdownPartition = new("markdownpartition", orchestrator);
56+
await orchestrator.AddHandlerAsync(customMarkdownPartition);
57+
5158
GenerateEmbeddingsHandler textEmbedding = new("gen_embeddings", orchestrator);
5259
await orchestrator.AddHandlerAsync(textEmbedding);
5360

@@ -66,11 +73,12 @@ public async Task RunSample(string fileToParse)
6673
new TagCollection { { "example", "books" } })
6774
.AddUploadFile(fileName, fileName, fileToParse)
6875
.Then("extract")
69-
.Then("partition")
76+
//.Then("partition")
77+
.Then("markdownpartition")
7078
.Then("gen_embeddings")
7179
.Then("save_records");
7280

73-
contextProvider.AddLLamaCloudParserOptions(fileName, "This is a manual for Dreame vacuum cleaner, I need you to extract a series of sections that can be useful for an helpdesk to answer user questions. You will create sections where each sections contains a question and an answer taken from the text.");
81+
contextProvider.AddLLamaCloudParserOptions(fileName, "This is a manual for Dreame vacuum cleaner, I need you to extract a series of sections that can be useful for an helpdesk to answer user questions. You will create sections where each sections contains a question and an answer taken from the text. Each question will be separated with ---");
7482

7583
var pipeline = pipelineBuilder.Build();
7684
await orchestrator.RunPipelineAsync(pipeline);
@@ -162,3 +170,164 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
162170
return kernelMemoryBuilder;
163171
}
164172
}
173+
174+
public sealed class CustomSamplePartitioningHandler : IPipelineStepHandler
175+
{
176+
private readonly IPipelineOrchestrator _orchestrator;
177+
private readonly ILogger<TextPartitioningHandler> _log;
178+
179+
/// <inheritdoc />
180+
public string StepName { get; }
181+
182+
/// <summary>
183+
/// Handler responsible for partitioning text in small chunks.
184+
/// Note: stepName and other params are injected with DI.
185+
/// </summary>
186+
/// <param name="stepName">Pipeline step for which the handler will be invoked</param>
187+
/// <param name="orchestrator">Current orchestrator used by the pipeline, giving access to content and other helps.</param>
188+
/// <param name="options">The customize text partitioning option</param>
189+
/// <param name="loggerFactory">Application logger factory</param>
190+
public CustomSamplePartitioningHandler(
191+
string stepName,
192+
IPipelineOrchestrator orchestrator,
193+
ILoggerFactory? loggerFactory = null)
194+
{
195+
this.StepName = stepName;
196+
this._orchestrator = orchestrator;
197+
198+
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<TextPartitioningHandler>();
199+
this._log.LogInformation("Handler '{0}' ready", stepName);
200+
}
201+
202+
/// <inheritdoc />
203+
public async Task<(ReturnType returnType, DataPipeline updatedPipeline)> InvokeAsync(
204+
DataPipeline pipeline, CancellationToken cancellationToken = default)
205+
{
206+
this._log.LogDebug("Markdown question Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
207+
208+
if (pipeline.Files.Count == 0)
209+
{
210+
this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
211+
return (ReturnType.Success, pipeline);
212+
}
213+
214+
var context = pipeline.GetContext();
215+
216+
foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files)
217+
{
218+
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
219+
Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles = [];
220+
221+
foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
222+
{
223+
var file = generatedFile.Value;
224+
if (file.AlreadyProcessedBy(this))
225+
{
226+
this._log.LogTrace("File {0} already processed by this handler", file.Name);
227+
continue;
228+
}
229+
230+
// Partition only the original text
231+
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
232+
{
233+
this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
234+
continue;
235+
}
236+
237+
// Use a different partitioning strategy depending on the file type
238+
BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
239+
string partitionsMimeType = MimeTypes.MarkDown;
240+
241+
// Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
242+
if (partitionContent.IsEmpty) { continue; }
243+
int partition = 1;
244+
switch (file.MimeType)
245+
{
246+
case MimeTypes.MarkDown:
247+
{
248+
this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
249+
string content = partitionContent.ToString();
250+
partitionsMimeType = MimeTypes.MarkDown;
251+
252+
var sb = new StringBuilder(1024);
253+
using (var reader = new StringReader(content))
254+
{
255+
string? line;
256+
while ((line = reader.ReadLine()) != null)
257+
{
258+
if (string.IsNullOrWhiteSpace(line))
259+
{
260+
continue;
261+
}
262+
263+
if (line.StartsWith("---"))
264+
{
265+
partition = await AddSegment(pipeline, uploadedFile, newFiles, partitionsMimeType, partition, sb, cancellationToken).ConfigureAwait(false);
266+
sb.Clear();
267+
continue;
268+
}
269+
270+
sb.AppendLine(line);
271+
}
272+
}
273+
274+
// Write remaining content if any
275+
if (sb.Length > 0)
276+
{
277+
await AddSegment(pipeline, uploadedFile, newFiles, partitionsMimeType, partition, sb, cancellationToken).ConfigureAwait(false);
278+
}
279+
280+
break;
281+
}
282+
283+
default:
284+
this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
285+
// Don't partition other files
286+
continue;
287+
}
288+
}
289+
290+
// Add new files to pipeline status
291+
foreach (var file in newFiles)
292+
{
293+
uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
294+
}
295+
}
296+
297+
return (ReturnType.Success, pipeline);
298+
}
299+
300+
private async Task<int> AddSegment(DataPipeline pipeline, DataPipeline.FileDetails uploadedFile, Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles, string partitionsMimeType, int partition, StringBuilder sb, CancellationToken cancellationToken)
301+
{
302+
var destFile = uploadedFile.GetPartitionFileName(partition);
303+
var textData = new BinaryData(sb.ToString());
304+
await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
305+
306+
var destFileDetails = new DataPipeline.GeneratedFileDetails
307+
{
308+
Id = Guid.NewGuid().ToString("N"),
309+
ParentId = uploadedFile.Id,
310+
Name = destFile,
311+
Size = sb.Length,
312+
MimeType = partitionsMimeType,
313+
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
314+
PartitionNumber = partition,
315+
SectionNumber = 1,
316+
Tags = pipeline.Tags,
317+
ContentSHA256 = textData.CalculateSHA256(),
318+
};
319+
newFiles.Add(destFile, destFileDetails);
320+
destFileDetails.MarkProcessedBy(this);
321+
partition++;
322+
return partition;
323+
}
324+
}
325+
326+
internal static class BinaryDataExtensions
327+
{
328+
public static string CalculateSHA256(this BinaryData binaryData)
329+
{
330+
byte[] byteArray = SHA256.HashData(binaryData.ToMemory().Span);
331+
return Convert.ToHexString(byteArray).ToLowerInvariant();
332+
}
333+
}

0 commit comments

Comments
 (0)