55using Microsoft . KernelMemory ;
66using Microsoft . KernelMemory . Context ;
77using Microsoft . KernelMemory . DataFormats ;
8+ using Microsoft . KernelMemory . Diagnostics ;
89using Microsoft . KernelMemory . DocumentStorage . DevTools ;
910using Microsoft . KernelMemory . FileSystem . DevTools ;
1011using Microsoft . KernelMemory . Handlers ;
1112using Microsoft . KernelMemory . MemoryStorage . DevTools ;
13+ using Microsoft . KernelMemory . Pipeline ;
14+ using System . Security . Cryptography ;
15+ using System . Text ;
1216
1317namespace SemanticMemory . Samples ;
1418
@@ -48,6 +52,9 @@ public async Task RunSample(string fileToParse)
4852 TextPartitioningHandler textPartitioning = new ( "partition" , orchestrator ) ;
4953 await orchestrator . AddHandlerAsync ( textPartitioning ) ;
5054
55+ CustomSamplePartitioningHandler customMarkdownPartition = new ( "markdownpartition" , orchestrator ) ;
56+ await orchestrator . AddHandlerAsync ( customMarkdownPartition ) ;
57+
5158 GenerateEmbeddingsHandler textEmbedding = new ( "gen_embeddings" , orchestrator ) ;
5259 await orchestrator . AddHandlerAsync ( textEmbedding ) ;
5360
@@ -66,11 +73,12 @@ public async Task RunSample(string fileToParse)
6673 new TagCollection { { "example" , "books" } } )
6774 . AddUploadFile ( fileName , fileName , fileToParse )
6875 . Then ( "extract" )
69- . Then ( "partition" )
76+ //.Then("partition")
77+ . Then ( "markdownpartition" )
7078 . Then ( "gen_embeddings" )
7179 . Then ( "save_records" ) ;
7280
73- contextProvider . AddLLamaCloudParserOptions ( fileName , "This is a manual for Dreame vacuum cleaner, I need you to extract a series of sections that can be useful for an helpdesk to answer user questions. You will create sections where each sections contains a question and an answer taken from the text." ) ;
81+ contextProvider . AddLLamaCloudParserOptions ( fileName , "This is a manual for Dreame vacuum cleaner, I need you to extract a series of sections that can be useful for an helpdesk to answer user questions. You will create sections where each sections contains a question and an answer taken from the text. Each question will be separated with --- " ) ;
7482
7583 var pipeline = pipelineBuilder . Build ( ) ;
7684 await orchestrator . RunPipelineAsync ( pipeline ) ;
@@ -162,3 +170,164 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
162170 return kernelMemoryBuilder ;
163171 }
164172}
173+
174+ public sealed class CustomSamplePartitioningHandler : IPipelineStepHandler
175+ {
176+ private readonly IPipelineOrchestrator _orchestrator ;
177+ private readonly ILogger < TextPartitioningHandler > _log ;
178+
179+ /// <inheritdoc />
180+ public string StepName { get ; }
181+
182+ /// <summary>
183+ /// Handler responsible for partitioning text in small chunks.
184+ /// Note: stepName and other params are injected with DI.
185+ /// </summary>
186+ /// <param name="stepName">Pipeline step for which the handler will be invoked</param>
187+ /// <param name="orchestrator">Current orchestrator used by the pipeline, giving access to content and other helps.</param>
188+ /// <param name="options">The customize text partitioning option</param>
189+ /// <param name="loggerFactory">Application logger factory</param>
190+ public CustomSamplePartitioningHandler (
191+ string stepName ,
192+ IPipelineOrchestrator orchestrator ,
193+ ILoggerFactory ? loggerFactory = null )
194+ {
195+ this . StepName = stepName ;
196+ this . _orchestrator = orchestrator ;
197+
198+ this . _log = ( loggerFactory ?? DefaultLogger . Factory ) . CreateLogger < TextPartitioningHandler > ( ) ;
199+ this . _log . LogInformation ( "Handler '{0}' ready" , stepName ) ;
200+ }
201+
202+ /// <inheritdoc />
203+ public async Task < ( ReturnType returnType , DataPipeline updatedPipeline ) > InvokeAsync (
204+ DataPipeline pipeline , CancellationToken cancellationToken = default )
205+ {
206+ this . _log . LogDebug ( "Markdown question Partitioning text, pipeline '{0}/{1}'" , pipeline . Index , pipeline . DocumentId ) ;
207+
208+ if ( pipeline . Files . Count == 0 )
209+ {
210+ this . _log . LogWarning ( "Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step." , pipeline . Index , pipeline . DocumentId ) ;
211+ return ( ReturnType . Success , pipeline ) ;
212+ }
213+
214+ var context = pipeline . GetContext ( ) ;
215+
216+ foreach ( DataPipeline . FileDetails uploadedFile in pipeline . Files )
217+ {
218+ // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
219+ Dictionary < string , DataPipeline . GeneratedFileDetails > newFiles = [ ] ;
220+
221+ foreach ( KeyValuePair < string , DataPipeline . GeneratedFileDetails > generatedFile in uploadedFile . GeneratedFiles )
222+ {
223+ var file = generatedFile . Value ;
224+ if ( file . AlreadyProcessedBy ( this ) )
225+ {
226+ this . _log . LogTrace ( "File {0} already processed by this handler" , file . Name ) ;
227+ continue ;
228+ }
229+
230+ // Partition only the original text
231+ if ( file . ArtifactType != DataPipeline . ArtifactTypes . ExtractedText )
232+ {
233+ this . _log . LogTrace ( "Skipping file {0} (not original text)" , file . Name ) ;
234+ continue ;
235+ }
236+
237+ // Use a different partitioning strategy depending on the file type
238+ BinaryData partitionContent = await this . _orchestrator . ReadFileAsync ( pipeline , file . Name , cancellationToken ) . ConfigureAwait ( false ) ;
239+ string partitionsMimeType = MimeTypes . MarkDown ;
240+
241+ // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
242+ if ( partitionContent . IsEmpty ) { continue ; }
243+ int partition = 1 ;
244+ switch ( file . MimeType )
245+ {
246+ case MimeTypes . MarkDown :
247+ {
248+ this . _log . LogDebug ( "Partitioning MarkDown file {0}" , file . Name ) ;
249+ string content = partitionContent . ToString ( ) ;
250+ partitionsMimeType = MimeTypes . MarkDown ;
251+
252+ var sb = new StringBuilder ( 1024 ) ;
253+ using ( var reader = new StringReader ( content ) )
254+ {
255+ string ? line ;
256+ while ( ( line = reader . ReadLine ( ) ) != null )
257+ {
258+ if ( string . IsNullOrWhiteSpace ( line ) )
259+ {
260+ continue ;
261+ }
262+
263+ if ( line . StartsWith ( "---" ) )
264+ {
265+ partition = await AddSegment ( pipeline , uploadedFile , newFiles , partitionsMimeType , partition , sb , cancellationToken ) . ConfigureAwait ( false ) ;
266+ sb . Clear ( ) ;
267+ continue ;
268+ }
269+
270+ sb . AppendLine ( line ) ;
271+ }
272+ }
273+
274+ // Write remaining content if any
275+ if ( sb . Length > 0 )
276+ {
277+ await AddSegment ( pipeline , uploadedFile , newFiles , partitionsMimeType , partition , sb , cancellationToken ) . ConfigureAwait ( false ) ;
278+ }
279+
280+ break ;
281+ }
282+
283+ default :
284+ this . _log . LogWarning ( "File {0} cannot be partitioned, type '{1}' not supported" , file . Name , file . MimeType ) ;
285+ // Don't partition other files
286+ continue ;
287+ }
288+ }
289+
290+ // Add new files to pipeline status
291+ foreach ( var file in newFiles )
292+ {
293+ uploadedFile . GeneratedFiles . Add ( file . Key , file . Value ) ;
294+ }
295+ }
296+
297+ return ( ReturnType . Success , pipeline ) ;
298+ }
299+
300+ private async Task < int > AddSegment ( DataPipeline pipeline , DataPipeline . FileDetails uploadedFile , Dictionary < string , DataPipeline . GeneratedFileDetails > newFiles , string partitionsMimeType , int partition , StringBuilder sb , CancellationToken cancellationToken )
301+ {
302+ var destFile = uploadedFile . GetPartitionFileName ( partition ) ;
303+ var textData = new BinaryData ( sb . ToString ( ) ) ;
304+ await this . _orchestrator . WriteFileAsync ( pipeline , destFile , textData , cancellationToken ) . ConfigureAwait ( false ) ;
305+
306+ var destFileDetails = new DataPipeline . GeneratedFileDetails
307+ {
308+ Id = Guid . NewGuid ( ) . ToString ( "N" ) ,
309+ ParentId = uploadedFile . Id ,
310+ Name = destFile ,
311+ Size = sb . Length ,
312+ MimeType = partitionsMimeType ,
313+ ArtifactType = DataPipeline . ArtifactTypes . TextPartition ,
314+ PartitionNumber = partition ,
315+ SectionNumber = 1 ,
316+ Tags = pipeline . Tags ,
317+ ContentSHA256 = textData . CalculateSHA256 ( ) ,
318+ } ;
319+ newFiles . Add ( destFile , destFileDetails ) ;
320+ destFileDetails . MarkProcessedBy ( this ) ;
321+ partition ++ ;
322+ return partition ;
323+ }
324+ }
325+
326+ internal static class BinaryDataExtensions
327+ {
328+ public static string CalculateSHA256 ( this BinaryData binaryData )
329+ {
330+ byte [ ] byteArray = SHA256 . HashData ( binaryData . ToMemory ( ) . Span ) ;
331+ return Convert . ToHexString ( byteArray ) . ToLowerInvariant ( ) ;
332+ }
333+ }
0 commit comments