diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs new file mode 100644 index 00000000000..edda9283bce --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs @@ -0,0 +1,27 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.IO; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.DataIngestion; + +/// +/// Reads source content and converts it to an . +/// +/// The type of the source to read from. Sample: , , etc. +public interface IIngestionDocumentReader +{ + /// + /// Reads a source and converts it to an . + /// + /// The source to read. + /// The unique identifier for the document. + /// The media type of the source (if needed). + /// The token to monitor for cancellation requests. + /// A task representing the asynchronous read operation. + /// or is or empty. + Task ReadAsync(TSource source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default); +} diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs index 8bdb651321a..971170e57d5 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs @@ -12,7 +12,7 @@ namespace Microsoft.Extensions.DataIngestion; /// /// Reads source content and converts it to an . /// -public abstract class IngestionDocumentReader +public abstract class IngestionDocumentReader : IIngestionDocumentReader, IIngestionDocumentReader { /// /// Reads a file and converts it to an . @@ -24,7 +24,7 @@ public abstract class IngestionDocumentReader public Task ReadAsync(FileInfo source, CancellationToken cancellationToken = default) { string identifier = Throw.IfNull(source).FullName; // entire path is more unique than just part of it. - return ReadAsync(source, identifier, GetMediaType(source), cancellationToken); + return ReadAsync(source, identifier, source.GetMediaType(), cancellationToken); } /// @@ -42,7 +42,7 @@ public virtual async Task ReadAsync(FileInfo source, string i _ = Throw.IfNullOrEmpty(identifier); using FileStream stream = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 1, FileOptions.Asynchronous); - return await ReadAsync(stream, identifier, string.IsNullOrEmpty(mediaType) ? GetMediaType(source) : mediaType!, cancellationToken).ConfigureAwait(false); + return await ReadAsync(stream, identifier, string.IsNullOrEmpty(mediaType) ? source.GetMediaType() : mediaType!, cancellationToken).ConfigureAwait(false); } /// @@ -53,90 +53,5 @@ public virtual async Task ReadAsync(FileInfo source, string i /// The media type of the content. /// The token to monitor for cancellation requests. /// A task representing the asynchronous read operation. - public abstract Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default); - - private static string GetMediaType(FileInfo source) - => source.Extension switch - { - ".123" => "application/vnd.lotus-1-2-3", - ".602" => "application/x-t602", - ".abw" => "application/x-abiword", - ".bmp" => "image/bmp", - ".cgm" => "image/cgm", - ".csv" => "text/csv", - ".cwk" => "application/x-cwk", - ".dbf" => "application/vnd.dbf", - ".dif" => "application/x-dif", - ".doc" => "application/msword", - ".docm" => "application/vnd.ms-word.document.macroEnabled.12", - ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ".dot" => "application/msword", - ".dotm" => "application/vnd.ms-word.template.macroEnabled.12", - ".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template", - ".epub" => "application/epub+zip", - ".et" => "application/vnd.ms-excel", - ".eth" => "application/ethos", - ".fods" => "application/vnd.oasis.opendocument.spreadsheet", - ".gif" => "image/gif", - ".htm" => "text/html", - ".html" => "text/html", - ".hwp" => "application/x-hwp", - ".jpeg" => "image/jpeg", - ".jpg" => "image/jpeg", - ".key" => "application/x-iwork-keynote-sffkey", - ".lwp" => "application/vnd.lotus-wordpro", - ".mcw" => "application/macwriteii", - ".mw" => "application/macwriteii", - ".numbers" => "application/x-iwork-numbers-sffnumbers", - ".ods" => "application/vnd.oasis.opendocument.spreadsheet", - ".pages" => "application/x-iwork-pages-sffpages", - ".pbd" => "application/x-pagemaker", - ".pdf" => "application/pdf", - ".png" => "image/png", - ".pot" => "application/vnd.ms-powerpoint", - ".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12", - ".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template", - ".ppt" => "application/vnd.ms-powerpoint", - ".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12", - ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ".prn" => "application/x-prn", - ".qpw" => "application/x-quattro-pro", - ".rtf" => "application/rtf", - ".sda" => "application/vnd.stardivision.draw", - ".sdd" => "application/vnd.stardivision.impress", - ".sdp" => "application/sdp", - ".sdw" => "application/vnd.stardivision.writer", - ".sgl" => "application/vnd.stardivision.writer", - ".slk" => "text/vnd.sylk", - ".sti" => "application/vnd.sun.xml.impress.template", - ".stw" => "application/vnd.sun.xml.writer.template", - ".svg" => "image/svg+xml", - ".sxg" => "application/vnd.sun.xml.writer.global", - ".sxi" => "application/vnd.sun.xml.impress", - ".sxw" => "application/vnd.sun.xml.writer", - ".sylk" => "text/vnd.sylk", - ".tiff" => "image/tiff", - ".tsv" => "text/tab-separated-values", - ".txt" => "text/plain", - ".uof" => "application/vnd.uoml+xml", - ".uop" => "application/vnd.openofficeorg.presentation", - ".uos1" => "application/vnd.uoml+xml", - ".uos2" => "application/vnd.uoml+xml", - ".uot" => "application/x-uo", - ".vor" => "application/vnd.stardivision.writer", - ".webp" => "image/webp", - ".wpd" => "application/wordperfect", - ".wps" => "application/vnd.ms-works", - ".wq1" => "application/x-lotus", - ".wq2" => "application/x-lotus", - ".xls" => "application/vnd.ms-excel", - ".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12", - ".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12", - ".xlr" => "application/vnd.ms-works", - ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ".xlw" => "application/vnd.ms-excel", - ".xml" => "application/xml", - ".zabw" => "application/x-abiword", - _ => "application/octet-stream" - }; + public abstract Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default); } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj index f3f16874b4c..f2e895df244 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj @@ -15,6 +15,10 @@ $(NoWarn);S1694 + + + + diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index e6a14bfbf17..60e3a02bd94 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.DataIngestion; /// /// Reads documents by converting them to Markdown using the MarkItDown MCP server. /// -public class MarkItDownMcpReader : IngestionDocumentReader +public class MarkItDownMcpReader : IngestionDocumentReader, IIngestionDocumentReader { private readonly Uri _mcpServerUri; private readonly McpClientOptions? _options; @@ -65,7 +65,7 @@ public override async Task ReadAsync(FileInfo source, string } /// - public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); @@ -79,10 +79,18 @@ public override async Task ReadAsync(Stream source, string id #endif DataContent dataContent = new( ms.GetBuffer().AsMemory(0, (int)ms.Length), - string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType); + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!); - string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false); + return await ReadAsync(dataContent, identifier, mediaType, cancellationToken).ConfigureAwait(false); + } + + /// + public async Task ReadAsync(DataContent source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(source); + _ = Throw.IfNullOrEmpty(identifier); + string markdown = await ConvertToMarkdownAsync(source, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs index 79b60f3ad5d..3ea255070a5 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs @@ -94,7 +94,7 @@ public override async Task ReadAsync(FileInfo source, string /// /// The contents of are copied to a temporary file. - public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md index 095011b77f1..4f846b09611 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md @@ -30,7 +30,7 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkItDownReader(new FileInfo(@"pathToMarkItDown.exe"), extractImages: true); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); ``` ### Creating a MarkItDownMcpReader for Data Ingestion (MCP Server) @@ -44,7 +44,7 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/mcp")); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); ``` The MarkItDown MCP server can be run using Docker: diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs index 1afabd03139..3581dd4bf9c 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs @@ -29,7 +29,7 @@ public override async Task ReadAsync(FileInfo source, string } /// - public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md index c6a2328699c..c8dc2af1968 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md @@ -27,7 +27,7 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkdownReader(); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); ``` ## Feedback & Contributing diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs index 4251bef6ae3..97aa3d86866 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs @@ -24,6 +24,7 @@ internal static class ProcessFiles internal static class ProcessSource { + internal const string ActivityName = "ProcessSource"; internal const string DocumentIdTagName = "rag.document.id"; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs new file mode 100644 index 00000000000..4a0931f4b57 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs @@ -0,0 +1,124 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; +using static Microsoft.Extensions.DataIngestion.DiagnosticsConstants; + +namespace Microsoft.Extensions.DataIngestion; + +#pragma warning disable IDE0058 // Expression value is never used +#pragma warning disable IDE0063 // Use simple 'using' statement +#pragma warning disable CA1031 // Do not catch general exception types + +/// +/// Provides a set of File System extension methods for the class. +/// +public static class FileSystemIngestionExtensions +{ + /// + /// Processes all files in the specified directory that match the given search pattern and option. + /// + /// The type of the chunk content. + /// The ingestion pipeline. + /// The directory to process. + /// The search pattern for file selection. + /// The search option for directory traversal. + /// The cancellation token for the operation. + /// A task representing the asynchronous operation. + public static async IAsyncEnumerable ProcessAsync( + this IngestionPipeline pipeline, + DirectoryInfo directory, string searchPattern = "*.*", + SearchOption searchOption = SearchOption.TopDirectoryOnly, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + Throw.IfNull(pipeline); + Throw.IfNull(directory); + Throw.IfNullOrEmpty(searchPattern); + Throw.IfOutOfRange((int)searchOption, (int)SearchOption.TopDirectoryOnly, (int)SearchOption.AllDirectories); + + using (Activity? rootActivity = pipeline.ActivitySource.StartActivity(ProcessDirectory.ActivityName)) + { + rootActivity?.SetTag(ProcessDirectory.DirectoryPathTagName, directory.FullName) + .SetTag(ProcessDirectory.SearchPatternTagName, searchPattern) + .SetTag(ProcessDirectory.SearchOptionTagName, searchOption.ToString()); + pipeline.Logger?.ProcessingDirectory(directory.FullName, searchPattern, searchOption); + + var files = directory.EnumerateFiles(searchPattern, searchOption); + await foreach (var ingestionResult in pipeline.ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false)) + { + yield return ingestionResult; + } + } + } + + /// + /// Processes the specified files. + /// + /// The type of the chunk content. + /// The ingestion pipeline. + /// The collection of files to process. + /// The cancellation token for the operation. + /// A task representing the asynchronous operation. + public static async IAsyncEnumerable ProcessAsync( + this IngestionPipeline pipeline, + IEnumerable files, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + Throw.IfNull(pipeline); + Throw.IfNull(files); + + using (Activity? rootActivity = pipeline.ActivitySource.StartActivity(ProcessFiles.ActivityName)) + { + await foreach (var ingestionResult in pipeline.ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false)) + { + yield return ingestionResult; + } + } + } + + private static async IAsyncEnumerable ProcessAsync( + this IngestionPipeline pipeline, + IEnumerable files, Activity? rootActivity, + [EnumeratorCancellation] CancellationToken cancellationToken) + { +#if NET + if (System.Linq.Enumerable.TryGetNonEnumeratedCount(files, out int count)) +#else + if (files is IReadOnlyCollection { Count: int count }) +#endif + { + rootActivity?.SetTag(ProcessFiles.FileCountTagName, count); + pipeline.Logger?.LogFileCount(count); + } + + foreach (FileInfo fileInfo in files) + { + using (Activity? processFileActivity = pipeline.ActivitySource.StartActivity(ProcessFile.ActivityName, ActivityKind.Internal, parentContext: rootActivity?.Context ?? default)) + { + processFileActivity?.SetTag(ProcessFile.FilePathTagName, fileInfo.FullName); + + Exception? failure = null; + IngestionDocument? document = null; + + try + { + document = await pipeline.ProcessAsync(fileInfo, fileInfo.FullName, fileInfo.GetMediaType(), cancellationToken).ConfigureAwait(false); + } + catch (Exception e) + { + failure = e; + } + + string documentId = document?.Identifier ?? fileInfo.FullName; + yield return new IngestionResult(documentId, document, failure); + } + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs index 1eeb94058ee..e130d8086a2 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Diagnostics; -using System.IO; using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; @@ -21,17 +20,18 @@ namespace Microsoft.Extensions.DataIngestion; /// /// Represents a pipeline for ingesting data from documents and processing it into chunks. /// -/// The type of the chunk content. -public sealed class IngestionPipeline : IDisposable +/// The type of the source content. +/// The type of the chunk content. +public sealed class IngestionPipeline : IDisposable { - private readonly IngestionDocumentReader _reader; - private readonly IngestionChunker _chunker; - private readonly IngestionChunkWriter _writer; - private readonly ActivitySource _activitySource; - private readonly ILogger? _logger; + internal readonly ActivitySource ActivitySource; + internal readonly ILogger? Logger; + private readonly IIngestionDocumentReader _reader; + private readonly IngestionChunker _chunker; + private readonly IngestionChunkWriter _writer; /// - /// Initializes a new instance of the class. + /// Initializes a new instance of the class. /// /// The reader for ingestion documents. /// The chunker to split documents into chunks. @@ -39,24 +39,24 @@ public sealed class IngestionPipeline : IDisposable /// The options for the ingestion pipeline. /// The logger factory for creating loggers. public IngestionPipeline( - IngestionDocumentReader reader, - IngestionChunker chunker, - IngestionChunkWriter writer, + IIngestionDocumentReader reader, + IngestionChunker chunker, + IngestionChunkWriter writer, IngestionPipelineOptions? options = default, ILoggerFactory? loggerFactory = default) { _reader = Throw.IfNull(reader); _chunker = Throw.IfNull(chunker); _writer = Throw.IfNull(writer); - _activitySource = new((options ?? new()).ActivitySourceName); - _logger = loggerFactory?.CreateLogger>(); + ActivitySource = new((options ?? new()).ActivitySourceName); + Logger = loggerFactory?.CreateLogger>(); } /// public void Dispose() { _writer.Dispose(); - _activitySource.Dispose(); + ActivitySource.Dispose(); } /// @@ -67,52 +67,39 @@ public void Dispose() /// /// Gets the chunk processors in the pipeline. /// - public IList> ChunkProcessors { get; } = []; + public IList> ChunkProcessors { get; } = []; /// - /// Processes all files in the specified directory that match the given search pattern and option. + /// Processes the specified input. /// - /// The directory to process. - /// The search pattern for file selection. - /// The search option for directory traversal. - /// The cancellation token for the operation. + /// The source input to process. + /// The unique documentIdentifier for the document. + /// The media type of the source. + /// The token to monitor for cancellation requests. /// A task representing the asynchronous operation. - public async IAsyncEnumerable ProcessAsync(DirectoryInfo directory, string searchPattern = "*.*", - SearchOption searchOption = SearchOption.TopDirectoryOnly, [EnumeratorCancellation] CancellationToken cancellationToken = default) + public async Task ProcessAsync(TSource source, string documentIdentifier, string? sourceMediaType = null, CancellationToken cancellationToken = default) { - Throw.IfNull(directory); - Throw.IfNullOrEmpty(searchPattern); - Throw.IfOutOfRange((int)searchOption, (int)SearchOption.TopDirectoryOnly, (int)SearchOption.AllDirectories); + Throw.IfNull(source); + Throw.IfNull(documentIdentifier); - using (Activity? rootActivity = _activitySource.StartActivity(ProcessDirectory.ActivityName)) + using (Activity? processActivity = ActivitySource.StartActivity(ProcessSource.ActivityName)) { - rootActivity?.SetTag(ProcessDirectory.DirectoryPathTagName, directory.FullName) - .SetTag(ProcessDirectory.SearchPatternTagName, searchPattern) - .SetTag(ProcessDirectory.SearchOptionTagName, searchOption.ToString()); - _logger?.ProcessingDirectory(directory.FullName, searchPattern, searchOption); - - await foreach (var ingestionResult in ProcessAsync(directory.EnumerateFiles(searchPattern, searchOption), rootActivity, cancellationToken).ConfigureAwait(false)) + IngestionDocument? document = null; + try { - yield return ingestionResult; - } - } - } + document = await _reader.ReadAsync(source, documentIdentifier, sourceMediaType, cancellationToken).ConfigureAwait(false); - /// - /// Processes the specified files. - /// - /// The collection of files to process. - /// The cancellation token for the operation. - /// A task representing the asynchronous operation. - public async IAsyncEnumerable ProcessAsync(IEnumerable files, [EnumeratorCancellation] CancellationToken cancellationToken = default) - { - Throw.IfNull(files); + processActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier); + Logger?.ReadDocument(document.Identifier); - using (Activity? rootActivity = _activitySource.StartActivity(ProcessFiles.ActivityName)) - { - await foreach (var ingestionResult in ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false)) + return await IngestAsync(document, processActivity, cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) { - yield return ingestionResult; + TraceException(processActivity, ex); + Logger?.IngestingFailed(ex, document?.Identifier ?? documentIdentifier); + + throw; } } } @@ -125,51 +112,6 @@ private static void TraceException(Activity? activity, Exception ex) .SetStatus(ActivityStatusCode.Error, ex.Message); } - private async IAsyncEnumerable ProcessAsync(IEnumerable files, Activity? rootActivity, - [EnumeratorCancellation] CancellationToken cancellationToken) - { -#if NET - if (System.Linq.Enumerable.TryGetNonEnumeratedCount(files, out int count)) -#else - if (files is IReadOnlyCollection { Count: int count }) -#endif - { - rootActivity?.SetTag(ProcessFiles.FileCountTagName, count); - _logger?.LogFileCount(count); - } - - foreach (FileInfo fileInfo in files) - { - using (Activity? processFileActivity = _activitySource.StartActivity(ProcessFile.ActivityName, ActivityKind.Internal, parentContext: rootActivity?.Context ?? default)) - { - processFileActivity?.SetTag(ProcessFile.FilePathTagName, fileInfo.FullName); - _logger?.ReadingFile(fileInfo.FullName, GetShortName(_reader)); - - IngestionDocument? document = null; - Exception? failure = null; - try - { - document = await _reader.ReadAsync(fileInfo, cancellationToken).ConfigureAwait(false); - - processFileActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier); - _logger?.ReadDocument(document.Identifier); - - document = await IngestAsync(document, processFileActivity, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - TraceException(processFileActivity, ex); - _logger?.IngestingFailed(ex, document?.Identifier ?? fileInfo.FullName); - - failure = ex; - } - - string documentId = document?.Identifier ?? fileInfo.FullName; - yield return new IngestionResult(documentId, document, failure); - } - } - } - private async Task IngestAsync(IngestionDocument document, Activity? parentActivity, CancellationToken cancellationToken) { foreach (IngestionDocumentProcessor processor in DocumentProcessors) @@ -180,15 +122,15 @@ private async Task IngestAsync(IngestionDocument document, Ac parentActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier); } - IAsyncEnumerable> chunks = _chunker.ProcessAsync(document, cancellationToken); + IAsyncEnumerable> chunks = _chunker.ProcessAsync(document, cancellationToken); foreach (var processor in ChunkProcessors) { chunks = processor.ProcessAsync(chunks, cancellationToken); } - _logger?.WritingChunks(GetShortName(_writer)); + Logger?.WritingChunks(GetShortName(_writer)); await _writer.WriteAsync(chunks, cancellationToken).ConfigureAwait(false); - _logger?.WroteChunks(document.Identifier); + Logger?.WroteChunks(document.Identifier); return document; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs new file mode 100644 index 00000000000..155bf6e58f2 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.IO; + +namespace Microsoft.Extensions.DataIngestion; + +internal static class MediaTypeProvider +{ + internal static string GetMediaType(this FileInfo source) + => source.Extension switch + { + ".123" => "application/vnd.lotus-1-2-3", + ".602" => "application/x-t602", + ".abw" => "application/x-abiword", + ".bmp" => "image/bmp", + ".cgm" => "image/cgm", + ".csv" => "text/csv", + ".cwk" => "application/x-cwk", + ".dbf" => "application/vnd.dbf", + ".dif" => "application/x-dif", + ".doc" => "application/msword", + ".docm" => "application/vnd.ms-word.document.macroEnabled.12", + ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".dot" => "application/msword", + ".dotm" => "application/vnd.ms-word.template.macroEnabled.12", + ".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + ".epub" => "application/epub+zip", + ".et" => "application/vnd.ms-excel", + ".eth" => "application/ethos", + ".fods" => "application/vnd.oasis.opendocument.spreadsheet", + ".gif" => "image/gif", + ".htm" => "text/html", + ".html" => "text/html", + ".hwp" => "application/x-hwp", + ".jpeg" => "image/jpeg", + ".jpg" => "image/jpeg", + ".key" => "application/x-iwork-keynote-sffkey", + ".lwp" => "application/vnd.lotus-wordpro", + ".mcw" => "application/macwriteii", + ".markdown" or ".md" => "text/markdown", + ".mw" => "application/macwriteii", + ".numbers" => "application/x-iwork-numbers-sffnumbers", + ".ods" => "application/vnd.oasis.opendocument.spreadsheet", + ".pages" => "application/x-iwork-pages-sffpages", + ".pbd" => "application/x-pagemaker", + ".pdf" => "application/pdf", + ".png" => "image/png", + ".pot" => "application/vnd.ms-powerpoint", + ".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12", + ".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template", + ".ppt" => "application/vnd.ms-powerpoint", + ".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12", + ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ".prn" => "application/x-prn", + ".qpw" => "application/x-quattro-pro", + ".rtf" => "application/rtf", + ".sda" => "application/vnd.stardivision.draw", + ".sdd" => "application/vnd.stardivision.impress", + ".sdp" => "application/sdp", + ".sdw" => "application/vnd.stardivision.writer", + ".sgl" => "application/vnd.stardivision.writer", + ".slk" => "text/vnd.sylk", + ".sti" => "application/vnd.sun.xml.impress.template", + ".stw" => "application/vnd.sun.xml.writer.template", + ".svg" => "image/svg+xml", + ".sxg" => "application/vnd.sun.xml.writer.global", + ".sxi" => "application/vnd.sun.xml.impress", + ".sxw" => "application/vnd.sun.xml.writer", + ".sylk" => "text/vnd.sylk", + ".tiff" => "image/tiff", + ".tsv" => "text/tab-separated-values", + ".txt" => "text/plain", + ".uof" => "application/vnd.uoml+xml", + ".uop" => "application/vnd.openofficeorg.presentation", + ".uos1" => "application/vnd.uoml+xml", + ".uos2" => "application/vnd.uoml+xml", + ".uot" => "application/x-uo", + ".vor" => "application/vnd.stardivision.writer", + ".webp" => "image/webp", + ".wpd" => "application/wordperfect", + ".wps" => "application/vnd.ms-works", + ".wq1" => "application/x-lotus", + ".wq2" => "application/x-lotus", + ".xls" => "application/vnd.ms-excel", + ".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12", + ".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12", + ".xlr" => "application/vnd.ms-works", + ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".xlw" => "application/vnd.ms-excel", + ".xml" => "application/xml", + ".zabw" => "application/x-abiword", + _ => "application/octet-stream" + }; +} diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs index af23c29f13c..e896922a88a 100644 --- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs +++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs index 5122cde3e9a..50f802da014 100644 --- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs +++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs @@ -23,7 +23,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs index 2e1ef18494d..0f3233c9d26 100644 --- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs +++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs @@ -8,7 +8,7 @@ namespace ChatWithCustomData_CSharp.Web.Services.Ingestion; internal sealed class PdfPigReader : IngestionDocumentReader { - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { using var pdf = PdfDocument.Open(source); var document = new IngestionDocument(identifier); diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs index f2f0d85c458..2e9503bf498 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs @@ -86,7 +86,7 @@ public async Task CanProcessDocuments() using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount); - using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); List ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync(); Assert.Equal(_sampleFiles.Count, ingestionResults.Count); @@ -119,7 +119,7 @@ public async Task CanProcessDocumentsInDirectory() using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount); - using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); DirectoryInfo directory = new("TestFiles"); List ingestionResults = await pipeline.ProcessAsync(directory, "*.md").ToListAsync(); @@ -152,7 +152,7 @@ public async Task ChunksCanBeMoreThanJustText() TestEmbeddingGenerator embeddingGenerator = new(); using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount); - using IngestionPipeline pipeline = new(CreateReader(), new ImageChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(CreateReader(), new ImageChunker(), vectorStoreWriter); Assert.False(embeddingGenerator.WasCalled); var ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync(); @@ -189,7 +189,7 @@ public async Task SingleFailureDoesNotTearDownEntirePipeline() { int failed = 0; MarkdownReader workingReader = new(); - TestReader failingForFirstReader = new( + TestReader failingForFirstReader = new( (source, identifier, mediaType, cancellationToken) => failed++ == 0 ? Task.FromException(new ExpectedException()) : workingReader.ReadAsync(source, identifier, mediaType, cancellationToken)); @@ -201,7 +201,7 @@ public async Task SingleFailureDoesNotTearDownEntirePipeline() using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount); - using IngestionPipeline pipeline = new(failingForFirstReader, CreateChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(failingForFirstReader, CreateChunker(), vectorStoreWriter); await Verify(pipeline.ProcessAsync(_sampleFiles)); await Verify(pipeline.ProcessAsync(_sampleDirectory)); @@ -221,6 +221,40 @@ async Task Verify(IAsyncEnumerable results) } } + [Fact] + public async Task SourceCanBeAnything() + { + TestReader testReader = new((index, id, mediaType, ct) => + { + return new MarkdownReader().ReadAsync(_sampleFiles[index], _sampleFiles[index].FullName, mediaType, ct); + }); + + TestEmbeddingGenerator embeddingGenerator = new(); + using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); + using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount); + + using IngestionPipeline pipeline = new(testReader, CreateChunker(), vectorStoreWriter); + + for (int i = 0; i < _sampleFiles.Count; i++) + { + await pipeline.ProcessAsync(i, _sampleFiles[i].FullName); + } + + Assert.True(embeddingGenerator.WasCalled, "Embedding generator should have been called."); + + var retrieved = await vectorStoreWriter.VectorStoreCollection + .GetAsync(record => _sampleFiles.Any(info => info.FullName == (string)record["documentid"]!), top: 1000) + .ToListAsync(); + + Assert.NotEmpty(retrieved); + for (int i = 0; i < retrieved.Count; i++) + { + Assert.NotEqual(Guid.Empty, (Guid)retrieved[i]["key"]!); + Assert.NotEmpty((string)retrieved[i]["content"]!); + Assert.Contains((string)retrieved[i]["documentid"]!, _sampleFiles.Select(info => info.FullName)); + } + } + private static IngestionDocumentReader CreateReader() => new MarkdownReader(); private static IngestionChunker CreateChunker() => new HeaderChunker(new(TiktokenTokenizer.CreateForModel("gpt-4"))); diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs index 37142f8b20e..b1af4584a9d 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs @@ -34,7 +34,7 @@ public async Task ReadAsync_ThrowsWhenSourceIsNull() { var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); - await Assert.ThrowsAsync("source", async () => await reader.ReadAsync(null!, "identifier")); + await Assert.ThrowsAsync("source", async () => await reader.ReadAsync((FileInfo)null!, "identifier")); await Assert.ThrowsAsync("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType")); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs index aa039de5e31..72db4f1a6eb 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs @@ -2,21 +2,20 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.IO; using System.Threading; using System.Threading.Tasks; namespace Microsoft.Extensions.DataIngestion; -internal sealed class TestReader : IngestionDocumentReader +internal sealed class TestReader : IIngestionDocumentReader { - public TestReader(Func> readAsyncCallback) + public TestReader(Func> readAsyncCallback) { ReadAsyncCallback = readAsyncCallback; } - public Func> ReadAsyncCallback { get; } + public Func> ReadAsyncCallback { get; } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public Task ReadAsync(TSource source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => ReadAsyncCallback(source, identifier, mediaType, cancellationToken); } diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 9dd366a03a5..8fc82ea91f4 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs index 60fcdbdc128..27bf84b65cd 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs @@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs index d97b986b694..b7a0a8e5ce6 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs index 315a6ad3d53..44a44e4e337 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs @@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs index f6de539eb22..de7e1027d19 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs @@ -8,7 +8,7 @@ namespace aichatweb.Services.Ingestion; internal sealed class PdfPigReader : IngestionDocumentReader { - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { using var pdf = PdfDocument.Open(source); var document = new IngestionDocument(identifier); diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 9dd366a03a5..8fc82ea91f4 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs index 60fcdbdc128..27bf84b65cd 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs @@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 9dd366a03a5..8fc82ea91f4 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs index 60fcdbdc128..27bf84b65cd 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs @@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs index d97b986b694..b7a0a8e5ce6 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs @@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) IncrementalIngestion = false, }); - using var pipeline = new IngestionPipeline( + using var pipeline = new IngestionPipeline( reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs index 315a6ad3d53..44a44e4e337 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs @@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi return base.ReadAsync(source, identifier, mediaType, cancellationToken); } - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) => mediaType switch { "application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken), diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs index f6de539eb22..de7e1027d19 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs @@ -8,7 +8,7 @@ namespace aichatweb.Services.Ingestion; internal sealed class PdfPigReader : IngestionDocumentReader { - public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) + public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) { using var pdf = PdfDocument.Open(source); var document = new IngestionDocument(identifier);