diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs
new file mode 100644
index 00000000000..edda9283bce
--- /dev/null
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IIngestionDocumentReader.cs
@@ -0,0 +1,27 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.IO;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Microsoft.Extensions.DataIngestion;
+
+///
+/// Reads source content and converts it to an .
+///
+/// The type of the source to read from. Sample: , , etc.
+public interface IIngestionDocumentReader
+{
+ ///
+ /// Reads a source and converts it to an .
+ ///
+ /// The source to read.
+ /// The unique identifier for the document.
+ /// The media type of the source (if needed).
+ /// The token to monitor for cancellation requests.
+ /// A task representing the asynchronous read operation.
+ /// or is or empty.
+ Task ReadAsync(TSource source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default);
+}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs
index 8bdb651321a..971170e57d5 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentReader.cs
@@ -12,7 +12,7 @@ namespace Microsoft.Extensions.DataIngestion;
///
/// Reads source content and converts it to an .
///
-public abstract class IngestionDocumentReader
+public abstract class IngestionDocumentReader : IIngestionDocumentReader, IIngestionDocumentReader
{
///
/// Reads a file and converts it to an .
@@ -24,7 +24,7 @@ public abstract class IngestionDocumentReader
public Task ReadAsync(FileInfo source, CancellationToken cancellationToken = default)
{
string identifier = Throw.IfNull(source).FullName; // entire path is more unique than just part of it.
- return ReadAsync(source, identifier, GetMediaType(source), cancellationToken);
+ return ReadAsync(source, identifier, source.GetMediaType(), cancellationToken);
}
///
@@ -42,7 +42,7 @@ public virtual async Task ReadAsync(FileInfo source, string i
_ = Throw.IfNullOrEmpty(identifier);
using FileStream stream = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 1, FileOptions.Asynchronous);
- return await ReadAsync(stream, identifier, string.IsNullOrEmpty(mediaType) ? GetMediaType(source) : mediaType!, cancellationToken).ConfigureAwait(false);
+ return await ReadAsync(stream, identifier, string.IsNullOrEmpty(mediaType) ? source.GetMediaType() : mediaType!, cancellationToken).ConfigureAwait(false);
}
///
@@ -53,90 +53,5 @@ public virtual async Task ReadAsync(FileInfo source, string i
/// The media type of the content.
/// The token to monitor for cancellation requests.
/// A task representing the asynchronous read operation.
- public abstract Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default);
-
- private static string GetMediaType(FileInfo source)
- => source.Extension switch
- {
- ".123" => "application/vnd.lotus-1-2-3",
- ".602" => "application/x-t602",
- ".abw" => "application/x-abiword",
- ".bmp" => "image/bmp",
- ".cgm" => "image/cgm",
- ".csv" => "text/csv",
- ".cwk" => "application/x-cwk",
- ".dbf" => "application/vnd.dbf",
- ".dif" => "application/x-dif",
- ".doc" => "application/msword",
- ".docm" => "application/vnd.ms-word.document.macroEnabled.12",
- ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ".dot" => "application/msword",
- ".dotm" => "application/vnd.ms-word.template.macroEnabled.12",
- ".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
- ".epub" => "application/epub+zip",
- ".et" => "application/vnd.ms-excel",
- ".eth" => "application/ethos",
- ".fods" => "application/vnd.oasis.opendocument.spreadsheet",
- ".gif" => "image/gif",
- ".htm" => "text/html",
- ".html" => "text/html",
- ".hwp" => "application/x-hwp",
- ".jpeg" => "image/jpeg",
- ".jpg" => "image/jpeg",
- ".key" => "application/x-iwork-keynote-sffkey",
- ".lwp" => "application/vnd.lotus-wordpro",
- ".mcw" => "application/macwriteii",
- ".mw" => "application/macwriteii",
- ".numbers" => "application/x-iwork-numbers-sffnumbers",
- ".ods" => "application/vnd.oasis.opendocument.spreadsheet",
- ".pages" => "application/x-iwork-pages-sffpages",
- ".pbd" => "application/x-pagemaker",
- ".pdf" => "application/pdf",
- ".png" => "image/png",
- ".pot" => "application/vnd.ms-powerpoint",
- ".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12",
- ".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template",
- ".ppt" => "application/vnd.ms-powerpoint",
- ".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
- ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- ".prn" => "application/x-prn",
- ".qpw" => "application/x-quattro-pro",
- ".rtf" => "application/rtf",
- ".sda" => "application/vnd.stardivision.draw",
- ".sdd" => "application/vnd.stardivision.impress",
- ".sdp" => "application/sdp",
- ".sdw" => "application/vnd.stardivision.writer",
- ".sgl" => "application/vnd.stardivision.writer",
- ".slk" => "text/vnd.sylk",
- ".sti" => "application/vnd.sun.xml.impress.template",
- ".stw" => "application/vnd.sun.xml.writer.template",
- ".svg" => "image/svg+xml",
- ".sxg" => "application/vnd.sun.xml.writer.global",
- ".sxi" => "application/vnd.sun.xml.impress",
- ".sxw" => "application/vnd.sun.xml.writer",
- ".sylk" => "text/vnd.sylk",
- ".tiff" => "image/tiff",
- ".tsv" => "text/tab-separated-values",
- ".txt" => "text/plain",
- ".uof" => "application/vnd.uoml+xml",
- ".uop" => "application/vnd.openofficeorg.presentation",
- ".uos1" => "application/vnd.uoml+xml",
- ".uos2" => "application/vnd.uoml+xml",
- ".uot" => "application/x-uo",
- ".vor" => "application/vnd.stardivision.writer",
- ".webp" => "image/webp",
- ".wpd" => "application/wordperfect",
- ".wps" => "application/vnd.ms-works",
- ".wq1" => "application/x-lotus",
- ".wq2" => "application/x-lotus",
- ".xls" => "application/vnd.ms-excel",
- ".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
- ".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12",
- ".xlr" => "application/vnd.ms-works",
- ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- ".xlw" => "application/vnd.ms-excel",
- ".xml" => "application/xml",
- ".zabw" => "application/x-abiword",
- _ => "application/octet-stream"
- };
+ public abstract Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default);
}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj
index f3f16874b4c..f2e895df244 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/Microsoft.Extensions.DataIngestion.Abstractions.csproj
@@ -15,6 +15,10 @@
$(NoWarn);S1694
+
+
+
+
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs
index e6a14bfbf17..60e3a02bd94 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs
@@ -16,7 +16,7 @@ namespace Microsoft.Extensions.DataIngestion;
///
/// Reads documents by converting them to Markdown using the MarkItDown MCP server.
///
-public class MarkItDownMcpReader : IngestionDocumentReader
+public class MarkItDownMcpReader : IngestionDocumentReader, IIngestionDocumentReader
{
private readonly Uri _mcpServerUri;
private readonly McpClientOptions? _options;
@@ -65,7 +65,7 @@ public override async Task ReadAsync(FileInfo source, string
}
///
- public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);
@@ -79,10 +79,18 @@ public override async Task ReadAsync(Stream source, string id
#endif
DataContent dataContent = new(
ms.GetBuffer().AsMemory(0, (int)ms.Length),
- string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType);
+ string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!);
- string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false);
+ return await ReadAsync(dataContent, identifier, mediaType, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ public async Task ReadAsync(DataContent source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
+ {
+ _ = Throw.IfNull(source);
+ _ = Throw.IfNullOrEmpty(identifier);
+ string markdown = await ConvertToMarkdownAsync(source, cancellationToken).ConfigureAwait(false);
return MarkdownParser.Parse(markdown, identifier);
}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs
index 79b60f3ad5d..3ea255070a5 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownReader.cs
@@ -94,7 +94,7 @@ public override async Task ReadAsync(FileInfo source, string
///
/// The contents of are copied to a temporary file.
- public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md
index 095011b77f1..4f846b09611 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md
@@ -30,7 +30,7 @@ using Microsoft.Extensions.DataIngestion;
IngestionDocumentReader reader =
new MarkItDownReader(new FileInfo(@"pathToMarkItDown.exe"), extractImages: true);
-using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
+using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
```
### Creating a MarkItDownMcpReader for Data Ingestion (MCP Server)
@@ -44,7 +44,7 @@ using Microsoft.Extensions.DataIngestion;
IngestionDocumentReader reader =
new MarkItDownMcpReader(new Uri("http://localhost:3001/mcp"));
-using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
+using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
```
The MarkItDown MCP server can be run using Docker:
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs
index 1afabd03139..3581dd4bf9c 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/MarkdownReader.cs
@@ -29,7 +29,7 @@ public override async Task ReadAsync(FileInfo source, string
}
///
- public override async Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override async Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md
index c6a2328699c..c8dc2af1968 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md
@@ -27,7 +27,7 @@ using Microsoft.Extensions.DataIngestion;
IngestionDocumentReader reader = new MarkdownReader();
-using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
+using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter());
```
## Feedback & Contributing
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs
index 4251bef6ae3..97aa3d86866 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs
@@ -24,6 +24,7 @@ internal static class ProcessFiles
internal static class ProcessSource
{
+ internal const string ActivityName = "ProcessSource";
internal const string DocumentIdTagName = "rag.document.id";
}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs
new file mode 100644
index 00000000000..4a0931f4b57
--- /dev/null
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion/FileSystemIngestionExtensions.cs
@@ -0,0 +1,124 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Shared.Diagnostics;
+using static Microsoft.Extensions.DataIngestion.DiagnosticsConstants;
+
+namespace Microsoft.Extensions.DataIngestion;
+
+#pragma warning disable IDE0058 // Expression value is never used
+#pragma warning disable IDE0063 // Use simple 'using' statement
+#pragma warning disable CA1031 // Do not catch general exception types
+
+///
+/// Provides a set of File System extension methods for the class.
+///
+public static class FileSystemIngestionExtensions
+{
+ ///
+ /// Processes all files in the specified directory that match the given search pattern and option.
+ ///
+ /// The type of the chunk content.
+ /// The ingestion pipeline.
+ /// The directory to process.
+ /// The search pattern for file selection.
+ /// The search option for directory traversal.
+ /// The cancellation token for the operation.
+ /// A task representing the asynchronous operation.
+ public static async IAsyncEnumerable ProcessAsync(
+ this IngestionPipeline pipeline,
+ DirectoryInfo directory, string searchPattern = "*.*",
+ SearchOption searchOption = SearchOption.TopDirectoryOnly,
+ [EnumeratorCancellation] CancellationToken cancellationToken = default)
+ {
+ Throw.IfNull(pipeline);
+ Throw.IfNull(directory);
+ Throw.IfNullOrEmpty(searchPattern);
+ Throw.IfOutOfRange((int)searchOption, (int)SearchOption.TopDirectoryOnly, (int)SearchOption.AllDirectories);
+
+ using (Activity? rootActivity = pipeline.ActivitySource.StartActivity(ProcessDirectory.ActivityName))
+ {
+ rootActivity?.SetTag(ProcessDirectory.DirectoryPathTagName, directory.FullName)
+ .SetTag(ProcessDirectory.SearchPatternTagName, searchPattern)
+ .SetTag(ProcessDirectory.SearchOptionTagName, searchOption.ToString());
+ pipeline.Logger?.ProcessingDirectory(directory.FullName, searchPattern, searchOption);
+
+ var files = directory.EnumerateFiles(searchPattern, searchOption);
+ await foreach (var ingestionResult in pipeline.ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false))
+ {
+ yield return ingestionResult;
+ }
+ }
+ }
+
+ ///
+ /// Processes the specified files.
+ ///
+ /// The type of the chunk content.
+ /// The ingestion pipeline.
+ /// The collection of files to process.
+ /// The cancellation token for the operation.
+ /// A task representing the asynchronous operation.
+ public static async IAsyncEnumerable ProcessAsync(
+ this IngestionPipeline pipeline,
+ IEnumerable files,
+ [EnumeratorCancellation] CancellationToken cancellationToken = default)
+ {
+ Throw.IfNull(pipeline);
+ Throw.IfNull(files);
+
+ using (Activity? rootActivity = pipeline.ActivitySource.StartActivity(ProcessFiles.ActivityName))
+ {
+ await foreach (var ingestionResult in pipeline.ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false))
+ {
+ yield return ingestionResult;
+ }
+ }
+ }
+
+ private static async IAsyncEnumerable ProcessAsync(
+ this IngestionPipeline pipeline,
+ IEnumerable files, Activity? rootActivity,
+ [EnumeratorCancellation] CancellationToken cancellationToken)
+ {
+#if NET
+ if (System.Linq.Enumerable.TryGetNonEnumeratedCount(files, out int count))
+#else
+ if (files is IReadOnlyCollection { Count: int count })
+#endif
+ {
+ rootActivity?.SetTag(ProcessFiles.FileCountTagName, count);
+ pipeline.Logger?.LogFileCount(count);
+ }
+
+ foreach (FileInfo fileInfo in files)
+ {
+ using (Activity? processFileActivity = pipeline.ActivitySource.StartActivity(ProcessFile.ActivityName, ActivityKind.Internal, parentContext: rootActivity?.Context ?? default))
+ {
+ processFileActivity?.SetTag(ProcessFile.FilePathTagName, fileInfo.FullName);
+
+ Exception? failure = null;
+ IngestionDocument? document = null;
+
+ try
+ {
+ document = await pipeline.ProcessAsync(fileInfo, fileInfo.FullName, fileInfo.GetMediaType(), cancellationToken).ConfigureAwait(false);
+ }
+ catch (Exception e)
+ {
+ failure = e;
+ }
+
+ string documentId = document?.Identifier ?? fileInfo.FullName;
+ yield return new IngestionResult(documentId, document, failure);
+ }
+ }
+ }
+}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs
index 1eeb94058ee..e130d8086a2 100644
--- a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs
@@ -4,7 +4,6 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
-using System.IO;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;
@@ -21,17 +20,18 @@ namespace Microsoft.Extensions.DataIngestion;
///
/// Represents a pipeline for ingesting data from documents and processing it into chunks.
///
-/// The type of the chunk content.
-public sealed class IngestionPipeline : IDisposable
+/// The type of the source content.
+/// The type of the chunk content.
+public sealed class IngestionPipeline : IDisposable
{
- private readonly IngestionDocumentReader _reader;
- private readonly IngestionChunker _chunker;
- private readonly IngestionChunkWriter _writer;
- private readonly ActivitySource _activitySource;
- private readonly ILogger? _logger;
+ internal readonly ActivitySource ActivitySource;
+ internal readonly ILogger? Logger;
+ private readonly IIngestionDocumentReader _reader;
+ private readonly IngestionChunker _chunker;
+ private readonly IngestionChunkWriter _writer;
///
- /// Initializes a new instance of the class.
+ /// Initializes a new instance of the class.
///
/// The reader for ingestion documents.
/// The chunker to split documents into chunks.
@@ -39,24 +39,24 @@ public sealed class IngestionPipeline : IDisposable
/// The options for the ingestion pipeline.
/// The logger factory for creating loggers.
public IngestionPipeline(
- IngestionDocumentReader reader,
- IngestionChunker chunker,
- IngestionChunkWriter writer,
+ IIngestionDocumentReader reader,
+ IngestionChunker chunker,
+ IngestionChunkWriter writer,
IngestionPipelineOptions? options = default,
ILoggerFactory? loggerFactory = default)
{
_reader = Throw.IfNull(reader);
_chunker = Throw.IfNull(chunker);
_writer = Throw.IfNull(writer);
- _activitySource = new((options ?? new()).ActivitySourceName);
- _logger = loggerFactory?.CreateLogger>();
+ ActivitySource = new((options ?? new()).ActivitySourceName);
+ Logger = loggerFactory?.CreateLogger>();
}
///
public void Dispose()
{
_writer.Dispose();
- _activitySource.Dispose();
+ ActivitySource.Dispose();
}
///
@@ -67,52 +67,39 @@ public void Dispose()
///
/// Gets the chunk processors in the pipeline.
///
- public IList> ChunkProcessors { get; } = [];
+ public IList> ChunkProcessors { get; } = [];
///
- /// Processes all files in the specified directory that match the given search pattern and option.
+ /// Processes the specified input.
///
- /// The directory to process.
- /// The search pattern for file selection.
- /// The search option for directory traversal.
- /// The cancellation token for the operation.
+ /// The source input to process.
+ /// The unique documentIdentifier for the document.
+ /// The media type of the source.
+ /// The token to monitor for cancellation requests.
/// A task representing the asynchronous operation.
- public async IAsyncEnumerable ProcessAsync(DirectoryInfo directory, string searchPattern = "*.*",
- SearchOption searchOption = SearchOption.TopDirectoryOnly, [EnumeratorCancellation] CancellationToken cancellationToken = default)
+ public async Task ProcessAsync(TSource source, string documentIdentifier, string? sourceMediaType = null, CancellationToken cancellationToken = default)
{
- Throw.IfNull(directory);
- Throw.IfNullOrEmpty(searchPattern);
- Throw.IfOutOfRange((int)searchOption, (int)SearchOption.TopDirectoryOnly, (int)SearchOption.AllDirectories);
+ Throw.IfNull(source);
+ Throw.IfNull(documentIdentifier);
- using (Activity? rootActivity = _activitySource.StartActivity(ProcessDirectory.ActivityName))
+ using (Activity? processActivity = ActivitySource.StartActivity(ProcessSource.ActivityName))
{
- rootActivity?.SetTag(ProcessDirectory.DirectoryPathTagName, directory.FullName)
- .SetTag(ProcessDirectory.SearchPatternTagName, searchPattern)
- .SetTag(ProcessDirectory.SearchOptionTagName, searchOption.ToString());
- _logger?.ProcessingDirectory(directory.FullName, searchPattern, searchOption);
-
- await foreach (var ingestionResult in ProcessAsync(directory.EnumerateFiles(searchPattern, searchOption), rootActivity, cancellationToken).ConfigureAwait(false))
+ IngestionDocument? document = null;
+ try
{
- yield return ingestionResult;
- }
- }
- }
+ document = await _reader.ReadAsync(source, documentIdentifier, sourceMediaType, cancellationToken).ConfigureAwait(false);
- ///
- /// Processes the specified files.
- ///
- /// The collection of files to process.
- /// The cancellation token for the operation.
- /// A task representing the asynchronous operation.
- public async IAsyncEnumerable ProcessAsync(IEnumerable files, [EnumeratorCancellation] CancellationToken cancellationToken = default)
- {
- Throw.IfNull(files);
+ processActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier);
+ Logger?.ReadDocument(document.Identifier);
- using (Activity? rootActivity = _activitySource.StartActivity(ProcessFiles.ActivityName))
- {
- await foreach (var ingestionResult in ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false))
+ return await IngestAsync(document, processActivity, cancellationToken).ConfigureAwait(false);
+ }
+ catch (Exception ex)
{
- yield return ingestionResult;
+ TraceException(processActivity, ex);
+ Logger?.IngestingFailed(ex, document?.Identifier ?? documentIdentifier);
+
+ throw;
}
}
}
@@ -125,51 +112,6 @@ private static void TraceException(Activity? activity, Exception ex)
.SetStatus(ActivityStatusCode.Error, ex.Message);
}
- private async IAsyncEnumerable ProcessAsync(IEnumerable files, Activity? rootActivity,
- [EnumeratorCancellation] CancellationToken cancellationToken)
- {
-#if NET
- if (System.Linq.Enumerable.TryGetNonEnumeratedCount(files, out int count))
-#else
- if (files is IReadOnlyCollection { Count: int count })
-#endif
- {
- rootActivity?.SetTag(ProcessFiles.FileCountTagName, count);
- _logger?.LogFileCount(count);
- }
-
- foreach (FileInfo fileInfo in files)
- {
- using (Activity? processFileActivity = _activitySource.StartActivity(ProcessFile.ActivityName, ActivityKind.Internal, parentContext: rootActivity?.Context ?? default))
- {
- processFileActivity?.SetTag(ProcessFile.FilePathTagName, fileInfo.FullName);
- _logger?.ReadingFile(fileInfo.FullName, GetShortName(_reader));
-
- IngestionDocument? document = null;
- Exception? failure = null;
- try
- {
- document = await _reader.ReadAsync(fileInfo, cancellationToken).ConfigureAwait(false);
-
- processFileActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier);
- _logger?.ReadDocument(document.Identifier);
-
- document = await IngestAsync(document, processFileActivity, cancellationToken).ConfigureAwait(false);
- }
- catch (Exception ex)
- {
- TraceException(processFileActivity, ex);
- _logger?.IngestingFailed(ex, document?.Identifier ?? fileInfo.FullName);
-
- failure = ex;
- }
-
- string documentId = document?.Identifier ?? fileInfo.FullName;
- yield return new IngestionResult(documentId, document, failure);
- }
- }
- }
-
private async Task IngestAsync(IngestionDocument document, Activity? parentActivity, CancellationToken cancellationToken)
{
foreach (IngestionDocumentProcessor processor in DocumentProcessors)
@@ -180,15 +122,15 @@ private async Task IngestAsync(IngestionDocument document, Ac
parentActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier);
}
- IAsyncEnumerable> chunks = _chunker.ProcessAsync(document, cancellationToken);
+ IAsyncEnumerable> chunks = _chunker.ProcessAsync(document, cancellationToken);
foreach (var processor in ChunkProcessors)
{
chunks = processor.ProcessAsync(chunks, cancellationToken);
}
- _logger?.WritingChunks(GetShortName(_writer));
+ Logger?.WritingChunks(GetShortName(_writer));
await _writer.WriteAsync(chunks, cancellationToken).ConfigureAwait(false);
- _logger?.WroteChunks(document.Identifier);
+ Logger?.WroteChunks(document.Identifier);
return document;
}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs
new file mode 100644
index 00000000000..155bf6e58f2
--- /dev/null
+++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Utils/MediaTypeProvider.cs
@@ -0,0 +1,95 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.IO;
+
+namespace Microsoft.Extensions.DataIngestion;
+
+internal static class MediaTypeProvider
+{
+ internal static string GetMediaType(this FileInfo source)
+ => source.Extension switch
+ {
+ ".123" => "application/vnd.lotus-1-2-3",
+ ".602" => "application/x-t602",
+ ".abw" => "application/x-abiword",
+ ".bmp" => "image/bmp",
+ ".cgm" => "image/cgm",
+ ".csv" => "text/csv",
+ ".cwk" => "application/x-cwk",
+ ".dbf" => "application/vnd.dbf",
+ ".dif" => "application/x-dif",
+ ".doc" => "application/msword",
+ ".docm" => "application/vnd.ms-word.document.macroEnabled.12",
+ ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ".dot" => "application/msword",
+ ".dotm" => "application/vnd.ms-word.template.macroEnabled.12",
+ ".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+ ".epub" => "application/epub+zip",
+ ".et" => "application/vnd.ms-excel",
+ ".eth" => "application/ethos",
+ ".fods" => "application/vnd.oasis.opendocument.spreadsheet",
+ ".gif" => "image/gif",
+ ".htm" => "text/html",
+ ".html" => "text/html",
+ ".hwp" => "application/x-hwp",
+ ".jpeg" => "image/jpeg",
+ ".jpg" => "image/jpeg",
+ ".key" => "application/x-iwork-keynote-sffkey",
+ ".lwp" => "application/vnd.lotus-wordpro",
+ ".mcw" => "application/macwriteii",
+ ".markdown" or ".md" => "text/markdown",
+ ".mw" => "application/macwriteii",
+ ".numbers" => "application/x-iwork-numbers-sffnumbers",
+ ".ods" => "application/vnd.oasis.opendocument.spreadsheet",
+ ".pages" => "application/x-iwork-pages-sffpages",
+ ".pbd" => "application/x-pagemaker",
+ ".pdf" => "application/pdf",
+ ".png" => "image/png",
+ ".pot" => "application/vnd.ms-powerpoint",
+ ".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12",
+ ".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template",
+ ".ppt" => "application/vnd.ms-powerpoint",
+ ".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
+ ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ ".prn" => "application/x-prn",
+ ".qpw" => "application/x-quattro-pro",
+ ".rtf" => "application/rtf",
+ ".sda" => "application/vnd.stardivision.draw",
+ ".sdd" => "application/vnd.stardivision.impress",
+ ".sdp" => "application/sdp",
+ ".sdw" => "application/vnd.stardivision.writer",
+ ".sgl" => "application/vnd.stardivision.writer",
+ ".slk" => "text/vnd.sylk",
+ ".sti" => "application/vnd.sun.xml.impress.template",
+ ".stw" => "application/vnd.sun.xml.writer.template",
+ ".svg" => "image/svg+xml",
+ ".sxg" => "application/vnd.sun.xml.writer.global",
+ ".sxi" => "application/vnd.sun.xml.impress",
+ ".sxw" => "application/vnd.sun.xml.writer",
+ ".sylk" => "text/vnd.sylk",
+ ".tiff" => "image/tiff",
+ ".tsv" => "text/tab-separated-values",
+ ".txt" => "text/plain",
+ ".uof" => "application/vnd.uoml+xml",
+ ".uop" => "application/vnd.openofficeorg.presentation",
+ ".uos1" => "application/vnd.uoml+xml",
+ ".uos2" => "application/vnd.uoml+xml",
+ ".uot" => "application/x-uo",
+ ".vor" => "application/vnd.stardivision.writer",
+ ".webp" => "image/webp",
+ ".wpd" => "application/wordperfect",
+ ".wps" => "application/vnd.ms-works",
+ ".wq1" => "application/x-lotus",
+ ".wq2" => "application/x-lotus",
+ ".xls" => "application/vnd.ms-excel",
+ ".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
+ ".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12",
+ ".xlr" => "application/vnd.ms-works",
+ ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ".xlw" => "application/vnd.ms-excel",
+ ".xml" => "application/xml",
+ ".zabw" => "application/x-abiword",
+ _ => "application/octet-stream"
+ };
+}
diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs
index af23c29f13c..e896922a88a 100644
--- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs
+++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs
index 5122cde3e9a..50f802da014 100644
--- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs
+++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/DocumentReader.cs
@@ -23,7 +23,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs
index 2e1ef18494d..0f3233c9d26 100644
--- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs
+++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Services/Ingestion/PdfPigReader.cs
@@ -8,7 +8,7 @@ namespace ChatWithCustomData_CSharp.Web.Services.Ingestion;
internal sealed class PdfPigReader : IngestionDocumentReader
{
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
using var pdf = PdfDocument.Open(source);
var document = new IngestionDocument(identifier);
diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs
index f2f0d85c458..2e9503bf498 100644
--- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs
+++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs
@@ -86,7 +86,7 @@ public async Task CanProcessDocuments()
using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator });
using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount);
- using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter);
+ using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter);
List ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync();
Assert.Equal(_sampleFiles.Count, ingestionResults.Count);
@@ -119,7 +119,7 @@ public async Task CanProcessDocumentsInDirectory()
using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator });
using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount);
- using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter);
+ using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter);
DirectoryInfo directory = new("TestFiles");
List ingestionResults = await pipeline.ProcessAsync(directory, "*.md").ToListAsync();
@@ -152,7 +152,7 @@ public async Task ChunksCanBeMoreThanJustText()
TestEmbeddingGenerator embeddingGenerator = new();
using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator });
using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount);
- using IngestionPipeline pipeline = new(CreateReader(), new ImageChunker(), vectorStoreWriter);
+ using IngestionPipeline pipeline = new(CreateReader(), new ImageChunker(), vectorStoreWriter);
Assert.False(embeddingGenerator.WasCalled);
var ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync();
@@ -189,7 +189,7 @@ public async Task SingleFailureDoesNotTearDownEntirePipeline()
{
int failed = 0;
MarkdownReader workingReader = new();
- TestReader failingForFirstReader = new(
+ TestReader failingForFirstReader = new(
(source, identifier, mediaType, cancellationToken) => failed++ == 0
? Task.FromException(new ExpectedException())
: workingReader.ReadAsync(source, identifier, mediaType, cancellationToken));
@@ -201,7 +201,7 @@ public async Task SingleFailureDoesNotTearDownEntirePipeline()
using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator });
using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount);
- using IngestionPipeline pipeline = new(failingForFirstReader, CreateChunker(), vectorStoreWriter);
+ using IngestionPipeline pipeline = new(failingForFirstReader, CreateChunker(), vectorStoreWriter);
await Verify(pipeline.ProcessAsync(_sampleFiles));
await Verify(pipeline.ProcessAsync(_sampleDirectory));
@@ -221,6 +221,40 @@ async Task Verify(IAsyncEnumerable results)
}
}
+ [Fact]
+ public async Task SourceCanBeAnything()
+ {
+ TestReader testReader = new((index, id, mediaType, ct) =>
+ {
+ return new MarkdownReader().ReadAsync(_sampleFiles[index], _sampleFiles[index].FullName, mediaType, ct);
+ });
+
+ TestEmbeddingGenerator embeddingGenerator = new();
+ using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator });
+ using VectorStoreWriter vectorStoreWriter = new(testVectorStore, dimensionCount: TestEmbeddingGenerator.DimensionCount);
+
+ using IngestionPipeline pipeline = new(testReader, CreateChunker(), vectorStoreWriter);
+
+ for (int i = 0; i < _sampleFiles.Count; i++)
+ {
+ await pipeline.ProcessAsync(i, _sampleFiles[i].FullName);
+ }
+
+ Assert.True(embeddingGenerator.WasCalled, "Embedding generator should have been called.");
+
+ var retrieved = await vectorStoreWriter.VectorStoreCollection
+ .GetAsync(record => _sampleFiles.Any(info => info.FullName == (string)record["documentid"]!), top: 1000)
+ .ToListAsync();
+
+ Assert.NotEmpty(retrieved);
+ for (int i = 0; i < retrieved.Count; i++)
+ {
+ Assert.NotEqual(Guid.Empty, (Guid)retrieved[i]["key"]!);
+ Assert.NotEmpty((string)retrieved[i]["content"]!);
+ Assert.Contains((string)retrieved[i]["documentid"]!, _sampleFiles.Select(info => info.FullName));
+ }
+ }
+
private static IngestionDocumentReader CreateReader() => new MarkdownReader();
private static IngestionChunker CreateChunker() => new HeaderChunker(new(TiktokenTokenizer.CreateForModel("gpt-4")));
diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs
index 37142f8b20e..b1af4584a9d 100644
--- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs
+++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs
@@ -34,7 +34,7 @@ public async Task ReadAsync_ThrowsWhenSourceIsNull()
{
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));
- await Assert.ThrowsAsync("source", async () => await reader.ReadAsync(null!, "identifier"));
+ await Assert.ThrowsAsync("source", async () => await reader.ReadAsync((FileInfo)null!, "identifier"));
await Assert.ThrowsAsync("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType"));
}
diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs
index aa039de5e31..72db4f1a6eb 100644
--- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs
+++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestReader.cs
@@ -2,21 +2,20 @@
// The .NET Foundation licenses this file to you under the MIT license.
using System;
-using System.IO;
using System.Threading;
using System.Threading.Tasks;
namespace Microsoft.Extensions.DataIngestion;
-internal sealed class TestReader : IngestionDocumentReader
+internal sealed class TestReader : IIngestionDocumentReader
{
- public TestReader(Func> readAsyncCallback)
+ public TestReader(Func> readAsyncCallback)
{
ReadAsyncCallback = readAsyncCallback;
}
- public Func> ReadAsyncCallback { get; }
+ public Func> ReadAsyncCallback { get; }
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public Task ReadAsync(TSource source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> ReadAsyncCallback(source, identifier, mediaType, cancellationToken);
}
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
index 9dd366a03a5..8fc82ea91f4 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
index 60fcdbdc128..27bf84b65cd 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.AzureOpenAI_Qdrant_Aspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
@@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs
index d97b986b694..b7a0a8e5ce6 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs
index 315a6ad3d53..44a44e4e337 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/DocumentReader.cs
@@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
index f6de539eb22..de7e1027d19 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Basic.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
@@ -8,7 +8,7 @@ namespace aichatweb.Services.Ingestion;
internal sealed class PdfPigReader : IngestionDocumentReader
{
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
using var pdf = PdfDocument.Open(source);
var document = new IngestionDocument(identifier);
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
index 9dd366a03a5..8fc82ea91f4 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
index 60fcdbdc128..27bf84b65cd 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.BasicAspire.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
@@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
index 9dd366a03a5..8fc82ea91f4 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
index 60fcdbdc128..27bf84b65cd 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.Ollama_Qdrant.verified/aichatweb/aichatweb.Web/Services/Ingestion/DocumentReader.cs
@@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs
index d97b986b694..b7a0a8e5ce6 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DataIngestor.cs
@@ -21,7 +21,7 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern)
IncrementalIngestion = false,
});
- using var pipeline = new IngestionPipeline(
+ using var pipeline = new IngestionPipeline(
reader: new DocumentReader(directory),
chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))),
writer: writer,
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs
index 315a6ad3d53..44a44e4e337 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/DocumentReader.cs
@@ -19,7 +19,7 @@ public override Task ReadAsync(FileInfo source, string identi
return base.ReadAsync(source, identifier, mediaType, cancellationToken);
}
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
=> mediaType switch
{
"application/pdf" => _pdfReader.ReadAsync(source, identifier, mediaType, cancellationToken),
diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
index f6de539eb22..de7e1027d19 100644
--- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
+++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb.OpenAI_AzureAISearch.verified/aichatweb/Services/Ingestion/PdfPigReader.cs
@@ -8,7 +8,7 @@ namespace aichatweb.Services.Ingestion;
internal sealed class PdfPigReader : IngestionDocumentReader
{
- public override Task ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
+ public override Task ReadAsync(Stream source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
using var pdf = PdfDocument.Open(source);
var document = new IngestionDocument(identifier);