Skip to content

Commit e173bb8

Browse files
committed
Better parsing
1 parent ece722c commit e173bb8

File tree

5 files changed

+179
-44
lines changed

5 files changed

+179
-44
lines changed

Directory.Packages.props

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,40 @@
1-
<Project>
2-
<PropertyGroup>
3-
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
4-
</PropertyGroup>
5-
<ItemGroup>
6-
<PackageVersion Include="Alkampfer.KernelMemory.ElasticSearch" Version="0.9.1" />
7-
<PackageVersion Include="Azure.AI.OpenAI" Version="2.1.0-beta.1" />
8-
<PackageVersion Include="CommandDotNet.Spectre" Version="3.0.2" />
9-
<PackageVersion Include="Microsoft.Extensions.Http" Version="8.0.1" />
10-
<PackageVersion Include="Microsoft.Extensions.Http.Resilience" Version="8.9.1" />
11-
<PackageVersion Include="Microsoft.Extensions.Logging.Console" Version="8.0.1" />
12-
<PackageVersion Include="Microsoft.Extensions.Logging.Debug" Version="8.0.1" />
13-
<PackageVersion Include="Microsoft.KernelMemory.Abstractions" Version="0.94.241201.1" />
14-
<PackageVersion Include="Microsoft.KernelMemory.AI.AzureOpenAI" Version="0.94.241201.1" />
15-
<PackageVersion Include="Microsoft.KernelMemory.Core" Version="0.94.241201.1" />
16-
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="0.22.0-preview.24378.1" />
17-
<PackageVersion Include="Microsoft.SemanticKernel" Version="1.26.0" />
18-
<PackageVersion Include="Microsoft.SemanticKernel.PromptTemplates.Handlebars" Version="1.26.0" />
19-
<PackageVersion Include="Microsoft.SemanticKernel.Yaml" Version="1.26.0" />
20-
<PackageVersion Include="Microsoft.SemanticKernel.Abstractions" Version="1.26.0" />
21-
<PackageVersion Include="Microsoft.SemanticKernel.Core" Version="1.26.0" />
22-
<PackageVersion Include="Polly.Core" Version="8.4.2" />
23-
<PackageVersion Include="TiktokenSharp" Version="1.1.4" />
24-
<PackageVersion Include="Microsoft.SourceLink.GitHub" Version="8.0.0" />
25-
</ItemGroup>
26-
<!-- Test related assemblies -->
27-
<ItemGroup>
28-
<PackageVersion Include="coverlet.msbuild" Version="6.0.2" />
29-
<PackageVersion Include="fasterflect" Version="3.0.0" />
30-
<PackageVersion Include="Microsoft.Extensions.DependencyInjection" Version="8.0.1" />
31-
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
32-
<PackageVersion Include="xunit" Version="2.9.2" />
33-
<PackageVersion Include="xunit.abstractions" Version="2.0.3" />
34-
<PackageVersion Include="Moq" Version="4.20.72" />
35-
<PackageVersion Include="Xunit.DependencyInjection" Version="9.7.0" />
36-
<PackageVersion Include="xunit.runner.visualstudio" Version="2.8.2" />
37-
<PackageVersion Include="coverlet.collector" Version="6.0.2" />
38-
</ItemGroup>
1+
<Project>
2+
<PropertyGroup>
3+
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
4+
</PropertyGroup>
5+
<ItemGroup>
6+
<PackageVersion Include="Alkampfer.KernelMemory.ElasticSearch" Version="0.9.1" />
7+
<PackageVersion Include="Azure.AI.OpenAI" Version="2.1.0-beta.1" />
8+
<PackageVersion Include="CommandDotNet.Spectre" Version="3.0.2" />
9+
<PackageVersion Include="Microsoft.Extensions.Http" Version="8.0.1" />
10+
<PackageVersion Include="Microsoft.Extensions.Http.Resilience" Version="8.9.1" />
11+
<PackageVersion Include="Microsoft.Extensions.Logging.Console" Version="8.0.1" />
12+
<PackageVersion Include="Microsoft.Extensions.Logging.Debug" Version="8.0.1" />
13+
<PackageVersion Include="Microsoft.KernelMemory.Abstractions" Version="0.94.241201.1" />
14+
<PackageVersion Include="Microsoft.KernelMemory.AI.AzureOpenAI" Version="0.94.241201.1" />
15+
<PackageVersion Include="Microsoft.KernelMemory.Core" Version="0.94.241201.1" />
16+
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="0.22.0-preview.24378.1" />
17+
<PackageVersion Include="Microsoft.SemanticKernel" Version="1.26.0" />
18+
<PackageVersion Include="Microsoft.SemanticKernel.PromptTemplates.Handlebars" Version="1.26.0" />
19+
<PackageVersion Include="Microsoft.SemanticKernel.Yaml" Version="1.26.0" />
20+
<PackageVersion Include="Microsoft.SemanticKernel.Abstractions" Version="1.26.0" />
21+
<PackageVersion Include="Microsoft.SemanticKernel.Core" Version="1.26.0" />
22+
<PackageVersion Include="PdfPig" Version="0.1.9" />
23+
<PackageVersion Include="Polly.Core" Version="8.4.2" />
24+
<PackageVersion Include="TiktokenSharp" Version="1.1.4" />
25+
<PackageVersion Include="Microsoft.SourceLink.GitHub" Version="8.0.0" />
26+
</ItemGroup>
27+
<!-- Test related assemblies -->
28+
<ItemGroup>
29+
<PackageVersion Include="coverlet.msbuild" Version="6.0.2" />
30+
<PackageVersion Include="fasterflect" Version="3.0.0" />
31+
<PackageVersion Include="Microsoft.Extensions.DependencyInjection" Version="8.0.1" />
32+
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
33+
<PackageVersion Include="xunit" Version="2.9.2" />
34+
<PackageVersion Include="xunit.abstractions" Version="2.0.3" />
35+
<PackageVersion Include="Moq" Version="4.20.72" />
36+
<PackageVersion Include="Xunit.DependencyInjection" Version="9.7.0" />
37+
<PackageVersion Include="xunit.runner.visualstudio" Version="2.8.2" />
38+
<PackageVersion Include="coverlet.collector" Version="6.0.2" />
39+
</ItemGroup>
3940
</Project>

src/KernelMemory.Extensions.ConsoleTest/KernelMemory.Extensions.ConsoleTest.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
<PackageReference Include="CommandDotNet.Spectre" />
1616
<PackageReference Include="Alkampfer.KernelMemory.ElasticSearch" />
1717
<PackageReference Include="Microsoft.KernelMemory.AI.AzureOpenAI" />
18+
<PackageReference Include="PdfPig" />
1819
</ItemGroup>
1920

2021
</Project>

src/KernelMemory.Extensions.ConsoleTest/Samples/ContextualRetrievalSample.cs

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,17 @@
55
using Microsoft.Extensions.Logging;
66
using Microsoft.KernelMemory;
77
using Microsoft.KernelMemory.DataFormats;
8+
using Microsoft.KernelMemory.DataFormats.Image;
9+
using Microsoft.KernelMemory.DataFormats.Office;
10+
using Microsoft.KernelMemory.DataFormats.Pdf;
11+
using Microsoft.KernelMemory.DataFormats.Text;
12+
using Microsoft.KernelMemory.DataFormats.WebPages;
813
using Microsoft.KernelMemory.DocumentStorage.DevTools;
914
using Microsoft.KernelMemory.FileSystem.DevTools;
1015
using Microsoft.KernelMemory.Handlers;
1116
using Microsoft.KernelMemory.MemoryStorage.DevTools;
1217
using Spectre.Console;
18+
using KernelMemory.Extensions.DocumentExtraction;
1319

1420
namespace SemanticMemory.Samples;
1521

@@ -24,7 +30,22 @@ public async Task RunSample(string bookPdf)
2430
.AddDebug()
2531
);
2632
//do not forget to add decoders
27-
services.AddDefaultContentDecoders();
33+
//services.AddDefaultContentDecoders();
34+
//you can add decoder directly in this way
35+
#pragma warning disable KMEXP00 // 'TextDecoder' is for evaluation purposes only and is subject to change or removal in future updates.
36+
// services.AddSingleton<IContentDecoder, TextDecoder>();
37+
// services.AddSingleton<IContentDecoder, MarkDownDecoder>();
38+
// services.AddSingleton<IContentDecoder, HtmlDecoder>();
39+
40+
// //services.AddSingleton<IContentDecoder, PdfDecoder>();
41+
// //services.AddSingleton<IContentDecoder, PdfStructuredDocumentDecoder>();
42+
43+
// services.AddSingleton<IContentDecoder, ImageDecoder>();
44+
// services.AddSingleton<IContentDecoder, MsExcelDecoder>();
45+
// services.AddSingleton<IContentDecoder, MsPowerPointDecoder>();
46+
// services.AddSingleton<IContentDecoder, MsWordDecoder>();
47+
#pragma warning restore KMEXP00
48+
2849
services.AddHttpClient<RawAnthropicHttpClient>()
2950
.AddStandardResilienceHandler(options =>
3051
{
@@ -38,7 +59,7 @@ public async Task RunSample(string bookPdf)
3859
throw new Exception("ANTHROPIC_API_KEY is not set");
3960
}
4061

41-
var config = new AnthropicTextGenerationConfiguration()
62+
var config = new AnthropicTextGenerationConfiguration()
4263
{
4364
ApiKey = anthropicApiKey,
4465
};
@@ -147,14 +168,14 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
147168
kernelMemoryBuilder
148169
.WithSimpleFileStorage(new SimpleFileStorageConfig()
149170
{
150-
//Directory = "/tmp/km/storage",
151-
Directory = "c:\\temp\\km2\\storage",
171+
Directory = "/tmp/km/storage",
172+
//Directory = "c:\\temp\\km2\\storage",
152173
StorageType = FileSystemTypes.Disk
153174
})
154175
.WithSimpleVectorDb(new SimpleVectorDbConfig()
155176
{
156-
//Directory = "/tmp/km/vectorstorage",
157-
Directory = "c:\\temp\\km2\\vectorstorage",
177+
Directory = "/tmp/km/vectorstorage",
178+
//Directory = "c:\\temp\\km2\\vectorstorage",
158179
StorageType = FileSystemTypes.Disk
159180
});
160181

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
using System;
2+
using System.IO;
3+
using System.Threading;
4+
using System.Threading.Tasks;
5+
using Microsoft.Extensions.Logging;
6+
using Microsoft.KernelMemory.DataFormats;
7+
using Microsoft.KernelMemory.Diagnostics;
8+
using Microsoft.KernelMemory.Pipeline;
9+
10+
namespace KernelMemory.Extensions.DocumentExtraction;
11+
12+
public class PdfStructuredDocumentDecoder : IContentDecoder
13+
{
14+
private readonly ILogger<PdfStructuredDocumentDecoder> _log;
15+
private readonly StructuredUglyToadPdfDecoder _uglyToadStructured;
16+
17+
public PdfStructuredDocumentDecoder(ILoggerFactory? loggerFactory = null)
18+
{
19+
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<PdfStructuredDocumentDecoder>();
20+
_uglyToadStructured = new StructuredUglyToadPdfDecoder();
21+
}
22+
23+
/// <inheritdoc />
24+
public bool SupportsMimeType(string mimeType)
25+
{
26+
return mimeType != null && mimeType.StartsWith(MimeTypes.Pdf, StringComparison.OrdinalIgnoreCase);
27+
}
28+
29+
/// <inheritdoc />
30+
public Task<FileContent> DecodeAsync(string filename, CancellationToken cancellationToken = default)
31+
{
32+
using var stream = File.OpenRead(filename);
33+
return this.DecodeAsync(stream, cancellationToken);
34+
}
35+
36+
/// <inheritdoc />
37+
public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancellationToken = default)
38+
{
39+
using var stream = data.ToStream();
40+
return this.DecodeAsync(stream, cancellationToken);
41+
}
42+
43+
/// <inheritdoc />
44+
public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellationToken = default)
45+
{
46+
this._log.LogDebug("Extracting structured text from PDF file");
47+
throw new NotImplementedException();
48+
var result = _uglyToadStructured.DecodePdf(data);
49+
50+
return Task.FromResult(result);
51+
}
52+
}
53+
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
using System;
2+
using System.IO;
3+
using System.Text;
4+
using Microsoft.KernelMemory.DataFormats;
5+
using Microsoft.KernelMemory.Pipeline;
6+
using UglyToad.PdfPig;
7+
using UglyToad.PdfPig.Content;
8+
9+
namespace KernelMemory.Extensions.DocumentExtraction;
10+
11+
public class StructuredUglyToadPdfDecoder
12+
{
13+
public FileContent DecodePdf(Stream stream)
14+
{
15+
var result = new FileContent(MimeTypes.PlainText);
16+
using var document = PdfDocument.Open(stream);
17+
18+
Word previous = null;
19+
var sb = new StringBuilder(1000);
20+
foreach (var page in document.GetPages())
21+
{
22+
foreach (var word in page.GetWords())
23+
{
24+
if (previous != null)
25+
{
26+
var hasInsertedWhitespace = false;
27+
var bothNonEmpty = previous.Letters.Count > 0 && word.Letters.Count > 0;
28+
if (bothNonEmpty)
29+
{
30+
var prevLetter1 = previous.Letters[0];
31+
var currentLetter1 = word.Letters[0];
32+
33+
var baselineGap = Math.Abs(prevLetter1.StartBaseLine.Y - currentLetter1.StartBaseLine.Y);
34+
35+
if (baselineGap > 3)
36+
{
37+
hasInsertedWhitespace = true;
38+
sb.AppendLine();
39+
}
40+
}
41+
42+
if (!hasInsertedWhitespace)
43+
{
44+
sb.Append(" ");
45+
}
46+
}
47+
48+
sb.Append(word.Text);
49+
50+
previous = word;
51+
}
52+
53+
result.Sections.Add(new FileSection(page.Number, sb.ToString(), false));
54+
sb.Clear();
55+
}
56+
57+
return result;
58+
}
59+
}

0 commit comments

Comments
 (0)