Skip to content

Commit 04b269e

Browse files
committed
Better Extraction from pdf
1 parent e173bb8 commit 04b269e

File tree

5 files changed

+50
-5
lines changed

5 files changed

+50
-5
lines changed

src/KernelMemory.Extensions.ConsoleTest/Helper/TextCleanerHandler.cs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
using Microsoft.Extensions.Logging;
1+
using KernelMemory.Extensions.Helper;
2+
using Microsoft.Extensions.Logging;
23
using Microsoft.KernelMemory.Diagnostics;
34
using Microsoft.KernelMemory.Pipeline;
5+
using System.Reflection.Emit;
46
using System.Text;
57

68
namespace KernelMemory.Extensions.ConsoleTest.Helper;
@@ -69,6 +71,12 @@ public TextCleanerHandler(
6971
{
7072
newContent.Append(c);
7173
}
74+
75+
//now handle ligatures
76+
if (LigatureHelper.IsLigature(c))
77+
{
78+
newContent.Append(LigatureHelper.ExpandLigature(c));
79+
}
7280
}
7381
await _orchestrator.WriteTextFileAsync(pipeline, file.Name, newContent.ToString(), cancellationToken).ConfigureAwait(false);
7482

src/KernelMemory.Extensions.ConsoleTest/Samples/ContextualRetrievalSample.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ private static IKernelMemoryBuilder CreateBasicKernelMemoryBuilder(
165165
.WithAzureOpenAITextGeneration(chatConfig)
166166
.WithAzureOpenAITextEmbeddingGeneration(embeddingConfig);
167167

168+
kernelMemoryBuilder.WithContentDecoder<PdfStructuredDocumentDecoder>();
169+
168170
kernelMemoryBuilder
169171
.WithSimpleFileStorage(new SimpleFileStorageConfig()
170172
{

src/KernelMemory.Extensions/DocumentExtraction/PdfStructuredDocumentDecoder.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,14 @@ public Task<FileContent> DecodeAsync(string filename, CancellationToken cancella
3737
public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancellationToken = default)
3838
{
3939
using var stream = data.ToStream();
40-
return this.DecodeAsync(stream, cancellationToken);
40+
return DecodeAsync(stream, cancellationToken);
4141
}
4242

4343
/// <inheritdoc />
4444
public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellationToken = default)
4545
{
46-
this._log.LogDebug("Extracting structured text from PDF file");
47-
throw new NotImplementedException();
46+
_log.LogDebug("Extracting structured text from PDF file");
4847
var result = _uglyToadStructured.DecodePdf(data);
49-
5048
return Task.FromResult(result);
5149
}
5250
}

src/KernelMemory.Extensions/DocumentExtraction/StructuredUglyToadPdfDecoder.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.IO;
34
using System.Text;
5+
using DocumentFormat.OpenXml.ExtendedProperties;
46
using Microsoft.KernelMemory.DataFormats;
57
using Microsoft.KernelMemory.Pipeline;
68
using UglyToad.PdfPig;
@@ -17,6 +19,7 @@ public FileContent DecodePdf(Stream stream)
1719

1820
Word previous = null;
1921
var sb = new StringBuilder(1000);
22+
2023
foreach (var page in document.GetPages())
2124
{
2225
foreach (var word in page.GetWords())
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
namespace KernelMemory.Extensions.Helper;
5+
6+
public static class LigatureHelper
7+
{
8+
// Common ligature mappings
9+
private static readonly Dictionary<string, string> Ligatures = new()
10+
{
11+
// Latin ligatures
12+
{"ff", "ff"}, // U+FB00
13+
{"fi", "fi"}, // U+FB01
14+
{"fl", "fl"}, // U+FB02
15+
{"ffi", "ffi"}, // U+FB03
16+
{"ffl", "ffl"}, // U+FB04
17+
{"ſt", "st"}, // U+FB05
18+
{"st", "st"}, // U+FB06
19+
20+
// Additional typographic ligatures
21+
{"℔", "lb"}, // U+2114
22+
{"℞", "Rx"}, // U+211E
23+
};
24+
25+
public static bool IsLigature(Char c)
26+
{
27+
return Ligatures.ContainsKey(c.ToString());
28+
}
29+
30+
public static string ExpandLigature(Char c)
31+
{
32+
return Ligatures[c.ToString()];
33+
}
34+
}

0 commit comments

Comments
 (0)