-
Notifications
You must be signed in to change notification settings - Fork 13
Description
We are using bytescout to convert pdf to text, now switching to mupdf.net but not getting expected result same as bytescout.
Below is my existing method
private MemoryStream ConvertPdfToTxt(Stream pdf)
{
StringBuilder sbRow = new();
// create object of pdf extractor file reader.
XMLExtractor extractor = new()
{
RegistrationName = config["RegistrationName"],
RegistrationKey = config["RegistrationKey"],
AutoAlignColumnsToHeader = true,
ConsiderVerticalBorders = true,
ExtractAnnotations = true,
ExtractColumnByColumn = true,
ExtractInvisibleText = true,
ExtractShadowLikeText = true
};
//Load pdf file to file extractor.
extractor.LoadDocumentFromStream(pdf);
// Get string from pdf stream
string doc = extractor.GetXML();
XmlDocument xml = new();
// Create xml document object.
xml.LoadXml(doc);
// get each row form the xml document.
XmlNodeList lst = xml.GetElementsByTagName("row");
foreach (XmlNode row in lst)
{
StringBuilder sbCol = new();
foreach (XmlNode column in row.ChildNodes)
{
//if column in row is blank then append "" in string.
if (column.InnerText.Equals(""))
{
sbCol.Append("");
}
else
{ // check if string contain "Pdf Extractor" and remove it.
if (Regex.IsMatch(column.InnerText, "PDF Extractor", RegexOptions.None, TimeSpan.FromSeconds(1)))
{
sbCol.Append("");
}
else
{ // if column has value then add it to new line for ease in txt reading
sbCol.AppendLine(column.InnerText);
}
}
}
sbRow.Append(sbCol.ToString());
}
// create new stream.
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(sbRow.ToString());
writer.Flush();
stream.Position = 0;
return stream;
}
I have tried with below code using mupdf
private MemoryStream ConvertPdfToTxt2()
{
// Create a MemoryStream to write the extracted text.
var sbRow = new StringBuilder();
// Load the PDF document using MuPDFNet
var doc = new MuPDF.NET.Document(Path.Combine(hostingEnvironment.WebRootPath, "assets", "sample.pdf"));
MuPDF.NET.TextPage textPage = doc.LoadPage(0).GetTextPage();
string extractedText = textPage.ExtractText();
string outputPath = Path.Combine(hostingEnvironment.WebRootPath, "assets", "output.txt");
// Save the extracted text to the text file
System.IO.File.WriteAllText(outputPath, extractedText);
// Iterate over each page in the document
for (int pageIndex = 0; pageIndex < doc.PageCount; pageIndex++)
{
var page = doc.LoadPage(pageIndex);
// Extract text from the page
var text = page.GetText();
// Append the extracted text to the StringBuilder
sbRow.AppendLine(text);
}
// Create a new MemoryStream to return
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(sbRow.ToString());
writer.Flush();
// Reset the position of the stream to the beginning for future reading
stream.Position = 0;
return stream;
}