Skip to content

Mupdf.net #158

@chiragmobifly

Description

@chiragmobifly

We are using bytescout to convert pdf to text, now switching to mupdf.net but not getting expected result same as bytescout.
Below is my existing method

private MemoryStream ConvertPdfToTxt(Stream pdf)
{
StringBuilder sbRow = new();
// create object of pdf extractor file reader.
XMLExtractor extractor = new()
{
RegistrationName = config["RegistrationName"],
RegistrationKey = config["RegistrationKey"],
AutoAlignColumnsToHeader = true,
ConsiderVerticalBorders = true,
ExtractAnnotations = true,
ExtractColumnByColumn = true,
ExtractInvisibleText = true,
ExtractShadowLikeText = true
};

        //Load pdf file to file extractor.
        extractor.LoadDocumentFromStream(pdf);
        // Get string from pdf stream
        string doc = extractor.GetXML();
        XmlDocument xml = new();
        // Create xml document object.
        xml.LoadXml(doc);
        // get each row form the xml document.
        XmlNodeList lst = xml.GetElementsByTagName("row");

        foreach (XmlNode row in lst)
        {
            StringBuilder sbCol = new();
            foreach (XmlNode column in row.ChildNodes)
            {
                //if column in row is blank then append "" in string.
                if (column.InnerText.Equals(""))
                {
                    sbCol.Append("");
                }
                else
                {   // check if string contain "Pdf Extractor" and remove it.
                    if (Regex.IsMatch(column.InnerText, "PDF Extractor", RegexOptions.None, TimeSpan.FromSeconds(1)))
                    {
                        sbCol.Append("");
                    }
                    else
                    {   // if column has value then add it to new line for ease in txt reading
                        sbCol.AppendLine(column.InnerText);
                    }
                }
            }

            sbRow.Append(sbCol.ToString());
        }
        // create new stream.
        var stream = new MemoryStream();
        var writer = new StreamWriter(stream);
        writer.Write(sbRow.ToString());
        writer.Flush();

        stream.Position = 0;
        return stream;

    }

I have tried with below code using mupdf
private MemoryStream ConvertPdfToTxt2()
{
// Create a MemoryStream to write the extracted text.
var sbRow = new StringBuilder();

        // Load the PDF document using MuPDFNet
        var doc = new MuPDF.NET.Document(Path.Combine(hostingEnvironment.WebRootPath, "assets", "sample.pdf"));
        MuPDF.NET.TextPage textPage = doc.LoadPage(0).GetTextPage();
        string extractedText = textPage.ExtractText();
        string outputPath = Path.Combine(hostingEnvironment.WebRootPath, "assets", "output.txt");

        // Save the extracted text to the text file
        System.IO.File.WriteAllText(outputPath, extractedText);
        // Iterate over each page in the document
        for (int pageIndex = 0; pageIndex < doc.PageCount; pageIndex++)
        {
            var page = doc.LoadPage(pageIndex);

            // Extract text from the page
            var text = page.GetText();

            // Append the extracted text to the StringBuilder
            sbRow.AppendLine(text);
        }

        // Create a new MemoryStream to return
        var stream = new MemoryStream();
        var writer = new StreamWriter(stream);
        writer.Write(sbRow.ToString());
        writer.Flush();

        // Reset the position of the stream to the beginning for future reading
        stream.Position = 0;
        return stream;
    }

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions