Mupdf.net

We are using bytescout to convert pdf to text, now switching to mupdf.net but not getting expected result same as bytescout.
Below is my existing method 

private MemoryStream ConvertPdfToTxt(Stream pdf)
        {
            StringBuilder sbRow = new();
            // create object of pdf extractor file reader.
            XMLExtractor extractor = new()
            {
                RegistrationName = config["RegistrationName"],
                RegistrationKey = config["RegistrationKey"],
                AutoAlignColumnsToHeader = true,
                ConsiderVerticalBorders = true,
                ExtractAnnotations = true,
                ExtractColumnByColumn = true,
                ExtractInvisibleText = true,
                ExtractShadowLikeText = true
            };

            //Load pdf file to file extractor.
            extractor.LoadDocumentFromStream(pdf);
            // Get string from pdf stream
            string doc = extractor.GetXML();
            XmlDocument xml = new();
            // Create xml document object.
            xml.LoadXml(doc);
            // get each row form the xml document.
            XmlNodeList lst = xml.GetElementsByTagName("row");

            foreach (XmlNode row in lst)
            {
                StringBuilder sbCol = new();
                foreach (XmlNode column in row.ChildNodes)
                {
                    //if column in row is blank then append "" in string.
                    if (column.InnerText.Equals(""))
                    {
                        sbCol.Append("");
                    }
                    else
                    {   // check if string contain "Pdf Extractor" and remove it.
                        if (Regex.IsMatch(column.InnerText, "PDF Extractor", RegexOptions.None, TimeSpan.FromSeconds(1)))
                        {
                            sbCol.Append("");
                        }
                        else
                        {   // if column has value then add it to new line for ease in txt reading
                            sbCol.AppendLine(column.InnerText);
                        }
                    }
                }

                sbRow.Append(sbCol.ToString());
            }
            // create new stream.
            var stream = new MemoryStream();
            var writer = new StreamWriter(stream);
            writer.Write(sbRow.ToString());
            writer.Flush();

            stream.Position = 0;
            return stream;

        }

I have tried with below code using mupdf
private MemoryStream ConvertPdfToTxt2()
        {
            // Create a MemoryStream to write the extracted text.
            var sbRow = new StringBuilder();

            // Load the PDF document using MuPDFNet
            var doc = new MuPDF.NET.Document(Path.Combine(hostingEnvironment.WebRootPath, "assets", "sample.pdf"));
            MuPDF.NET.TextPage textPage = doc.LoadPage(0).GetTextPage();
            string extractedText = textPage.ExtractText();
            string outputPath = Path.Combine(hostingEnvironment.WebRootPath, "assets", "output.txt");

            // Save the extracted text to the text file
            System.IO.File.WriteAllText(outputPath, extractedText);
            // Iterate over each page in the document
            for (int pageIndex = 0; pageIndex < doc.PageCount; pageIndex++)
            {
                var page = doc.LoadPage(pageIndex);

                // Extract text from the page
                var text = page.GetText();

                // Append the extracted text to the StringBuilder
                sbRow.AppendLine(text);
            }

            // Create a new MemoryStream to return
            var stream = new MemoryStream();
            var writer = new StreamWriter(stream);
            writer.Write(sbRow.ToString());
            writer.Flush();

            // Reset the position of the stream to the beginning for future reading
            stream.Position = 0;
            return stream;
        }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Mupdf.net #158

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Mupdf.net #158

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions