Address 4439

JorjMcKie · JorjMcKie · commit e869c8763f8c · 2025-04-10T10:30:22.000-04:00
Address 4439 (type in init of Xml class.
Also replaced unnecessary use of `math.sqrt()` by Python's built-in exponential operator.

Add more details to pymupdf4llm
diff --git a/docs/pymupdf4llm/api.rst b/docs/pymupdf4llm/api.rst
@@ -16,7 +16,7 @@ The |PyMuPDF4LLM| API
 
     Prints the version of the library.
 
-.. method:: to_markdown(doc: pymupdf.Document | str, *, pages: list | range | None = None, hdr_info: Any = None, write_images: bool = False, embed_images: bool = False, dpi: int = 150, filename=None, image_path="", image_format="png", image_size_limit=0.05, force_text=True, margins=0, page_chunks: bool = False, page_width: float = 612, page_height: float = None, table_strategy="lines_strict", graphics_limit: int = None, ignore_code: bool = False, extract_words: bool = False, show_progress: bool = False, use_glyphs=False) -> str | list[dict]
+.. method:: to_markdown(doc: pymupdf.Document | str, *, pages: list | range | None = None, hdr_info: Any = None, write_images: bool = False, embed_images: bool = False, ignore_images: bool = False, ignore_graphics: bool = False, dpi: int = 150, filename=None, image_path="", image_format="png", image_size_limit=0.05, force_text=True, margins=0, page_chunks: bool = False, page_width: float = 612, page_height: float = None, table_strategy="lines_strict", graphics_limit: int = None, ignore_code: bool = False, extract_words: bool = False, show_progress: bool = False, use_glyphs=False) -> str | list[dict]
 
     Read the pages of the file and outputs the text of its pages in |Markdown| format. How this should happen in detail can be influenced by a number of parameters. Please note that there exists **support for building page chunks** from the |Markdown|  text.
 
@@ -30,6 +30,10 @@ The |PyMuPDF4LLM| API
 
     :arg bool embed_images: like `write_images`, but images will be included in the markdown text as base64-encoded strings. Ignores `write_images` and `image_path` if used. This may drastically increase the size of your markdown text.
 
+    :arg bool ignore_images: (New in v.0.0.20) Disregard images on the page. This may help detecting text correctly when pages are very crowded (often the case for documents representing presentation slides). Also speeds up processing time.
+
+    :arg bool ignore_graphics: (New in v.0.0.20) Disregard vector graphics on the page. This may help detecting text correctly when pages are very crowded (often the case for documents representing presentation slides). Also speeds up processing time. Vector graphics are still used for table detection.
+
     :arg float image_size_limit: this must be a positive value less than 1. Images are ignored if `width / page.rect.width <= image_size_limit` or `height / page.rect.height <= image_size_limit`. For instance, the default value 0.05 means that to be considered for inclusion, an image's width and height must be larger than 5% of the page's width and height, respectively.
 
     :arg int dpi: specify the desired image resolution in dots per inch. Relevant only if `write_images=True`. Default value is 150.
@@ -62,13 +66,13 @@ The |PyMuPDF4LLM| API
 
         - **"words"** - if `extract_words=True` was used. This is a list of tuples `(x0, y0, x1, y1, "wordstring", bno, lno, wno)` as delivered by `page.get_text("words")`. The **sequence** of these tuples however is the same as produced in the markdown text string and thus honors multi-column text. This is also true for text in tables: words are extracted in the sequence of table row cells.
 
-    :arg str filename: (New in v.0.0.19) Overwrites or sets the desired image file name of written images. Useful when the document is provided as a memory object (which has no inherent name).
+    :arg str filename: (New in v.0.0.19) Overwrites or sets the desired image file name of written images. Useful when the document is provided as a memory object (which has no inherent file name).
     
-    :arg float page_width: specify a desired page width. This is ignored for documents with a fixed page width like PDF, XPS etc. **Reflowable** documents however, like e-books, office or text files have no fixed page dimensions and by default are assumed to have Letter format width (612) and an **"infinite"** page height. This means that the full document is treated as one large page.
+    :arg float page_width: specify a desired page width. This is ignored for documents with a fixed page width like PDF, XPS etc. **Reflowable** documents however, like e-books, office [#f2]_ or text files have no fixed page dimensions and by default are assumed to have Letter format width (612) and an **"infinite"** page height. This means that the **full document is treated as one large page.**
 
     :arg float page_height: specify a desired page height. For relevance see the `page_width` parameter. If using the default `None`, the document will appear as one large page with a width of `page_width`. Consequently in this case, no markdown page separators will occur (except the final one), respectively only one page chunk will be returned.
 
-    :arg str table_strategy: table detection strategy. Default is `"lines_strict"` which ignores background colors. In some occasions, other strategies may be more successful, for example `"lines"` which uses all vector graphics objects for detection.
+    :arg str table_strategy: `table detection strategy <https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables>`_. Default is `"lines_strict"` which ignores background colors. In some occasions, other strategies may be more successful, for example `"lines"` which uses all vector graphics objects for detection.  **Changed in v0.0.19:** A value of `None` will not perform any table detection at all. This may be useful when you know that your document contains no tables. Execution time savings can be significant.
 
     :arg int graphics_limit: use this to limit dealing with excess amounts of vector graphics elements. Scientific documents, or pages simulating text via graphics commands may contain tens of thousands of these objects. As vector graphics are analyzed for multiple purposes, runtime may quickly become intolerable. With this parameter, all vector graphics will be ignored if their count exceeds the threshold. **Changed in v0.0.19:** The page will still be processed, and text, tables and images should be extracted.
 
@@ -97,6 +101,129 @@ The |PyMuPDF4LLM| API
 ----
 
 
+.. class:: IdentifyHeaders
+
+    .. method:: __init__(self, doc: pymupdf.Document | str, *, pages: list | range | None = None, body_limit: float = 11, max_levels: int = 6)
+
+        Create an object which maps text font sizes to the respective number of '#' characters which are used by Markdown syntax to indicate header levels. The object is created by scanning the document for font size "popularity". The most popular font size and all smaller sizes are used for body text. Larger font sizes are mapped to the respective header levels - which correspond to the HTML tags `<h1>` to `<h6>`.
+
+        All font sizes are rounded to integer values.
+
+        If more than 6 header levels would be required, then the largest number smaller than the `<h6>` font size is used for body text.
+
+        Please note that creating the object will read and inspect the text of the entire document - independently of reading the document again in the `to_markdown()` method subequently. Method `to_markdown()` by default **will create this object** if you do not override its `hdr_info=None` parameter.
+        
+
+        :arg Document,str doc: the file, to be specified either as a file path string, or as a |PyMuPDF| Document (created via `pymupdf.open`). In order to use `pathlib.Path` specifications, Python file-like objects, documents in memory etc. you **must** use a |PyMuPDF| Document.
+
+        :arg list pages: optional, the pages to consider. If omitted all pages are processed.
+
+        :arg float body_limit: the default font size limit for body text. Only used when the document scan does not deliver valid information.
+
+        :arg int max_levels: the maximum number of header levels to be used. Valid values are in `range(1, 7)`. The default is 6, which corresponds to the HTML tags `<h1>` to `<h6>`. A smaller value will limit the number of generated header levels. For instance, a value of 3 will only generate header tags "#", "##" and "###". Body text will be assumed for all font sizes smaller than the one corresponding to "###".
+
+
+    .. method:: get_header_id(self, span: dict, page=None) -> str
+    
+        Return appropriate markdown header prefix. This is either "" or a string of "#" characters followed by a space.
+
+        Given a text span from a "dict"" extraction, determine the
+        markdown header prefix string of 0 to n concatenated '#' characters.
+
+        :arg dict span: a dictionary containing the text span information. This is the same dictionary as returned by `page.get_text("dict")`.
+
+        :arg Page page: the owning page object. This can be used when additional information needs to be extracted.
+
+        :returns: a string of "#" characters followed by a space.
+
+    .. attibute:: header_id
+    
+        A dictionary mapping (integer) font sizes to Markdown header strings like ``{14: '# ', 12: '## '}``. The dictionary is created by the `IdentifyHeaders` constructor. The keys are the font sizes of the text spans in the document. The values are the respective header strings.
+
+     .. attibute:: body_limit
+
+        An integer value indicating the font size limit for body text. This is computed as ``min(header_id.keys()) - 1``. In the above example, body_limit would be 11.
+
+
+    **How to limit header levels (example)**
+    
+    Limit the generated header levels to 3::
+
+        import pymupdf, pymupdf4llm
+
+        filename = "input.pdf"
+        doc = pymupdf.open(filename)  # use a Document for subsequent processing
+        my_headers = pymupdf4llm.IdentifyHeaders(doc, max_levels=3)  # generate header info
+        md_text = pymupdf4llm.to_markdown(doc, hdr_info=my_headers)
+
+
+    **How to provide your own header logic (example 1)**
+    
+    Provide your own function which uses pre-determined, fixed font sizes::
+
+        import pymupdf, pymupdf4llm
+
+        filename = "input.pdf"
+        doc = pymupdf.open(filename)  # use a Document for subsequent processing
+
+        def my_headers(span, page=None):
+            """
+            Provide some custom header logic.
+            This is a callable which accepts a text span and the page.
+            Could be extended to check for other properties of the span, for
+            instance the font name, text color and other attributes.
+            """
+            # header level is h1 if font size is larger than 14
+            # header level is h2 if font size is larger than 10
+            # otherwise it is body text
+            if span["size"] > 14:
+                return "# "
+            elif span["size"] > 10:
+                return "## "
+            else:
+                return ""
+        
+        # this will *NOT* scan the document for font sizes!
+        md_text = pymupdf4llm.to_markdown(doc, hdr_info=my_headers)
+
+    **How to provide your own header logic (example 2)**
+    
+    This user function uses the document's Table of Contents -- under the assumption that the bookmark text is also present as a header line on the page (which certainly need not be the case!)::
+
+        import pymupdf, pymupdf4llm
+
+        filename = "input.pdf"
+        doc = pymupdf.open(filename)  # use a Document for subsequent processing
+        TOC = doc.get_toc()  # get the table of contents for determing headers
+
+        def my_headers(span, page=None):
+            """
+            Provide some custom header logic (experimental!).
+            This callable checks whether the span text matches any of the
+            TOC titles on this page.
+            If so, use TOC hierarchy level as header level.
+            """
+            # TOC items on this page:
+            toc = [t for t in TOC if t[-1] == page.number + 1]
+
+            if not toc:  # no TOC items on this page
+                return ""
+
+            # look for a match in the TOC items
+            for lvl, title, _ in toc:
+                if span["text"].startswith(title):
+                    return "#" * lvl + " "
+                if title.startswith(span["text"]):
+                    return "#" * lvl + " "
+            
+            return ""
+        
+        # this will *NOT* scan the document for font sizes!
+        md_text = pymupdf4llm.to_markdown(doc, hdr_info=my_headers)
+
+----
+
+
 .. class:: pdf_markdown_reader.PDFMarkdownReader
 
     .. method:: load_data(file_path: Union[Path, str], extra_info: Optional[Dict] = None, **load_kwargs: Any) -> List[LlamaIndexDocument]
@@ -115,6 +242,8 @@ For a list of changes, please see file `CHANGES.md <https://github.com/pymupdf/R
 
 .. [#f1] `LlamaIndex documentation <https://docs.llamaindex.ai/en/stable/>`_
 
+.. [#f2] When using PyMuPDF-Pro, supported office documents are converted internally into a PDF-like format. Therefore, they **will have fixed page dimensions** and be no longer "reflowable". Consequently, the page width and page height specifications will be ignored as well in these cases.
+
 
 
 
diff --git a/docs/pymupdf4llm/index.rst b/docs/pymupdf4llm/index.rst
@@ -142,10 +142,4 @@ Blogs
 - `Building a RAG Chatbot GUI with the ChatGPT API and PyMuPDF <https://artifex.com/blog/building-a-rag-chatbot-gui-with-the-chatgpt-api-and-pymupdf>`_
 - `RAG/LLM and PDF: Conversion to Markdown Text with PyMuPDF <https://artifex.com/blog/rag-llm-and-pdf-conversion-to-markdown-text-with-pymupdf>`_
 
-
-
-
-
-
-
 .. include:: ../footer.rst
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -13,6 +13,8 @@ This tutorial will show you the use of |PyMuPDF|, :title:`MuPDF` in :title:`Pyth
 
 Because :title:`MuPDF` supports not only PDF, but also XPS, OpenXPS, CBZ, CBR, FB2 and EPUB formats, so does PyMuPDF [#f1]_. Nevertheless, for the sake of brevity we will only talk about PDF files. At places where indeed only PDF files are supported, this will be mentioned explicitly.
 
+In addition to this introduction, please do visit PyMuPDF's `Youtube Channel <https://www.youtube.com/@PyMuPDF>`_ which covers most of the following in the form of Youtube "Shorts" and longer videos.
+
 Importing the Bindings
 ==========================
 The Python bindings to MuPDF are made available by this import statement. We also show here how your version can be checked::
diff --git a/src/__init__.py b/src/__init__.py
@@ -2066,12 +2066,12 @@ def __enter__(self):
     def __exit__(self, *args):
         pass
 
-    def __init__( self, rhs):
-        if isinstance( rhs, mupdf.FzXml):
+    def __init__(self, rhs):
+        if isinstance(rhs, mupdf.FzXml):
             self.this = rhs
-        elif isinstance( str):
-            buff = mupdf.fz_new_buffer_from_copied_data( rhs)
-            self.this = mupdf.fz_parse_xml_from_html5( buff)
+        elif isinstance(rhs, str):
+            buff = mupdf.fz_new_buffer_from_copied_data(rhs)
+            self.this = mupdf.fz_parse_xml_from_html5(buff)
         else:
             assert 0, f'Unsupported type for rhs: {type(rhs)}'
     
@@ -6642,7 +6642,7 @@ def uri(self):
 class Matrix:
 
     def __abs__(self):
-        return math.sqrt(sum([c*c for c in self]))
+        return (sum([c*c for c in self])) ** 0.5
 
     def __add__(self, m):
         if hasattr(m, "__float__"):
@@ -10638,7 +10638,7 @@ def __del__(self):
 class Point:
 
     def __abs__(self):
-        return math.sqrt(self.x * self.x + self.y * self.y)
+        return (self.x * self.x + self.y * self.y) ** 0.5
 
     def __add__(self, p):
         if hasattr(p, "__float__"):
@@ -10749,7 +10749,7 @@ def abs_unit(self):
         s = self.x * self.x + self.y * self.y
         if s < EPSILON:
             return Point(0,0)
-        s = math.sqrt(s)
+        s = s ** 0.5
         return Point(abs(self.x) / s, abs(self.y) / s)
 
     def distance_to(self, *args):
@@ -10815,7 +10815,7 @@ def unit(self):
         s = self.x * self.x + self.y * self.y
         if s < EPSILON:
             return Point(0,0)
-        s = math.sqrt(s)
+        s = s ** 0.5
         return Point(self.x / s, self.y / s)
 
     __div__ = __truediv__
@@ -11276,7 +11276,7 @@ def morph(self, p, m):
         return self.quad.morph(p, m)
 
     def norm(self):
-        return math.sqrt(sum([c*c for c in self]))
+        return (sum([c*c for c in self])) ** 0.5
 
     def normalize(self):
         """Replace rectangle with its finite version."""
@@ -13222,7 +13222,7 @@ def morph(self, p, m):
         return self.quad.morph(p, m)
 
     def norm(self):
-        return math.sqrt(sum([c*c for c in self]))
+        return (sum([c*c for c in self])) ** 0.5
 
     def normalize(self):
         """Replace rectangle with its valid version."""
@@ -18664,7 +18664,7 @@ def jm_trace_text_span(dev, span, type_, ctm, colorspace, color, alpha, seqno):
     
     mat = mupdf.fz_concat(span.trm(), ctm)  # text transformation matrix
     dir = mupdf.fz_transform_vector(mupdf.fz_make_point(1, 0), mat) # writing direction
-    fsize = math.sqrt(dir.x * dir.x + dir.y * dir.y) # font size
+    fsize = (dir.x * dir.x + dir.y * dir.y) ** 0.5 # font size
 
     dir = mupdf.fz_normalize_vector(dir)