Remove dependency on MuPDF version

JorjMcKie · JorjMcKie · commit f57cb6ba305f · 2025-05-02T07:54:43.000-04:00
Add all MuPDF STEXT flags up to v1.26.0 to PyMuPDF.
Use hard coded values if unknown in an earlier MuPDF version that we still want / need to support.
The intention is to switch to MuPDF's symbolic names as soon as we drop support of the corresponding version.
Flag bits representing current MuPDF features can always be used because the are ignored by older MuPDF versions.

Also removed some duplicate definitions.
diff --git a/docs/vars.rst b/docs/vars.rst
@@ -253,7 +253,7 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si
 
 .. py:data:: TEXT_COLLECT_STRUCTURE
 
-    256 -- Not supported.
+    256 -- Not supported yet.
 
 .. py:data:: TEXT_ACCURATE_BBOXES
 
@@ -264,17 +264,45 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si
 
 .. py:data:: TEXT_COLLECT_VECTORS
 
-    1024 -- Not supported.
+    1024 -- Not supported yet.
 
 .. py:data:: TEXT_IGNORE_ACTUALTEXT
 
-    2048 -- Ignore built-in differences between text appearing in e.g. PDF viewers versus text stored in the PDF. See :ref:`AdobeManual`, page 615 for background. If set, the **stored** ("replacement" text) is ignored in favor of the displayed text.
+    2048 -- Ignore built-in differences between text appearing in e.g. PDF viewers versus text stored in the PDF. See :ref:`AdobeManual`, page 615 for background. If set, the **stored** ("replacement" text) is ignored in favor of the **displayed** text.
 
 .. py:data:: TEXT_STEXT_SEGMENT
 
     4096 -- Attempt to segment page into different regions.
 
-The following constants represent the default combinations of the above for text extraction and searching:
+.. py:data:: TEXT_STEXT_PARAGRAPH_BREAK
+
+    8192 -- Not supported yet.
+
+.. py:data:: TEXT_STEXT_TABLE_HUNT
+
+    16384 -- Not supported yet.
+
+.. py:data:: TEXT_COLLECT_STYLES
+
+    32768 -- Detect underlined and strikeout text. Also detect and handle faked bold text in most cases.
+
+.. py:data:: TEXT_GID_FOR_UNKNOWN_UNICODE
+
+    65536 -- An alternative to `TEXT_CID_FOR_UNKNOWN_UNICODE` that uses the GID (glyph ID) instead of the CID (character ID). Both flags should never be used together, because results are undefined.
+
+.. py:data:: TEXT_CLIP_RECT
+
+    1 << 17 -- Not supported yet.
+
+.. py:data:: TEXT_ACCURATE_ASCENDERS
+
+    1 << 18 -- Not supported yet.
+
+.. py:data:: TEXT_ACCURATE_SIDE_BEARINGS
+
+    1 << 19 -- Not supported yet.
+
+The following constants represent default combinations of the above for text extraction and searching:
 
 .. py:data:: TEXTFLAGS_TEXT
 
diff --git a/src/__init__.py b/src/__init__.py
@@ -13516,18 +13516,18 @@ def width(self):
 TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS
 TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP
 TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
-if mupdf_version_tuple >= (1, 25):
-    TEXT_COLLECT_STRUCTURE = mupdf.FZ_STEXT_COLLECT_STRUCTURE
-    TEXT_ACCURATE_BBOXES = mupdf.FZ_STEXT_ACCURATE_BBOXES
-    TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS
-    TEXT_IGNORE_ACTUALTEXT = mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
-    TEXT_STEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT
-else:
-    TEXT_COLLECT_STRUCTURE = 256
-    TEXT_ACCURATE_BBOXES = 512
-    TEXT_COLLECT_VECTORS = 1024
-    TEXT_IGNORE_ACTUALTEXT = 2048
-    TEXT_STEXT_SEGMENT = 4096
+TEXT_COLLECT_STRUCTURE = 256          # mupdf.FZ_STEXT_COLLECT_STRUCTURE
+TEXT_ACCURATE_BBOXES = 512            # mupdf.FZ_STEXT_ACCURATE_BBOXES
+TEXT_COLLECT_VECTORS = 1024           # mupdf.FZ_STEXT_COLLECT_VECTORS
+TEXT_IGNORE_ACTUALTEXT = 2048         # mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
+TEXT_STEXT_SEGMENT = 4096             # mupdf.FZ_STEXT_SEGMENT
+TEXT_STEXT_PARAGRAPH_BREAK = 8192     # mupdf.FZ_STEXT_PARAGRAPH_BREAK
+TEXT_STEXT_TABLE_HUNT = 16384         # mupdf.FZ_STEXT_TABLE_HUNT
+TEXT_COLLECT_STYLES = 32768           # mupdf.FZ_STEXT_COLLECT_STYLES
+TEXT_GID_FOR_UNKNOWN_UNICODE = 65536  # mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
+TEXT_CLIP_RECT =  1 << 17             # mupdf.FZ_STEXT_CLIP_RECT
+TEXT_ACCURATE_ASCENDERS = 1 << 18     # mupdf.FZ_STEXT_ACCURATE_ASCENDERS
+TEXT_ACCURATE_SIDE_BEARINGS = 1 << 19 # mupdf.FZ_STEXT_ACCURATE_SIDE_BEARINGS
 
 TEXTFLAGS_WORDS = (0
         | TEXT_PRESERVE_LIGATURES
@@ -13620,14 +13620,6 @@ def width(self):
 PDF_BM_Screen = "Screen"
 PDF_BM_SoftLight = "Softlight"
 
-# General text flags
-TEXT_FONT_SUPERSCRIPT = 1
-TEXT_FONT_ITALIC = 2
-TEXT_FONT_SERIFED = 4
-TEXT_FONT_MONOSPACED = 8
-TEXT_FONT_BOLD = 16
-
-
 annot_skel = {
         "goto1": lambda a, b, c, d, e: f"<</A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>/Rect[{e}]/BS<</W 0>>/Subtype/Link>>",
         "goto2": lambda a, b: f"<</A<</S/GoTo/D{a}>>/Rect[{b}]/BS<</W 0>>/Subtype/Link>>",