Skip to content

Commit cbc0cad

Browse files
Updated some TEXT_* constants.
Renamed two TEXT_ constants for consistency with mupdf, but preserved the old names too: TEXT_CID_FOR_UNKNOWN_UNICODE => TEXT_USE_CID_FOR_UNKNOWN_UNICODE TEXT_STEXT_SEGMENT => TEXT_SEGMENT. Added some new TEXT_ constants if using MuPDF-1.26: TEXT_PARAGRAPH_BREAK TEXT_TABLE_HUNT TEXT_COLLECT_STYLES TEXT_USE_GID_FOR_UNKNOWN_UNICODE TEXT_CLIP_RECT TEXT_ACCURATE_ASCENDERS TEXT_ACCURATE_SIDE_BEARINGS Removed duplicate definitions of TEXT_FONT_* constants.
1 parent 93be238 commit cbc0cad

File tree

4 files changed

+37
-31
lines changed

4 files changed

+37
-31
lines changed

docs/locales/ja/LC_MESSAGES/vars.po

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -553,15 +553,15 @@ msgstr "以下の定数は、テキスト抽出と検索のための上記のデ
553553
#: d81bde381b3b43d08761afe43ea6dc5e dd264103844e4a3eb274501e5b205714
554554
msgid ""
555555
"`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP "
556-
"| TEXT_CID_FOR_UNKNOWN_UNICODE`"
556+
"| TEXT_USE_CID_FOR_UNKNOWN_UNICODE`"
557557
msgstr ""
558558

559559
#: ../../vars.rst:245 ../../vars.rst:249 ../../vars.rst:253 ../../vars.rst:257
560560
#: 12741fdffb57448cafbcf1fc3e55f15a 7dc1f5cf362a43a2a12062f475fb2b8b
561561
#: a59651b1c41c4785b081fbdcb08378e5 fc6dcf6ae5c445d9b8bac3bb88627aa8
562562
msgid ""
563563
"`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP "
564-
"| TEXT_PRESERVE_IMAGES | TEXT_CID_FOR_UNKNOWN_UNICODE`"
564+
"| TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`"
565565
msgstr ""
566566

567567
#: ../../vars.rst:265 a0dbba8829ba49d8b201c159e50c25ed

docs/vars.rst

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si
247247
248248
64 -- Characters entirely outside a page's **mediabox** or contained in other "clipped" areas will be ignored. This is default in PyMuPDF.
249249

250-
.. py:data:: TEXT_CID_FOR_UNKNOWN_UNICODE
250+
.. py:data:: TEXT_USE_CID_FOR_UNKNOWN_UNICODE
251251
252252
128 -- Use raw character codes instead of U+FFFD. This is the default for **text extraction** in PyMuPDF. If you **want to detect** when encoding information is missing or uncertain, toggle this flag and scan for the presence of U+FFFD (= `chr(0xfffd)`) code points in the resulting text.
253253

@@ -270,43 +270,43 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si
270270
271271
2048 -- Ignore built-in differences between text appearing in e.g. PDF viewers versus text stored in the PDF. See :ref:`AdobeManual`, page 615 for background. If set, the **stored** ("replacement" text) is ignored in favor of the displayed text.
272272

273-
.. py:data:: TEXT_STEXT_SEGMENT
273+
.. py:data:: TEXT_SEGMENT
274274
275275
4096 -- Attempt to segment page into different regions.
276276

277277
The following constants represent the default combinations of the above for text extraction and searching:
278278

279279
.. py:data:: TEXTFLAGS_TEXT
280280
281-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_CID_FOR_UNKNOWN_UNICODE`
281+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
282282

283283
.. py:data:: TEXTFLAGS_WORDS
284284
285-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_CID_FOR_UNKNOWN_UNICODE`
285+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
286286

287287
.. py:data:: TEXTFLAGS_BLOCKS
288288
289-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_CID_FOR_UNKNOWN_UNICODE`
289+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
290290

291291
.. py:data:: TEXTFLAGS_DICT
292292
293-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_CID_FOR_UNKNOWN_UNICODE`
293+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
294294

295295
.. py:data:: TEXTFLAGS_RAWDICT
296296
297-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_CID_FOR_UNKNOWN_UNICODE`
297+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
298298

299299
.. py:data:: TEXTFLAGS_HTML
300300
301-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_CID_FOR_UNKNOWN_UNICODE`
301+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
302302

303303
.. py:data:: TEXTFLAGS_XHTML
304304
305-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_CID_FOR_UNKNOWN_UNICODE`
305+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
306306

307307
.. py:data:: TEXTFLAGS_XML
308308
309-
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_CID_FOR_UNKNOWN_UNICODE`
309+
`TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE`
310310

311311
.. py:data:: TEXTFLAGS_SEARCH
312312

src/__init__.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13468,33 +13468,46 @@ def width(self):
1346813468
TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE
1346913469
TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS
1347013470
TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP
13471-
TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
13471+
TEXT_USE_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
1347213472
TEXT_COLLECT_STRUCTURE = mupdf.FZ_STEXT_COLLECT_STRUCTURE
1347313473
TEXT_ACCURATE_BBOXES = mupdf.FZ_STEXT_ACCURATE_BBOXES
1347413474
TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS
1347513475
TEXT_IGNORE_ACTUALTEXT = mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
13476-
TEXT_STEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT
13476+
TEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT
13477+
13478+
if mupdf_version_tuple >= (1, 26):
13479+
TEXT_PARAGRAPH_BREAK = mupdf.FZ_STEXT_PARAGRAPH_BREAK
13480+
TEXT_TABLE_HUNT = mupdf.FZ_STEXT_TABLE_HUNT
13481+
TEXT_COLLECT_STYLES = mupdf.FZ_STEXT_COLLECT_STYLES
13482+
TEXT_USE_GID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
13483+
TEXT_CLIP_RECT = mupdf.FZ_STEXT_CLIP_RECT
13484+
TEXT_ACCURATE_ASCENDERS = mupdf.FZ_STEXT_ACCURATE_ASCENDERS
13485+
TEXT_ACCURATE_SIDE_BEARINGS = mupdf.FZ_STEXT_ACCURATE_SIDE_BEARINGS
13486+
13487+
# 2025-05-07: Non-standard names preserved for backwards compatibility.
13488+
TEXT_STEXT_SEGMENT = TEXT_SEGMENT
13489+
TEXT_CID_FOR_UNKNOWN_UNICODE = TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1347713490

1347813491
TEXTFLAGS_WORDS = (0
1347913492
| TEXT_PRESERVE_LIGATURES
1348013493
| TEXT_PRESERVE_WHITESPACE
1348113494
| TEXT_MEDIABOX_CLIP
13482-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13495+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1348313496
)
1348413497

1348513498
TEXTFLAGS_BLOCKS = (0
1348613499
| TEXT_PRESERVE_LIGATURES
1348713500
| TEXT_PRESERVE_WHITESPACE
1348813501
| TEXT_MEDIABOX_CLIP
13489-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13502+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1349013503
)
1349113504

1349213505
TEXTFLAGS_DICT = (0
1349313506
| TEXT_PRESERVE_LIGATURES
1349413507
| TEXT_PRESERVE_WHITESPACE
1349513508
| TEXT_MEDIABOX_CLIP
1349613509
| TEXT_PRESERVE_IMAGES
13497-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13510+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1349813511
)
1349913512

1350013513
TEXTFLAGS_RAWDICT = TEXTFLAGS_DICT
@@ -13503,37 +13516,37 @@ def width(self):
1350313516
| TEXT_PRESERVE_WHITESPACE
1350413517
| TEXT_MEDIABOX_CLIP
1350513518
| TEXT_DEHYPHENATE
13506-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13519+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1350713520
)
1350813521

1350913522
TEXTFLAGS_HTML = (0
1351013523
| TEXT_PRESERVE_LIGATURES
1351113524
| TEXT_PRESERVE_WHITESPACE
1351213525
| TEXT_MEDIABOX_CLIP
1351313526
| TEXT_PRESERVE_IMAGES
13514-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13527+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1351513528
)
1351613529

1351713530
TEXTFLAGS_XHTML = (0
1351813531
| TEXT_PRESERVE_LIGATURES
1351913532
| TEXT_PRESERVE_WHITESPACE
1352013533
| TEXT_MEDIABOX_CLIP
1352113534
| TEXT_PRESERVE_IMAGES
13522-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13535+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1352313536
)
1352413537

1352513538
TEXTFLAGS_XML = (0
1352613539
| TEXT_PRESERVE_LIGATURES
1352713540
| TEXT_PRESERVE_WHITESPACE
1352813541
| TEXT_MEDIABOX_CLIP
13529-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13542+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1353013543
)
1353113544

1353213545
TEXTFLAGS_TEXT = (0
1353313546
| TEXT_PRESERVE_LIGATURES
1353413547
| TEXT_PRESERVE_WHITESPACE
1353513548
| TEXT_MEDIABOX_CLIP
13536-
| TEXT_CID_FOR_UNKNOWN_UNICODE
13549+
| TEXT_USE_CID_FOR_UNKNOWN_UNICODE
1353713550
)
1353813551

1353913552
# Simple text encoding options
@@ -13566,13 +13579,6 @@ def width(self):
1356613579
PDF_BM_Screen = "Screen"
1356713580
PDF_BM_SoftLight = "Softlight"
1356813581

13569-
# General text flags
13570-
TEXT_FONT_SUPERSCRIPT = 1
13571-
TEXT_FONT_ITALIC = 2
13572-
TEXT_FONT_SERIFED = 4
13573-
TEXT_FONT_MONOSPACED = 8
13574-
TEXT_FONT_BOLD = 16
13575-
1357613582

1357713583
annot_skel = {
1357813584
"goto1": lambda a, b, c, d, e: f"<</A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>/Rect[{e}]/BS<</W 0>>/Subtype/Link>>",

tests/test_textextract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def get(flags=None):
197197
text_none, n_fffd_none = get()
198198
text_0, n_fffd_0 = get(flags0)
199199

200-
text_1, n_fffd_1 = get(flags0 | pymupdf.TEXT_CID_FOR_UNKNOWN_UNICODE)
200+
text_1, n_fffd_1 = get(flags0 | pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE)
201201

202202
assert n_fffd_none == n_fffd_good
203203
assert n_fffd_0 == n_fffd_bad
@@ -434,7 +434,7 @@ def test_4139():
434434
flags = (0
435435
| pymupdf.TEXT_PRESERVE_IMAGES
436436
| pymupdf.TEXT_PRESERVE_WHITESPACE
437-
| pymupdf.TEXT_CID_FOR_UNKNOWN_UNICODE
437+
| pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE
438438
)
439439
with pymupdf.open(path) as document:
440440
page = document[0]

0 commit comments

Comments
 (0)