Skip to content

Commit ff1bdbb

Browse files
JorjMcKiejulian-smith-artifex-com
authored andcommitted
Re-Implement select method
This re-implements Document method "select()" based on new MuPDF function "pdf_rearrange_pages()". This is a more complete (and faster) implementation of what needs to be done here in that not only pages will be rearranged, but also consequential changes will be made to the table of contents, links to removed pages and affected entries in the Optional Content definitions. Update __init__.py
1 parent 0fd9594 commit ff1bdbb

File tree

7 files changed

+71
-447
lines changed

7 files changed

+71
-447
lines changed

changes.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@ Change Log
22
==========
33

44

5+
**Changes in version 1.23.23 (2024-02-14)**
6+
7+
* Fixed issues:
8+
9+
* **Fixed** `3150 <https://github.com/pymupdf/PyMuPDF/issues/3150>`_: doc.select() hangs on this doc.
10+
11+
12+
* Other:
13+
14+
* Replaced major code portions previously supporting `Document.select()` MuPDF function `pdf_rearrange_pages()` which is faster and more thoroughly performing that task.
15+
16+
517
**Changes in version 1.23.22 (2024-02-12)**
618

719
* Fixed issues:

src/__init__.py

Lines changed: 13 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -5385,16 +5385,21 @@ def select(self, pyliste):
53855385
raise ValueError("is no PDF")
53865386
if not hasattr(pyliste, "__getitem__"):
53875387
raise ValueError("sequence required")
5388-
if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):
5388+
5389+
valid_range = range(len(self))
5390+
if (len(pyliste) == 0
5391+
or min(pyliste) not in valid_range
5392+
or max(pyliste) not in valid_range
5393+
):
53895394
raise ValueError("bad page number(s)")
5390-
# preparatory stuff:
5391-
# (1) get underlying pdf document,
5392-
# (2) transform Python list into integer array
5395+
5396+
# get underlying pdf document,
53935397
pdf = _as_pdf_document(self)
5394-
# call retainpages (code copy of fz_clean_file.c)
5395-
retainpages(pdf, pyliste)
5396-
if pdf.m_internal.rev_page_map:
5397-
mupdf.ll_pdf_drop_page_tree(pdf.m_internal)
5398+
5399+
# create page sub-pdf via extra.rearrange_pages2
5400+
extra.rearrange_pages2(pdf, tuple(pyliste))
5401+
5402+
# remove any existing pages with their kids
53985403
self._reset_page_refs()
53995404

54005405
def set_language(self, language=None):
@@ -20862,116 +20867,6 @@ def repair_mono_font(page: "Page", font: "Font") -> None:
2086220867
log("Cannot set width for '%s' in xref %i" % (font.name, xref))
2086320868

2086420869

20865-
def retainpage(doc, parent, kids, page):
20866-
'''
20867-
Recreate page tree to only retain specified pages.
20868-
'''
20869-
pageref = mupdf.pdf_lookup_page_obj(doc, page)
20870-
mupdf.pdf_flatten_inheritable_page_items(pageref)
20871-
mupdf.pdf_dict_put(pageref, PDF_NAME('Parent'), parent)
20872-
# Store page object in new kids array
20873-
mupdf.pdf_array_push(kids, pageref)
20874-
20875-
20876-
def retainpages(doc, liste):
20877-
'''
20878-
This is called by PyMuPDF:
20879-
liste = page numbers to retain
20880-
'''
20881-
argc = len(liste)
20882-
pagecount = mupdf.pdf_count_pages(doc)
20883-
20884-
# Keep only pages/type and (reduced) dest entries to avoid
20885-
# references to dropped pages
20886-
oldroot = mupdf.pdf_dict_get(mupdf.pdf_trailer(doc), PDF_NAME('Root'))
20887-
pages = mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages'))
20888-
olddests = mupdf.pdf_load_name_tree(doc, PDF_NAME('Dests'))
20889-
outlines = mupdf.pdf_dict_get(oldroot, PDF_NAME('Outlines'))
20890-
ocproperties = mupdf.pdf_dict_get(oldroot, PDF_NAME('OCProperties'))
20891-
names_list = None
20892-
20893-
root = mupdf.pdf_new_dict(doc, 3)
20894-
mupdf.pdf_dict_put(root, PDF_NAME('Type'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Type')))
20895-
mupdf.pdf_dict_put(root, PDF_NAME('Pages'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages')))
20896-
if outlines.m_internal:
20897-
mupdf.pdf_dict_put(root, PDF_NAME('Outlines'), outlines)
20898-
if ocproperties.m_internal:
20899-
mupdf.pdf_dict_put(root, PDF_NAME('OCProperties'), ocproperties)
20900-
20901-
mupdf.pdf_update_object(doc, mupdf.pdf_to_num(oldroot), root)
20902-
20903-
# Create a new kids array with only the pages we want to keep
20904-
kids = mupdf.pdf_new_array(doc, 1)
20905-
20906-
# Retain pages specified
20907-
for page in range(argc):
20908-
i = liste[page]
20909-
if i < 0 or i >= pagecount:
20910-
RAISEPY(MSG_BAD_PAGENO, PyExc_ValueError)
20911-
retainpage(doc, pages, kids, i)
20912-
20913-
# Update page count and kids array
20914-
countobj = mupdf.pdf_new_int(mupdf.pdf_array_len(kids))
20915-
mupdf.pdf_dict_put(pages, PDF_NAME('Count'), countobj)
20916-
mupdf.pdf_dict_put(pages, PDF_NAME('Kids'), kids)
20917-
20918-
pagecount = mupdf.pdf_count_pages(doc)
20919-
page_object_nums = []
20920-
for i in range(pagecount):
20921-
pageref = mupdf.pdf_lookup_page_obj(doc, i)
20922-
page_object_nums.append(mupdf.pdf_to_num(pageref))
20923-
20924-
# If we had an old Dests tree (now reformed as an olddests dictionary),
20925-
# keep any entries in there that point to valid pages.
20926-
# This may mean we keep more than we need, but it is safe at least.
20927-
if olddests:
20928-
names = mupdf.pdf_new_dict(doc, 1)
20929-
dests = mupdf.pdf_new_dict(doc, 1)
20930-
len_ = mupdf.pdf_dict_len(olddests)
20931-
20932-
names_list = mupdf.pdf_new_array(doc, 32)
20933-
20934-
for i in range(len_):
20935-
key = mupdf.pdf_dict_get_key(olddests, i)
20936-
val = mupdf.pdf_dict_get_val(olddests, i)
20937-
dest = mupdf.pdf_dict_get(val, PDF_NAME('D'))
20938-
20939-
dest = mupdf.pdf_array_get(dest if dest.m_internal else val, 0)
20940-
# fixme: need dest_is_valid_page.
20941-
if dest_is_valid_page(dest, page_object_nums, pagecount):
20942-
key_str = mupdf.pdf_new_string(mupdf.pdf_to_name(key), len(mupdf.pdf_to_name(key)))
20943-
mupdf.pdf_array_push(names_list, key_str)
20944-
mupdf.pdf_array_push(names_list, val)
20945-
20946-
mupdf.pdf_dict_put(dests, PDF_NAME('Names'), names_list)
20947-
mupdf.pdf_dict_put(names, PDF_NAME('Dests'), dests)
20948-
mupdf.pdf_dict_put(root, PDF_NAME('Names'), names)
20949-
20950-
# Edit each pages /Annot list to remove any links pointing to nowhere.
20951-
for i in range(pagecount):
20952-
pageref = mupdf.pdf_lookup_page_obj(doc, i)
20953-
annots = mupdf.pdf_dict_get(pageref, PDF_NAME('Annots'))
20954-
len_ = mupdf.pdf_array_len(annots)
20955-
j = 0
20956-
while 1:
20957-
if j >= len_:
20958-
break
20959-
o = mupdf.pdf_array_get(annots, j)
20960-
20961-
if not mupdf.pdf_name_eq(mupdf.pdf_dict_get(o, PDF_NAME('Subtype')), PDF_NAME('Link')):
20962-
continue
20963-
20964-
if not dest_is_valid(o, pagecount, page_object_nums, names_list):
20965-
# Remove this annotation
20966-
mupdf.pdf_array_delete(annots, j)
20967-
len_ -= 1
20968-
j -= 1
20969-
j += 1
20970-
20971-
if strip_outlines( doc, outlines, pagecount, page_object_nums, names_list) == 0:
20972-
mupdf.pdf_dict_del(root, PDF_NAME('Outlines'))
20973-
20974-
2097520870
def sRGB_to_pdf(srgb: int) -> tuple:
2097620871
"""Convert sRGB color code to a PDF color triple.
2097720872

src/extra.i

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,21 @@ PyObject* JM_EscapeStrFromBuffer(fz_buffer* buff)
166166
return val;
167167
}
168168

169+
void rearrange_pages2(
170+
mupdf::PdfDocument& doc,
171+
PyObject *new_pages
172+
)
173+
{
174+
int len = (int) PyTuple_Size(new_pages);
175+
int *pages = (int *) malloc((int) len * sizeof(int));
176+
int i;
177+
for (i = 0; i < len; i++) {
178+
pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(new_pages, (Py_ssize_t) i));
179+
}
180+
mupdf::pdf_rearrange_pages(doc, len, pages);
181+
free(pages);
182+
}
183+
169184

170185
//----------------------------------------------------------------------------
171186
// Deep-copies a source page to the target.
@@ -4515,3 +4530,5 @@ fz_image* fz_new_image_from_compressed_buffer(
45154530
fz_compressed_buffer *buffer,
45164531
fz_image *mask
45174532
);
4533+
4534+
void rearrange_pages2( mupdf::PdfDocument& doc, PyObject *new_pages);

src_classic/fitz_old.i

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,7 +2297,8 @@ if not self.is_pdf:
22972297
if not hasattr(pyliste, "__getitem__"):
22982298
raise ValueError("sequence required")
22992299
if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):
2300-
raise ValueError("bad page number(s)")%}
2300+
raise ValueError("bad page number(s)")
2301+
pyliste = tuple(pyliste)%}
23012302
%pythonappend select %{self._reset_page_refs()%}
23022303
PyObject *select(PyObject *pyliste)
23032304
{
@@ -2306,17 +2307,23 @@ if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not
23062307
// (2) transform Python list into integer array
23072308

23082309
pdf_document *pdf = pdf_specifics(gctx, (fz_document *) $self);
2310+
int *pages = NULL;
23092311
fz_try(gctx) {
23102312
// call retainpages (code copy of fz_clean_file.c)
2311-
globals glo = {0};
2312-
glo.ctx = gctx;
2313-
glo.doc = pdf;
2314-
retainpages(gctx, &glo, pyliste);
2313+
int i, len = (int) PyTuple_Size(pyliste);
2314+
pages = fz_realloc_array(gctx, pages, len, int);
2315+
for (i = 0; i < len; i++) {
2316+
pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(pyliste, (Py_ssize_t) i));
2317+
}
2318+
pdf_rearrange_pages(gctx, pdf, len, pages);
23152319
if (pdf->rev_page_map)
23162320
{
23172321
pdf_drop_page_tree(gctx, pdf);
23182322
}
23192323
}
2324+
fz_always(gctx) {
2325+
fz_free(gctx, pages);
2326+
}
23202327
fz_catch(gctx) {
23212328
return NULL;
23222329
}

0 commit comments

Comments
 (0)