@@ -1683,15 +1683,185 @@ def do_widgets(
16831683 from_page : int = - 1 ,
16841684 to_page : int = - 1 ,
16851685 start_at : int = - 1 ,
1686+ join_duplicates = 0 ,
16861687) -> None :
1687- """Insert widgets contained in copied page range into destination PDF.
1688+ """Insert widgets of copied page range into target PDF.
16881689
1689- Parameter values **must** equal those of method insert_pdf(). Method
1690- insert_pdf() which must have been previously executed.
1690+ Parameter values **must** equal those of method insert_pdf() which
1691+ must have been previously executed.
16911692 """
16921693 if not src .is_form_pdf : # nothing to do: source PDF has no fields
16931694 return
16941695
1696+ def clean_kid_parents (acro_fields ):
1697+ """ Make sure all kids have correct "Parent" pointers."""
1698+ for i in range (acro_fields .pdf_array_len ()):
1699+ parent = acro_fields .pdf_array_get (i )
1700+ kids = parent .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1701+ for j in range (kids .pdf_array_len ()):
1702+ kid = kids .pdf_array_get (j )
1703+ kid .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), parent )
1704+
1705+ def join_widgets (pdf , acro_fields , xref1 , xref2 , name ):
1706+ """Called for each pair of widgets having the same name.
1707+
1708+ Args:
1709+ pdf: target MuPDF document
1710+ acro_fields: object Root/AcroForm/Fields
1711+ xref1, xref2: widget xrefs having same names
1712+ name: (str) the name
1713+
1714+ Result:
1715+ Defined or updated widget parent that points to both widgets.
1716+ """
1717+
1718+ def re_target (pdf , acro_fields , xref1 , kids1 , xref2 , kids2 ):
1719+ """Merge widget in xref2 into "Kids" list of widget xref1.
1720+
1721+ Args:
1722+ xref1, kids1: target widget and its "Kids" array.
1723+ xref2, kids2: source wwidget and its "Kids" array (may be empty).
1724+ """
1725+ # make indirect objects from widgets
1726+ w1_ind = mupdf .pdf_new_indirect (pdf , xref1 , 0 )
1727+ w2_ind = mupdf .pdf_new_indirect (pdf , xref2 , 0 )
1728+ # find source widget in "Fields" array
1729+ idx = acro_fields .pdf_array_find (w2_ind )
1730+ acro_fields .pdf_array_delete (idx )
1731+
1732+ if not kids2 .pdf_is_array (): # source widget has no kids
1733+ widget = mupdf .pdf_load_object (pdf , xref2 )
1734+
1735+ # delete name from widget and insert target as parent
1736+ widget .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1737+ widget .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), w1_ind )
1738+
1739+ # put in target Kids
1740+ kids1 .pdf_array_push (w2_ind )
1741+ else : # copy source kids to target kids
1742+ for i in range (kids2 .pdf_array_len ()):
1743+ kid = kids2 .pdf_array_get (i )
1744+ kid .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), w1_ind )
1745+ kid_ind = mupdf .pdf_new_indirect (pdf , kid .pdf_to_num (), 0 )
1746+ kids1 .pdf_array_push (kid_ind )
1747+
1748+ def new_target (pdf , acro_fields , xref1 , w1 , xref2 , w2 , name ):
1749+ """Make new "Parent" for two widgets with same name.
1750+
1751+ Args:
1752+ xref1, w1: first widget
1753+ xref2, w2: second widget
1754+ name: field name
1755+
1756+ Result:
1757+ Both widgets have no "Kids". We create a new object with the
1758+ name and a "Kids" array containing the widgets.
1759+ Original widgets must be removed from AcroForm/Fields.
1760+ """
1761+ # make new "Parent" object
1762+ new = mupdf .pdf_new_dict (pdf , 5 )
1763+ new .pdf_dict_put_text_string (pymupdf .PDF_NAME ("T" ), name )
1764+ kids = new .pdf_dict_put_array (pymupdf .PDF_NAME ("Kids" ), 2 )
1765+ new_obj = mupdf .pdf_add_object (pdf , new )
1766+ new_obj_xref = new_obj .pdf_to_num ()
1767+ new_ind = mupdf .pdf_new_indirect (pdf , new_obj_xref , 0 )
1768+
1769+ # copy over some required source widget properties
1770+ ft = w1 .pdf_dict_get (pymupdf .PDF_NAME ("FT" ))
1771+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("FT" ))
1772+ new_obj .pdf_dict_put (pymupdf .PDF_NAME ("FT" ), ft )
1773+
1774+ aa = w1 .pdf_dict_get (pymupdf .PDF_NAME ("AA" ))
1775+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("AA" ))
1776+ new_obj .pdf_dict_put (pymupdf .PDF_NAME ("AA" ), aa )
1777+
1778+ # remove name field, insert "Parent" field in source widgets
1779+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1780+ w1 .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), new_ind )
1781+ w2 .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1782+ w2 .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), new_ind )
1783+
1784+ # put source widgets in "kids" array
1785+ ind1 = mupdf .pdf_new_indirect (pdf , xref1 , 0 )
1786+ ind2 = mupdf .pdf_new_indirect (pdf , xref2 , 0 )
1787+ kids .pdf_array_push (ind1 )
1788+ kids .pdf_array_push (ind2 )
1789+
1790+ # remove source widgets from "AcroForm/Fields"
1791+ idx = acro_fields .pdf_array_find (ind1 )
1792+ acro_fields .pdf_array_delete (idx )
1793+ idx = acro_fields .pdf_array_find (ind2 )
1794+ acro_fields .pdf_array_delete (idx )
1795+
1796+ acro_fields .pdf_array_push (new_ind )
1797+
1798+ w1 = mupdf .pdf_load_object (pdf , xref1 )
1799+ w2 = mupdf .pdf_load_object (pdf , xref2 )
1800+ kids1 = w1 .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1801+ kids2 = w2 .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1802+
1803+ # check which widget has a suitable "Kids" array
1804+ if kids1 .pdf_is_array ():
1805+ re_target (pdf , acro_fields , xref1 , kids1 , xref2 , kids2 ) # pylint: disable=arguments-out-of-order
1806+ elif kids2 .pdf_is_array ():
1807+ re_target (pdf , acro_fields , xref2 , kids2 , xref1 , kids1 ) # pylint: disable=arguments-out-of-order
1808+ else :
1809+ new_target (pdf , acro_fields , xref1 , w1 , xref2 , w2 , name ) # pylint: disable=arguments-out-of-order
1810+
1811+ def get_kids (parent , kids_list ):
1812+ """Return xref list of leaf kids for a parent.
1813+
1814+ Call with an empty list.
1815+ """
1816+ kids = mupdf .pdf_dict_get (parent , pymupdf .PDF_NAME ("Kids" ))
1817+ if not kids .pdf_is_array ():
1818+ return kids_list
1819+ for i in range (kids .pdf_array_len ()):
1820+ kid = kids .pdf_array_get (i )
1821+ if mupdf .pdf_is_dict (mupdf .pdf_dict_get (kid , pymupdf .PDF_NAME ("Kids" ))):
1822+ kids_list = get_kids (kid , kids_list )
1823+ else :
1824+ kids_list .append (kid .pdf_to_num ())
1825+ return kids_list
1826+
1827+ def kids_xrefs (widget ):
1828+ """Get the xref of top "Parent" and the list of leaf widgets."""
1829+ kids_list = []
1830+ parent = mupdf .pdf_dict_get (widget , pymupdf .PDF_NAME ("Parent" ))
1831+ parent_xref = parent .pdf_to_num ()
1832+ if parent_xref == 0 :
1833+ return parent_xref , kids_list
1834+ kids_list = get_kids (parent , kids_list )
1835+ return parent_xref , kids_list
1836+
1837+ def deduplicate_names (pdf , acro_fields , join_duplicates = False ):
1838+ """Handle any widget name duplicates caused by the merge."""
1839+ names = {} # key is a widget name, value a list of widgets having it.
1840+
1841+ # extract all names and widgets in "AcroForm/Fields"
1842+ for i in range (mupdf .pdf_array_len (acro_fields )):
1843+ wobject = mupdf .pdf_array_get (acro_fields , i )
1844+ xref = wobject .pdf_to_num ()
1845+
1846+ # extract widget name and collect widget(s) using it
1847+ T = mupdf .pdf_dict_get_text_string (wobject , pymupdf .PDF_NAME ("T" ))
1848+ xrefs = names .get (T , [])
1849+ xrefs .append (xref )
1850+ names [T ] = xrefs
1851+
1852+ for name , xrefs in names .items ():
1853+ if len (xrefs ) < 2 :
1854+ continue
1855+ xref0 , xref1 = xrefs [:2 ] # only exactly 2 should occur!
1856+ if join_duplicates : # combine fields with equal names
1857+ join_widgets (pdf , acro_fields , xref0 , xref1 , name )
1858+ else : # make field names unique
1859+ newname = name + f" [{ xref1 } ]" # append this to the name
1860+ wobject = mupdf .pdf_load_object (pdf , xref1 )
1861+ wobject .pdf_dict_put_text_string (pymupdf .PDF_NAME ("T" ), newname )
1862+
1863+ clean_kid_parents (acro_fields )
1864+
16951865 def get_acroform (doc ):
16961866 """Retrieve the AcroForm dictionary form a PDF."""
16971867 pdf = mupdf .pdf_document_from_fz_document (doc )
@@ -1702,56 +1872,79 @@ def get_acroform(doc):
17021872 srcpdf = mupdf .pdf_document_from_fz_document (src )
17031873
17041874 if tar .is_form_pdf :
1705- # target is a Form PDF, so use its AcroForm to include source fields
1875+ # target is a Form PDF, so use it to include source fields
17061876 acro = get_acroform (tar )
1707- # Important arrays of indirect objects
1708- tar_fields = mupdf .pdf_dict_get (acro , pymupdf .PDF_NAME ("Fields" ))
1709- tar_co = mupdf .pdf_dict_get (acro , pymupdf .PDF_NAME ("CO" ))
1710- if not mupdf .pdf_is_array (tar_co ):
1711- tar_co = mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("CO" ), 5 )
1877+ # Important arrays in AcroForm
1878+ acro_fields = acro .pdf_dict_get (pymupdf .PDF_NAME ("Fields" ))
1879+ tar_co = acro .pdf_dict_get (pymupdf .PDF_NAME ("CO" ))
1880+ if not tar_co .pdf_is_array ():
1881+ tar_co = acro .pdf_dict_put_array (pymupdf .PDF_NAME ("CO" ), 5 )
17121882 else :
17131883 # target is no Form PDF, so copy over source AcroForm
17141884 acro = mupdf .pdf_deep_copy_obj (get_acroform (src )) # make a copy
17151885
17161886 # Clear "Fields" and "CO" arrays: will be populated by page fields.
17171887 # This is required to avoid copying unneeded objects.
1718- mupdf .pdf_dict_del (acro , pymupdf .PDF_NAME ("Fields" ))
1719- mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("Fields" ), 5 )
1720- mupdf .pdf_dict_del (acro , pymupdf .PDF_NAME ("CO" ))
1721- mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("CO" ), 5 )
1888+ acro .pdf_dict_del (pymupdf .PDF_NAME ("Fields" ))
1889+ acro .pdf_dict_put_array (pymupdf .PDF_NAME ("Fields" ), 5 )
1890+ acro .pdf_dict_del (pymupdf .PDF_NAME ("CO" ))
1891+ acro .pdf_dict_put_array (pymupdf .PDF_NAME ("CO" ), 5 )
17221892
17231893 # Enrich AcroForm for copying to target
17241894 acro_graft = mupdf .pdf_graft_mapped_object (graftmap , acro )
17251895
17261896 # Insert AcroForm into target PDF
17271897 acro_tar = mupdf .pdf_add_object (tarpdf , acro_graft )
1728- tar_fields = mupdf .pdf_dict_get (acro_tar , pymupdf .PDF_NAME ("Fields" ))
1729- tar_co = mupdf .pdf_dict_get (acro_tar , pymupdf .PDF_NAME ("CO" ))
1898+ acro_fields = acro_tar .pdf_dict_get (pymupdf .PDF_NAME ("Fields" ))
1899+ tar_co = acro_tar .pdf_dict_get (pymupdf .PDF_NAME ("CO" ))
17301900
17311901 # get its xref and insert it into target catalog
1732- tar_xref = mupdf .pdf_to_num (acro_tar )
1902+ tar_xref = acro_tar .pdf_to_num ()
17331903 acro_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
17341904 root = mupdf .pdf_dict_get (mupdf .pdf_trailer (tarpdf ), pymupdf .PDF_NAME ("Root" ))
1735- mupdf .pdf_dict_put (root , pymupdf .PDF_NAME ("AcroForm" ), acro_tar_ind )
1905+ root .pdf_dict_put (pymupdf .PDF_NAME ("AcroForm" ), acro_tar_ind )
17361906
17371907 if from_page <= to_page :
17381908 src_range = range (from_page , to_page + 1 )
17391909 else :
17401910 src_range = range (from_page , to_page - 1 , - 1 )
17411911
1742- for i in range (len (src_range )):
1743- # read first page that was copied over
1744- tar_page = tar [start_at + i ]
1745-
1746- # convert it to a formal PDF page
1747- tar_page_pdf = mupdf .pdf_page_from_fz_page (tar_page )
1912+ parents = {} # information about widget parents
17481913
1749- # extract its annotations array
1750- tar_annots = mupdf .pdf_dict_get (tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ))
1751- if not mupdf .pdf_is_array (tar_annots ):
1752- tar_annots = mupdf .pdf_dict_put_array (
1753- tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ), 5
1754- )
1914+ # remove "P" owning page reference from all widgets of all source pages
1915+ for i in src_range :
1916+ src_page = src [src_range [i ]]
1917+ for xref in [
1918+ xref
1919+ for xref , wtype , _ in src_page .annot_xrefs ()
1920+ if wtype == pymupdf .PDF_ANNOT_WIDGET # pylint: disable=no-member
1921+ ]:
1922+ w_obj = mupdf .pdf_load_object (srcpdf , xref )
1923+ w_obj .pdf_dict_del (pymupdf .PDF_NAME ("P" ))
1924+
1925+ # get the widget's parent structure
1926+ parent_xref , old_kids = kids_xrefs (w_obj )
1927+ if parent_xref :
1928+ parents [parent_xref ] = {
1929+ "new_xref" : 0 ,
1930+ "old_kids" : old_kids ,
1931+ "new_kids" : [],
1932+ }
1933+ # Copy over Parent widgets first - they are not page-dependent
1934+ for xref in parents .keys (): # pylint: disable=consider-using-dict-items
1935+ parent = mupdf .pdf_load_object (srcpdf , xref )
1936+ parent_graft = mupdf .pdf_graft_mapped_object (graftmap , parent )
1937+ parent_tar = mupdf .pdf_add_object (tarpdf , parent_graft )
1938+ kids_xrefs_new = get_kids (parent_tar , [])
1939+ parent_xref_new = parent_tar .pdf_to_num ()
1940+ parent_ind = mupdf .pdf_new_indirect (tarpdf , parent_xref_new , 0 )
1941+ acro_fields .pdf_array_push (parent_ind )
1942+ parents [xref ]["new_xref" ] = parent_xref_new
1943+ parents [xref ]["new_kids" ] = kids_xrefs_new
1944+
1945+ for i in src_range :
1946+ # read first copied over page in target
1947+ tar_page = tar [start_at + i ]
17551948
17561949 # read the original page in the source PDF
17571950 src_page = src [src_range [i ]]
@@ -1762,44 +1955,48 @@ def get_acroform(doc):
17621955 for xref , wtype , _ in src_page .annot_xrefs ()
17631956 if wtype == pymupdf .PDF_ANNOT_WIDGET # pylint: disable=no-member
17641957 ]
1958+ if not w_xrefs : # no widgets on this source page
1959+ continue
17651960
1766- # Remove page references from widgets to prevent duplicate copies
1767- # of the page in the target.
1768- for xref in w_xrefs :
1769- w_obj = mupdf .pdf_load_object (srcpdf , xref )
1770- mupdf .pdf_dict_del (w_obj , pymupdf .PDF_NAME ("P" ))
1961+ # convert to formal PDF page
1962+ tar_page_pdf = mupdf .pdf_page_from_fz_page (tar_page )
1963+
1964+ # extract annotations array
1965+ tar_annots = mupdf .pdf_dict_get (tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ))
1966+ if not mupdf .pdf_is_array (tar_annots ):
1967+ tar_annots = mupdf .pdf_dict_put_array (
1968+ tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ), 5
1969+ )
17711970
17721971 for xref in w_xrefs :
17731972 w_obj = mupdf .pdf_load_object (srcpdf , xref )
17741973
1775- # check if field is a member of inter-field validations
1776- temp = mupdf .pdf_dict_getp (w_obj , "AA/C" )
1777- if mupdf .pdf_is_dict (temp ):
1778- is_aac = True
1779- else :
1780- is_aac = False
1781-
1782- # recursively complete the widget object with all referenced objects
1783- w_obj_graft = mupdf .pdf_graft_mapped_object (graftmap , w_obj )
1784-
1785- # add the completed widget object to the target PDF
1786- w_obj_tar = mupdf .pdf_add_object (tarpdf , w_obj_graft )
1787-
1788- # extract its generated target xref number
1789- tar_xref = mupdf .pdf_to_num (w_obj_tar )
1974+ # check if field takes part in inter-field validations
1975+ is_aac = mupdf .pdf_is_dict (mupdf .pdf_dict_getp (w_obj , "AA/C" ))
17901976
1791- # create an indirect object from it
1792- w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1793-
1794- # insert this xref reference into the page,
1795- mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
1977+ # check if parent of widget already in target
1978+ parent_xref = mupdf .pdf_to_num (
1979+ w_obj .pdf_dict_get (pymupdf .PDF_NAME ("Parent" ))
1980+ )
1981+ if parent_xref == 0 : # parent not in target yet
1982+ w_obj_graft = mupdf .pdf_graft_mapped_object (graftmap , w_obj )
1983+ w_obj_tar = mupdf .pdf_add_object (tarpdf , w_obj_graft )
1984+ tar_xref = w_obj_tar .pdf_to_num ()
1985+ w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1986+ mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
1987+ mupdf .pdf_array_push (acro_fields , w_obj_tar_ind )
1988+ else :
1989+ parent = parents [parent_xref ]
1990+ idx = parent ["old_kids" ].index (xref ) # search for xref in parent
1991+ tar_xref = parent ["new_kids" ][idx ]
1992+ w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1993+ mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
17961994
1797- # and also into "AcroForm/Fields",
1798- mupdf .pdf_array_push (tar_fields , w_obj_tar_ind )
1799- # and also into "AcroForm/CO" if a computation field.
1995+ # Into "AcroForm/CO" if a computation field.
18001996 if is_aac :
18011997 mupdf .pdf_array_push (tar_co , w_obj_tar_ind )
18021998
1999+ deduplicate_names (tarpdf , acro_fields , join_duplicates = join_duplicates )
18032000
18042001def do_links (
18052002 doc1 : pymupdf .Document ,
0 commit comments