diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 87fe2261e..828e0bab5 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -13,6 +13,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [abyesilyurt](https://github.com/abyesilyurt) * [ArkieCoder](https://github.com/ArkieCoder) +* [PJ Beers](https://github.com/PJBrs) * [Clauss, Christian](https://github.com/cclauss) * [DL6ER](https://github.com/DL6ER) * [Duy, Phan Thanh](https://github.com/zuypt) diff --git a/docs/user/forms.md b/docs/user/forms.md index e622b4138..c2517fefe 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -30,6 +30,7 @@ writer.update_page_form_field_values( writer.pages[0], {"fieldname": "some filled in text"}, auto_regenerate=False, + flatten=False, ) with open("filled-out.pdf", "wb") as output_stream: @@ -41,6 +42,12 @@ parameter is `True` by default for legacy compatibility, but this flags the PDF processor to recompute the field's rendering, and may trigger a "save changes" dialog for users who open the generated PDF. +If you want to flatten your form, that is, keeping all form field contents while +removing the form fields themselves, you can set `flatten=True` to convert form +field contents to regular pdf content, and then use +`writer.remove_annotations(subtypes="/Widget")` to remove all form fields. This +will result in a flattened pdf. + ## Some notes about form fields and annotations PDF forms have a dual-nature approach to the fields: diff --git a/pypdf/_font.py b/pypdf/_font.py index 06f78ea77..5f8d98d2f 100644 --- a/pypdf/_font.py +++ b/pypdf/_font.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Optional from pypdf.generic import DictionaryObject @@ -29,10 +28,16 @@ class FontDescriptor: character_widths: dict[str, int] = field(default_factory=dict) @classmethod - def from_font_resource(cls, pdf_font_dict: DictionaryObject) -> "Optional[FontDescriptor]": + def from_font_resource(cls, pdf_font_dict: DictionaryObject) -> "FontDescriptor": from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415 # Prioritize information from the PDF font dictionary - font_name = pdf_font_dict.get("/BaseFont", "Unknown") - if font_name[1:] in CORE_FONT_METRICS: - return CORE_FONT_METRICS.get(font_name[1:]) + font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") + if font_name in CORE_FONT_METRICS: + return CORE_FONT_METRICS[font_name] return cls(name=font_name) + + def text_width(self, text: str) -> float: + """Sum of character widths specified in PDF font for the supplied text.""" + return sum( + [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0 + ) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 43e9c1657..72b7889da 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -1,7 +1,9 @@ import re from typing import Any, Optional, Union, cast -from .._cmap import _default_fonts_space_width, build_char_map_from_dict +from .._cmap import build_char_map_from_dict +from .._codecs.core_fontmetrics import CORE_FONT_METRICS +from .._font import FontDescriptor from .._utils import logger_warning from ..constants import AnnotationDictionaryAttributes, FieldDictionaryAttributes from ..generic import ( @@ -25,16 +27,120 @@ class TextStreamAppearance(DecodedStreamObject): like font, font size, color, multiline text, and text selection highlighting. """ + def _scale_text( + self, + font_descriptor: FontDescriptor, + font_size: float, + field_width: float, + field_height: float, + txt: str, + is_multiline: bool, + min_font_size: float = 4.0, # Minimum font size to attempt + font_size_step: float = 0.2 # How much to decrease font size by each step + ) -> tuple[list[tuple[float, str]], float]: + """ + Takes a piece of text and scales it to field_width or field_height, given font_name + and font_size. For multiline fields, adds newlines to wrap the text. + + Args: + font_descriptor: A FontDescriptor for the font to be used. + font_size: The font size in points. + field_width: The width of the field in which to fit the text. + field_height: The height of the field in which to fit the text. + txt: The text to fit with the field. + is_multiline: Whether to scale and wrap the text, or only to scale. + min_font_size: The minimum font size at which to scale the text. + font_size_step: The amount by which to decrement font size per step while scaling. + + Returns: + The text in the form of list of tuples, each tuple containing the length of a line + and its contents, and the font_size for these lines and lengths. + """ + # Single line: + if not is_multiline: + test_width = font_descriptor.text_width(txt) * font_size / 1000 + if test_width > field_width or font_size > field_height: + new_font_size = font_size - font_size_step + if new_font_size >= min_font_size: + # Text overflows height; Retry with smaller font size. + return self._scale_text( + font_descriptor, + round(new_font_size, 1), + field_width, + field_height, + txt, + is_multiline, + min_font_size, + font_size_step + ) + # Font size lower than set minimum font size, give up. + return [(test_width, txt)], font_size + return [(test_width, txt)], font_size + # Multiline: + orig_txt = txt + paragraphs = re.sub(r"\n", "\r", txt).split("\r") + wrapped_lines = [] + current_line_words: list[str] = [] + current_line_width: float = 0 + space_width = font_descriptor.text_width(" ") * font_size / 1000 + for paragraph in paragraphs: + if not paragraph.strip(): + wrapped_lines.append((0.0, "")) + continue + words = paragraph.split(" ") + for i, word in enumerate(words): + word_width = font_descriptor.text_width(word) * font_size / 1000 + test_width = current_line_width + word_width + (space_width if i else 0) + if test_width > field_width and current_line_words: + wrapped_lines.append((current_line_width, " ".join(current_line_words))) + current_line_words = [word] + current_line_width = word_width + elif not current_line_words and word_width > field_width: + wrapped_lines.append((word_width, word)) + current_line_words = [] + current_line_width = 0 + else: + if current_line_words: + current_line_width += space_width + current_line_words.append(word) + current_line_width += word_width + if current_line_words: + wrapped_lines.append((current_line_width, " ".join(current_line_words))) + current_line_words = [] + current_line_width = 0 + # Estimate total height. + # Assumed line spacing of 1.4 + estimated_total_height = font_size + (len(wrapped_lines) - 1) * 1.4 * font_size + if estimated_total_height > field_height: + new_font_size = font_size - font_size_step + if new_font_size >= min_font_size: + # Text overflows height; Retry with smaller font size. + return self._scale_text( + font_descriptor, + round(new_font_size, 1), + field_width, + field_height, + orig_txt, + is_multiline, + min_font_size, + font_size_step + ) + # Font size lower than set minimum font size, give up. + return (wrapped_lines, font_size) + return (wrapped_lines, font_size) + def _generate_appearance_stream_data( self, text: str = "", selection: Optional[list[str]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_descriptor: FontDescriptor = CORE_FONT_METRICS["Helvetica"], font_glyph_byte_map: Optional[dict[str, bytes]] = None, font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", - is_multiline: bool = False + is_multiline: bool = False, + alignment: int = 0 ) -> bytes: """ Generates the raw bytes of the PDF appearance stream for a text field. @@ -56,6 +162,7 @@ def _generate_appearance_stream_data( font_color: The color to apply to the font, represented as a PDF graphics state string (e.g., "0 g" for black). is_multiline: A boolean indicating if the text field is multiline. + alignment: Left-aligned (0), centered (1) or right-aligned (2) text. Returns: A byte string containing the PDF content stream data. @@ -67,10 +174,27 @@ def _generate_appearance_stream_data( # If font_size is 0, apply the logic for multiline or large-as-possible font if font_size == 0: + if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems + is_multiline = False # with matching "selection" with "line" later on. if is_multiline: font_size = DEFAULT_FONT_SIZE_IN_MULTILINE else: font_size = rectangle.height - 2 + lines, font_size = self._scale_text( + font_descriptor, + font_size, + rectangle.width - 3, # One point margin left and right, and an additional point because the first + # offset takes one extra point (see below, "desired_abs_x_start") + rectangle.height - 3, # One point margin for top and bottom, one point extra for the first line + # (see y_offset) + text, + is_multiline, + ) + else: + lines = [( + font_descriptor.text_width(line) * font_size / 1000, + line + ) for line in text.replace("\n", "\r").split("\r")] # Set the vertical offset y_offset = rectangle.height - 1 - font_size @@ -80,19 +204,42 @@ def _generate_appearance_stream_data( f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " f"re\nW\nBT\n{default_appearance}\n" ).encode() + current_x_pos: float = 0 # Initial virtual position within the text object. - for line_number, line in enumerate(text.replace("\n", "\r").split("\r")): + for line_number, (line_width, line) in enumerate(lines): if selection and line in selection: # Might be improved, but cannot find how to get fill working => replaced with lined box ap_stream += ( f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n" f"0.5 0.5 0.5 rg s\n{default_appearance}\n" ).encode() + + # Calculate the desired absolute starting X for the current line + desired_abs_x_start: float = 0 + if alignment == 2: # Right aligned + desired_abs_x_start = rectangle.width - 2 - line_width + elif alignment == 1: # Centered + desired_abs_x_start = (rectangle.width - line_width) / 2 + else: # Left aligned; default + desired_abs_x_start = 2 + # Calculate x_rel_offset: how much to move from the current_x_pos + # to reach the desired_abs_x_start. + x_rel_offset = desired_abs_x_start - current_x_pos + + # Y-offset: + y_rel_offset: float = 0 if line_number == 0: - ap_stream += f"2 {y_offset} Td\n".encode() + y_rel_offset = y_offset # Initial vertical position else: - # Td is a relative translation - ap_stream += f"0 {-font_size * 1.4} Td\n".encode() + y_rel_offset = - font_size * 1.4 # Move down by line height + + # Td is a relative translation (Tx and Ty). + # It updates the current text position. + ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode() + # Update current_x_pos based on the Td operation for the next iteration. + # This is the X position where the *current line* will start. + current_x_pos = desired_abs_x_start + encoded_line: list[bytes] = [ font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line ] @@ -112,7 +259,8 @@ def __init__( font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", - is_multiline: bool = False + is_multiline: bool = False, + alignment: int = 0 ) -> None: """ Initializes a TextStreamAppearance object. @@ -131,6 +279,7 @@ def __init__( font_size: The font size. If 0, it's auto-calculated. font_color: The font color string. is_multiline: A boolean indicating if the text field is multiline. + alignment: Left-aligned (0), centered (1) or right-aligned (2) text. """ super().__init__() @@ -138,36 +287,49 @@ def __init__( # If a font resource was added, get the font character map if font_resource: font_resource = cast(DictionaryObject, font_resource.get_object()) - _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_resource - ) - try: # remove width stored in -1 key - del font_map[-1] - except KeyError: - pass - font_glyph_byte_map: dict[str, bytes] - if isinstance(font_encoding, str): - font_glyph_byte_map = { - v: k.encode(font_encoding) for k, v in font_map.items() - } - else: - font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} - font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - for key, value in font_map.items(): - font_glyph_byte_map[value] = font_encoding_rev.get(key, key) + font_descriptor = FontDescriptor.from_font_resource(font_resource) + else: + logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) + font_name = "/Helv" + font_resource = DictionaryObject({ + NameObject("/Subtype"): NameObject("/Type1"), + NameObject("/Name"): NameObject("/Helv"), + NameObject("/Type"): NameObject("/Font"), + NameObject("/BaseFont"): NameObject("/Helvetica"), + NameObject("/Encoding"): NameObject("/WinAnsiEncoding") + }) + font_descriptor = CORE_FONT_METRICS["Helvetica"] + + # Get the font glyph data + _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_resource + ) + try: # remove width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_glyph_byte_map: dict[str, bytes] + if isinstance(font_encoding, str): + font_glyph_byte_map = { + v: k.encode(font_encoding) for k, v in font_map.items() + } else: - logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_glyph_byte_map = {} + font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for key, value in font_map.items(): + font_glyph_byte_map[value] = font_encoding_rev.get(key, key) ap_stream_data = self._generate_appearance_stream_data( text, selection, rectangle, + font_descriptor, font_glyph_byte_map, font_name, font_size, font_color, - is_multiline + is_multiline, + alignment ) self[NameObject("/Type")] = NameObject("/XObject") @@ -175,13 +337,12 @@ def __init__( self[NameObject("/BBox")] = RectangleObject(rectangle) self.set_data(ByteStringObject(ap_stream_data)) self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) - # Update Resources with font information if necessary - if font_resource is not None: - self[NameObject("/Resources")] = DictionaryObject({ - NameObject("/Font"): DictionaryObject({ - NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) - }) + # Update Resources with font information + self[NameObject("/Resources")] = DictionaryObject({ + NameObject("/Font"): DictionaryObject({ + NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) }) + }) @classmethod def from_text_annotation( @@ -260,8 +421,8 @@ def from_text_annotation( ).get_object(), ) document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object() - # _default_fonts_space_width keys is the list of Standard fonts - if font_name not in document_font_resources and font_name not in _default_fonts_space_width: + # CORE_FONT_METRICS is the dict with Standard font metrics + if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS: # ...or AcroForm dictionary document_resources = cast( dict[Any, Any], @@ -275,6 +436,7 @@ def from_text_annotation( # Retrieve field text, selected values and formatting information is_multiline = False field_flags = field.get(FieldDictionaryAttributes.Ff, 0) + alignment = field.get("/Q", 0) if field_flags & FieldDictionaryAttributes.FfBits.Multiline: is_multiline = True if ( @@ -301,7 +463,8 @@ def from_text_annotation( font_name, font_size, font_color, - is_multiline + is_multiline, + alignment ) if AnnotationDictionaryAttributes.AP in annotation: for key, value in ( diff --git a/tests/test_appearance_stream.py b/tests/test_appearance_stream.py new file mode 100644 index 000000000..36ebe6c13 --- /dev/null +++ b/tests/test_appearance_stream.py @@ -0,0 +1,62 @@ +"""Test the pypdf.generic._appearance_stream module.""" + +from pypdf.generic._appearance_stream import TextStreamAppearance + + +def test_scale_text(): + rectangle = (0, 0, 9.1, 55.4) + font_size = 10.1 + text = "Hello World" + is_multiline = False + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (str(font_size) + r" Tf").encode() in appearance_stream.get_data() + text = "This is a very very long sentence that probably will scale below the minimum font size" + font_size = 0.0 + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (b"4.0 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 160, 360) + font_size = 0.0 + text = """Welcome to pypdf +pypdf is a free and open source pure-python PDF library capable of splitting, merging, cropping, and +transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF +files. pypdf can retrieve text and metadata from PDFs as well. + +See pdfly for a CLI application that uses pypdf to interact with PDFs. + """ + is_multiline = True + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (b"12 Tf") in appearance_stream.get_data() + assert b"pypdf is a free and open" in appearance_stream.get_data() + rectangle = (0, 0, 160, 160) + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (b"8.8 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 160, 12) + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + text = """Option A +Option B +Option C +Option D +""" + selection = "Option A" + assert (b"4.0 Tf") in appearance_stream.get_data() + text = "pneumonoultramicroscopicsilicovolcanoconiosis" + appearance_stream = TextStreamAppearance( + text, selection, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (b"7.2 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 10, 100) + text = "OneWord" + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, is_multiline=is_multiline + ) + assert (b"OneWord") in appearance_stream.get_data() diff --git a/tests/test_writer.py b/tests/test_writer.py index c76f76a44..be4c7d9c5 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2427,7 +2427,7 @@ def test_no_resource_for_14_std_fonts(caplog): writer.update_page_form_field_values( p, {a["/T"]: "Brooks"}, auto_regenerate=False ) - assert "Font dictionary for /Helvetica not found." in caplog.text + assert "Font dictionary for /Helvetica not found; defaulting to Helvetica." in caplog.text @pytest.mark.enable_socket @@ -2439,7 +2439,7 @@ def test_field_box_upside_down(): writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"}) assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == ( b"q\n/Tx BMC \nq\n1 1 105.29520000000001 10.835000000000036 re\n" - b"W\nBT\n/Arial 8.0 Tf 0 g\n2 2.8350000000000364 Td\n(0) Tj\nET\n" + b"W\nBT\n/Helv 8.0 Tf 0 g\n2 2.8350000000000364 Td\n(0) Tj\nET\n" b"Q\nEMC\nQ\n" ) box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"]