diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..a0a0ed8 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 6024e4a..ffbce06 100644 --- a/.gitignore +++ b/.gitignore @@ -126,3 +126,5 @@ dmypy.json *.pdf *.html chromedriver +requirements.txt +main.py \ No newline at end of file diff --git a/LICENSE b/LICENSE index b80561b..81f990d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Maksim +Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 354c6ff..3ac48bf 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,80 @@ -# python-selenium-chrome-html-to-pdf-converter +# pyhtml2pdf Simple python wrapper to convert HTML to PDF with headless Chrome via selenium. -## Installation -Clone repository, move to project root dir, install virtualenv, install dependencies: +## Install ``` -git clone https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter.git -cd python-selenium-chrome-html-to-pdf-converter -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt +pip install pyhtml2pdf ``` -Install chrome (chromium) browser. -Download chromedriver from http://chromedriver.chromium.org/ and put it to project root directory. +## Dependencies + + - [Selenium Chrome Webdriver](https://chromedriver.chromium.org/downloads) (If Chrome is installed on the machine you won't need to install the chrome driver) + - [Ghostscript](https://www.ghostscript.com/download.html) + +## Example + +### **Convert to PDF** + +**Use with website url** -## Demo ``` -cd examples -python converter.py https://google.com google.pdf +from pyhtml2pdf import converter + +converter.convert('https://pypi.org', 'sample.pdf') ``` -## Why use selenium? -TODO: Add description +**Use with html file from local machine** -## CSS recomendations +``` +import os +from pyhtml2pdf import converter -Basic configuration for single page: +path = os.path.abspath('index.html') +converter.convert(f'file:///{path}', 'sample.pdf') ``` -@page { - size: A4; - margin: 0mm; -} + +**Some JS objects may have animations or take a some time to render. You can set a time out in order to help render those objects. You can set timeout in seconds** + +``` +converter.convert(source, target, timeout=2) ``` -For printing double-sided documents use +**Compress the converted PDF** + +Some PDFs may be oversized. So there is a built in PDF compression feature. + +The power of the compression, + - 0: default + - 1: prepress + - 2: printer + - 3: ebook + - 4: screen + ``` -@page :left { - margin-left: 4cm; - margin-right: 2cm; -} - -@page :right { - margin-left: 4cm; - margin-right: 2cm; -} - -@page :first { - margin-top: 10cm /* Top margin on first page 10cm */ -} +converter.convert(source, target, compress=True, power=0) ``` -Control pagination with page-break-before, page-break-after, page-break-inside like +### **Pass Print Options** + +You can use print options mentioned [here](https://vanilla.aslushnikov.com/?Page.printToPDF) + ``` -h1 { page-break-before : right } -h2 { page-break-after : avoid } -table { page-break-inside : avoid } +converter.convert( f"file:///{path}", f"sample.pdf", print_options={"scale": 0.95} ) ``` -Control widows and оrphans like + +### **Compress PDF** + +**Use it to compress a PDF file from local machine** + ``` -@page { - orphans:4; - widows:2; -} +import os +from pyhtml2pdf import compressor + +compressor.compress('sample.pdf', 'compressed_sample.pdf') ``` -More descriptions see at https://www.tutorialspoint.com/css/css_paged_media.htm + +Inspired the works from, + + - https://github.com/maxvst/python-selenium-chrome-html-to-pdf-converter.git + - https://github.com/theeko74/pdfc + diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..625a682 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,51 @@ +# Security Policy + +## Supported Versions + +We support security fixes for the latest released version and the `master` branch. + +| Version | Supported | +| ------- | --------- | +| Latest | ✅ | +| Older | ❌ | + +## Reporting a Vulnerability + +If you believe you’ve found a security vulnerability, **please do not open a public GitHub issue**. + +Instead, report it privately using one of the following: + +### Preferred: GitHub Private Vulnerability Reporting +- Go to: **Security** → **Advisories** → **Report a vulnerability** +- Provide as much detail as possible (see “What to include” below). + +### Alternative: Email +- Email: **mklmfernando@gmail.com** + +## What to Include + +Please include: +- A clear description of the issue and potential impact +- Steps to reproduce (proof-of-concept if available) +- Affected versions/branches +- Any suggested fix or mitigation (if you have one) + +## Response Timeline + +We aim to: +- Acknowledge receipt within **3 business days** +- Provide a status update within **7 business days** +- Release a fix as soon as practical based on severity and complexity + +## Coordinated Disclosure + +We follow coordinated disclosure practices. Please allow reasonable time to investigate and remediate before any public disclosure. + +## Security Updates + +Security fixes may be released as: +- Patch releases +- Advisory notes (GitHub Security Advisory) +- Changelog entries (when appropriate) + +Thank you for helping keep this project and its users safe. diff --git a/examples/converter.py b/examples/converter.py deleted file mode 100644 index d02b407..0000000 --- a/examples/converter.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys -sys.path.append('..') - -from sample.html_to_pdf_converter import get_pdf_from_html - -if len(sys.argv) != 3: - print ("usage: converter.py ") - exit() - -result = get_pdf_from_html(sys.argv[1], chromedriver='../chromedriver') -with open(sys.argv[2], 'wb') as file: - file.write(result) diff --git a/pyhtml2pdf/__init__.py b/pyhtml2pdf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyhtml2pdf/compressor.py b/pyhtml2pdf/compressor.py new file mode 100644 index 0000000..682cf21 --- /dev/null +++ b/pyhtml2pdf/compressor.py @@ -0,0 +1,131 @@ +import logging +import os +import platform +import subprocess +from pathlib import Path +from tempfile import NamedTemporaryFile, _TemporaryFileWrapper +from typing import Literal, Union + +from .utils import _pdf_has_suspicious_content + +MAX_BYTES = 25 * 1024 * 1024 + +logger = logging.getLogger(__name__) + + +def compress( + source: str | os.PathLike | _TemporaryFileWrapper, + target: str | os.PathLike, + power: int = 0, + ghostscript_command: Union[Literal["gs", "gswin64c", "gswin32c"], None] = None, + max_pdf_size: int = MAX_BYTES, + timeout: int = 10, + force_process: bool = False, +) -> None: + """ + + :param source: Source PDF file + :param target: Target location to save the compressed PDF + :param power: Power of the compression. Default value is 0. This can be + 0: default, + 1: prepress, + 2: printer, + 3: ebook, + 4: screen + :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted + to be inferred from the OS. + If the OS is not Windows, "gs" is used as executable name. + If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit + version, "gswin32c" is used. + :param max_pdf_size: Maximum allowed size for the PDF in bytes. Default is 25 MB. + :param timeout: Timeout in seconds + :param force_process: Whether to process even if suspicious content is found (Be extra careful with this setting). + """ + quality = {0: "/default", 1: "/prepress", 2: "/printer", 3: "/ebook", 4: "/screen"} + + if ghostscript_command is None: + if platform.system() == "Windows": + if platform.machine().endswith("64"): + ghostscript_command = "gswin64c" + else: + ghostscript_command = "gswin32c" + else: + ghostscript_command = "gs" + + if isinstance(source, _TemporaryFileWrapper): + source = source.name + + source = Path(source) + target = Path(target) + + if not source.is_file(): + raise FileNotFoundError("Source file does not exist") + + if source.suffix.lower() != ".pdf": + raise ValueError("Source file is not a PDF") + + issues = _pdf_has_suspicious_content(source, max_pdf_size) + + if issues: + logger.warning( + "Warning: The PDF file has been flagged for suspicious content.\n\n- %s\n\nProcessing has been skipped to avoid potential security risks.\n\n" + "If you believe this is an error, you can set force_process=True to override this behavior. Proceed with caution!\n", + "\n- ".join(issues), + ) + + if not force_process: + logger.error( + "PDF file flagged for suspicious content. Process aborted.\n\n" + ) + raise RuntimeError( + "PDF file flagged for suspicious content. Process aborted." + ) + + try: + subprocess.call( + [ + ghostscript_command, + "-dSAFER", + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS={}".format(quality[power]), + "-dNOPAUSE", + "-dQUIET", + "-dBATCH", + "-sOutputFile={}".format(target.as_posix()), + source.as_posix(), + ], + shell=platform.system() == "Windows", + timeout=timeout, + ) + except subprocess.TimeoutExpired: + logger.error( + "PDF processing took too long (DoS protection triggered). If you believe this is an error, try increasing the timeout parameter." + ) + + raise TimeoutError + + +def _compress( + result: bytes, + target: str | os.PathLike, + power: int, + timeout: int, + ghostscript_command: Union[Literal["gs", "gswin64c", "gswin32c"], None] = None, +): + with NamedTemporaryFile( + suffix=".pdf", delete=platform.system() != "Windows" + ) as tmp_file: + tmp_file.write(result) + + # Ensure minimum timeout of 20 seconds for compression when call from converter.py + _timeout: int = max(timeout, 20) + + compress( + source=tmp_file, + target=target, + power=power, + ghostscript_command=ghostscript_command, + max_pdf_size=Path(tmp_file.name).stat().st_size + 1_000_000, + timeout=_timeout, + ) diff --git a/pyhtml2pdf/converter.py b/pyhtml2pdf/converter.py new file mode 100644 index 0000000..66a0841 --- /dev/null +++ b/pyhtml2pdf/converter.py @@ -0,0 +1,143 @@ +import base64 +import io +import json +from typing import Literal, TypedDict, Union + +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.expected_conditions import staleness_of +from selenium.webdriver.support.ui import WebDriverWait +from webdriver_manager.chrome import ChromeDriverManager + +from .compressor import _compress + + +class PrintOptions(TypedDict): + landscape: bool + displayHeaderFooter: bool + printBackground: bool + scale: float + paperWidth: float + paperHeight: float + marginTop: float + marginBottom: float + marginLeft: float + marginRight: float + pageRanges: str + ignoreInvalidPageRanges: bool + preferCSSPageSize: bool + + +def convert( + source: Union[str, io.BytesIO], + target: Union[str, io.BytesIO], + timeout: int = 2, + compress: bool = False, + power: int = 0, + install_driver: bool = True, + print_options: PrintOptions = {}, + ghostscript_command: Union[Literal["gs", "gswin64c", "gswin32c"], None] = None, +): + """ + Convert a given html file or website into PDF + + :param str source: source html file or website link or html content or a BytesIO object + :param str | BytesIO target: target location to save the PDF, can be a path or a BytesIO object + :param int timeout: timeout in seconds. Default value is set to 2 seconds + :param bool compress: whether PDF is compressed or not. Default value is False + :param int power: power of the compression. Default value is 0. This can be 0: default, 1: prepress, 2: printer, 3: ebook, 4: screen + :param bool install_driver: whether or not to install using ChromeDriverManager. Default value is True + :param PrintOptions print_options: A dictionary containing options for the printing of the PDF, conforming to the types specified in the PrintOptions TypedDict. + :param ghostscript_command: The name of the ghostscript executable. If set to the default value None, is attempted + to be inferred from the OS. + If the OS is not Windows, "gs" is used as executable name. + If the OS is Windows, and it is a 64-bit version, "gswin64c" is used. If it is a 32-bit + version, "gswin32c" is used. + """ + if print_options is None: + print_options = {} + + result = __get_pdf_from_html(source, timeout, install_driver, print_options) + + if compress: + _compress(result, target, power, timeout, ghostscript_command) + else: + if isinstance(target, io.BytesIO): + return target.write(result) + with open(target, "wb") as file: + file.write(result) + + +def __send_devtools(driver, cmd, params=None): + if params is None: + params = {} + resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id + if hasattr(driver.command_executor, "_client_config"): + remote_url = driver.command_executor._client_config.remote_server_addr + else: + # Old internal API + remote_url = driver.command_executor._url + url = remote_url + resource + body = json.dumps({"cmd": cmd, "params": params}) + response = driver.command_executor._request("POST", url, body) + + if not response: + raise Exception(response.get("value")) + + return response.get("value") + + +def __get_pdf_from_html( + source: Union[str, io.BytesIO], + timeout: int, + install_driver: bool, + print_options: dict, +) -> bytes: + webdriver_options = Options() + webdriver_prefs = {} + + webdriver_options.add_argument("--headless") + webdriver_options.add_argument("--disable-gpu") + webdriver_options.add_argument("--no-sandbox") + webdriver_options.add_argument("--disable-dev-shm-usage") + webdriver_options.experimental_options["prefs"] = webdriver_prefs + + webdriver_prefs["profile.default_content_settings"] = {"images": 2} + + if install_driver: + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=webdriver_options) + else: + driver = webdriver.Chrome(options=webdriver_options) + + # Detect the type of source and create data url if needed + if isinstance(source, io.BytesIO): + encoded_content = base64.b64encode(source.getvalue()).decode("utf-8") + path = f"data:text/html;base64,{encoded_content}" + if not source.startswith("http") and not source.startswith("file"): + encoded_content = base64.b64encode(source.encode("utf-8")).decode("utf-8") + path = f"data:text/html;base64,{encoded_content}" + else: + path = source + + driver.get(path) + + try: + WebDriverWait(driver, timeout).until( + staleness_of(driver.find_element(by=By.TAG_NAME, value="html")) + ) + except TimeoutException: + calculated_print_options = { + "landscape": False, + "displayHeaderFooter": False, + "printBackground": True, + "preferCSSPageSize": True, + } + calculated_print_options.update(print_options) + result = __send_devtools(driver, "Page.printToPDF", calculated_print_options) + return base64.b64decode(result["data"]) + finally: + driver.quit() diff --git a/pyhtml2pdf/utils.py b/pyhtml2pdf/utils.py new file mode 100644 index 0000000..6131546 --- /dev/null +++ b/pyhtml2pdf/utils.py @@ -0,0 +1,126 @@ +from pathlib import Path + +import pikepdf + + +def __has_javascript(pdf: pikepdf.Pdf) -> bool: + root = pdf.Root + + # OpenAction + if "/OpenAction" in root: + return True + + # JavaScript name tree + names = root.get("/Names") + if names and "/JavaScript" in names: + return True + + # Page-level actions + for page in pdf.pages: + if "/AA" in page.obj: + return True + + return False + + +# def __has_launch_action(pdf: pikepdf.Pdf) -> bool: +# def walk(obj): +# print(obj) + +# if isinstance(obj, pikepdf.Dictionary): +# if obj.get("/S") == "/Launch": +# return True + +# for _, v in obj.items(): +# if walk(v): +# return True + +# elif isinstance(obj, (list, tuple)): +# return any(walk(v) for v in obj) + +# return False + +# return walk(pdf.Root) + + +def __has_embedded_files(pdf: pikepdf.Pdf) -> bool: + root = pdf.Root + names = root.get("/Names") + if not names: + return False + + ef_tree = names.get("/EmbeddedFiles") + return ef_tree is not None + + +def __has_rich_media(pdf: pikepdf.Pdf) -> bool: + for page in pdf.pages: + annots = page.obj.get("/Annots", []) + for annot in annots: + if annot.get("/Subtype") == "/RichMedia": + return True + return False + + +def __has_xfa(pdf: pikepdf.Pdf) -> bool: + root = pdf.Root + acroform = root.get("/AcroForm") + if not acroform: + return False + return "/XFA" in acroform + + +def __has_file_size_exceeded(path: Path, max_pdf_size: int) -> bool: + size = path.stat().st_size + if size <= 0 or size > max_pdf_size: + return True + + return True + + +def __has_invalid_header(path: Path) -> bool: + with path.open("rb") as f: + if f.read(5) != b"%PDF-": + return True + + return False + + +def __has_no_pages(pdf: pikepdf.Pdf) -> bool: + return len(pdf.pages) <= 0 + + +def _pdf_has_suspicious_content(path: str | Path, max_pdf_size: int) -> list[str]: + path = Path(path) + + findings = [] + + if __has_file_size_exceeded(path, max_pdf_size): + findings.append( + "File size exceeded - adjust max_pdf_size parameter to allow larger files" + ) + + if __has_invalid_header(path): + findings.append("Invalid PDF header, could be corrupted or not a PDF") + return findings + + with pikepdf.open(path) as pdf: + if __has_no_pages(pdf): + findings.append("No pages in PDF") + + if __has_javascript(pdf): + findings.append("JavaScript detected") + + # if __has_launch_action(pdf): + # findings.append("Launch action detected") + + if __has_embedded_files(pdf): + findings.append("Embedded files detected") + + if __has_rich_media(pdf): + findings.append("Rich media detected") + + if __has_xfa(pdf): + findings.append("XFA forms detected") + + return findings diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4e978e0..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -selenium==3.141.0 - diff --git a/sample/html_to_pdf_converter.py b/sample/html_to_pdf_converter.py deleted file mode 100644 index c314f5e..0000000 --- a/sample/html_to_pdf_converter.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import json, base64 - -def send_devtools(driver, cmd, params={}): - resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id - url = driver.command_executor._url + resource - body = json.dumps({'cmd': cmd, 'params': params}) - response = driver.command_executor._request('POST', url, body) - if response['status']: - raise Exception(response.get('value')) - return response.get('value') - -def get_pdf_from_html(path, chromedriver='./chromedriver', print_options = {}): - webdriver_options = Options() - webdriver_options.add_argument('--headless') - webdriver_options.add_argument('--disable-gpu') - driver = webdriver.Chrome(chromedriver, options=webdriver_options) - - driver.get(path) - - calculated_print_options = { - 'landscape': False, - 'displayHeaderFooter': False, - 'printBackground': True, - 'preferCSSPageSize': True, - } - calculated_print_options.update(print_options) - result = send_devtools(driver, "Page.printToPDF", calculated_print_options) - driver.quit() - return base64.b64decode(result['data']) - -if __name__ == "__main__": - pass - # TODO: add short help layout \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..14d7915 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="pyhtml2pdf", # Replace with your own username + version="0.1.0", + author="Kumara Fernando", + author_email="mklmfernando@gmail.com", + description="Simple python wrapper to convert HTML to PDF with headless Chrome via selenium.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/kumaF/pyhtml2pdf", + packages=setuptools.find_packages(), + install_requires=[ # I get to this in a second + "selenium", + "webdriver-manager", + "pikepdf", + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.10", +)