From 3e867f11bc6b5de4a79ae73a34918e6eef18f6c4 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 30 May 2025 15:52:14 +0100 Subject: [PATCH 1/8] Added fpdf --- Aptfile | 1 - requirements.txt | 2 +- src/worker/email.py | 21 ++++++++++++---- tests/test_email.py | 58 ++++++++++++++++++++++----------------------- 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/Aptfile b/Aptfile index 68c58524..06a52a7d 100644 --- a/Aptfile +++ b/Aptfile @@ -1,3 +1,2 @@ libjpeg62 libc6 -https://mirrors.edge.kernel.org/ubuntu/pool/main/o/openssl1.1/libssl1.1_1.1.1f-1ubuntu2_amd64.deb diff --git a/requirements.txt b/requirements.txt index 98f436e1..5d3f0be4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ MarkupSafe==2.1.2 phonenumbers==8.13.6 pydantic[email]==1.9.2 python-multipart==0.0.6 -python-pdf==0.39 +fpdf2>=3.0.0 requests==2.28.2 starlette==0.14.2 sentry-sdk==1.16.0 diff --git a/src/worker/email.py b/src/worker/email.py index 432a6f70..f8a06432 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -14,7 +14,7 @@ from httpx import ConnectError, ReadTimeout from itertools import chain from pathlib import Path -from pydf import generate_pdf +from fpdf import FPDF from typing import List, Optional from src.ext import ApiError @@ -36,6 +36,20 @@ email_retrying = [5, 10, 60, 600, 1800, 3600, 12 * 3600] +def generate_pdf_from_html(html: str, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm') -> bytes: + pdf = FPDF(orientation='P', unit='mm', format=page_size.upper()) + pdf.add_page() + + left_margin = float(margin_left.replace('mm', '')) + right_margin = float(margin_right.replace('mm', '')) + + pdf.set_left_margin(left_margin) + pdf.set_right_margin(right_margin) + pdf.write_html(html) + + return pdf.output() + + def utcnow(): return datetime.utcnow().replace(tzinfo=timezone.utc) @@ -201,12 +215,11 @@ async def _render_email(self, context, headers) -> Optional[EmailInfo]: await self._store_email_failed(MessageStatus.render_failed, f'Error rendering email: {e}') async def _generate_base64_pdf(self, pdf_attachments): - kwargs = dict(page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm') for a in pdf_attachments: if a.html: try: - pdf_content = generate_pdf(a.html, **kwargs) - except RuntimeError as e: + pdf_content = generate_pdf_from_html(a.html, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm') + except Exception as e: main_logger.warning('error generating pdf, data: %s', e) else: yield dict(type='application/pdf', name=a.name, content=base64.b64encode(pdf_content).decode()) diff --git a/tests/test_email.py b/tests/test_email.py index 7b0de2bc..e0d86699 100644 --- a/tests/test_email.py +++ b/tests/test_email.py @@ -475,26 +475,26 @@ def test_invalid_mustache_body(send_email, sync_db: SyncDb): assert m['body'] == 'Error rendering email: unclosed tag at line 1' -# def test_send_with_pdf(send_email, tmpdir, sync_db: SyncDb): -# message_id = send_email( -# recipients=[ -# { -# 'address': 'foobar@testing.com', -# 'pdf_attachments': [ -# {'name': 'testing.pdf', 'html': '

testing

', 'id': 123}, -# {'name': 'different.pdf', 'html': '

different

'}, -# ], -# } -# ] -# ) -# assert len(tmpdir.listdir()) == 1 -# msg_file = tmpdir.join(f'{message_id}.txt').read() -# assert 'testing.pdf' in msg_file -# -# attachments = sync_db.fetchrow_b('select * from messages where :where', where=V('external_id') == message_id)[ -# 'attachments' -# ] -# assert set(attachments) == {'123::testing.pdf', '::different.pdf'} +def test_send_with_pdf(send_email, tmpdir, sync_db: SyncDb): + message_id = send_email( + recipients=[ + { + 'address': 'foobar@testing.com', + 'pdf_attachments': [ + {'name': 'testing.pdf', 'html': '

testing

', 'id': 123}, + {'name': 'different.pdf', 'html': '

different

'}, + ], + } + ] + ) + assert len(tmpdir.listdir()) == 1 + msg_file = tmpdir.join(f'{message_id}.txt').read() + assert 'testing.pdf' in msg_file + + attachments = sync_db.fetchrow_b('select * from messages where :where', where=V('external_id') == message_id)[ + 'attachments' + ] + assert set(attachments) == {'123::testing.pdf', '::different.pdf'} def test_send_with_other_attachment(send_email, tmpdir, sync_db: SyncDb): @@ -541,15 +541,15 @@ def test_send_with_other_attachment_pdf(send_email, tmpdir, sync_db: SyncDb): assert set(attachments) == {'::test_pdf.pdf', '::test_pdf_encoded.pdf'} -# def test_pdf_not_unicode(send_email, tmpdir, cli): -# message_id = send_email( -# recipients=[ -# {'address': 'foobar@testing.com', 'pdf_attachments': [{'name': 'testing.pdf', 'html': '

binary

'}]} -# ] -# ) -# assert len(tmpdir.listdir()) == 1 -# msg_file = tmpdir.join(f'{message_id}.txt').read() -# assert 'testing.pdf' in msg_file +def test_pdf_not_unicode(send_email, tmpdir, cli): + message_id = send_email( + recipients=[ + {'address': 'foobar@testing.com', 'pdf_attachments': [{'name': 'testing.pdf', 'html': '

binary

'}]} + ] + ) + assert len(tmpdir.listdir()) == 1 + msg_file = tmpdir.join(f'{message_id}.txt').read() + assert 'testing.pdf' in msg_file def test_pdf_empty(send_email, tmpdir, dummy_server): From 55b65ae0774b0386abce05f31ad285fbe6d9b831 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 30 May 2025 15:52:28 +0100 Subject: [PATCH 2/8] Added formatting --- src/worker/email.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/worker/email.py b/src/worker/email.py index f8a06432..e4511ed2 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -10,11 +10,11 @@ from concurrent.futures import TimeoutError from datetime import datetime, timezone from foxglove import glove +from fpdf import FPDF from httpcore import ReadTimeout as HttpReadTimeout from httpx import ConnectError, ReadTimeout from itertools import chain from pathlib import Path -from fpdf import FPDF from typing import List, Optional from src.ext import ApiError @@ -218,7 +218,9 @@ async def _generate_base64_pdf(self, pdf_attachments): for a in pdf_attachments: if a.html: try: - pdf_content = generate_pdf_from_html(a.html, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm') + pdf_content = generate_pdf_from_html( + a.html, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm' + ) except Exception as e: main_logger.warning('error generating pdf, data: %s', e) else: From 3e5f697adcd86f6c067a3a7127d282ad11a6c5ad Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 30 May 2025 16:12:21 +0100 Subject: [PATCH 3/8] Added requriements changes --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5d3f0be4..3e3fe152 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ MarkupSafe==2.1.2 phonenumbers==8.13.6 pydantic[email]==1.9.2 python-multipart==0.0.6 -fpdf2>=3.0.0 +fpdf2==2.8.3 requests==2.28.2 starlette==0.14.2 sentry-sdk==1.16.0 From d17e5d9d465ffae3c7c6df1fdc8d4d8d19c3ec52 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 2 Jun 2025 15:31:08 +0100 Subject: [PATCH 4/8] Removed fpdf2 and used Weasyprint instead --- requirements.txt | 2 +- src/worker/email.py | 30 +++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3e3fe152..311c61c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ MarkupSafe==2.1.2 phonenumbers==8.13.6 pydantic[email]==1.9.2 python-multipart==0.0.6 -fpdf2==2.8.3 +weasyprint==65.1 requests==2.28.2 starlette==0.14.2 sentry-sdk==1.16.0 diff --git a/src/worker/email.py b/src/worker/email.py index e4511ed2..3d79b49b 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -10,12 +10,12 @@ from concurrent.futures import TimeoutError from datetime import datetime, timezone from foxglove import glove -from fpdf import FPDF from httpcore import ReadTimeout as HttpReadTimeout from httpx import ConnectError, ReadTimeout from itertools import chain from pathlib import Path from typing import List, Optional +from weasyprint import HTML from src.ext import ApiError from src.render import EmailInfo, MessageDef, render_email @@ -36,19 +36,23 @@ email_retrying = [5, 10, 60, 600, 1800, 3600, 12 * 3600] -def generate_pdf_from_html(html: str, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm') -> bytes: - pdf = FPDF(orientation='P', unit='mm', format=page_size.upper()) - pdf.add_page() - - left_margin = float(margin_left.replace('mm', '')) - right_margin = float(margin_right.replace('mm', '')) - - pdf.set_left_margin(left_margin) - pdf.set_right_margin(right_margin) - pdf.write_html(html) - - return pdf.output() +def generate_pdf_from_html(html: str, page_size: str = 'A4', zoom: str = '1.0', margin_left: str = '10mm', margin_right: str = '10mm') -> bytes: + from weasyprint import CSS + page_css = f""" + @page {{ + size: {page_size}; + margin-left: {margin_left}; + margin-right: {margin_right}; + }} + body {{ + zoom: {zoom}; + }} + """ + html_doc = HTML(string=html) + css_doc = CSS(string=page_css) + pdf_bytes = html_doc.write_pdf(stylesheets=[css_doc]) + return pdf_bytes def utcnow(): return datetime.utcnow().replace(tzinfo=timezone.utc) From 9e11c9b593bbfb2ac0272a7626fa47533bd5f0ba Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 2 Jun 2025 15:37:23 +0100 Subject: [PATCH 5/8] Test removing CSS --- src/worker/email.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/worker/email.py b/src/worker/email.py index 3d79b49b..8b9c83fc 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -39,19 +39,8 @@ def generate_pdf_from_html(html: str, page_size: str = 'A4', zoom: str = '1.0', margin_left: str = '10mm', margin_right: str = '10mm') -> bytes: from weasyprint import CSS - page_css = f""" - @page {{ - size: {page_size}; - margin-left: {margin_left}; - margin-right: {margin_right}; - }} - body {{ - zoom: {zoom}; - }} - """ html_doc = HTML(string=html) - css_doc = CSS(string=page_css) - pdf_bytes = html_doc.write_pdf(stylesheets=[css_doc]) + pdf_bytes = html_doc.write_pdf() return pdf_bytes def utcnow(): From bf5b6e815c819a73e2bc7234be2a1d6990ab626b Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 2 Jun 2025 15:50:02 +0100 Subject: [PATCH 6/8] Added CSS back --- src/worker/email.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/worker/email.py b/src/worker/email.py index 8b9c83fc..3b21195e 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -36,13 +36,27 @@ email_retrying = [5, 10, 60, 600, 1800, 3600, 12 * 3600] -def generate_pdf_from_html(html: str, page_size: str = 'A4', zoom: str = '1.0', margin_left: str = '10mm', margin_right: str = '10mm') -> bytes: +def generate_pdf_from_html( + html: str, page_size: str = 'A4', zoom: str = '1.25', margin_left: str = '8mm', margin_right: str = '8mm' +) -> bytes: from weasyprint import CSS + page_css = f""" + @page {{ + size: {page_size}; + margin-left: {margin_left}; + margin-right: {margin_right}; + }} + body {{ + zoom: {zoom}; + }} + """ html_doc = HTML(string=html) - pdf_bytes = html_doc.write_pdf() + css_doc = CSS(string=page_css) + pdf_bytes = html_doc.write_pdf(stylesheets=[css_doc]) return pdf_bytes + def utcnow(): return datetime.utcnow().replace(tzinfo=timezone.utc) From d72c4b4a1d324b48452e02f4ab4087c7b08bbe90 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 2 Jun 2025 17:13:49 +0100 Subject: [PATCH 7/8] Added some options to weasyprint --- src/worker/email.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/worker/email.py b/src/worker/email.py index 3b21195e..43e23dec 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -37,7 +37,7 @@ def generate_pdf_from_html( - html: str, page_size: str = 'A4', zoom: str = '1.25', margin_left: str = '8mm', margin_right: str = '8mm' + html: str, page_size: str = 'A4', zoom: str = '1.0', margin_left: str = '10mm', margin_right: str = '10mm' ) -> bytes: from weasyprint import CSS @@ -51,9 +51,14 @@ def generate_pdf_from_html( zoom: {zoom}; }} """ + html_doc = HTML(string=html) css_doc = CSS(string=page_css) - pdf_bytes = html_doc.write_pdf(stylesheets=[css_doc]) + + pdf_bytes = html_doc.write_pdf( + stylesheets=[css_doc], + presentational_hints=True, + ) return pdf_bytes From 8207a1bf128293a8f0a0f46f0fb0a932b0829b81 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 2 Jun 2025 17:26:13 +0100 Subject: [PATCH 8/8] Added logging --- src/worker/email.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/worker/email.py b/src/worker/email.py index 43e23dec..29e51267 100644 --- a/src/worker/email.py +++ b/src/worker/email.py @@ -36,8 +36,27 @@ email_retrying = [5, 10, 60, 600, 1800, 3600, 12 * 3600] +# Add a simple url_fetcher for debugging +def logging_url_fetcher(url, timeout=10, ssl_context=None): + from weasyprint.urls import default_url_fetcher + main_logger.info(f"WeasyPrint is attempting to fetch URL: {url}") + try: + result = default_url_fetcher(url, timeout=timeout, ssl_context=ssl_context) + main_logger.info(f"Successfully fetched {url}, content type: {result.get('mime_type')}") + return result + except Exception as e: + main_logger.error(f"Failed to fetch {url}: {e}", exc_info=True) + # Propagate the error to see it in PDF generation or logs + raise + + def generate_pdf_from_html( - html: str, page_size: str = 'A4', zoom: str = '1.0', margin_left: str = '10mm', margin_right: str = '10mm' + html: str, + page_size: str = 'A4', + zoom: str = '1.0', + margin_left: str = '10mm', + margin_right: str = '10mm', + base_url_for_html: Optional[str] = None ) -> bytes: from weasyprint import CSS @@ -52,7 +71,11 @@ def generate_pdf_from_html( }} """ - html_doc = HTML(string=html) + html_doc = HTML( + string=html, + base_url=base_url_for_html, + url_fetcher=logging_url_fetcher + ) css_doc = CSS(string=page_css) pdf_bytes = html_doc.write_pdf( @@ -230,11 +253,19 @@ async def _generate_base64_pdf(self, pdf_attachments): for a in pdf_attachments: if a.html: try: + # Assuming URLs in a.html (like {{ bootstrap_url }}) are absolute, + # base_url_for_html can remain None. + # If they could be relative, a proper base_url would be needed here. pdf_content = generate_pdf_from_html( - a.html, page_size='A4', zoom='1.25', margin_left='8mm', margin_right='8mm' + a.html, + page_size='A4', + zoom='1.25', + margin_left='8mm', + margin_right='8mm', + base_url_for_html=None ) except Exception as e: - main_logger.warning('error generating pdf, data: %s', e) + main_logger.warning('error generating pdf, data: %s', e, exc_info=True) else: yield dict(type='application/pdf', name=a.name, content=base64.b64encode(pdf_content).decode())