Skip to content

Commit 68ffb4c

Browse files
committed
adding benchmarks and pdf metadata
1 parent 077ceb2 commit 68ffb4c

File tree

5 files changed

+266
-6
lines changed

5 files changed

+266
-6
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ build
1111
env
1212
.coverage
1313
.cache/
14+
benchmark/pdf_cache/
15+
benchmark/output/

benchmark/invoice.html

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Invoice INV-1</title>
6+
<link rel="stylesheet" href="https://secure.tutorcruncher.com/static/css/libraries.css">
7+
<link rel="stylesheet" href="https://secure.tutorcruncher.com/static/css/pdf_styles.css">
8+
</head>
9+
10+
<body>
11+
<!--The first page is always a summary cover sheet-->
12+
<page size="A4" class="container" style="margin-top: 30px">
13+
<div class="row">
14+
<div class="col-xs-5">
15+
<h1 style="margin-bottom: 5px">
16+
Summary
17+
</h1>
18+
<h3 style="margin-top: 5px">
19+
<small>Invoice INV-1, Page 1 of 2</small>
20+
<br>
21+
<small>Date: !TODAY!</small>
22+
</h3>
23+
</div>
24+
<div class="col-xs-7">
25+
<img class="logo" src="https://secure.tutorcruncher.com/static/tc-box-logo.png">
26+
</div>
27+
</div>
28+
<div class="row addresses">
29+
<div class="col-xs-5">
30+
<strong>Jane cli_a</strong><br>
31+
cli_a House, Any Street<br>
32+
cli_aville<br>
33+
United Kingdom<br>
34+
PO37 50DE<br>
35+
01264 730 666<br>
36+
37+
</div>
38+
<div class="col-xs-5 col-xs-offset-2 text-right">
39+
<strong>XX branch display name XX</strong><br>
40+
XX branch street XX<br>
41+
XX branch town XX<br>
42+
XX branch country XX<br>
43+
XX branch pc XX<br>
44+
-
45+
</div>
46+
</div>
47+
48+
49+
<div class="row" style="margin-top: 10px;">
50+
<div class="col-xs-12">
51+
<p style="text-align: justify;">
52+
<p>We would recommend that you check the hours on the attached invoices to make sure they correspond with the service you have been provided. If you think there is a discrepancy in the invoice please contact our accounts department before you settle the bill. We would ask you to settle the invoice within 7 days.</p>
53+
54+
</p>
55+
<strong>Summary of invoices payable for the period</strong>
56+
</div>
57+
</div>
58+
59+
<table class="table table-bordered margin-top">
60+
<tbody>
61+
<tr>
62+
<td>Work: total of contractors' invoices</td>
63+
<td class="text-right">£100.00</td>
64+
</tr>
65+
<tr>
66+
<td><strong>AMOUNT DUE FOR PAYMENT</strong></td>
67+
<td class="text-right"><strong>£100.00</strong></td>
68+
</tr>
69+
</tbody>
70+
</table>
71+
72+
<div class="row" style="margin-top: 5px">
73+
<p class="col-xs-12">
74+
This is a summary. For invoice breakdown please see individual PDFs also attached.
75+
</p>
76+
</div>
77+
<div class="row text-right" style="padding-top: 20px;">
78+
<div class="col-xs-12">
79+
<h4>Please quote reference INV-1 with your payment</h4>
80+
81+
<div class="small-para-gap">
82+
<p>Terms of payment - 10 days from invoice date.</p>
83+
84+
<p>Please pay using the link in the email sent with this invoice.</p>
85+
86+
</div>
87+
</div>
88+
</div>
89+
</page>
90+
91+
<!--After the cover sheet there's one page for contractor and potentially the agency-->
92+
<page size="A4" class="container">
93+
<div class="row">
94+
<div class="col-xs-6">
95+
<h1 style="margin-bottom: 5px">
96+
Invoice
97+
</h1>
98+
99+
<h3 style="margin-top: 5px">
100+
<small>Invoice INV-1, Page 2 of 2</small>
101+
<br>
102+
<small>Date: !TODAY!</small>
103+
</h3>
104+
</div>
105+
<div class="col-xs-6">
106+
<img class="logo" align="right" src="https://secure.tutorcruncher.com/static/tc-box-logo.png">
107+
</div>
108+
</div>
109+
110+
<div class="row addresses">
111+
<div class="col-xs-5">
112+
<strong>Jane cli_a</strong><br>
113+
cli_a House, Any Street<br>
114+
cli_aville<br>
115+
United Kingdom<br>
116+
PO37 50DE<br>
117+
01264 730 666
118+
</div>
119+
<div class="col-xs-5 col-xs-offset-2 text-right">
120+
<strong>Jane con_a</strong><br>
121+
con_a House, Any Street<br>
122+
con_aville<br>
123+
United Kingdom<br>
124+
PO37 50DE
125+
</div>
126+
</div>
127+
128+
<table class="table table-bordered">
129+
<thead>
130+
<tr>
131+
<th>
132+
Date
133+
</th>
134+
<th>
135+
Item Description
136+
</th>
137+
<th class="text-right">
138+
Units
139+
</th>
140+
<th class="text-right">
141+
Amount
142+
</th>
143+
</tr>
144+
</thead>
145+
<tbody>
146+
<tr>
147+
<td class="narrow">
148+
!DATE!<br>
149+
12:00
150+
</td>
151+
<td>
152+
XX appointment topic XX<br>
153+
Service #123<br> Appointment #123
154+
</td>
155+
<td class="text-right narrow">1 unit</td>
156+
<td class="text-right narrow">£100.00</td>
157+
</tr>
158+
</tbody>
159+
</table>
160+
161+
<div class="row text-right invoice-totals">
162+
163+
<div class="col-xs-2 col-xs-offset-8">
164+
<p>Total:</p>
165+
</div>
166+
167+
<div class="col-xs-2">
168+
<strong>
169+
<p>£100.00</p>
170+
</strong>
171+
</div>
172+
173+
</div>
174+
175+
<div class="row text-right info-block">
176+
<div class="col-xs-12 small-para-gap">
177+
<p>To ease administration XX branch display name XX collects fees on behalf of tutors. Please make a single payment to XX branch display name XX for the total amount shown on the statement page of this PDF and if paying by BACS or cheque please use the reference number supplied on this statement.</p>
178+
179+
</div>
180+
</div>
181+
</page>
182+
</body>
183+
</html>

benchmark/run.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from pathlib import Path
2+
from time import time
3+
4+
from pydf import generate_pdf
5+
6+
7+
THIS_DIR = Path(__file__).parent.resolve()
8+
html = (THIS_DIR / 'invoice.html').read_text()
9+
PDF_CACHE = THIS_DIR / 'pdf_cache'
10+
if not PDF_CACHE.exists():
11+
Path.mkdir(PDF_CACHE)
12+
OUT_DIR = THIS_DIR / 'output'
13+
if not OUT_DIR.exists():
14+
Path.mkdir(OUT_DIR)
15+
16+
17+
start = time()
18+
count = 20
19+
for i in range(count):
20+
pdf = generate_pdf(
21+
html,
22+
title='Benchmark',
23+
author='Samuel Colvin',
24+
subject='Mock Invoice',
25+
page_size='A4',
26+
zoom='1.25',
27+
margin_left='8mm',
28+
margin_right='8mm',
29+
cache_dir=PDF_CACHE,
30+
)
31+
print(f'{i:03}: {len(pdf)}')
32+
file = OUT_DIR / f'output_{i:03}.pdf'
33+
file.write_bytes(pdf)
34+
time_taken = (time() - start) / count
35+
print(f'time taken: {time_taken:0.3f}')

pydf/wkhtmltopdf.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import subprocess
23
from pathlib import Path
34
from tempfile import NamedTemporaryFile
@@ -21,7 +22,14 @@ def execute_wk(*args):
2122
return stdout, stderr, p.returncode
2223

2324

24-
def generate_pdf(source,
25+
def generate_pdf(source, *,
26+
title=None,
27+
author=None,
28+
subject=None,
29+
creator=None,
30+
producer=None,
31+
# from here on arguments are passed via the commandline to wkhtmltopdf
32+
cache_dir=None,
2533
quiet=True,
2634
grayscale=False,
2735
lowquality=False,
@@ -68,9 +76,10 @@ def generate_pdf(source,
6876
:param extra_kwargs: any exotic extra options for wkhtmltopdf
6977
:return: string representing pdf
7078
"""
71-
is_url = source.strip().startswith(('http', 'www'))
79+
is_url = source.lstrip().startswith(('http', 'www'))
7280

7381
py_args = dict(
82+
cache_dir=cache_dir,
7483
quiet=quiet,
7584
grayscale=grayscale,
7685
lowquality=lowquality,
@@ -96,18 +105,30 @@ def generate_pdf(source,
96105
else:
97106
cmd_args.extend([arg_name, str(value)])
98107

108+
fields = [
109+
('Title', title),
110+
('Author', author),
111+
('Subject', subject),
112+
('Creator', creator),
113+
('Producer', producer),
114+
]
115+
metadata = '\n'.join(f'/{name} ({value})' for name, value in fields if value)
116+
99117
def gen_pdf(src, cmd_args):
100118
with NamedTemporaryFile(suffix='.pdf', mode='rb') as pdf_file:
101119
cmd_args += [src, pdf_file.name]
102120
_, stderr, returncode = execute_wk(*cmd_args)
103121
pdf_file.seek(0)
104-
pdf_string = pdf_file.read()
122+
pdf_bytes = pdf_file.read()
105123
# it seems wkhtmltopdf's error codes can be false, we'll ignore them if we
106124
# seem to have generated a pdf
107-
if returncode != 0 and pdf_string[:4] != b'%PDF':
125+
if returncode != 0 and pdf_bytes[:4] != b'%PDF':
108126
raise RuntimeError('error running wkhtmltopdf, command: {!r}\n'
109127
'response: "{}"'.format(cmd_args, stderr.strip()))
110-
return pdf_string
128+
129+
if metadata:
130+
pdf_bytes = re.sub(b'/Title.*\n.*\n/Producer.*', metadata.encode(), pdf_bytes, count=1)
131+
return pdf_bytes
111132

112133
if is_url:
113134
return gen_pdf(source, cmd_args)
@@ -120,7 +141,7 @@ def gen_pdf(src, cmd_args):
120141

121142

122143
def _string_execute(*args):
123-
return execute_wk(*args)[0].decode('utf8').strip(' \n')
144+
return execute_wk(*args)[0].decode().strip(' \n')
124145

125146

126147
def get_version():

tests/test_main.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,25 @@ def test_generate_pdf_with_html():
88
assert pdf_content[:4] == b'%PDF'
99

1010

11+
def test_generate_pdf_with_html_meta_data():
12+
pdf_content = generate_pdf(
13+
'<html><body>Is this thing on?</body></html>',
14+
title='title foobar',
15+
subject='the subject',
16+
author='Samuel Colvin',
17+
creator='this is the creator'
18+
)
19+
assert pdf_content[:4] == b'%PDF'
20+
beginning = pdf_content.decode('utf8', 'ignore')[:300]
21+
print(beginning)
22+
assert """
23+
<<
24+
/Title (title foobar)
25+
/Author (Samuel Colvin)
26+
/Subject (the subject)
27+
/Creator (this is the creator)""" in beginning
28+
29+
1130
def test_generate_pdf_with_url():
1231
pdf_content = generate_pdf('http://google.com')
1332
assert pdf_content[:4] == b'%PDF'

0 commit comments

Comments
 (0)