Skip to content

Commit 94a15c5

Browse files
committed
adding async pdf generation using create_subprocess_exec
1 parent a26b687 commit 94a15c5

File tree

5 files changed

+193
-64
lines changed

5 files changed

+193
-64
lines changed

benchmark/run.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import asyncio
12
from pathlib import Path
23
from time import time
34

4-
from pydf import generate_pdf
5+
from pydf import AsyncPydf, generate_pdf
56

67

78
THIS_DIR = Path(__file__).parent.resolve()
@@ -14,22 +15,59 @@
1415
Path.mkdir(OUT_DIR)
1516

1617

18+
def go_sync():
19+
count = 20
20+
for i in range(count):
21+
pdf = generate_pdf(
22+
html,
23+
title='Benchmark',
24+
author='Samuel Colvin',
25+
subject='Mock Invoice',
26+
page_size='A4',
27+
zoom='1.25',
28+
margin_left='8mm',
29+
margin_right='8mm',
30+
cache_dir=PDF_CACHE,
31+
)
32+
print(f'{i:03}: {len(pdf)}')
33+
file = OUT_DIR / f'output_{i:03}.pdf'
34+
file.write_bytes(pdf)
35+
return count
36+
37+
start = time()
38+
count = go_sync()
39+
time_taken = (time() - start) / count
40+
print(f'sync, time taken per pdf: {time_taken:0.3f}s')
41+
42+
async def go_async():
43+
count = 20
44+
apydf = AsyncPydf(max_processes=20)
45+
46+
async def gen(i_):
47+
pdf = await apydf.generate_pdf(
48+
html,
49+
title='Benchmark',
50+
author='Samuel Colvin',
51+
subject='Mock Invoice',
52+
page_size='A4',
53+
zoom='1.25',
54+
margin_left='8mm',
55+
margin_right='8mm',
56+
cache_dir=PDF_CACHE,
57+
)
58+
print(f'{i_:03}: {len(pdf)}')
59+
file = OUT_DIR / f'output_{i_:03}.pdf'
60+
file.write_bytes(pdf)
61+
62+
coros = []
63+
for i in range(count):
64+
coros.append(gen(i))
65+
await asyncio.gather(*coros)
66+
return count
67+
68+
1769
start = time()
18-
count = 20
19-
for i in range(count):
20-
pdf = generate_pdf(
21-
html,
22-
title='Benchmark',
23-
author='Samuel Colvin',
24-
subject='Mock Invoice',
25-
page_size='A4',
26-
zoom='1.25',
27-
margin_left='8mm',
28-
margin_right='8mm',
29-
cache_dir=PDF_CACHE,
30-
)
31-
print(f'{i:03}: {len(pdf)}')
32-
file = OUT_DIR / f'output_{i:03}.pdf'
33-
file.write_bytes(pdf)
70+
loop = asyncio.get_event_loop()
71+
count = loop.run_until_complete(go_async())
3472
time_taken = (time() - start) / count
35-
print(f'time taken: {time_taken:0.3f}')
73+
print(f'async time taken per pdf: {time_taken:0.3f}s')

pydf/wkhtmltopdf.py

Lines changed: 100 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,106 @@
1+
import asyncio
12
import re
23
import subprocess
4+
35
from pathlib import Path
46

57
from .version import VERSION
68

9+
__all__ = [
10+
'AsyncPydf',
11+
'generate_pdf',
12+
'get_version',
13+
'get_help',
14+
'get_extended_help',
15+
]
16+
717
THIS_DIR = Path(__file__).parent.resolve()
8-
WK_PATH = THIS_DIR / 'bin' / 'wkhtmltopdf'
18+
WK_PATH = str(THIS_DIR / 'bin' / 'wkhtmltopdf')
919

1020

11-
def execute_wk(*args, input=None):
21+
def _execute_wk(*args, input=None):
1222
"""
1323
Generate path for the wkhtmltopdf binary and execute command.
1424
1525
:param args: args to pass straight to subprocess.Popen
1626
:return: stdout, stderr
1727
"""
18-
wk_args = (str(WK_PATH),) + args
28+
wk_args = (WK_PATH,) + args
1929
return subprocess.run(wk_args, input=input, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2030

2131

22-
def generate_pdf(source, *,
32+
def _convert_args(py_args):
33+
cmd_args = []
34+
for name, value in py_args.items():
35+
if value in {None, False}:
36+
continue
37+
arg_name = '--' + name.replace('_', '-')
38+
if value is True:
39+
cmd_args.append(arg_name)
40+
else:
41+
cmd_args.extend([arg_name, str(value)])
42+
43+
# read from stdin and write to stdout
44+
cmd_args.extend(['-', '-'])
45+
return cmd_args
46+
47+
48+
def _set_meta_data(pdf_content, **kwargs):
49+
fields = [
50+
('Title', kwargs.get('title')),
51+
('Author', kwargs.get('author')),
52+
('Subject', kwargs.get('subject')),
53+
('Creator', kwargs.get('creator')),
54+
('Producer', kwargs.get('producer')),
55+
]
56+
metadata = '\n'.join(f'/{name} ({value})' for name, value in fields if value)
57+
if metadata:
58+
pdf_content = re.sub(b'/Title.*\n.*\n/Producer.*', metadata.encode(), pdf_content, count=1)
59+
return pdf_content
60+
61+
62+
class AsyncPydf:
63+
def __init__(self, *, max_processes=20, loop=None):
64+
self.semaphore = asyncio.Semaphore(value=max_processes, loop=loop)
65+
self.loop = loop
66+
67+
async def generate_pdf(self,
68+
html,
69+
title=None,
70+
author=None,
71+
subject=None,
72+
creator=None,
73+
producer=None,
74+
**cmd_args):
75+
cmd_args = [WK_PATH] + _convert_args(cmd_args)
76+
async with self.semaphore:
77+
p = await asyncio.create_subprocess_exec(
78+
*cmd_args,
79+
stdin=asyncio.subprocess.PIPE,
80+
stdout=asyncio.subprocess.PIPE,
81+
stderr=asyncio.subprocess.PIPE,
82+
loop=self.loop
83+
)
84+
p.stdin.write(html.encode())
85+
p.stdin.close()
86+
await p.wait()
87+
pdf_content = await p.stdout.read()
88+
if p.returncode != 0 and pdf_content[:4] != b'%PDF':
89+
stderr = await p.stderr.read()
90+
raise RuntimeError('error running wkhtmltopdf, command: {!r}\n'
91+
'response: "{}"'.format(cmd_args, stderr.strip()))
92+
93+
return _set_meta_data(
94+
pdf_content,
95+
title=title,
96+
author=author,
97+
subject=subject,
98+
creator=creator,
99+
producer=producer,
100+
)
101+
102+
103+
def generate_pdf(html, *,
23104
title=None,
24105
author=None,
25106
subject=None,
@@ -55,7 +136,7 @@ def generate_pdf(source, *,
55136
Arguments which are True are passed with no value eg. just --quiet, False
56137
and None arguments are missed, everything else is passed with str(value).
57138
58-
:param source: html string to generate pdf from or url to get
139+
:param html: html string to generate pdf from
59140
:param grayscale: bool
60141
:param lowquality: bool
61142
:param margin_bottom: string eg. 10mm
@@ -71,7 +152,7 @@ def generate_pdf(source, *,
71152
:param extra_kwargs: any exotic extra options for wkhtmltopdf
72153
:return: string representing pdf
73154
"""
74-
if source.lstrip().startswith(('http', 'www')):
155+
if html.lstrip().startswith(('http', 'www')):
75156
raise ValueError('pdf generation from urls is not supported')
76157

77158
py_args = dict(
@@ -90,43 +171,29 @@ def generate_pdf(source, *,
90171
image_quality=image_quality,
91172
)
92173
py_args.update(extra_kwargs)
93-
cmd_args = []
94-
for name, value in py_args.items():
95-
if value in {None, False}:
96-
continue
97-
arg_name = '--' + name.replace('_', '-')
98-
if value is True:
99-
cmd_args.append(arg_name)
100-
else:
101-
cmd_args.extend([arg_name, str(value)])
102-
103-
# read from stdin and write to stdout
104-
cmd_args += ['-', '-']
174+
cmd_args = _convert_args(py_args)
105175

106-
p = execute_wk(*cmd_args, input=source.encode())
107-
pdf_bytes = p.stdout
176+
p = _execute_wk(*cmd_args, input=html.encode())
177+
pdf_content = p.stdout
108178

109179
# it seems wkhtmltopdf's error codes can be false, we'll ignore them if we
110180
# seem to have generated a pdf
111-
if p.returncode != 0 and pdf_bytes[:4] != b'%PDF':
181+
if p.returncode != 0 and pdf_content[:4] != b'%PDF':
112182
raise RuntimeError('error running wkhtmltopdf, command: {!r}\n'
113183
'response: "{}"'.format(cmd_args, p.stderr.strip()))
114184

115-
fields = [
116-
('Title', title),
117-
('Author', author),
118-
('Subject', subject),
119-
('Creator', creator),
120-
('Producer', producer),
121-
]
122-
metadata = '\n'.join(f'/{name} ({value})' for name, value in fields if value)
123-
if metadata:
124-
pdf_bytes = re.sub(b'/Title.*\n.*\n/Producer.*', metadata.encode(), pdf_bytes, count=1)
125-
return pdf_bytes
185+
return _set_meta_data(
186+
pdf_content,
187+
title=title,
188+
author=author,
189+
subject=subject,
190+
creator=creator,
191+
producer=producer,
192+
)
126193

127194

128195
def _string_execute(*args):
129-
return execute_wk(*args).stdout.decode().strip(' \n')
196+
return _execute_wk(*args).stdout.decode().strip(' \n')
130197

131198

132199
def get_version():

tests/test_async.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import asyncio
2+
3+
import pytest
4+
from pydf import AsyncPydf
5+
from .utils import pdf_text
6+
7+
8+
def test_async_pdf_gen():
9+
apydf = AsyncPydf()
10+
loop = asyncio.get_event_loop()
11+
12+
pdf_content = loop.run_until_complete(apydf.generate_pdf('<html><body>Is this thing on?</body></html>'))
13+
assert pdf_content[:4] == b'%PDF'
14+
text = pdf_text(pdf_content)
15+
assert 'Is this thing on?\n\n\x0c' == text
16+
17+
18+
def test_invalid_argument():
19+
apydf = AsyncPydf()
20+
loop = asyncio.get_event_loop()
21+
with pytest.raises(RuntimeError) as exc_info:
22+
loop.run_until_complete(apydf.generate_pdf('hello', foobar='broken'))
23+
assert 'error running wkhtmltopdf, command' in str(exc_info)
Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,13 @@
1-
from io import BytesIO, StringIO
2-
3-
import pdfminer.layout
41
import pytest
5-
from pdfminer import high_level
62

73
from pydf import generate_pdf, get_extended_help, get_help, get_version
8-
9-
10-
def get_pdf_text(pdf_data: bytes) -> str:
11-
laparams = pdfminer.layout.LAParams()
12-
output = StringIO()
13-
high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams)
14-
return output.getvalue()
4+
from .utils import pdf_text
155

166

177
def test_generate_pdf_with_html():
188
pdf_content = generate_pdf('<html><body>Is this thing on?</body></html>')
199
assert pdf_content[:4] == b'%PDF'
20-
text = get_pdf_text(pdf_content)
10+
text = pdf_text(pdf_content)
2111
assert 'Is this thing on?\n\n\x0c' == text
2212

2313

@@ -90,7 +80,7 @@ def test_generate_url():
9080

9181
def test_bad_arguments():
9282
with pytest.raises(RuntimeError) as exc_info:
93-
generate_pdf('hellp', foobar='broken')
83+
generate_pdf('hello', foobar='broken')
9484
assert 'error running wkhtmltopdf, command' in str(exc_info)
9585

9686

tests/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from io import BytesIO, StringIO
2+
3+
import pdfminer.layout
4+
from pdfminer import high_level
5+
6+
7+
def pdf_text(pdf_data: bytes) -> str:
8+
laparams = pdfminer.layout.LAParams()
9+
output = StringIO()
10+
high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams)
11+
return output.getvalue()

0 commit comments

Comments
 (0)