removing url pdf gen

samuelcolvin · samuelcolvin · commit 4d4629e9b802 · 2017-05-23T20:47:00.000+01:00
diff --git a/pydf/wkhtmltopdf.py b/pydf/wkhtmltopdf.py
@@ -76,7 +76,8 @@ def generate_pdf(source, *,
     :param extra_kwargs: any exotic extra options for wkhtmltopdf
     :return: string representing pdf
     """
-    is_url = source.lstrip().startswith(('http', 'www'))
+    if source.lstrip().startswith(('http', 'www')):
+        raise RuntimeError('pdf generation from urls is not supported')
 
     py_args = dict(
         cache_dir=cache_dir,
@@ -114,9 +115,12 @@ def generate_pdf(source, *,
     ]
     metadata = '\n'.join(f'/{name} ({value})' for name, value in fields if value)
 
-    def gen_pdf(src, cmd_args):
+    with NamedTemporaryFile(suffix='.html', mode='wb') as html_file:
+        html_file.write(source.encode())
+        html_file.flush()
+        html_file.seek(0)
         with NamedTemporaryFile(suffix='.pdf', mode='rb') as pdf_file:
-            cmd_args += [src, pdf_file.name]
+            cmd_args += [html_file.name, pdf_file.name]
             _, stderr, returncode = execute_wk(*cmd_args)
             pdf_file.seek(0)
             pdf_bytes = pdf_file.read()
@@ -130,15 +134,6 @@ def gen_pdf(src, cmd_args):
                 pdf_bytes = re.sub(b'/Title.*\n.*\n/Producer.*', metadata.encode(), pdf_bytes, count=1)
             return pdf_bytes
 
-    if is_url:
-        return gen_pdf(source, cmd_args)
-
-    with NamedTemporaryFile(suffix='.html', mode='wb') as html_file:
-        html_file.write(source.encode())
-        html_file.flush()
-        html_file.seek(0)
-        return gen_pdf(html_file.name, cmd_args)
-
 
 def _string_execute(*args):
     return execute_wk(*args)[0].decode().strip(' \n')
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,6 +1,7 @@
 coverage==4.4
 docutils==0.13.1
 flake8==3.3.0
+pdfminer.six==20170419
 pycodestyle==2.3.1
 pyflakes==1.5.0
 pytest==3.0.7
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,11 +1,24 @@
+from io import BytesIO, StringIO
+
 import pytest
+import pdfminer.layout
+from pdfminer import high_level
 
 from pydf import generate_pdf, get_extended_help, get_help, get_version
 
 
+def get_pdf_text(pdf_data: bytes) -> str:
+    laparams = pdfminer.layout.LAParams()
+    output = StringIO()
+    high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams)
+    return output.getvalue()
+
+
 def test_generate_pdf_with_html():
     pdf_content = generate_pdf('<html><body>Is this thing on?</body></html>')
     assert pdf_content[:4] == b'%PDF'
+    text = get_pdf_text(pdf_content)
+    assert 'Is this thing on?\n\n\x0c' == text
 
 
 def test_generate_pdf_with_html_meta_data():
@@ -27,11 +40,6 @@ def test_generate_pdf_with_html_meta_data():
 /Creator (this is the creator)""" in beginning
 
 
-def test_generate_pdf_with_url():
-    pdf_content = generate_pdf('http://google.com')
-    assert pdf_content[:4] == b'%PDF'
-
-
 def test_unicode():
     pdf_content = generate_pdf(u'<html><body>Schrödinger</body></html>')
     assert pdf_content[:4] == b'%PDF'