Skip to content

Commit 00203cb

Browse files
committed
Add export-data and import-data CLI commands for PDF archives
Flask CLI commands to export all PDFs as tar.gz and import them back, useful for migrating data between environments or creating backups.
1 parent 2383b69 commit 00203cb

File tree

2 files changed

+203
-0
lines changed

2 files changed

+203
-0
lines changed

src/app.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import os
33
import re
4+
import tarfile
45
import time
56
from collections import Counter, defaultdict
67
from datetime import datetime
@@ -118,6 +119,8 @@ class TokenCount(db.Model):
118119
db.session.commit()
119120

120121

122+
PDF_DIR = Path("/data/pdfs")
123+
121124
jurisdictions = ["Bund"] + [
122125
l[1] for l in sorted(report_info["abr"], key=lambda x: x[1])
123126
]
@@ -319,6 +322,50 @@ def generate_images(pattern="*", force=False):
319322
generate_page_images(pdf_path, i)
320323

321324

325+
@app.cli.command()
326+
@click.argument("output_path")
327+
def export_data(output_path):
328+
"""Export all PDFs from /data/pdfs/ as a tar.gz archive."""
329+
pdf_dir = PDF_DIR
330+
if not pdf_dir.exists():
331+
print(f"Error: {pdf_dir} does not exist")
332+
return
333+
334+
pdfs = sorted(pdf_dir.glob("*.pdf"))
335+
if not pdfs:
336+
print("No PDF files found in /data/pdfs/")
337+
return
338+
339+
with tarfile.open(output_path, "w:gz") as tar:
340+
for pdf_path in pdfs:
341+
tar.add(str(pdf_path), arcname=pdf_path.name)
342+
print(f" Added {pdf_path.name}")
343+
344+
print(f"Exported {len(pdfs)} PDFs to {output_path}")
345+
346+
347+
@app.cli.command()
348+
@click.argument("input_path")
349+
def import_data(input_path):
350+
"""Import PDFs from a tar.gz archive into /data/pdfs/."""
351+
input_file = Path(input_path)
352+
if not input_file.exists():
353+
print(f"Error: {input_path} does not exist")
354+
return
355+
356+
pdf_dir = PDF_DIR
357+
pdf_dir.mkdir(parents=True, exist_ok=True)
358+
359+
with tarfile.open(input_path, "r:gz") as tar:
360+
members = [m for m in tar.getmembers() if m.name.endswith(".pdf")]
361+
for member in members:
362+
member.name = Path(member.name).name # strip any directory prefix
363+
tar.extract(member, path=str(pdf_dir))
364+
print(f" Extracted {member.name}")
365+
366+
print(f"Imported {len(members)} PDFs to {pdf_dir}")
367+
368+
322369
def get_index():
323370
res = []
324371
total = 0

tests/test_unit.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,5 +202,161 @@ def test_autocomplete_nonexistent_first_token(self):
202202
assert response.get_json() == []
203203

204204

205+
# ---------------------------------------------------------------------------
206+
# CLI command tests: export-data / import-data
207+
# ---------------------------------------------------------------------------
208+
209+
class TestExportData:
210+
"""Test the flask export-data CLI command."""
211+
212+
def test_export_creates_tarball_with_pdfs(self, tmp_path):
213+
"""Exporting should create a tar.gz containing all PDFs."""
214+
import app as app_module
215+
import tarfile
216+
217+
pdf_dir = tmp_path / "pdfs"
218+
pdf_dir.mkdir()
219+
(pdf_dir / "report-a.pdf").write_bytes(b"%PDF-fake-a")
220+
(pdf_dir / "report-b.pdf").write_bytes(b"%PDF-fake-b")
221+
222+
output_file = tmp_path / "export.tar.gz"
223+
224+
runner = app_module.app.test_cli_runner(mix_stderr=False)
225+
with patch.object(app_module, "PDF_DIR", pdf_dir):
226+
result = runner.invoke(args=["export-data", str(output_file)])
227+
228+
assert result.exit_code == 0
229+
assert "Exported 2 PDFs" in result.output
230+
231+
with tarfile.open(str(output_file), "r:gz") as tar:
232+
names = sorted(tar.getnames())
233+
assert names == ["report-a.pdf", "report-b.pdf"]
234+
235+
def test_export_no_pdfs_found(self, tmp_path):
236+
"""Exporting when no PDFs exist should print a message."""
237+
import app as app_module
238+
239+
pdf_dir = tmp_path / "pdfs"
240+
pdf_dir.mkdir()
241+
output_file = tmp_path / "export.tar.gz"
242+
243+
runner = app_module.app.test_cli_runner(mix_stderr=False)
244+
with patch.object(app_module, "PDF_DIR", pdf_dir):
245+
result = runner.invoke(args=["export-data", str(output_file)])
246+
247+
assert result.exit_code == 0
248+
assert "No PDF files found" in result.output
249+
assert not output_file.exists()
250+
251+
def test_export_missing_directory(self, tmp_path):
252+
"""Exporting when pdf dir doesn't exist should print an error."""
253+
import app as app_module
254+
255+
nonexistent = tmp_path / "nonexistent"
256+
output_file = tmp_path / "export.tar.gz"
257+
258+
runner = app_module.app.test_cli_runner(mix_stderr=False)
259+
with patch.object(app_module, "PDF_DIR", nonexistent):
260+
result = runner.invoke(args=["export-data", str(output_file)])
261+
262+
assert result.exit_code == 0
263+
assert "does not exist" in result.output
264+
265+
266+
class TestImportData:
267+
"""Test the flask import-data CLI command."""
268+
269+
def test_import_extracts_pdfs(self, tmp_path):
270+
"""Importing should extract PDFs from tarball."""
271+
import app as app_module
272+
import tarfile
273+
274+
archive_path = tmp_path / "import.tar.gz"
275+
pdf_content_a = b"%PDF-fake-a"
276+
pdf_content_b = b"%PDF-fake-b"
277+
278+
src_dir = tmp_path / "src"
279+
src_dir.mkdir()
280+
(src_dir / "report-a.pdf").write_bytes(pdf_content_a)
281+
(src_dir / "report-b.pdf").write_bytes(pdf_content_b)
282+
283+
with tarfile.open(str(archive_path), "w:gz") as tar:
284+
tar.add(str(src_dir / "report-a.pdf"), arcname="report-a.pdf")
285+
tar.add(str(src_dir / "report-b.pdf"), arcname="report-b.pdf")
286+
287+
dest_dir = tmp_path / "data_pdfs"
288+
289+
runner = app_module.app.test_cli_runner(mix_stderr=False)
290+
with patch.object(app_module, "PDF_DIR", dest_dir):
291+
result = runner.invoke(args=["import-data", str(archive_path)])
292+
293+
assert result.exit_code == 0
294+
assert "Imported 2 PDFs" in result.output
295+
assert (dest_dir / "report-a.pdf").read_bytes() == pdf_content_a
296+
assert (dest_dir / "report-b.pdf").read_bytes() == pdf_content_b
297+
298+
def test_import_skips_non_pdf_files(self, tmp_path):
299+
"""Importing should only extract .pdf files, ignoring others."""
300+
import app as app_module
301+
import tarfile
302+
303+
archive_path = tmp_path / "import.tar.gz"
304+
src_dir = tmp_path / "src"
305+
src_dir.mkdir()
306+
(src_dir / "report.pdf").write_bytes(b"%PDF-fake")
307+
(src_dir / "readme.txt").write_bytes(b"not a pdf")
308+
309+
with tarfile.open(str(archive_path), "w:gz") as tar:
310+
tar.add(str(src_dir / "report.pdf"), arcname="report.pdf")
311+
tar.add(str(src_dir / "readme.txt"), arcname="readme.txt")
312+
313+
dest_dir = tmp_path / "data_pdfs"
314+
315+
runner = app_module.app.test_cli_runner(mix_stderr=False)
316+
with patch.object(app_module, "PDF_DIR", dest_dir):
317+
result = runner.invoke(args=["import-data", str(archive_path)])
318+
319+
assert result.exit_code == 0
320+
assert "Imported 1 PDFs" in result.output
321+
assert (dest_dir / "report.pdf").exists()
322+
assert not (dest_dir / "readme.txt").exists()
323+
324+
def test_import_missing_archive(self, tmp_path):
325+
"""Importing a nonexistent file should print an error."""
326+
import app as app_module
327+
328+
runner = app_module.app.test_cli_runner(mix_stderr=False)
329+
result = runner.invoke(args=["import-data", str(tmp_path / "nonexistent.tar.gz")])
330+
331+
assert result.exit_code == 0
332+
assert "does not exist" in result.output
333+
334+
def test_export_then_import_roundtrip(self, tmp_path):
335+
"""Export and then import should produce identical files."""
336+
import app as app_module
337+
338+
src_pdf_dir = tmp_path / "src_pdfs"
339+
src_pdf_dir.mkdir()
340+
pdf_content = b"%PDF-roundtrip-test"
341+
(src_pdf_dir / "vsbericht-bund-2020.pdf").write_bytes(pdf_content)
342+
343+
archive_path = tmp_path / "roundtrip.tar.gz"
344+
345+
runner = app_module.app.test_cli_runner(mix_stderr=False)
346+
347+
# Export
348+
with patch.object(app_module, "PDF_DIR", src_pdf_dir):
349+
result = runner.invoke(args=["export-data", str(archive_path)])
350+
assert result.exit_code == 0
351+
352+
# Import into a different directory
353+
dest_pdf_dir = tmp_path / "dest_pdfs"
354+
with patch.object(app_module, "PDF_DIR", dest_pdf_dir):
355+
result = runner.invoke(args=["import-data", str(archive_path)])
356+
assert result.exit_code == 0
357+
358+
assert (dest_pdf_dir / "vsbericht-bund-2020.pdf").read_bytes() == pdf_content
359+
360+
205361
if __name__ == '__main__':
206362
pytest.main([__file__, '-v'])

0 commit comments

Comments
 (0)