Skip to content

Commit 36d7089

Browse files
authored
Merge pull request #176 from pharmaverse/write-html-pdf
2 parents 098a880 + 51a4b14 commit 36d7089

File tree

12 files changed

+485
-138
lines changed

12 files changed

+485
-138
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Changelog
22

3+
## rtflite 2.5.0
4+
5+
### New features
6+
7+
- Added `RTFDocument.write_html` and `RTFDocument.write_pdf` for exporting RTF
8+
documents to HTML and PDF via LibreOffice, matching the `write_docx`
9+
conversion workflow.
10+
11+
### Testing
12+
13+
- Added parameterized tests covering DOCX, HTML, and PDF exports, with a
14+
new `pdf` extra (`pypdf`) for PDF text extraction. Improved LibreOffice
15+
availability checks to skip integration tests when conversion is not working.
16+
317
## rtflite 2.4.0
418

519
### Breaking changes

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ To add rtflite as a dependency with DOCX support for projects using uv:
4444
uv add rtflite --extra docx
4545
```
4646

47-
For rtflite developers, sync optional dependencies with:
47+
For rtflite developers, sync all optional dependencies with:
4848

4949
```bash
50-
uv sync --extra docx
50+
uv sync --all-extras
5151
```
5252

5353
### Optional dependencies - LibreOffice

docs/changelog.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Changelog
22

3+
## rtflite 2.5.0
4+
5+
### New features
6+
7+
- Added `RTFDocument.write_html` and `RTFDocument.write_pdf` for exporting RTF
8+
documents to HTML and PDF via LibreOffice, matching the `write_docx`
9+
conversion workflow.
10+
11+
### Testing
12+
13+
- Added parameterized tests covering DOCX, HTML, and PDF exports, with a
14+
new `pdf` extra (`pypdf`) for PDF text extraction. Improved LibreOffice
15+
availability checks to skip integration tests when conversion is not working.
16+
317
## rtflite 2.4.0
418

519
### Breaking changes

docs/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ To add rtflite as a dependency with DOCX support for projects using uv:
4444
uv add rtflite --extra docx
4545
```
4646

47-
For rtflite developers, sync optional dependencies with:
47+
For rtflite developers, sync all optional dependencies with:
4848

4949
```bash
50-
uv sync --extra docx
50+
uv sync --all-extras
5151
```
5252

5353
### Optional dependencies - LibreOffice

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ classifiers = [
3737

3838
[project.optional-dependencies]
3939
docx = ["python-docx>=1.0.0"]
40+
pdf = ["pypdf>=5.0.0"]
4041

4142
[project.urls]
4243
Homepage = "https://pharmaverse.github.io/rtflite/"
@@ -132,5 +133,5 @@ select = [
132133

133134
[tool.mypy]
134135
[[tool.mypy.overrides]]
135-
module = ["docx", "docx.*"]
136+
module = ["docx", "docx.*", "pypdf", "pypdf.*"]
136137
ignore_missing_imports = true

src/rtflite/encode.py

Lines changed: 136 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,12 +497,147 @@ def write_docx(
497497
rtf_path.write_text(rtf_code, encoding="utf-8")
498498

499499
with tempfile.TemporaryDirectory() as convert_tmpdir:
500-
docx_path = converter.convert(
500+
converted = converter.convert(
501501
input_files=rtf_path,
502502
output_dir=Path(convert_tmpdir),
503503
format="docx",
504504
overwrite=True,
505505
)
506+
if not isinstance(converted, Path):
507+
raise TypeError(
508+
"LibreOffice conversion returned an unexpected output for a "
509+
"single input file; expected `Path`, got object of type "
510+
f"{type(converted)!r} with value {converted!r}."
511+
)
512+
docx_path = converted
506513
shutil.move(str(docx_path), target_path)
507514

508515
print(target_path)
516+
517+
def write_html(
518+
self,
519+
file_path: str | Path,
520+
*,
521+
converter: LibreOfficeConverter | None = None,
522+
) -> None:
523+
"""Write the document as an HTML file.
524+
525+
Writes the document to a temporary RTF file first, and then converts
526+
it to HTML with LibreOffice. Temporary directories are used for
527+
all intermediate files to avoid placing artifacts alongside the
528+
requested output path.
529+
530+
Args:
531+
file_path: Destination path for the HTML file.
532+
Accepts string or Path input. Can be absolute or relative.
533+
Directories are created if they do not already exist.
534+
converter: Optional LibreOffice converter instance.
535+
Pass a configured instance (for example with a custom
536+
`executable_path`) to control how LibreOffice is invoked and to
537+
avoid re-initializing and re-verifying the executable path across
538+
multiple conversions. Note that each call to ``convert()`` still
539+
starts a new LibreOffice process in headless mode; the process is
540+
not kept alive between conversions.
541+
542+
Examples:
543+
```python
544+
doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
545+
doc.write_html("output/report.html")
546+
```
547+
548+
Note:
549+
LibreOffice may create a companion directory (for example
550+
`report.html_files`) for embedded resources. When present, it is moved
551+
alongside the requested output path.
552+
"""
553+
target_path = Path(file_path).expanduser()
554+
target_path.parent.mkdir(parents=True, exist_ok=True)
555+
556+
if converter is None:
557+
converter = LibreOfficeConverter()
558+
with tempfile.TemporaryDirectory() as tmpdir:
559+
rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
560+
rtf_code = self.rtf_encode()
561+
rtf_path.write_text(rtf_code, encoding="utf-8")
562+
563+
with tempfile.TemporaryDirectory() as convert_tmpdir:
564+
converted = converter.convert(
565+
input_files=rtf_path,
566+
output_dir=Path(convert_tmpdir),
567+
format="html",
568+
overwrite=True,
569+
)
570+
if not isinstance(converted, Path):
571+
raise TypeError(
572+
"LibreOffice conversion returned an unexpected output for a "
573+
"single input file; expected `Path`, got object of type "
574+
f"{type(converted)!r} with value {converted!r}."
575+
)
576+
html_path = converted
577+
resources_dir = html_path.with_name(f"{html_path.name}_files")
578+
shutil.move(str(html_path), target_path)
579+
if resources_dir.is_dir():
580+
shutil.move(
581+
str(resources_dir), target_path.parent / resources_dir.name
582+
)
583+
584+
print(target_path)
585+
586+
def write_pdf(
587+
self,
588+
file_path: str | Path,
589+
*,
590+
converter: LibreOfficeConverter | None = None,
591+
) -> None:
592+
"""Write the document as a PDF file.
593+
594+
Writes the document to a temporary RTF file first, and then converts
595+
it to PDF with LibreOffice. Temporary directories are used for
596+
all intermediate files to avoid placing artifacts alongside the
597+
requested output path.
598+
599+
Args:
600+
file_path: Destination path for the PDF file.
601+
Accepts string or Path input. Can be absolute or relative.
602+
Directories are created if they do not already exist.
603+
converter: Optional LibreOffice converter instance.
604+
Pass a configured instance (for example with a custom
605+
`executable_path`) to control how LibreOffice is invoked and to
606+
avoid re-initializing and re-verifying the executable path across
607+
multiple conversions. Note that each call to ``convert()`` still
608+
starts a new LibreOffice process in headless mode; the process is
609+
not kept alive between conversions.
610+
611+
Examples:
612+
```python
613+
doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
614+
doc.write_pdf("output/report.pdf")
615+
```
616+
"""
617+
target_path = Path(file_path).expanduser()
618+
target_path.parent.mkdir(parents=True, exist_ok=True)
619+
620+
if converter is None:
621+
converter = LibreOfficeConverter()
622+
with tempfile.TemporaryDirectory() as tmpdir:
623+
rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
624+
rtf_code = self.rtf_encode()
625+
rtf_path.write_text(rtf_code, encoding="utf-8")
626+
627+
with tempfile.TemporaryDirectory() as convert_tmpdir:
628+
converted = converter.convert(
629+
input_files=rtf_path,
630+
output_dir=Path(convert_tmpdir),
631+
format="pdf",
632+
overwrite=True,
633+
)
634+
if not isinstance(converted, Path):
635+
raise TypeError(
636+
"LibreOffice conversion returned an unexpected output for a "
637+
"single input file; expected `Path`, got object of type "
638+
f"{type(converted)!r} with value {converted!r}."
639+
)
640+
pdf_path = converted
641+
shutil.move(str(pdf_path), target_path)
642+
643+
print(target_path)

tests/conftest.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""Shared optional dependency checks for pytest."""
22

3+
import tempfile
4+
from pathlib import Path
5+
36
import pytest
47

58
from rtflite.convert import LibreOfficeConverter
@@ -15,19 +18,51 @@ def has_python_docx() -> bool:
1518
return True
1619

1720

21+
def has_pypdf() -> bool:
22+
"""Return True when pypdf is installed."""
23+
try:
24+
import pypdf # noqa: F401
25+
except ImportError:
26+
return False
27+
return True
28+
29+
1830
def has_libreoffice() -> bool:
19-
"""Return True when LibreOffice is available on the system."""
31+
"""Return True when LibreOffice is available and can convert documents."""
2032
try:
21-
LibreOfficeConverter()
22-
return True
33+
converter = LibreOfficeConverter()
2334
except (FileNotFoundError, RuntimeError):
2435
return False
2536

37+
# LibreOffice can report a valid version while still failing in headless
38+
# conversion mode (for example due to sandboxing).
39+
try:
40+
with tempfile.TemporaryDirectory() as tmpdir:
41+
tmp_path = Path(tmpdir)
42+
rtf_path = tmp_path / "rtflite_smoke.rtf"
43+
rtf_path.write_text(
44+
r"{\rtf1\ansi\deff0\fs24 rtflite}",
45+
encoding="utf-8",
46+
)
47+
converter.convert(
48+
input_files=rtf_path,
49+
output_dir=tmp_path,
50+
format="pdf",
51+
overwrite=True,
52+
)
53+
except Exception:
54+
return False
55+
return True
56+
2657

2758
PYTHON_DOCX_INSTALLED = has_python_docx()
2859
PYTHON_DOCX_REASON = "python-docx is required for DOCX assembly tests"
60+
PYPDF_INSTALLED = has_pypdf()
61+
PYPDF_REASON = "pypdf is required for PDF content extraction tests"
2962
LIBREOFFICE_INSTALLED = has_libreoffice()
30-
LIBREOFFICE_REASON = f"LibreOffice (>= {MIN_VERSION}) not found on system"
63+
LIBREOFFICE_REASON = (
64+
f"LibreOffice (>= {MIN_VERSION}) not found on system or cannot convert files"
65+
)
3166

3267
skip_if_no_python_docx = pytest.mark.skipif(
3368
not PYTHON_DOCX_INSTALLED,
@@ -43,3 +78,8 @@ def has_libreoffice() -> bool:
4378
not (LIBREOFFICE_INSTALLED and PYTHON_DOCX_INSTALLED),
4479
reason=(f"LibreOffice (>= {MIN_VERSION}) and python-docx are required"),
4580
)
81+
82+
skip_if_no_libreoffice_and_pypdf = pytest.mark.skipif(
83+
not (LIBREOFFICE_INSTALLED and PYPDF_INSTALLED),
84+
reason=(f"LibreOffice (>= {MIN_VERSION}) and pypdf are required"),
85+
)

tests/test_write_docx.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

0 commit comments

Comments
 (0)