Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## rtflite 2.5.0

### New features

- Added `RTFDocument.write_html` and `RTFDocument.write_pdf` for exporting RTF
documents to HTML and PDF via LibreOffice, matching the `write_docx`
conversion workflow.

### Testing

- Added parameterized tests covering DOCX, HTML, and PDF exports, with a
new `pdf` extra (`pypdf`) for PDF text extraction. Improved LibreOffice
availability checks to skip integration tests when conversion is not working.

## rtflite 2.4.0

### Breaking changes
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ To add rtflite as a dependency with DOCX support for projects using uv:
uv add rtflite --extra docx
```

For rtflite developers, sync optional dependencies with:
For rtflite developers, sync all optional dependencies with:

```bash
uv sync --extra docx
uv sync --all-extras
```

### Optional dependencies - LibreOffice
Expand Down
14 changes: 14 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## rtflite 2.5.0

### New features

- Added `RTFDocument.write_html` and `RTFDocument.write_pdf` for exporting RTF
documents to HTML and PDF via LibreOffice, matching the `write_docx`
conversion workflow.

### Testing

- Added parameterized tests covering DOCX, HTML, and PDF exports, with a
new `pdf` extra (`pypdf`) for PDF text extraction. Improved LibreOffice
availability checks to skip integration tests when conversion is not working.

## rtflite 2.4.0

### Breaking changes
Expand Down
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ To add rtflite as a dependency with DOCX support for projects using uv:
uv add rtflite --extra docx
```

For rtflite developers, sync optional dependencies with:
For rtflite developers, sync all optional dependencies with:

```bash
uv sync --extra docx
uv sync --all-extras
```

### Optional dependencies - LibreOffice
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ classifiers = [

[project.optional-dependencies]
docx = ["python-docx>=1.0.0"]
pdf = ["pypdf>=5.0.0"]

[project.urls]
Homepage = "https://pharmaverse.github.io/rtflite/"
Expand Down Expand Up @@ -132,5 +133,5 @@ select = [

[tool.mypy]
[[tool.mypy.overrides]]
module = ["docx", "docx.*"]
module = ["docx", "docx.*", "pypdf", "pypdf.*"]
ignore_missing_imports = true
137 changes: 136 additions & 1 deletion src/rtflite/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,12 +497,147 @@ def write_docx(
rtf_path.write_text(rtf_code, encoding="utf-8")

with tempfile.TemporaryDirectory() as convert_tmpdir:
docx_path = converter.convert(
converted = converter.convert(
input_files=rtf_path,
output_dir=Path(convert_tmpdir),
format="docx",
overwrite=True,
)
if not isinstance(converted, Path):
raise TypeError(
"LibreOffice conversion returned an unexpected output for a "
"single input file; expected `Path`, got object of type "
f"{type(converted)!r} with value {converted!r}."
)
docx_path = converted
shutil.move(str(docx_path), target_path)

print(target_path)

def write_html(
self,
file_path: str | Path,
*,
converter: LibreOfficeConverter | None = None,
) -> None:
"""Write the document as an HTML file.

Writes the document to a temporary RTF file first, and then converts
it to HTML with LibreOffice. Temporary directories are used for
all intermediate files to avoid placing artifacts alongside the
requested output path.

Args:
file_path: Destination path for the HTML file.
Accepts string or Path input. Can be absolute or relative.
Directories are created if they do not already exist.
converter: Optional LibreOffice converter instance.
Pass a configured instance (for example with a custom
`executable_path`) to control how LibreOffice is invoked and to
avoid re-initializing and re-verifying the executable path across
multiple conversions. Note that each call to ``convert()`` still
starts a new LibreOffice process in headless mode; the process is
not kept alive between conversions.

Examples:
```python
doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
doc.write_html("output/report.html")
```

Note:
LibreOffice may create a companion directory (for example
`report.html_files`) for embedded resources. When present, it is moved
alongside the requested output path.
"""
target_path = Path(file_path).expanduser()
target_path.parent.mkdir(parents=True, exist_ok=True)

if converter is None:
converter = LibreOfficeConverter()
with tempfile.TemporaryDirectory() as tmpdir:
rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
rtf_code = self.rtf_encode()
rtf_path.write_text(rtf_code, encoding="utf-8")

with tempfile.TemporaryDirectory() as convert_tmpdir:
converted = converter.convert(
input_files=rtf_path,
output_dir=Path(convert_tmpdir),
format="html",
overwrite=True,
)
if not isinstance(converted, Path):
raise TypeError(
"LibreOffice conversion returned an unexpected output for a "
"single input file; expected `Path`, got object of type "
f"{type(converted)!r} with value {converted!r}."
)
html_path = converted
resources_dir = html_path.with_name(f"{html_path.name}_files")
shutil.move(str(html_path), target_path)
if resources_dir.is_dir():
shutil.move(
str(resources_dir), target_path.parent / resources_dir.name
)

print(target_path)

def write_pdf(
self,
file_path: str | Path,
*,
converter: LibreOfficeConverter | None = None,
) -> None:
"""Write the document as a PDF file.

Writes the document to a temporary RTF file first, and then converts
it to PDF with LibreOffice. Temporary directories are used for
all intermediate files to avoid placing artifacts alongside the
requested output path.

Args:
file_path: Destination path for the PDF file.
Accepts string or Path input. Can be absolute or relative.
Directories are created if they do not already exist.
converter: Optional LibreOffice converter instance.
Pass a configured instance (for example with a custom
`executable_path`) to control how LibreOffice is invoked and to
avoid re-initializing and re-verifying the executable path across
multiple conversions. Note that each call to ``convert()`` still
starts a new LibreOffice process in headless mode; the process is
not kept alive between conversions.

Examples:
```python
doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
doc.write_pdf("output/report.pdf")
```
"""
target_path = Path(file_path).expanduser()
target_path.parent.mkdir(parents=True, exist_ok=True)

if converter is None:
converter = LibreOfficeConverter()
with tempfile.TemporaryDirectory() as tmpdir:
rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
rtf_code = self.rtf_encode()
rtf_path.write_text(rtf_code, encoding="utf-8")

with tempfile.TemporaryDirectory() as convert_tmpdir:
converted = converter.convert(
input_files=rtf_path,
output_dir=Path(convert_tmpdir),
format="pdf",
overwrite=True,
)
if not isinstance(converted, Path):
raise TypeError(
"LibreOffice conversion returned an unexpected output for a "
"single input file; expected `Path`, got object of type "
f"{type(converted)!r} with value {converted!r}."
)
pdf_path = converted
shutil.move(str(pdf_path), target_path)

print(target_path)
48 changes: 44 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Shared optional dependency checks for pytest."""

import tempfile
from pathlib import Path

import pytest

from rtflite.convert import LibreOfficeConverter
Expand All @@ -15,19 +18,51 @@ def has_python_docx() -> bool:
return True


def has_pypdf() -> bool:
"""Return True when pypdf is installed."""
try:
import pypdf # noqa: F401
except ImportError:
return False
return True


def has_libreoffice() -> bool:
"""Return True when LibreOffice is available on the system."""
"""Return True when LibreOffice is available and can convert documents."""
try:
LibreOfficeConverter()
return True
converter = LibreOfficeConverter()
except (FileNotFoundError, RuntimeError):
return False

# LibreOffice can report a valid version while still failing in headless
# conversion mode (for example due to sandboxing).
try:
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
rtf_path = tmp_path / "rtflite_smoke.rtf"
rtf_path.write_text(
r"{\rtf1\ansi\deff0\fs24 rtflite}",
encoding="utf-8",
)
converter.convert(
input_files=rtf_path,
output_dir=tmp_path,
format="pdf",
overwrite=True,
)
except Exception:
return False
return True


PYTHON_DOCX_INSTALLED = has_python_docx()
PYTHON_DOCX_REASON = "python-docx is required for DOCX assembly tests"
PYPDF_INSTALLED = has_pypdf()
PYPDF_REASON = "pypdf is required for PDF content extraction tests"
LIBREOFFICE_INSTALLED = has_libreoffice()
LIBREOFFICE_REASON = f"LibreOffice (>= {MIN_VERSION}) not found on system"
LIBREOFFICE_REASON = (
f"LibreOffice (>= {MIN_VERSION}) not found on system or cannot convert files"
)

skip_if_no_python_docx = pytest.mark.skipif(
not PYTHON_DOCX_INSTALLED,
Expand All @@ -43,3 +78,8 @@ def has_libreoffice() -> bool:
not (LIBREOFFICE_INSTALLED and PYTHON_DOCX_INSTALLED),
reason=(f"LibreOffice (>= {MIN_VERSION}) and python-docx are required"),
)

skip_if_no_libreoffice_and_pypdf = pytest.mark.skipif(
not (LIBREOFFICE_INSTALLED and PYPDF_INSTALLED),
reason=(f"LibreOffice (>= {MIN_VERSION}) and pypdf are required"),
)
56 changes: 0 additions & 56 deletions tests/test_write_docx.py

This file was deleted.

Loading