diff --git a/src/nutrient_dws/api/direct.py b/src/nutrient_dws/api/direct.py index df7703f..7acf7b3 100644 --- a/src/nutrient_dws/api/direct.py +++ b/src/nutrient_dws/api/direct.py @@ -4,7 +4,7 @@ for supported document processing operations. """ -from typing import TYPE_CHECKING, Any, List, Optional, Protocol +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol from nutrient_dws.file_handler import FileInput @@ -230,6 +230,100 @@ def apply_redactions( """ return self._process_file("apply-redactions", input_file, output_path) + def split_pdf( + self, + input_file: FileInput, + page_ranges: Optional[List[Dict[str, int]]] = None, + output_paths: Optional[List[str]] = None, + ) -> List[bytes]: + """Split a PDF into multiple documents by page ranges. + + Splits a PDF into multiple files based on specified page ranges. + Each range creates a separate output file. + + Args: + input_file: Input PDF file. + page_ranges: List of page range dictionaries. Each dict can contain: + - 'start': Starting page index (0-based, inclusive). 0 = first page. + - 'end': Ending page index (0-based, exclusive). + For example: {"start": 0, "end": 2} extracts pages 0-1 (first two pages). + - If 'end' is omitted from dict, extracts from 'start' to end of document. + Required parameter - must provide at least one range + output_paths: Optional list of paths to save output files. + Must match length of page_ranges if provided. + + Returns: + List of PDF bytes for each split, or empty list if output_paths provided. + + Raises: + AuthenticationError: If API key is missing or invalid. + APIError: For other API errors. + ValueError: If page_ranges and output_paths length mismatch. + + Examples: + # Split first two pages into separate files + pages = client.split_pdf( + "document.pdf", + page_ranges=[{"start": 0, "end": 1}, {"start": 1, "end": 2}] + ) + + # Split by custom ranges + parts = client.split_pdf( + "document.pdf", + page_ranges=[ + {"start": 0, "end": 5}, # Pages 1-5 + {"start": 5, "end": 10}, # Pages 6-10 + {"start": 10} # Pages 11 to end + ] + ) + + # Save to specific files + client.split_pdf( + "document.pdf", + page_ranges=[{"start": 0, "end": 2}, {"start": 2}], + output_paths=["part1.pdf", "part2.pdf"] + ) + """ + from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output + + # Validate inputs + if not page_ranges: + raise ValueError("page_ranges is required - must provide at least one range") + + # Limit number of ranges to prevent excessive API calls + if len(page_ranges) > 50: + raise ValueError("Maximum 50 page ranges allowed per split operation") + + if output_paths and len(output_paths) != len(page_ranges): + raise ValueError("output_paths length must match page_ranges length") + + results: List[bytes] = [] + + # Process each page range as a separate API call + for i, page_range in enumerate(page_ranges): + # Prepare file for upload + file_field, file_data = prepare_file_for_upload(input_file, "file") + files = {file_field: file_data} + + # Build instructions for page extraction + instructions = {"parts": [{"file": "file", "pages": page_range}], "actions": []} + + # Make API request + # Type checking: at runtime, self is NutrientClient which has _http_client + result = self._http_client.post( # type: ignore[attr-defined] + "/build", + files=files, + json_data=instructions, + ) + + # Handle output + if output_paths and i < len(output_paths): + save_file_output(result, output_paths[i]) + else: + results.append(result) + + return results if not output_paths else [] + def merge_pdfs( self, input_files: List[FileInput], @@ -293,3 +387,421 @@ def merge_pdfs( return None else: return result # type: ignore[no-any-return] + + def duplicate_pdf_pages( + self, + input_file: FileInput, + page_indexes: List[int], + output_path: Optional[str] = None, + ) -> Optional[bytes]: + """Duplicate specific pages within a PDF document. + + Creates a new PDF containing the specified pages in the order provided. + Pages can be duplicated multiple times by including their index multiple times. + + Args: + input_file: Input PDF file. + page_indexes: List of page indexes to include (0-based). 0 = first page. + Pages can be repeated to create duplicates. + Negative indexes are supported (-1 for last page). + For example: [0, 0, 1] duplicates the first page then includes the second. + output_path: Optional path to save the output file. + + Returns: + Processed PDF as bytes, or None if output_path is provided. + + Raises: + AuthenticationError: If API key is missing or invalid. + APIError: For other API errors. + ValueError: If page_indexes is empty. + + Examples: + # Duplicate first page twice, then include second page + result = client.duplicate_pdf_pages( + "document.pdf", + page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2 + ) + + # Include last page at beginning and end + result = client.duplicate_pdf_pages( + "document.pdf", + page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last + ) + + # Save to specific file + client.duplicate_pdf_pages( + "document.pdf", + page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2 + output_path="reordered.pdf" + ) + """ + from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output + + # Validate inputs + if not page_indexes: + raise ValueError("page_indexes cannot be empty") + + # Prepare file for upload + file_field, file_data = prepare_file_for_upload(input_file, "file") + files = {file_field: file_data} + + # Build parts for each page index + parts = [] + for page_index in page_indexes: + if page_index < 0: + # For negative indexes, use the index directly (API supports negative indexes) + parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) + else: + # For positive indexes, create single-page range with exclusive end + parts.append( + {"file": "file", "pages": {"start": page_index, "end": page_index + 1}} + ) + + # Build instructions for duplication + instructions = {"parts": parts, "actions": []} + + # Make API request + # Type checking: at runtime, self is NutrientClient which has _http_client + result = self._http_client.post( # type: ignore[attr-defined] + "/build", + files=files, + json_data=instructions, + ) + + # Handle output + if output_path: + save_file_output(result, output_path) + return None + else: + return result # type: ignore[no-any-return] + + def delete_pdf_pages( + self, + input_file: FileInput, + page_indexes: List[int], + output_path: Optional[str] = None, + ) -> Optional[bytes]: + """Delete specific pages from a PDF document. + + Creates a new PDF with the specified pages removed. The API approach + works by selecting all pages except those to be deleted. + + Args: + input_file: Input PDF file. + page_indexes: List of page indexes to delete (0-based). 0 = first page. + Must be unique, sorted in ascending order. + Negative indexes are NOT supported. + output_path: Optional path to save the output file. + + Returns: + Processed PDF as bytes, or None if output_path is provided. + + Raises: + AuthenticationError: If API key is missing or invalid. + APIError: For other API errors. + ValueError: If page_indexes is empty or contains negative indexes. + + Examples: + # Delete first and last pages (Note: negative indexes not supported) + result = client.delete_pdf_pages( + "document.pdf", + page_indexes=[0, 2] # Delete pages 1 and 3 + ) + + # Delete specific pages (2nd and 4th pages) + result = client.delete_pdf_pages( + "document.pdf", + page_indexes=[1, 3] # 0-based indexing + ) + + # Save to specific file + client.delete_pdf_pages( + "document.pdf", + page_indexes=[2, 4, 5], + output_path="pages_deleted.pdf" + ) + """ + from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output + + # Validate inputs + if not page_indexes: + raise ValueError("page_indexes cannot be empty") + + # Check for negative indexes + if any(idx < 0 for idx in page_indexes): + negative_indexes = [idx for idx in page_indexes if idx < 0] + raise ValueError( + f"Negative page indexes not yet supported for deletion: {negative_indexes}" + ) + + # Prepare file for upload + file_field, file_data = prepare_file_for_upload(input_file, "file") + files = {file_field: file_data} + + # Sort page indexes to handle ranges efficiently + sorted_indexes = sorted(set(page_indexes)) # Remove duplicates and sort + + # Build parts for pages to keep (excluding the ones to delete) + parts = [] + + # Start from page 0 + current_page = 0 + + for delete_index in sorted_indexes: + # Add range from current_page to delete_index (exclusive) + if current_page < delete_index: + parts.append( + {"file": "file", "pages": {"start": current_page, "end": delete_index}} + ) + + # Skip the deleted page + current_page = delete_index + 1 + + # Add remaining pages from current_page to end + if current_page >= 0: # Always add remaining pages + parts.append({"file": "file", "pages": {"start": current_page}}) + + # If no parts (edge case), raise error + if not parts: + raise ValueError("No valid pages to keep after deletion") + + # Build instructions for deletion (keeping non-deleted pages) + instructions = {"parts": parts, "actions": []} + + # Make API request + # Type checking: at runtime, self is NutrientClient which has _http_client + result = self._http_client.post( # type: ignore[attr-defined] + "/build", + files=files, + json_data=instructions, + ) + + # Handle output + if output_path: + save_file_output(result, output_path) + return None + else: + return result # type: ignore[no-any-return] + + def add_page( + self, + input_file: FileInput, + insert_index: int, + page_count: int = 1, + page_size: str = "A4", + orientation: str = "portrait", + output_path: Optional[str] = None, + ) -> Optional[bytes]: + """Add blank pages to a PDF document. + + Inserts blank pages at the specified insertion index in the document. + + Args: + input_file: Input PDF file. + insert_index: Position to insert pages (0-based insertion index). + 0 = insert before first page (at beginning) + 1 = insert before second page (after first page) + -1 = insert after last page (at end) + page_count: Number of blank pages to add (default: 1). + page_size: Page size for new pages. Common values: "A4", "Letter", + "Legal", "A3", "A5" (default: "A4"). + orientation: Page orientation. Either "portrait" or "landscape" + (default: "portrait"). + output_path: Optional path to save the output file. + + Returns: + Processed PDF as bytes, or None if output_path is provided. + + Raises: + AuthenticationError: If API key is missing or invalid. + APIError: For other API errors. + ValueError: If page_count is less than 1 or if insert_index is + a negative number other than -1. + + Examples: + # Add a single blank page at the beginning + result = client.add_page("document.pdf", insert_index=0) + + # Add multiple pages at the end + result = client.add_page( + "document.pdf", + insert_index=-1, # Insert at end + page_count=3, + page_size="Letter", + orientation="landscape" + ) + + # Add pages before third page and save to file + client.add_page( + "document.pdf", + insert_index=2, # Insert before third page + page_count=2, + output_path="with_blank_pages.pdf" + ) + """ + from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output + + # Validate inputs + if page_count < 1: + raise ValueError("page_count must be at least 1") + if page_count > 100: + raise ValueError("page_count cannot exceed 100 pages") + if insert_index < -1: + raise ValueError("insert_index must be -1 (for end) or a non-negative insertion index") + + # Prepare file for upload + file_field, file_data = prepare_file_for_upload(input_file, "file") + files = {file_field: file_data} + + # Build parts array + parts: List[Dict[str, Any]] = [] + + # Create new page part + new_page_part = { + "page": "new", + "pageCount": page_count, + "layout": { + "size": page_size, + "orientation": orientation, + }, + } + + if insert_index == -1: + # Insert at end: add all original pages first, then new pages + parts.append({"file": "file"}) + parts.append(new_page_part) + elif insert_index == 0: + # Insert at beginning: add new pages first, then all original pages + parts.append(new_page_part) + parts.append({"file": "file"}) + else: + # Insert at specific position: split original document + # Add pages from start up to insertion point (0 to insert_index-1) + parts.append({"file": "file", "pages": {"start": 0, "end": insert_index}}) + + # Add new blank pages + parts.append(new_page_part) + + # Add remaining pages from insertion point to end + parts.append({"file": "file", "pages": {"start": insert_index}}) + + # Build instructions for adding pages + instructions = {"parts": parts, "actions": []} + + # Make API request + # Type checking: at runtime, self is NutrientClient which has _http_client + result = self._http_client.post( # type: ignore[attr-defined] + "/build", + files=files, + json_data=instructions, + ) + + # Handle output + if output_path: + save_file_output(result, output_path) + return None + else: + return result # type: ignore[no-any-return] + + def set_page_label( + self, + input_file: FileInput, + labels: List[Dict[str, Any]], + output_path: Optional[str] = None, + ) -> Optional[bytes]: + """Set labels for specific pages in a PDF. + + Assigns custom labels/numbering to specific page ranges in a PDF document. + Each label configuration specifies a page range and the label text to apply. + + Args: + input_file: Input PDF file. + labels: List of label configurations. Each dict must contain: + - 'pages': Page range dict with 'start' (required) and optionally 'end' + - 'label': String label to apply to those pages + Page ranges use 0-based indexing where 'end' is exclusive. + output_path: Optional path to save the output file. + + Returns: + Processed PDF as bytes, or None if output_path is provided. + + Raises: + AuthenticationError: If API key is missing or invalid. + APIError: For other API errors. + ValueError: If labels list is empty or contains invalid configurations. + + Examples: + # Set labels for different page ranges + client.set_page_label( + "document.pdf", + labels=[ + {"pages": {"start": 0, "end": 3}, "label": "Introduction"}, + {"pages": {"start": 3, "end": 10}, "label": "Chapter 1"}, + {"pages": {"start": 10}, "label": "Appendix"} + ], + output_path="labeled_document.pdf" + ) + + # Set label for single page + client.set_page_label( + "document.pdf", + labels=[{"pages": {"start": 0, "end": 1}, "label": "Cover Page"}] + ) + """ + from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output + + # Validate inputs + if not labels: + raise ValueError("labels list cannot be empty") + + # Normalize labels to ensure proper format + normalized_labels = [] + for i, label_config in enumerate(labels): + if not isinstance(label_config, dict): + raise ValueError(f"Label configuration {i} must be a dictionary") + + if "pages" not in label_config: + raise ValueError(f"Label configuration {i} missing required 'pages' key") + + if "label" not in label_config: + raise ValueError(f"Label configuration {i} missing required 'label' key") + + pages = label_config["pages"] + if not isinstance(pages, dict) or "start" not in pages: + raise ValueError(f"Label configuration {i} 'pages' must be a dict with 'start' key") + + # Normalize pages to ensure 'end' is present + normalized_pages = {"start": pages["start"]} + if "end" in pages: + normalized_pages["end"] = pages["end"] + else: + # If no end is specified, use -1 to indicate "to end of document" + normalized_pages["end"] = -1 + + normalized_labels.append({"pages": normalized_pages, "label": label_config["label"]}) + + # Prepare file for upload + file_field, file_data = prepare_file_for_upload(input_file, "file") + files = {file_field: file_data} + + # Build instructions with page labels in output configuration + instructions = { + "parts": [{"file": "file"}], + "actions": [], + "output": {"labels": normalized_labels}, + } + + # Make API request + # Type checking: at runtime, self is NutrientClient which has _http_client + result = self._http_client.post( # type: ignore[attr-defined] + "/build", + files=files, + json_data=instructions, + ) + + # Handle output + if output_path: + save_file_output(result, output_path) + return None + else: + return result # type: ignore[no-any-return] diff --git a/tests/integration/test_direct_api_integration.py b/tests/integration/test_direct_api_integration.py new file mode 100644 index 0000000..222cf72 --- /dev/null +++ b/tests/integration/test_direct_api_integration.py @@ -0,0 +1,591 @@ +"""Comprehensive integration tests for Direct API methods. + +These tests require a valid API key configured in integration_config.py and +test all Direct API methods against the live Nutrient DWS API. +""" + +from typing import Optional, Union + +import pytest + +from nutrient_dws import NutrientClient + +try: + from . import integration_config # type: ignore[attr-defined] + + API_KEY: Optional[str] = integration_config.API_KEY + BASE_URL: Optional[str] = getattr(integration_config, "BASE_URL", None) + TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) +except ImportError: + API_KEY = None + BASE_URL = None + TIMEOUT = 60 + + +def assert_is_pdf(file_path_or_bytes: Union[str, bytes]) -> None: + """Assert that a file or bytes is a valid PDF. + + Args: + file_path_or_bytes: Path to file or bytes content to check. + """ + if isinstance(file_path_or_bytes, (str, bytes)): + if isinstance(file_path_or_bytes, str): + with open(file_path_or_bytes, "rb") as f: + content = f.read(8) + else: + content = file_path_or_bytes[:8] + + # Check PDF magic number + assert content.startswith(b"%PDF-"), ( + f"File does not start with PDF magic number, got: {content!r}" + ) + else: + raise ValueError("Input must be file path string or bytes") + + +@pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") +class TestDirectAPIIntegration: + """Comprehensive integration tests for all Direct API methods.""" + + @pytest.fixture + def client(self): + """Create a client with the configured API key.""" + client = NutrientClient(api_key=API_KEY, timeout=TIMEOUT) + yield client + client.close() + + @pytest.fixture + def sample_pdf_path(self): + """Get path to sample PDF file for testing.""" + import os + + return os.path.join(os.path.dirname(__file__), "..", "data", "sample.pdf") + + @pytest.fixture + def sample_docx_path(self): + """Get path to sample DOCX file for testing.""" + import os + + return os.path.join(os.path.dirname(__file__), "..", "data", "sample.docx") + + # Tests for convert_to_pdf + def test_convert_to_pdf_from_docx(self, client, sample_docx_path): + """Test convert_to_pdf method with DOCX input.""" + result = client.convert_to_pdf(sample_docx_path) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_convert_to_pdf_with_output_file(self, client, sample_docx_path, tmp_path): + """Test convert_to_pdf method saving to output file.""" + output_path = str(tmp_path / "converted.pdf") + + result = client.convert_to_pdf(sample_docx_path, output_path=output_path) + + assert result is None + assert (tmp_path / "converted.pdf").exists() + assert (tmp_path / "converted.pdf").stat().st_size > 0 + assert_is_pdf(output_path) + + def test_convert_to_pdf_from_pdf_passthrough(self, client, sample_pdf_path): + """Test convert_to_pdf method with PDF input (should pass through).""" + result = client.convert_to_pdf(sample_pdf_path) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + # Tests for flatten_annotations + def test_flatten_annotations_integration(self, client, sample_pdf_path): + """Test flatten_annotations method with live API.""" + result = client.flatten_annotations(sample_pdf_path) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_flatten_annotations_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test flatten_annotations method saving to output file.""" + output_path = str(tmp_path / "flattened.pdf") + + result = client.flatten_annotations(sample_pdf_path, output_path=output_path) + + assert result is None + assert (tmp_path / "flattened.pdf").exists() + assert_is_pdf(output_path) + + # Tests for rotate_pages + def test_rotate_pages_integration(self, client, sample_pdf_path): + """Test rotate_pages method with live API.""" + result = client.rotate_pages(sample_pdf_path, degrees=90) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_rotate_pages_specific_pages(self, client, sample_pdf_path): + """Test rotate_pages method with specific page indexes.""" + result = client.rotate_pages(sample_pdf_path, degrees=180, page_indexes=[0]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_rotate_pages_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test rotate_pages method saving to output file.""" + output_path = str(tmp_path / "rotated.pdf") + + result = client.rotate_pages(sample_pdf_path, degrees=270, output_path=output_path) + + assert result is None + assert (tmp_path / "rotated.pdf").exists() + assert_is_pdf(output_path) + + # Tests for ocr_pdf + def test_ocr_pdf_integration(self, client, sample_pdf_path): + """Test ocr_pdf method with live API.""" + result = client.ocr_pdf(sample_pdf_path, language="english") + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_ocr_pdf_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test ocr_pdf method saving to output file.""" + output_path = str(tmp_path / "ocr.pdf") + + result = client.ocr_pdf(sample_pdf_path, language="english", output_path=output_path) + + assert result is None + assert (tmp_path / "ocr.pdf").exists() + assert_is_pdf(output_path) + + # Tests for watermark_pdf + def test_watermark_pdf_text_integration(self, client, sample_pdf_path): + """Test watermark_pdf method with text watermark.""" + result = client.watermark_pdf( + sample_pdf_path, text="DRAFT", width=200, height=100, opacity=0.5 + ) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_watermark_pdf_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test watermark_pdf method saving to output file.""" + output_path = str(tmp_path / "watermarked.pdf") + + result = client.watermark_pdf( + sample_pdf_path, + text="CONFIDENTIAL", + width=150, + height=75, + position="top-right", + output_path=output_path, + ) + + assert result is None + assert (tmp_path / "watermarked.pdf").exists() + assert_is_pdf(output_path) + + # Tests for apply_redactions + def test_apply_redactions_integration(self, client, sample_pdf_path): + """Test apply_redactions method with live API.""" + result = client.apply_redactions(sample_pdf_path) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_apply_redactions_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test apply_redactions method saving to output file.""" + output_path = str(tmp_path / "redacted.pdf") + + result = client.apply_redactions(sample_pdf_path, output_path=output_path) + + assert result is None + assert (tmp_path / "redacted.pdf").exists() + assert_is_pdf(output_path) + + # Tests for merge_pdfs + def test_merge_pdfs_integration(self, client, sample_pdf_path, tmp_path): + """Test merge_pdfs method with live API.""" + # Create a second PDF by copying the sample + second_pdf_path = str(tmp_path / "second.pdf") + import shutil + + shutil.copy2(sample_pdf_path, second_pdf_path) + + result = client.merge_pdfs([sample_pdf_path, second_pdf_path]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_merge_pdfs_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test merge_pdfs method saving to output file.""" + # Create a second PDF by copying the sample + second_pdf_path = str(tmp_path / "second.pdf") + output_path = str(tmp_path / "merged.pdf") + import shutil + + shutil.copy2(sample_pdf_path, second_pdf_path) + + result = client.merge_pdfs([sample_pdf_path, second_pdf_path], output_path=output_path) + + assert result is None + assert (tmp_path / "merged.pdf").exists() + assert_is_pdf(output_path) + + def test_merge_pdfs_error_single_file(self, client, sample_pdf_path): + """Test merge_pdfs method with single file raises error.""" + with pytest.raises(ValueError, match="At least 2 files required"): + client.merge_pdfs([sample_pdf_path]) + + # Tests for split_pdf + def test_split_pdf_integration(self, client, sample_pdf_path, tmp_path): + """Test split_pdf method with live API.""" + # Test splitting PDF into two parts - sample PDF should have multiple pages + page_ranges = [ + {"start": 0, "end": 1}, # First page + {"start": 1}, # Remaining pages + ] + + # Test getting bytes back + result = client.split_pdf(sample_pdf_path, page_ranges=page_ranges) + + assert isinstance(result, list) + assert len(result) == 2 # Should return exactly 2 parts + assert all(isinstance(pdf_bytes, bytes) for pdf_bytes in result) + assert all(len(pdf_bytes) > 0 for pdf_bytes in result) + + # Verify both results are valid PDFs + for pdf_bytes in result: + assert_is_pdf(pdf_bytes) + + def test_split_pdf_with_output_files(self, client, sample_pdf_path, tmp_path): + """Test split_pdf method saving to output files.""" + output_paths = [str(tmp_path / "page1.pdf"), str(tmp_path / "remaining.pdf")] + + page_ranges = [ + {"start": 0, "end": 1}, # First page + {"start": 1}, # Remaining pages + ] + + # Test saving to files + result = client.split_pdf( + sample_pdf_path, page_ranges=page_ranges, output_paths=output_paths + ) + + # Should return empty list when saving to files + assert result == [] + + # Check that output files were created + assert (tmp_path / "page1.pdf").exists() + assert (tmp_path / "page1.pdf").stat().st_size > 0 + assert_is_pdf(str(tmp_path / "page1.pdf")) + + # Second file should exist since sample PDF has multiple pages + assert (tmp_path / "remaining.pdf").exists() + assert (tmp_path / "remaining.pdf").stat().st_size > 0 + assert_is_pdf(str(tmp_path / "remaining.pdf")) + + def test_split_pdf_no_ranges_error(self, client, sample_pdf_path): + """Test split_pdf with no ranges raises error.""" + # Test that page_ranges is required + with pytest.raises(ValueError, match="page_ranges is required"): + client.split_pdf(sample_pdf_path) + + def test_split_pdf_output_paths_length_mismatch_error(self, client, sample_pdf_path): + """Test split_pdf method with mismatched output_paths and page_ranges lengths.""" + page_ranges = [{"start": 0, "end": 1}, {"start": 1}] + output_paths = ["page1.pdf"] # Only one path for two ranges + + with pytest.raises(ValueError, match="output_paths length must match page_ranges length"): + client.split_pdf(sample_pdf_path, page_ranges=page_ranges, output_paths=output_paths) + + def test_split_pdf_too_many_ranges_error(self, client, sample_pdf_path): + """Test split_pdf method with too many ranges raises error.""" + # Create 51 ranges (exceeds the 50 limit) + page_ranges = [{"start": i, "end": i + 1} for i in range(51)] + + with pytest.raises(ValueError, match="Maximum 50 page ranges allowed"): + client.split_pdf(sample_pdf_path, page_ranges=page_ranges) + + # Tests for duplicate_pdf_pages + def test_duplicate_pdf_pages_basic(self, client, sample_pdf_path): + """Test duplicate_pdf_pages method with basic duplication.""" + # Test duplicating first page twice + result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[0, 0]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_duplicate_pdf_pages_reorder(self, client, sample_pdf_path): + """Test duplicate_pdf_pages method with page reordering.""" + # Test reordering pages (assumes sample PDF has at least 2 pages) + result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[1, 0]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_duplicate_pdf_pages_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test duplicate_pdf_pages method saving to output file.""" + output_path = str(tmp_path / "duplicated.pdf") + + # Test duplicating and saving to file + result = client.duplicate_pdf_pages( + sample_pdf_path, page_indexes=[0, 0, 1], output_path=output_path + ) + + # Should return None when saving to file + assert result is None + + # Check that output file was created + assert (tmp_path / "duplicated.pdf").exists() + assert (tmp_path / "duplicated.pdf").stat().st_size > 0 + assert_is_pdf(output_path) + + def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path): + """Test duplicate_pdf_pages method with negative indexes.""" + # Test using negative indexes (last page) + result = client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[-1, 0, -1]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): + """Test duplicate_pdf_pages method with empty page_indexes raises error.""" + with pytest.raises(ValueError, match="page_indexes cannot be empty"): + client.duplicate_pdf_pages(sample_pdf_path, page_indexes=[]) + + # Tests for delete_pdf_pages + def test_delete_pdf_pages_basic(self, client, sample_pdf_path): + """Test delete_pdf_pages method with basic page deletion.""" + # Test deleting first page (assuming sample PDF has at least 2 pages) + result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_delete_pdf_pages_multiple(self, client, sample_pdf_path): + """Test delete_pdf_pages method with multiple page deletion.""" + # Test deleting multiple pages + result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0, 2]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_delete_pdf_pages_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test delete_pdf_pages method saving to output file.""" + output_path = str(tmp_path / "pages_deleted.pdf") + + # Test deleting pages and saving to file + result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[1], output_path=output_path) + + # Should return None when saving to file + assert result is None + + # Check that output file was created + assert (tmp_path / "pages_deleted.pdf").exists() + assert (tmp_path / "pages_deleted.pdf").stat().st_size > 0 + assert_is_pdf(output_path) + + def test_delete_pdf_pages_negative_indexes_error(self, client, sample_pdf_path): + """Test delete_pdf_pages method with negative indexes raises error.""" + # Currently negative indexes are not supported for deletion + with pytest.raises(ValueError, match="Negative page indexes not yet supported"): + client.delete_pdf_pages(sample_pdf_path, page_indexes=[-1]) + + def test_delete_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): + """Test delete_pdf_pages method with empty page_indexes raises error.""" + with pytest.raises(ValueError, match="page_indexes cannot be empty"): + client.delete_pdf_pages(sample_pdf_path, page_indexes=[]) + + def test_delete_pdf_pages_duplicate_indexes(self, client, sample_pdf_path): + """Test delete_pdf_pages method with duplicate page indexes.""" + # Test that duplicate indexes are handled correctly (should remove duplicates) + result = client.delete_pdf_pages(sample_pdf_path, page_indexes=[0, 0, 1]) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + # Tests for add_page + def test_add_page_at_beginning(self, client, sample_pdf_path): + """Test add_page method inserting at the beginning.""" + # Test inserting at beginning (insert_index=0) + result = client.add_page(sample_pdf_path, insert_index=0) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_multiple_pages(self, client, sample_pdf_path): + """Test add_page method with multiple pages.""" + # Test adding multiple blank pages before second page + result = client.add_page(sample_pdf_path, insert_index=1, page_count=3) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_at_end(self, client, sample_pdf_path): + """Test add_page method inserting at the end.""" + # Test inserting at end using -1 + result = client.add_page(sample_pdf_path, insert_index=-1, page_count=2) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_before_specific_page(self, client, sample_pdf_path): + """Test add_page method inserting before a specific page.""" + # Test inserting before page 3 (insert_index=2) + result = client.add_page(sample_pdf_path, insert_index=2, page_count=1) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_custom_size_orientation(self, client, sample_pdf_path): + """Test add_page method with custom page size and orientation.""" + # Test adding Letter-sized landscape pages at beginning + result = client.add_page( + sample_pdf_path, + insert_index=0, + page_size="Letter", + orientation="landscape", + page_count=2, + ) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_with_output_file(self, client, sample_pdf_path, tmp_path): + """Test add_page method saving to output file.""" + output_path = str(tmp_path / "with_blank_pages.pdf") + + # Test adding pages and saving to file + result = client.add_page( + sample_pdf_path, insert_index=1, page_count=2, output_path=output_path + ) + + # Should return None when saving to file + assert result is None + + # Check that output file was created + assert (tmp_path / "with_blank_pages.pdf").exists() + assert (tmp_path / "with_blank_pages.pdf").stat().st_size > 0 + assert_is_pdf(output_path) + + def test_add_page_different_page_sizes(self, client, sample_pdf_path): + """Test add_page method with different page sizes.""" + # Test various page sizes + page_sizes = ["A4", "Letter", "Legal", "A3", "A5"] + + for page_size in page_sizes: + result = client.add_page(sample_pdf_path, insert_index=0, page_size=page_size) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_add_page_invalid_page_count_error(self, client, sample_pdf_path): + """Test add_page method with invalid page_count raises error.""" + # Test zero page count + with pytest.raises(ValueError, match="page_count must be at least 1"): + client.add_page(sample_pdf_path, insert_index=0, page_count=0) + + # Test negative page count + with pytest.raises(ValueError, match="page_count must be at least 1"): + client.add_page(sample_pdf_path, insert_index=0, page_count=-1) + + # Test excessive page count + with pytest.raises(ValueError, match="page_count cannot exceed 100"): + client.add_page(sample_pdf_path, insert_index=0, page_count=101) + + def test_add_page_invalid_position_error(self, client, sample_pdf_path): + """Test add_page method with invalid insert_index raises error.""" + # Test invalid negative position (anything below -1) + with pytest.raises(ValueError, match="insert_index must be -1"): + client.add_page(sample_pdf_path, insert_index=-2, page_count=1) + + with pytest.raises(ValueError, match="insert_index must be -1"): + client.add_page(sample_pdf_path, insert_index=-5, page_count=1) + + # Tests for set_page_label + def test_set_page_label_integration(self, client, sample_pdf_path, tmp_path): + """Test set_page_label method with live API.""" + labels = [{"pages": {"start": 0, "end": 1}, "label": "Cover"}] + + output_path = str(tmp_path / "labeled.pdf") + + # Try to set page labels + result = client.set_page_label(sample_pdf_path, labels, output_path=output_path) + + # If successful, verify output + assert result is None # Should return None when output_path provided + assert (tmp_path / "labeled.pdf").exists() + assert_is_pdf(output_path) + + def test_set_page_label_return_bytes(self, client, sample_pdf_path): + """Test set_page_label method returning bytes.""" + labels = [{"pages": {"start": 0, "end": 1}, "label": "i"}] + + # Test getting bytes back + result = client.set_page_label(sample_pdf_path, labels) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_set_page_label_multiple_ranges(self, client, sample_pdf_path): + """Test set_page_label method with multiple page ranges.""" + labels = [ + {"pages": {"start": 0, "end": 1}, "label": "i"}, + {"pages": {"start": 1, "end": 2}, "label": "intro"}, + {"pages": {"start": 2, "end": 3}, "label": "final"}, + ] + + result = client.set_page_label(sample_pdf_path, labels) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_set_page_label_single_page(self, client, sample_pdf_path): + """Test set_page_label method with single page label.""" + labels = [{"pages": {"start": 0, "end": 1}, "label": "Cover Page"}] + + result = client.set_page_label(sample_pdf_path, labels) + + assert isinstance(result, bytes) + assert len(result) > 0 + assert_is_pdf(result) + + def test_set_page_label_empty_labels_error(self, client, sample_pdf_path): + """Test set_page_label method with empty labels raises error.""" + with pytest.raises(ValueError, match="labels list cannot be empty"): + client.set_page_label(sample_pdf_path, labels=[]) + + def test_set_page_label_invalid_label_config_error(self, client, sample_pdf_path): + """Test set_page_label method with invalid label configuration raises error.""" + # Missing 'pages' key + with pytest.raises(ValueError, match="missing required 'pages' key"): + client.set_page_label(sample_pdf_path, labels=[{"label": "test"}]) + + # Missing 'label' key + with pytest.raises(ValueError, match="missing required 'label' key"): + client.set_page_label(sample_pdf_path, labels=[{"pages": {"start": 0}}]) + + # Invalid pages format + with pytest.raises(ValueError, match="'pages' must be a dict with 'start' key"): + client.set_page_label(sample_pdf_path, labels=[{"pages": "invalid", "label": "test"}]) diff --git a/tests/integration/test_live_api.py b/tests/integration/test_live_api.py index af72552..cc9457b 100644 --- a/tests/integration/test_live_api.py +++ b/tests/integration/test_live_api.py @@ -3,6 +3,8 @@ These tests require a valid API key configured in integration_config.py. """ +from typing import Optional, Union + import pytest from nutrient_dws import NutrientClient @@ -10,15 +12,36 @@ try: from . import integration_config # type: ignore[attr-defined] - API_KEY = integration_config.API_KEY - BASE_URL = getattr(integration_config, "BASE_URL", None) - TIMEOUT = getattr(integration_config, "TIMEOUT", 60) + API_KEY: Optional[str] = integration_config.API_KEY + BASE_URL: Optional[str] = getattr(integration_config, "BASE_URL", None) + TIMEOUT: int = getattr(integration_config, "TIMEOUT", 60) except ImportError: API_KEY = None BASE_URL = None TIMEOUT = 60 +def assert_is_pdf(file_path_or_bytes: Union[str, bytes]) -> None: + """Assert that a file or bytes is a valid PDF. + + Args: + file_path_or_bytes: Path to file or bytes content to check. + """ + if isinstance(file_path_or_bytes, (str, bytes)): + if isinstance(file_path_or_bytes, str): + with open(file_path_or_bytes, "rb") as f: + content = f.read(8) + else: + content = file_path_or_bytes[:8] + + # Check PDF magic number + assert content.startswith(b"%PDF-"), ( + f"File does not start with PDF magic number, got: {content!r}" + ) + else: + raise ValueError("Input must be file path string or bytes") + + @pytest.mark.skipif(not API_KEY, reason="No API key configured in integration_config.py") class TestLiveAPI: """Integration tests against live API."""