diff --git a/.gitignore b/.gitignore index 097ef70..1cfc656 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,5 @@ Desktop.ini fusion_result.json kernel_meta/ + +hf_model diff --git a/demo_page.py b/demo_page.py index 4c2d457..926de8e 100644 --- a/demo_page.py +++ b/demo_page.py @@ -6,6 +6,7 @@ import argparse import glob import os +import sys import cv2 import torch @@ -111,7 +112,7 @@ def chat(self, prompt, image): return results -def process_document(document_path, model, save_dir, max_batch_size=None): +def process_document(document_path, model, save_dir, max_batch_size=None, processed_images_dir=None): """Parse documents with two stages - Handles both images and PDFs""" file_ext = os.path.splitext(document_path)[1].lower() @@ -133,7 +134,8 @@ def process_document(document_path, model, save_dir, max_batch_size=None): # Process this page (don't save individual page results) json_path, recognition_results = process_single_image( - pil_image, model, save_dir, page_name, max_batch_size, save_individual=False + pil_image, model, save_dir, page_name, max_batch_size, save_individual=False, + processed_images_dir=processed_images_dir ) # Add page information to results @@ -155,7 +157,7 @@ def process_document(document_path, model, save_dir, max_batch_size=None): return process_single_image(pil_image, model, save_dir, base_name, max_batch_size) -def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True): +def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True, processed_images_dir=None): """Process a single image (either from file or converted from PDF page) Args: @@ -173,7 +175,25 @@ def process_single_image(image, model, save_dir, image_name, max_batch_size=None layout_output = model.chat("Parse the reading order of this document.", image) # Stage 2: Element-level content parsing - padded_image, dims = prepare_image(image) + + # Extract PDF name and page number for organized image saving + pdf_name = None + page_number = None + + # Check if this is a PDF page (format: "pdfname_page_001") + if "_page_" in image_name: + parts = image_name.split("_page_") + if len(parts) == 2: + pdf_name = parts[0] + try: + page_number = int(parts[1]) + except ValueError: + page_number = None + else: + # For single images, use the image name as pdf_name + pdf_name = image_name + + padded_image, dims = prepare_image(image, pdf_name=pdf_name, page_number=page_number, processed_images_dir=processed_images_dir) recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name) # Save outputs only if requested (skip for PDF pages) @@ -215,7 +235,8 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size, "label": label, "text": f"![Figure](figures/{figure_filename})", "figure_path": f"figures/{figure_filename}", - "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], + "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], # Original image coordinates + "padded_bbox": [x1, y1, x2, y2], # Padded image coordinates "reading_order": reading_order, }) else: @@ -223,7 +244,8 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size, element_info = { "crop": pil_crop, "label": label, - "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], + "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], # Original image coordinates + "padded_bbox": [x1, y1, x2, y2], # Padded image coordinates "reading_order": reading_order, } @@ -291,6 +313,7 @@ def process_element_batch(elements, model, prompt, max_batch_size=None): results.append({ "label": elem["label"], "bbox": elem["bbox"], + "padded_bbox": elem["padded_bbox"], # Padded coordinates "text": result.strip(), "reading_order": elem["reading_order"], }) @@ -314,8 +337,31 @@ def main(): default=16, help="Maximum number of document elements to parse in a single batch (default: 16)", ) + parser.add_argument( + "--processed_images_dir", + type=str, + default=None, + help="Directory to save processed images (default: from config or './processed_images_by_dolphin')", + ) args = parser.parse_args() + # Determine processed_images_dir with fallback logic + processed_images_dir = args.processed_images_dir + if processed_images_dir is None: + # Try to get from config file + try: + # Add parent directory to path to access config + parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, parent_dir) + from config.config import PROCESSED_IMAGES_DIR + processed_images_dir = PROCESSED_IMAGES_DIR + except (ImportError, AttributeError): + processed_images_dir = None + + # If not in config, use hardcoded default + if processed_images_dir is None: + processed_images_dir = "./processed_images_by_dolphin" + # Load Model model = DOLPHIN(args.model_path) @@ -358,6 +404,7 @@ def main(): model=model, save_dir=save_dir, max_batch_size=args.max_batch_size, + processed_images_dir=processed_images_dir ) print(f"Processing completed. Results saved to {save_dir}") @@ -368,4 +415,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py index 07e39e1..d2a6472 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -7,6 +7,8 @@ import json import os import re +import time +import uuid from dataclasses import dataclass from typing import List, Tuple @@ -297,7 +299,7 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100] -def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]: +def prepare_image(image, pdf_name=None, page_number=None, processed_images_dir=None) -> Tuple[np.ndarray, ImageDimensions]: """Load and prepare image with padding while maintaining aspect ratio Args: @@ -321,6 +323,38 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]: # Apply padding padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)) + # Save the processed padded image with organized filename + try: + if processed_images_dir is None: + processed_images_dir = "./processed_images" + # Create PDF-specific subdirectory if pdf_name is provided + if pdf_name: + pdf_dir = os.path.join(processed_images_dir, pdf_name) + os.makedirs(pdf_dir, exist_ok=True) + + # Generate organized filename + if page_number is not None: + unique_filename = f"page-{page_number}.png" + else: + unique_filename = f"{pdf_name}.png" + + processed_image_path = os.path.join(pdf_dir, unique_filename) + else: + # Fallback to original naming scheme for non-PDF files + os.makedirs(processed_images_dir, exist_ok=True) + timestamp = int(time.time() * 1000) # milliseconds since epoch + unique_id = str(uuid.uuid4())[:8] # first 8 characters of UUID + unique_filename = f"processed_{timestamp}_{unique_id}.png" + processed_image_path = os.path.join(processed_images_dir, unique_filename) + + cv2.imwrite(processed_image_path, padded_image) + print(f"✓ Saved processed padded image: {unique_filename}") + + except Exception as save_error: + # Don't let saving errors affect the main functionality + print(f"Warning: Could not save processed image: {str(save_error)}") + + padded_h, padded_w = padded_image.shape[:2] dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h) @@ -603,4 +637,4 @@ def assign_colors_to_elements(num_elements): color_idx = i % len(palette) colors.append(palette[color_idx]) - return colors + return colors \ No newline at end of file