|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Usage: ./process-pdf-parallel-through-api.sh filename.pdf |
| 4 | + |
| 5 | +set -eu -o pipefail |
| 6 | + |
| 7 | +if [ $# -ne 1 ]; then |
| 8 | + echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently." |
| 9 | + echo |
| 10 | + echo "Usage: $0 <pdf_filename>" |
| 11 | + echo "Please provide a PDF filename as the first argument." |
| 12 | + echo |
| 13 | + echo "Optionally, set the following env vars: " |
| 14 | + echo |
| 15 | + echo "* STRATEGY (default hi_res)" |
| 16 | + echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel" |
| 17 | + echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split" |
| 18 | + echo |
| 19 | + echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf" |
| 20 | + exit 1 |
| 21 | +fi |
| 22 | + |
| 23 | +ALLOWED_STRATEGIES=("hi_res" "fast" "auto") |
| 24 | + |
| 25 | +# Validate STRATEGY environment variable if it's set |
| 26 | +if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then |
| 27 | + echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2 |
| 28 | + exit 1 |
| 29 | +fi |
| 30 | + |
| 31 | +# Check if UNST_API_KEY is set |
| 32 | +if [ -z "${UNST_API_KEY}" ]; then |
| 33 | + echo "Error: UNST_API_KEY is not set or is empty" >&2 |
| 34 | + exit 1 |
| 35 | +fi |
| 36 | + |
| 37 | +PDF_FILE="$1" |
| 38 | +DEFAULT_SPLIT_SIZE=10 |
| 39 | +SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE} |
| 40 | +PDF_NAME=$(basename "$PDF_FILE" .pdf) |
| 41 | +DEFAULT_DIR="$HOME/tmp/pdf-splits" |
| 42 | +PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}" |
| 43 | +MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }') |
| 44 | +PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}" |
| 45 | +PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}" |
| 46 | + |
| 47 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 48 | + |
| 49 | +# Check if PDF parts directory exists |
| 50 | +if [ ! -d "$PDF_DIR" ]; then |
| 51 | + "$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE" |
| 52 | +fi |
| 53 | + |
| 54 | +# Create output directory if it does not exist |
| 55 | +mkdir -p "$PDF_OUTPUT_DIR" |
| 56 | + |
| 57 | +incomplete=0 # Flag to track incomplete processing |
| 58 | + |
| 59 | +# Function to process a single PDF part file |
| 60 | +process_file_part() { |
| 61 | + local file="$1" |
| 62 | + local STARTING_PAGE_NUMBER="$2" |
| 63 | + local OUTPUT_JSON="$3" |
| 64 | + |
| 65 | + if [ -f "$OUTPUT_JSON" ]; then |
| 66 | + echo "Skipping processing for $OUTPUT_JSON as it already exists." |
| 67 | + return |
| 68 | + fi |
| 69 | + |
| 70 | + curl -q -X POST https://api.unstructuredapp.io/general/v0/general \ |
| 71 | + -H "unstructured-api-key: $UNST_API_KEY" \ |
| 72 | + -H 'accept: application/json' \ |
| 73 | + -H 'Content-Type: multipart/form-data' \ |
| 74 | + -F strategy="${STRATEGY:-hi_res}" \ |
| 75 | + -F 'skip_infer_table_types="[]"' \ |
| 76 | + -F starting_page_number="$STARTING_PAGE_NUMBER" \ |
| 77 | + -F files=@"$file;filename=$PDF_FILE" \ |
| 78 | + -o "$OUTPUT_JSON" |
| 79 | + |
| 80 | + # Verify JSON content |
| 81 | + if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then |
| 82 | + echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file." |
| 83 | + cat "$OUTPUT_JSON" |
| 84 | + rm "$OUTPUT_JSON" |
| 85 | + incomplete=1 |
| 86 | + else |
| 87 | + echo "Valid JSON output created: $OUTPUT_JSON" |
| 88 | + fi |
| 89 | +} |
| 90 | + |
| 91 | +# Function to process a batch of files |
| 92 | +process_batch() { |
| 93 | + for file in "$@"; do |
| 94 | + local START_PAGE |
| 95 | + START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p') |
| 96 | + local END_PAGE= |
| 97 | + END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p') |
| 98 | + local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json" |
| 99 | + process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" & |
| 100 | + done |
| 101 | + wait |
| 102 | +} |
| 103 | + |
| 104 | +# Read PDF parts into an array |
| 105 | +mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print) |
| 106 | + |
| 107 | +# Process PDF parts in batches of 30, by default |
| 108 | +batch_size=${BATCH_SIZE:-30} |
| 109 | +for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do |
| 110 | + process_batch "${pdf_parts[@]:i:batch_size}" |
| 111 | +done |
| 112 | + |
| 113 | +# Determine the output filename based on whether processing was incomplete |
| 114 | +if [ "$incomplete" -eq 1 ]; then |
| 115 | + combined_output_filename="${PDF_NAME}_incomplete_combined.json" |
| 116 | + echo "WARNING! not all json parts were successfully processed. you may rerun this script" |
| 117 | + echo "to attempt reprocessing those (failed to process) parts." |
| 118 | +else |
| 119 | + combined_output_filename="${PDF_NAME}_combined.json" |
| 120 | +fi |
| 121 | + |
| 122 | +# Combine JSON outputs in numerical order |
| 123 | +find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename" |
| 124 | + |
| 125 | +echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename" |
0 commit comments