feat: utility script to process large PDFs through the API by script (#3591)

cragwolfe · web-flow · commit e9690b2738d0 · 2024-09-10T11:40:35.000-07:00
Adds the bash script `process-pdf-parallel-through-api.sh` that allows
splitting up a PDF into smaller parts (splits) to be processed through
the API concurrently, and is re-entrant. If any of the parts splits fail
to process, one can attempt reprocessing those split(s) by rerunning the
script.

Note: requires the `qpdf` command line utility.

The below command line output shows the scenario where just one split
had to be reprocessed through the API to create the final
`layout-parser-paper_combined.json` output.

```
$ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \
  ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf
&gt;   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\
er-paper_pages_1_to_6.json as it already exists.
Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists.
Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json
Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json
```

Bonus change to `unstructured-get-json.sh` to point to the standard
hosted Serverless API, but allow using the Free API with --freemium.
diff --git a/scripts/user/process-pdf-parallel-through-api.sh b/scripts/user/process-pdf-parallel-through-api.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+
+# Usage: ./process-pdf-parallel-through-api.sh filename.pdf
+
+set -eu -o pipefail
+
+if [ $# -ne 1 ]; then
+  echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently."
+  echo
+  echo "Usage: $0 <pdf_filename>"
+  echo "Please provide a PDF filename as the first argument."
+  echo
+  echo "Optionally, set the following env vars: "
+  echo
+  echo "* STRATEGY (default hi_res)"
+  echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel"
+  echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split"
+  echo
+  echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf"
+  exit 1
+fi
+
+ALLOWED_STRATEGIES=("hi_res" "fast" "auto")
+
+# Validate STRATEGY environment variable if it's set
+if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then
+  echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2
+  exit 1
+fi
+
+# Check if UNST_API_KEY is set
+if [ -z "${UNST_API_KEY}" ]; then
+  echo "Error: UNST_API_KEY is not set or is empty" >&2
+  exit 1
+fi
+
+PDF_FILE="$1"
+DEFAULT_SPLIT_SIZE=10
+SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
+PDF_NAME=$(basename "$PDF_FILE" .pdf)
+DEFAULT_DIR="$HOME/tmp/pdf-splits"
+PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
+MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
+PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
+PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Check if PDF parts directory exists
+if [ ! -d "$PDF_DIR" ]; then
+  "$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE"
+fi
+
+# Create output directory if it does not exist
+mkdir -p "$PDF_OUTPUT_DIR"
+
+incomplete=0 # Flag to track incomplete processing
+
+# Function to process a single PDF part file
+process_file_part() {
+  local file="$1"
+  local STARTING_PAGE_NUMBER="$2"
+  local OUTPUT_JSON="$3"
+
+  if [ -f "$OUTPUT_JSON" ]; then
+    echo "Skipping processing for $OUTPUT_JSON as it already exists."
+    return
+  fi
+
+  curl -q -X POST https://api.unstructuredapp.io/general/v0/general \
+    -H "unstructured-api-key: $UNST_API_KEY" \
+    -H 'accept: application/json' \
+    -H 'Content-Type: multipart/form-data' \
+    -F strategy="${STRATEGY:-hi_res}" \
+    -F 'skip_infer_table_types="[]"' \
+    -F starting_page_number="$STARTING_PAGE_NUMBER" \
+    -F files=@"$file;filename=$PDF_FILE" \
+    -o "$OUTPUT_JSON"
+
+  # Verify JSON content
+  if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then
+    echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file."
+    cat "$OUTPUT_JSON"
+    rm "$OUTPUT_JSON"
+    incomplete=1
+  else
+    echo "Valid JSON output created: $OUTPUT_JSON"
+  fi
+}
+
+# Function to process a batch of files
+process_batch() {
+  for file in "$@"; do
+    local START_PAGE
+    START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p')
+    local END_PAGE=
+    END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p')
+    local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json"
+    process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" &
+  done
+  wait
+}
+
+# Read PDF parts into an array
+mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print)
+
+# Process PDF parts in batches of 30, by default
+batch_size=${BATCH_SIZE:-30}
+for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do
+  process_batch "${pdf_parts[@]:i:batch_size}"
+done
+
+# Determine the output filename based on whether processing was incomplete
+if [ "$incomplete" -eq 1 ]; then
+  combined_output_filename="${PDF_NAME}_incomplete_combined.json"
+  echo "WARNING! not all json parts were successfully processed. you may rerun this script"
+  echo "to attempt reprocessing those (failed to process) parts."
+else
+  combined_output_filename="${PDF_NAME}_combined.json"
+fi
+
+# Combine JSON outputs in numerical order
+find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename"
+
+echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename"
diff --git a/scripts/user/split-pdf.sh b/scripts/user/split-pdf.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+# Usage: ./split_pdf.sh filename.pdf
+
+set -e
+
+PDF_FILE="$1"
+DEFAULT_SPLIT_SIZE=5
+SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
+
+# Validate that SPLIT_SIZE is an integer
+if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then
+  echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer."
+  exit 1
+fi
+
+DEFAULT_DIR="$HOME/tmp/pdf-splits"
+PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
+PDF_NAME=$(basename "$PDF_FILE" .pdf)
+MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
+PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
+
+# Create directory if it does not exist
+mkdir -p "$PDF_DIR"
+
+# Total number of pages
+TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE")
+
+# Split PDF into $SPLIT_SIZE-page chunks
+START_PAGE=1
+while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do
+  END_PAGE=$((START_PAGE + SPLIT_SIZE - 1))
+  if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then
+    END_PAGE=$TOTAL_PAGES
+  fi
+  OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf"
+  qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE"
+  echo "Created $OUTPUT_FILE"
+  START_PAGE=$((END_PAGE + 1))
+done
+
+echo "All parts have been saved to $PDF_DIR"
diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh
@@ -12,6 +12,7 @@ USAGE_MESSAGE="Usage: $0 [options] <file>"'
 
 Options:
   --api-key KEY   Specify the API key for authentication. Set the env var $UNST_API_KEY to skip providing this option.
+  --freemium      Use the free API rather paid API
   --hi-res        hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
   --fast          fast strategy: No OCR, just extract embedded text
   --ocr-only      ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
@@ -22,9 +23,13 @@ Options:
   --s3            Write the resulting output to s3 (like a pastebin)
   --help          Display this help and exit.
 
+
 Arguments:
   <file>          File to send to the API.
 
+If running against an API instance other than hosted Unstructured paid API (or --freemium),
+set the enviornment variable UNST_API_ENDPOINT.
+
 The script requires a <file>, the document to post to the Unstructured API.
 The .json result is written to ~/tmp/unst-outputs/ -- this path is echoed and copied to your clipboard.
 '
@@ -35,7 +40,6 @@ if [ "$#" -eq 0 ]; then
 fi
 
 API_KEY=${UNST_API_KEY:-""}
-API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructured.io/general/v0/general"}
 TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads"
 TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs"
 # only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/
@@ -62,6 +66,7 @@ STRATEGY=""
 VERBOSE=false
 TRACE=false
 COORDINATES=false
+FREEMIUM=false
 TABLES=true
 S3=""
 
@@ -99,6 +104,10 @@ while [[ "$#" -gt 0 ]]; do
     COORDINATES=true
     shift
     ;;
+  --freemium)
+    FREEMIUM=true
+    shift
+    ;;
   --api-key)
     if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
       API_KEY=$2
@@ -139,6 +148,12 @@ else
   INPUT_FILEPATH=${INPUT}
 fi
 
+if $FREEMIUM; then
+  API_ENDPOINT="https://api.unstructured.io/general/v0/general"
+else
+  API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructuredapp.io/general/v0/general"}
+fi
+
 if $HI_RES; then
   if $VERBOSE; then echo "Sending API request with hi_res strategy"; fi
   STRATEGY="-hi-res"