Skip to content

Commit e9690b2

Browse files
authored
feat: utility script to process large PDFs through the API by script (#3591)
Adds the bash script `process-pdf-parallel-through-api.sh` that allows splitting up a PDF into smaller parts (splits) to be processed through the API concurrently, and is re-entrant. If any of the parts splits fail to process, one can attempt reprocessing those split(s) by rerunning the script. Note: requires the `qpdf` command line utility. The below command line output shows the scenario where just one split had to be reprocessed through the API to create the final `layout-parser-paper_combined.json` output. ``` $ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \ ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf > % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\ er-paper_pages_1_to_6.json as it already exists. Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists. Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json ``` Bonus change to `unstructured-get-json.sh` to point to the standard hosted Serverless API, but allow using the Free API with --freemium.
1 parent 71208ca commit e9690b2

File tree

3 files changed

+183
-1
lines changed

3 files changed

+183
-1
lines changed
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/env bash
2+
3+
# Usage: ./process-pdf-parallel-through-api.sh filename.pdf
4+
5+
set -eu -o pipefail
6+
7+
if [ $# -ne 1 ]; then
8+
echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently."
9+
echo
10+
echo "Usage: $0 <pdf_filename>"
11+
echo "Please provide a PDF filename as the first argument."
12+
echo
13+
echo "Optionally, set the following env vars: "
14+
echo
15+
echo "* STRATEGY (default hi_res)"
16+
echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel"
17+
echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split"
18+
echo
19+
echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf"
20+
exit 1
21+
fi
22+
23+
ALLOWED_STRATEGIES=("hi_res" "fast" "auto")
24+
25+
# Validate STRATEGY environment variable if it's set
26+
if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then
27+
echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2
28+
exit 1
29+
fi
30+
31+
# Check if UNST_API_KEY is set
32+
if [ -z "${UNST_API_KEY}" ]; then
33+
echo "Error: UNST_API_KEY is not set or is empty" >&2
34+
exit 1
35+
fi
36+
37+
PDF_FILE="$1"
38+
DEFAULT_SPLIT_SIZE=10
39+
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
40+
PDF_NAME=$(basename "$PDF_FILE" .pdf)
41+
DEFAULT_DIR="$HOME/tmp/pdf-splits"
42+
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
43+
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
44+
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
45+
PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}"
46+
47+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
48+
49+
# Check if PDF parts directory exists
50+
if [ ! -d "$PDF_DIR" ]; then
51+
"$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE"
52+
fi
53+
54+
# Create output directory if it does not exist
55+
mkdir -p "$PDF_OUTPUT_DIR"
56+
57+
incomplete=0 # Flag to track incomplete processing
58+
59+
# Function to process a single PDF part file
60+
process_file_part() {
61+
local file="$1"
62+
local STARTING_PAGE_NUMBER="$2"
63+
local OUTPUT_JSON="$3"
64+
65+
if [ -f "$OUTPUT_JSON" ]; then
66+
echo "Skipping processing for $OUTPUT_JSON as it already exists."
67+
return
68+
fi
69+
70+
curl -q -X POST https://api.unstructuredapp.io/general/v0/general \
71+
-H "unstructured-api-key: $UNST_API_KEY" \
72+
-H 'accept: application/json' \
73+
-H 'Content-Type: multipart/form-data' \
74+
-F strategy="${STRATEGY:-hi_res}" \
75+
-F 'skip_infer_table_types="[]"' \
76+
-F starting_page_number="$STARTING_PAGE_NUMBER" \
77+
-F files=@"$file;filename=$PDF_FILE" \
78+
-o "$OUTPUT_JSON"
79+
80+
# Verify JSON content
81+
if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then
82+
echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file."
83+
cat "$OUTPUT_JSON"
84+
rm "$OUTPUT_JSON"
85+
incomplete=1
86+
else
87+
echo "Valid JSON output created: $OUTPUT_JSON"
88+
fi
89+
}
90+
91+
# Function to process a batch of files
92+
process_batch() {
93+
for file in "$@"; do
94+
local START_PAGE
95+
START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p')
96+
local END_PAGE=
97+
END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p')
98+
local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json"
99+
process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" &
100+
done
101+
wait
102+
}
103+
104+
# Read PDF parts into an array
105+
mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print)
106+
107+
# Process PDF parts in batches of 30, by default
108+
batch_size=${BATCH_SIZE:-30}
109+
for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do
110+
process_batch "${pdf_parts[@]:i:batch_size}"
111+
done
112+
113+
# Determine the output filename based on whether processing was incomplete
114+
if [ "$incomplete" -eq 1 ]; then
115+
combined_output_filename="${PDF_NAME}_incomplete_combined.json"
116+
echo "WARNING! not all json parts were successfully processed. you may rerun this script"
117+
echo "to attempt reprocessing those (failed to process) parts."
118+
else
119+
combined_output_filename="${PDF_NAME}_combined.json"
120+
fi
121+
122+
# Combine JSON outputs in numerical order
123+
find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename"
124+
125+
echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename"

scripts/user/split-pdf.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env bash
2+
3+
# Usage: ./split_pdf.sh filename.pdf
4+
5+
set -e
6+
7+
PDF_FILE="$1"
8+
DEFAULT_SPLIT_SIZE=5
9+
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE}
10+
11+
# Validate that SPLIT_SIZE is an integer
12+
if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then
13+
echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer."
14+
exit 1
15+
fi
16+
17+
DEFAULT_DIR="$HOME/tmp/pdf-splits"
18+
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}"
19+
PDF_NAME=$(basename "$PDF_FILE" .pdf)
20+
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }')
21+
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}"
22+
23+
# Create directory if it does not exist
24+
mkdir -p "$PDF_DIR"
25+
26+
# Total number of pages
27+
TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE")
28+
29+
# Split PDF into $SPLIT_SIZE-page chunks
30+
START_PAGE=1
31+
while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do
32+
END_PAGE=$((START_PAGE + SPLIT_SIZE - 1))
33+
if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then
34+
END_PAGE=$TOTAL_PAGES
35+
fi
36+
OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf"
37+
qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE"
38+
echo "Created $OUTPUT_FILE"
39+
START_PAGE=$((END_PAGE + 1))
40+
done
41+
42+
echo "All parts have been saved to $PDF_DIR"

scripts/user/unstructured-get-json.sh

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ USAGE_MESSAGE="Usage: $0 [options] <file>"'
1212
1313
Options:
1414
--api-key KEY Specify the API key for authentication. Set the env var $UNST_API_KEY to skip providing this option.
15+
--freemium Use the free API rather paid API
1516
--hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
1617
--fast fast strategy: No OCR, just extract embedded text
1718
--ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
@@ -22,9 +23,13 @@ Options:
2223
--s3 Write the resulting output to s3 (like a pastebin)
2324
--help Display this help and exit.
2425
26+
2527
Arguments:
2628
<file> File to send to the API.
2729
30+
If running against an API instance other than hosted Unstructured paid API (or --freemium),
31+
set the enviornment variable UNST_API_ENDPOINT.
32+
2833
The script requires a <file>, the document to post to the Unstructured API.
2934
The .json result is written to ~/tmp/unst-outputs/ -- this path is echoed and copied to your clipboard.
3035
'
@@ -35,7 +40,6 @@ if [ "$#" -eq 0 ]; then
3540
fi
3641

3742
API_KEY=${UNST_API_KEY:-""}
38-
API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructured.io/general/v0/general"}
3943
TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads"
4044
TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs"
4145
# only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/
@@ -62,6 +66,7 @@ STRATEGY=""
6266
VERBOSE=false
6367
TRACE=false
6468
COORDINATES=false
69+
FREEMIUM=false
6570
TABLES=true
6671
S3=""
6772

@@ -99,6 +104,10 @@ while [[ "$#" -gt 0 ]]; do
99104
COORDINATES=true
100105
shift
101106
;;
107+
--freemium)
108+
FREEMIUM=true
109+
shift
110+
;;
102111
--api-key)
103112
if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
104113
API_KEY=$2
@@ -139,6 +148,12 @@ else
139148
INPUT_FILEPATH=${INPUT}
140149
fi
141150

151+
if $FREEMIUM; then
152+
API_ENDPOINT="https://api.unstructured.io/general/v0/general"
153+
else
154+
API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructuredapp.io/general/v0/general"}
155+
fi
156+
142157
if $HI_RES; then
143158
if $VERBOSE; then echo "Sending API request with hi_res strategy"; fi
144159
STRATEGY="-hi-res"

0 commit comments

Comments
 (0)