|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +set -e |
| 4 | + |
| 5 | +DEST_PATH=$(dirname "$(realpath "$0")") |
| 6 | +SCRIPT_DIR=$(dirname "$DEST_PATH") |
| 7 | +cd "$SCRIPT_DIR"/.. || exit 1 |
| 8 | +OUTPUT_FOLDER_NAME=s3-pinecone-dest |
| 9 | +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME |
| 10 | +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME |
| 11 | +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} |
| 12 | + |
| 13 | +if [ -z "$PINECONE_API_KEY" ]; then |
| 14 | + echo "Skipping Pinecone ingest test because PINECONE_API_KEY env var is not set." |
| 15 | + exit 0 |
| 16 | +fi |
| 17 | + |
| 18 | +RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) |
| 19 | + |
| 20 | +# Set the variables with default values if they're not set in the environment |
| 21 | +PINECONE_INDEX=${PINECONE_INDEX:-"ingest-test-$RANDOM_SUFFIX"} |
| 22 | +PINECONE_HOST_POSTFIX=${PINECONE_HOST_POSTFIX:-"4627-b74a"} |
| 23 | +PINECONE_PROJECT_ID=${PINECONE_PROJECT_ID:-"art8iaj"} |
| 24 | + |
| 25 | +# shellcheck disable=SC1091 |
| 26 | +source "$SCRIPT_DIR"/cleanup.sh |
| 27 | +function cleanup { |
| 28 | + |
| 29 | + # Get response code to check if index exists |
| 30 | + response_code=$(curl \ |
| 31 | + -s -o /dev/null \ |
| 32 | + -w "%{http_code}" \ |
| 33 | + --request GET \ |
| 34 | + --url "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ |
| 35 | + --header 'accept: application/json' \ |
| 36 | + --header "Api-Key: $PINECONE_API_KEY") |
| 37 | + |
| 38 | + # Cleanup (delete) index if it exists |
| 39 | + if [ "$response_code" == "200" ]; then |
| 40 | + echo "" |
| 41 | + echo "deleting index $PINECONE_INDEX" |
| 42 | + curl --request DELETE \ |
| 43 | + "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ |
| 44 | + --header "Api-Key: $PINECONE_API_KEY" \ |
| 45 | + --header 'content-type: application/json' |
| 46 | + |
| 47 | + else |
| 48 | + echo "There was an error during index deletion for index $PINECONE_INDEX, with response code: $response_code. It might be that index $PINECONE_INDEX does not exist, so there is nothing to delete." |
| 49 | + fi |
| 50 | + |
| 51 | + # Local file cleanup |
| 52 | + cleanup_dir "$WORK_DIR" |
| 53 | + cleanup_dir "$OUTPUT_DIR" |
| 54 | +} |
| 55 | + |
| 56 | +trap cleanup EXIT |
| 57 | + |
| 58 | +echo "Creating index $PINECONE_INDEX" |
| 59 | +response_code=$(curl \ |
| 60 | + -s -o /dev/null \ |
| 61 | + -w "%{http_code}" \ |
| 62 | + --request POST \ |
| 63 | + --url "https://api.pinecone.io/indexes" \ |
| 64 | + --header "accept: application/json" \ |
| 65 | + --header "content-type: application/json" \ |
| 66 | + --header "Api-Key: $PINECONE_API_KEY" \ |
| 67 | + --data ' |
| 68 | +{ |
| 69 | + "name": "'"$PINECONE_INDEX"'", |
| 70 | + "dimension": 384, |
| 71 | + "metric": "cosine", |
| 72 | + "spec": { |
| 73 | + "serverless": { |
| 74 | + "cloud": "aws", |
| 75 | + "region": "us-east-1" |
| 76 | + } |
| 77 | + } |
| 78 | +} |
| 79 | +') |
| 80 | + |
| 81 | +if [ "$response_code" -lt 400 ]; then |
| 82 | + echo "Index creation success: $response_code" |
| 83 | +else |
| 84 | + echo "Index creation failure: $response_code" |
| 85 | + exit 1 |
| 86 | +fi |
| 87 | + |
| 88 | +PYTHONPATH=. ./unstructured_ingest/main.py \ |
| 89 | + local \ |
| 90 | + --num-processes "$max_processes" \ |
| 91 | + --output-dir "$OUTPUT_DIR" \ |
| 92 | + --strategy fast \ |
| 93 | + --verbose \ |
| 94 | + --reprocess \ |
| 95 | + --input-path example-docs/book-war-and-peace-1225p.txt \ |
| 96 | + --work-dir "$WORK_DIR" \ |
| 97 | + --chunking-strategy by_title \ |
| 98 | + --chunk-combine-text-under-n-chars 150 --chunk-new-after-n-chars 1500 --chunk-max-characters 2500 --chunk-multipage-sections \ |
| 99 | + --embedding-provider "huggingface" \ |
| 100 | + pinecone \ |
| 101 | + --api-key "$PINECONE_API_KEY" \ |
| 102 | + --index-name "$PINECONE_INDEX" \ |
| 103 | + --batch-size 80 |
| 104 | + |
| 105 | +# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps |
| 106 | +# to give it that time process the writes. Will timeout after checking for a minute. |
| 107 | +num_of_vectors_remote=0 |
| 108 | +attempt=1 |
| 109 | +sleep_amount=8 |
| 110 | +while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do |
| 111 | + echo "attempt $attempt: sleeping $sleep_amount seconds to let index finish catching up after writes" |
| 112 | + sleep $sleep_amount |
| 113 | + |
| 114 | + num_of_vectors_remote=$(curl --request POST \ |
| 115 | + -s \ |
| 116 | + --url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.aped-$PINECONE_HOST_POSTFIX.pinecone.io/describe_index_stats" \ |
| 117 | + --header "accept: application/json" \ |
| 118 | + --header "content-type: application/json" \ |
| 119 | + --header "Api-Key: $PINECONE_API_KEY" | jq -r '.totalVectorCount') |
| 120 | + |
| 121 | + echo "vector count in Pinecone: $num_of_vectors_remote" |
| 122 | + attempt=$((attempt + 1)) |
| 123 | +done |
| 124 | + |
| 125 | +EXPECTED=1825 |
| 126 | + |
| 127 | +if [ "$num_of_vectors_remote" -ne $EXPECTED ]; then |
| 128 | + echo "Number of vectors in Pinecone are $num_of_vectors_remote when the expected number is $EXPECTED. Test failed." |
| 129 | + exit 1 |
| 130 | +fi |
0 commit comments