Skip to content

Commit df43877

Browse files
authored
feat/validate read/write permissions in fsspec prechecks (#133)
* write small empty file to destination to check connection * Add dest check to validate lack of access * Make HEAD request to check read permissions by indexer * Don't care about size when filtering for valid files * fix CI * bump version of kdbai client used in CI
1 parent 6b0c494 commit df43877

File tree

11 files changed

+110
-8
lines changed

11 files changed

+110
-8
lines changed

.github/actions/base-cache/action.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ runs:
2929
shell: bash
3030
run: |
3131
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
32-
python${{ inputs.python-version }} -m venv .venv
32+
if [ ! -d ".venv" ]; then
33+
python${{ inputs.python-version }} -m venv .venv
34+
fi
3335
source .venv/bin/activate
3436
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
3537
python -m ensurepip --upgrade

.github/workflows/ingest-test-fixtures-update-pr.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
# actions/checkout MUST come before auth
2929
- uses: 'actions/checkout@v4'
3030
- name: Set up Python ${{ env.PYTHON_VERSION }}
31-
uses: actions/setup-python@v4
31+
uses: actions/setup-python@v5
3232
with:
3333
python-version: ${{ env.PYTHON_VERSION }}
3434
- name: Get full Python version

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
steps:
1515
- uses: actions/checkout@v4
1616
- name: Set up Python ${{ env.PYTHON_VERSION }}
17-
uses: actions/setup-python@v4
17+
uses: actions/setup-python@v5
1818
with:
1919
python-version: ${{ env.PYTHON_VERSION }}
2020
- name: Build artifact

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.0.18
2+
3+
### Enhancements
4+
5+
* **Better destination precheck for blob storage** Write an empty file to the destination location when running fsspec-based precheck
6+
17
## 0.0.17
28

39
### Fixes

requirements/connectors/kdbai.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ charset-normalizer==3.3.2
1010
# via requests
1111
idna==3.10
1212
# via requests
13-
kdbai-client==1.2.4
13+
kdbai-client==1.3.0
1414
# via -r kdbai.in
1515
numpy==1.26.4
1616
# via

test_e2e/dest/s3_no_access.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env bash
2+
3+
DEST_PATH=$(dirname "$(realpath "$0")")
4+
SCRIPT_DIR=$(dirname "$DEST_PATH")
5+
cd "$SCRIPT_DIR"/.. || exit 1
6+
OUTPUT_FOLDER_NAME=s3-dest
7+
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
8+
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
9+
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
10+
DESTINATION_S3="s3://utic-ingest-test-fixtures/destination/no_access/$(uuidgen)/"
11+
CI=${CI:-"false"}
12+
13+
# shellcheck disable=SC1091
14+
source "$SCRIPT_DIR"/cleanup.sh
15+
function cleanup() {
16+
cleanup_dir "$WORK_DIR"
17+
}
18+
trap cleanup EXIT
19+
20+
set +e
21+
22+
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
23+
24+
# Capture the stderr in a variable to check against
25+
{ err=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
26+
local \
27+
--num-processes "$max_processes" \
28+
--strategy fast \
29+
--verbose \
30+
--reprocess \
31+
--input-path example-docs/pdf/fake-memo.pdf \
32+
--work-dir "$WORK_DIR" \
33+
s3 \
34+
--remote-url "$DESTINATION_S3" 2>&1 >&3 3>&-); } 3>&1
35+
36+
if [[ "$err" == *"Error: Precheck failed"* ]]; then
37+
echo "passed"
38+
else
39+
echo "error didn't occur with expected text: $err"
40+
exit 1
41+
fi

test_e2e/src/s3_no_access.sh

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
SRC_PATH=$(dirname "$(realpath "$0")")
6+
SCRIPT_DIR=$(dirname "$SRC_PATH")
7+
cd "$SCRIPT_DIR"/.. || exit 1
8+
OUTPUT_FOLDER_NAME=s3
9+
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
10+
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
11+
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
12+
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
13+
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
14+
15+
# shellcheck disable=SC1091
16+
source "$SCRIPT_DIR"/cleanup.sh
17+
# shellcheck disable=SC2317
18+
function cleanup() {
19+
cleanup_dir "$OUTPUT_DIR"
20+
cleanup_dir "$WORK_DIR"
21+
}
22+
trap cleanup EXIT
23+
24+
set +e
25+
26+
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
27+
28+
# Capture the stderr in a variable to check against
29+
{ err=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
30+
s3 \
31+
--num-processes "$max_processes" \
32+
--download-dir "$DOWNLOAD_DIR" \
33+
--reprocess \
34+
--output-dir "$OUTPUT_DIR" \
35+
--verbose \
36+
--remote-url s3://utic-ingest-test-fixtures/destination/ \
37+
--anonymous \
38+
--work-dir "$WORK_DIR" 2>&1 >&3 3>&-); } 3>&1
39+
40+
if [[ "$err" == *"Error: Precheck failed"* ]]; then
41+
echo "passed"
42+
else
43+
echo "error didn't occur with expected text: $err"
44+
exit 1
45+
fi

test_e2e/test-dest.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ all_tests=(
3333
'pinecone.sh'
3434
'qdrant.sh'
3535
's3.sh'
36+
's3_no_access.sh'
3637
'sharepoint-embed-cog-index.sh'
3738
'sqlite.sh'
3839
'vectara.sh'

test_e2e/test-src.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ all_tests=(
2020
's3.sh'
2121
's3-minio.sh'
2222
's3-filter.sh'
23+
's3_no_access.sh'
2324
'astradb.sh'
2425
'azure.sh'
2526
'biomed-api.sh'

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.17" # pragma: no cover
1+
__version__ = "0.0.18" # pragma: no cover

0 commit comments

Comments
 (0)