Skip to content

Commit ab5f92d

Browse files
authored
Fix(ingest): Deprecate --s3-url in favor of --remote-url (#616)
* deprecation s3-url * changelopg and versioin * download dir not now
1 parent 7942bc9 commit ab5f92d

File tree

6 files changed

+9
-22
lines changed

6 files changed

+9
-22
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.6.7-dev7
1+
## 0.6.7-dev8
22

33
### Enhancements
44

5+
* Deprecate `--s3-url` in favor of `--remote-url` in CLI
56
* Refactor out non-connector-specific config variables
67
* Add `file_directory` to metadata
78
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.

Ingest.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ The unstructured library includes a CLI to batch ingest documents from (soon to
66
various) sources, storing structured outputs locally on the filesystem.
77

88
For example, the following command processes all the documents in S3 in the
9-
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
9+
`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
1010

1111
unstructured-ingest \
12-
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
12+
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
1313
--s3-anonymous \
1414
--structured-output-dir s3-small-batch-output \
1515
--num-processes 2
@@ -30,7 +30,7 @@ When testing from a local checkout rather than a pip-installed version of `unstr
3030
just execute `unstructured/ingest/main.py`, e.g.:
3131

3232
PYTHONPATH=. ./unstructured/ingest/main.py \
33-
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
33+
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
3434
--s3-anonymous \
3535
--structured-output-dir s3-small-batch-output \
3636
--num-processes 2

examples/ingest/s3-small-batch/ingest.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
12+
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
1313
--s3-anonymous \
1414
--structured-output-dir s3-small-batch-output \
1515
--num-processes 2

test_unstructured_ingest/test-ingest-s3.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ fi
1313

1414
PYTHONPATH=. ./unstructured/ingest/main.py \
1515
--metadata-exclude filename,file_directory \
16-
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
16+
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
1717
--s3-anonymous \
1818
--structured-output-dir s3-small-batch-output \
1919
--preserve-downloads \

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.7-dev7" # pragma: no cover
1+
__version__ = "0.6.7-dev8" # pragma: no cover

unstructured/ingest/main.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -172,12 +172,6 @@ def run(self):
172172
help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both "
173173
"a directory or a single file. Supported protocols are: `s3`, `s3a`, `abfs`, and `az`.",
174174
)
175-
@click.option(
176-
"--s3-url",
177-
default=None,
178-
help="Prefix of s3 objects (files) to download. E.g. s3://bucket1/path/. This value may "
179-
"also be a single file. To be deprecated in favor of --remote-url.",
180-
)
181175
@click.option(
182176
"--s3-anonymous",
183177
is_flag=True,
@@ -399,7 +393,6 @@ def run(self):
399393
@click.option("-v", "--verbose", is_flag=True, default=False)
400394
def main(
401395
remote_url,
402-
s3_url, # TODO: deprecate this in the next minor release
403396
s3_anonymous,
404397
azure_account_name,
405398
azure_account_key,
@@ -491,13 +484,6 @@ def main(
491484
cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
492485
if not cache_path.exists():
493486
cache_path.mkdir(parents=True, exist_ok=True)
494-
if s3_url:
495-
warnings.warn(
496-
"The `--s3-url` option will be deprecated in favor of `--remote-url`"
497-
" in the next minor release.",
498-
DeprecationWarning,
499-
)
500-
remote_url = s3_url
501487
if remote_url:
502488
hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
503489
elif github_url:
@@ -561,7 +547,7 @@ def main(
561547
doc_connector = S3Connector( # type: ignore
562548
standard_config=standard_config,
563549
config=SimpleS3Config(
564-
path=s3_url,
550+
path=remote_url,
565551
access_kwargs={"anon": s3_anonymous},
566552
),
567553
)

0 commit comments

Comments
 (0)