Skip to content

Commit b6f2bde

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents a21ba5e + 1eceac2 commit b6f2bde

File tree

635 files changed

+3003
-43603
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

635 files changed

+3003
-43603
lines changed

.github/actions/base-ingest-cache/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ runs:
3939
python -m pip install --upgrade setuptools
4040
fi
4141
make install-ci
42-
make install-all-ingest
42+
make install-ingest
4343
- name: Save Ingest Cache
4444
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
4545
id: ingest-virtualenv-cache-save

.github/workflows/ci.yml

Lines changed: 0 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ jobs:
7272
- name: Install all doc and test dependencies
7373
run: |
7474
make install-ci
75-
make install-all-ingest
7675
make check-licenses
7776
7877
lint:
@@ -273,37 +272,6 @@ jobs:
273272
python-version: ${{ matrix.python-version }}
274273
check-only: 'true'
275274

276-
test_ingest_unit:
277-
strategy:
278-
matrix:
279-
python-version: [ "3.9","3.10" ]
280-
runs-on: ubuntu-latest
281-
needs: [ setup_ingest, lint ]
282-
steps:
283-
# actions/checkout MUST come before auth
284-
- uses: 'actions/checkout@v4'
285-
- name: Set up Python ${{ matrix.python-version }}
286-
uses: actions/setup-python@v5
287-
with:
288-
python-version: ${{ matrix.python-version }}
289-
- name: Get full Python version
290-
id: full-python-version
291-
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
292-
- name: Setup virtual environment
293-
uses: ./.github/actions/base-ingest-cache
294-
with:
295-
python-version: ${{ matrix.python-version }}
296-
- name: Test Ingest (unit)
297-
env:
298-
NLTK_DATA: ${{ github.workspace }}/nltk_data
299-
PYTHON: python${{ matrix.python-version }}
300-
run: |
301-
source .venv/bin/activate
302-
make install-ci
303-
make install-all-ingest
304-
PYTHONPATH=. ${PYTHON} -m pytest test_unstructured_ingest/unit
305-
306-
307275
test_ingest_src:
308276
strategy:
309277
matrix:
@@ -378,8 +346,6 @@ jobs:
378346
PYTHON: python${{ matrix.python-version }}
379347
run: |
380348
source .venv/bin/activate
381-
make install-ci
382-
make install-all-ingest
383349
sudo apt-get update
384350
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
385351
make install-pandoc
@@ -392,103 +358,6 @@ jobs:
392358
./test_unstructured_ingest/test-ingest-src.sh
393359
394360
395-
test_ingest_dest:
396-
environment: ci
397-
strategy:
398-
matrix:
399-
python-version: ["3.9","3.10"]
400-
runs-on: ubuntu-latest-m
401-
needs: [setup_ingest, lint]
402-
steps:
403-
# actions/checkout MUST come before auth
404-
- uses: 'actions/checkout@v4'
405-
- name: Set up Python ${{ matrix.python-version }}
406-
uses: actions/setup-python@v5
407-
with:
408-
python-version: ${{ matrix.python-version }}
409-
- name: Get full Python version
410-
id: full-python-version
411-
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
412-
- name: Setup virtual environment
413-
uses: ./.github/actions/base-ingest-cache
414-
with:
415-
python-version: ${{ matrix.python-version }}
416-
- name: Setup docker-compose
417-
uses: KengoTODA/actions-setup-docker-compose@v1
418-
with:
419-
version: '2.22.0'
420-
- name: Test (end-to-end)
421-
env:
422-
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
423-
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
424-
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
425-
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
426-
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
427-
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
428-
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
429-
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
430-
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
431-
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
432-
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
433-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
434-
MONGODB_URI: ${{ secrets.MONGODB_URI }}
435-
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
436-
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }}
437-
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
438-
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}}
439-
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}}
440-
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}}
441-
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
442-
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
443-
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
444-
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
445-
DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}}
446-
DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}}
447-
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
448-
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
449-
CI: "true"
450-
NLTK_DATA: ${{ github.workspace }}/nltk_data
451-
PYTHON: python${{ matrix.python-version }}
452-
run: |
453-
source .venv/bin/activate
454-
make install-ci
455-
make install-all-ingest
456-
sudo apt-get update
457-
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
458-
make install-pandoc
459-
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
460-
sudo apt-get update
461-
sudo apt-get install -y tesseract-ocr
462-
sudo apt-get install -y tesseract-ocr-kor
463-
sudo apt-get install diffstat
464-
tesseract --version
465-
./test_unstructured_ingest/test-ingest-dest.sh
466-
467-
test_ingest_help:
468-
environment: ci
469-
strategy:
470-
matrix:
471-
python-version: ["3.9","3.10","3.11", "3.12"]
472-
runs-on: ubuntu-latest
473-
needs: [setup_ingest, lint]
474-
steps:
475-
- uses: 'actions/checkout@v4'
476-
- name: Set up Python ${{ matrix.python-version }}
477-
uses: actions/setup-python@v5
478-
with:
479-
python-version: ${{ matrix.python-version }}
480-
- name: Setup virtual environment
481-
uses: ./.github/actions/base-ingest-cache
482-
with:
483-
python-version: ${{ matrix.python-version }}
484-
- name: Validate --help
485-
run: |
486-
source .venv/bin/activate
487-
make install-ci
488-
make install-all-ingest
489-
./test_unstructured_ingest/test-help.sh
490-
491-
492361
test_unstructured_api_unit:
493362
strategy:
494363
matrix:

.github/workflows/ingest-test-fixtures-update-pr.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ jobs:
105105
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
106106
sudo apt-get install -y tesseract-ocr
107107
sudo apt-get install -y tesseract-ocr-kor
108+
sudo apt-get install diffstat
108109
tesseract --version
109110
./test_unstructured_ingest/test-ingest-src.sh
110111

CHANGELOG.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,26 @@
1-
## 0.15.15-dev0
1+
## 0.16.1-dev0
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
10+
11+
## 0.16.0
12+
13+
### Enhancements
14+
15+
* **Remove ingest implementation.** The deprecated ingest functionality has been removed, as it is now maintained in the separate [unstructured-ingest](https://github.com/Unstructured-IO/unstructured-ingest) repository.
16+
* Replace extras in `requirements/ingest` directory with a new `ingest.txt` extra for installing the `unstructured-ingest` library.
17+
* Remove the `unstructured.ingest` submodule.
18+
* Delete all shell scripts previously used for destination ingest tests.
19+
20+
### Features
21+
22+
### Fixes
23+
924
* **Add language parameter to `OCRAgentGoogleVision`.** Introduces an optional language parameter in the `OCRAgentGoogleVision` constructor to serve as a language hint for `document_text_detection`. This ensures compatibility with the OCRAgent's `get_instance` method and resolves errors when parsing PDFs with Google Cloud Vision as the OCR agent.
1025

1126
## 0.15.14

MANIFEST.in

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -15,45 +15,3 @@ include requirements/extra-pptx.in
1515
include requirements/extra-xlsx.in
1616
include requirements/huggingface.in
1717

18-
# Ingest extras
19-
include requirements/ingest/airtable.in
20-
include requirements/ingest/astradb.in
21-
include requirements/ingest/azure-cognitive-search.in
22-
include requirements/ingest/azure.in
23-
include requirements/ingest/biomed.in
24-
include requirements/ingest/box.in
25-
include requirements/ingest/chroma.in
26-
include requirements/ingest/confluence.in
27-
include requirements/ingest/databricks-volumes.in
28-
include requirements/ingest/delta-table.in
29-
include requirements/ingest/discord.in
30-
include requirements/ingest/dropbox.in
31-
include requirements/ingest/elasticsearch.in
32-
include requirements/ingest/embed-aws-bedrock.in
33-
include requirements/ingest/embed-huggingface.in
34-
include requirements/ingest/embed-mixedbreadai.in
35-
include requirements/ingest/embed-openai.in
36-
include requirements/ingest/gcs.in
37-
include requirements/ingest/github.in
38-
include requirements/ingest/gitlab.in
39-
include requirements/ingest/google-drive.in
40-
include requirements/ingest/hubspot.in
41-
include requirements/ingest/jira.in
42-
include requirements/ingest/kafka.in
43-
include requirements/ingest/mongodb.in
44-
include requirements/ingest/notion.in
45-
include requirements/ingest/onedrive.in
46-
include requirements/ingest/opensearch.in
47-
include requirements/ingest/outlook.in
48-
include requirements/ingest/pinecone.in
49-
include requirements/ingest/postgres.in
50-
include requirements/ingest/qdrant.in
51-
include requirements/ingest/reddit.in
52-
include requirements/ingest/s3.in
53-
include requirements/ingest/salesforce.in
54-
include requirements/ingest/sftp.in
55-
include requirements/ingest/sharepoint.in
56-
include requirements/ingest/slack.in
57-
include requirements/ingest/singlestore.in
58-
include requirements/ingest/weaviate.in
59-
include requirements/ingest/wikipedia.in

0 commit comments

Comments
 (0)