Skip to content

Commit 4cdc5f4

Browse files
authored
Merge pull request #1 from Unstructured-IO/roman/introduce-initial-setup
feat: init repo
2 parents 3399d89 + 444a530 commit 4cdc5f4

File tree

1,064 files changed

+405741
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,064 files changed

+405741
-1
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
name: 'Base Cache Build'
3+
description: 'Restore the base python cache for CI to use, recreate if not found'
4+
inputs:
5+
python-version:
6+
description: 'python version associated with the cache'
7+
required: true
8+
check-only:
9+
description: 'if set, will not restore the cache if it exists'
10+
default: "false"
11+
runs:
12+
using: "composite"
13+
steps:
14+
- name: Check for/restore base cache
15+
uses: actions/cache/restore@v4
16+
id: virtualenv-cache-restore
17+
with:
18+
path: |
19+
.venv
20+
key: unstructured-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/**/*.txt') }}
21+
lookup-only: ${{ inputs.check-only }}
22+
- name: Set up Python ${{ inputs.python-version }}
23+
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
24+
uses: actions/setup-python@v5
25+
with:
26+
python-version: ${{ inputs.python-version }}
27+
- name: Setup virtual environment (no cache hit)
28+
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
29+
shell: bash
30+
run: |
31+
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
32+
python${{ inputs.python-version }} -m venv .venv
33+
source .venv/bin/activate
34+
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
35+
python -m ensurepip --upgrade
36+
python -m pip install --upgrade setuptools
37+
fi
38+
make install-base
39+
make install-ci
40+
- name: Save Cache
41+
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
42+
id: virtualenv-cache-save
43+
uses: actions/cache/save@v4
44+
with:
45+
path: |
46+
.venv
47+
key: ${{ steps.virtualenv-cache-restore.outputs.cache-primary-key }}
48+
49+

.github/workflows/e2e.yml

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
name: End-to-End Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
merge_group:
9+
branches: [ main ]
10+
11+
concurrency:
12+
group: "${{ github.workflow }}-${{ github.ref }}"
13+
cancel-in-progress: true
14+
15+
16+
jobs:
17+
setup:
18+
strategy:
19+
matrix:
20+
python-version: [ "3.9","3.10" ]
21+
runs-on: ubuntu-latest
22+
steps:
23+
- uses: actions/checkout@v4
24+
- uses: ./.github/actions/base-cache
25+
with:
26+
python-version: ${{ matrix.python-version }}
27+
check-only: 'true'
28+
29+
test_src:
30+
strategy:
31+
matrix:
32+
# python-version: ["3.9","3.10"]
33+
python-version: [ "3.10" ]
34+
runs-on: ubuntu-latest-m
35+
needs: [ setup ]
36+
steps:
37+
# actions/checkout MUST come before auth
38+
- uses: 'actions/checkout@v4'
39+
- name: Set up Python ${{ matrix.python-version }}
40+
uses: actions/setup-python@v5
41+
with:
42+
python-version: ${{ matrix.python-version }}
43+
- name: Get full Python version
44+
id: full-python-version
45+
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
46+
- name: Setup virtual environment
47+
uses: ./.github/actions/base-cache
48+
with:
49+
python-version: ${{ matrix.python-version }}
50+
- name: Setup docker-compose
51+
uses: KengoTODA/actions-setup-docker-compose@v1
52+
with:
53+
version: '2.22.0'
54+
- name: Test (end-to-end)
55+
env:
56+
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
57+
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
58+
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
59+
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
60+
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
61+
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
62+
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
63+
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
64+
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
65+
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
66+
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }}
67+
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
68+
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
69+
MONGODB_URI: ${{ secrets.MONGODB_URI }}
70+
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
71+
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
72+
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
73+
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
74+
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
75+
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
76+
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
77+
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
78+
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
79+
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
80+
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
81+
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
82+
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
83+
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
84+
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
85+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
86+
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
87+
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
88+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
89+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
90+
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
91+
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
92+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
93+
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
94+
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
95+
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
96+
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
97+
TABLE_OCR: "tesseract"
98+
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
99+
CI: "true"
100+
run: |
101+
source .venv/bin/activate
102+
sudo apt-get update
103+
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
104+
make install-pandoc
105+
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
106+
sudo apt-get update
107+
sudo apt-get install -y tesseract-ocr
108+
sudo apt-get install -y tesseract-ocr-kor
109+
sudo apt-get install diffstat
110+
tesseract --version
111+
./test_e2e/test-src.sh
112+
113+
114+
test_dest:
115+
environment: ci
116+
strategy:
117+
matrix:
118+
python-version: [ "3.9","3.10" ]
119+
runs-on: ubuntu-latest-m
120+
needs: [ setup ]
121+
steps:
122+
# actions/checkout MUST come before auth
123+
- uses: 'actions/checkout@v4'
124+
- name: Set up Python ${{ matrix.python-version }}
125+
uses: actions/setup-python@v5
126+
with:
127+
python-version: ${{ matrix.python-version }}
128+
- name: Get full Python version
129+
id: full-python-version
130+
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
131+
- name: Setup virtual environment
132+
uses: ./.github/actions/base-cache
133+
with:
134+
python-version: ${{ matrix.python-version }}
135+
- name: Setup docker-compose
136+
uses: KengoTODA/actions-setup-docker-compose@v1
137+
with:
138+
version: '2.22.0'
139+
- name: Test (end-to-end)
140+
env:
141+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
142+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
143+
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
144+
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
145+
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
146+
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
147+
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
148+
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
149+
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
150+
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
151+
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
152+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
153+
MONGODB_URI: ${{ secrets.MONGODB_URI }}
154+
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
155+
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }}
156+
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
157+
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}}
158+
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}}
159+
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}}
160+
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
161+
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
162+
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
163+
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
164+
DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}}
165+
DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}}
166+
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
167+
TABLE_OCR: "tesseract"
168+
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
169+
CI: "true"
170+
run: |
171+
source .venv/bin/activate
172+
sudo apt-get update
173+
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
174+
make install-pandoc
175+
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
176+
sudo apt-get update
177+
sudo apt-get install -y tesseract-ocr
178+
sudo apt-get install -y tesseract-ocr-kor
179+
sudo apt-get install diffstat
180+
tesseract --version
181+
./test_e2e/test-dest.sh
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
name: Ingest Test Fixtures Update PR
2+
3+
on:
4+
workflow_dispatch:
5+
6+
env:
7+
PYTHON_VERSION: "3.10"
8+
9+
permissions:
10+
id-token: write
11+
contents: read
12+
13+
jobs:
14+
15+
setup:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- uses: actions/checkout@v3
19+
- uses: ./.github/actions/base-cache
20+
with:
21+
python-version: ${{ env.PYTHON_VERSION }}
22+
check-only: 'true'
23+
24+
update-fixtures-and-pr:
25+
runs-on: ubuntu-latest-m
26+
needs: [setup]
27+
steps:
28+
# actions/checkout MUST come before auth
29+
- uses: 'actions/checkout@v4'
30+
- name: Set up Python ${{ env.PYTHON_VERSION }}
31+
uses: actions/setup-python@v4
32+
with:
33+
python-version: ${{ env.PYTHON_VERSION }}
34+
- name: Get full Python version
35+
id: full-python-version
36+
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
37+
- name: Setup virtual environment
38+
uses: ./.github/actions/base-cache
39+
with:
40+
python-version: ${{ env.PYTHON_VERSION }}
41+
- name: Setup docker-compose
42+
uses: KengoTODA/actions-setup-docker-compose@v1
43+
with:
44+
version: '2.22.0'
45+
- name: Update test fixtures
46+
env:
47+
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
48+
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
49+
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
50+
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
51+
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
52+
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
53+
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
54+
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
55+
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
56+
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
57+
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }}
58+
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
59+
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
60+
MONGODB_URI: ${{ secrets.MONGODB_URI }}
61+
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
62+
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
63+
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
64+
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
65+
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
66+
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
67+
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
68+
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
69+
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
70+
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
71+
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
72+
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
73+
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
74+
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
75+
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
76+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
77+
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
78+
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
79+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
80+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
81+
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
82+
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
83+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
84+
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
85+
TABLE_OCR: "tesseract"
86+
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
87+
OVERWRITE_FIXTURES: "true"
88+
CI: "true"
89+
run: |
90+
source .venv/bin/activate
91+
sudo apt-get update
92+
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
93+
make install-pandoc
94+
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
95+
sudo apt-get install -y tesseract-ocr
96+
sudo apt-get install -y tesseract-ocr-kor
97+
tesseract --version
98+
./test_e2e/test-src.sh
99+
100+
- name: Save branch name to environment file
101+
id: branch
102+
run: |
103+
original_branch=$(git rev-parse --abbrev-ref HEAD)
104+
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)"
105+
branch_name="$original_branch$suffix"
106+
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV
107+
108+
- name: Save PR name to environment file
109+
id: pr
110+
run: |
111+
commit_sha=$(git rev-parse HEAD)
112+
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
113+
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls")
114+
pr_name=$(echo "$prs" | jq -r '.[0].title')
115+
echo "PR_NAME=$pr_name" >> $GITHUB_ENV
116+
117+
- name: Create Pull Request
118+
uses: peter-evans/create-pull-request@v5
119+
with:
120+
token: ${{ secrets.GH_CREATE_PR_TOKEN }}
121+
add-paths: |
122+
test_e2e/expected-structured-output
123+
test_e2e/metrics
124+
commit-message: "Update ingest test fixtures"
125+
branch: ${{ env.BRANCH_NAME }}
126+
title: "${{ env.PR_NAME }} <- Ingest test fixtures update"
127+
assignees: ${{ github.actor }}
128+
reviewers: ${{ github.actor }}
129+
delete-branch: true
130+
body: |
131+
This pull request includes updated ingest test fixtures.
132+
Please review and merge if appropriate.
133+
base: ${{ github.head_ref }}

0 commit comments

Comments
 (0)