Skip to content

Commit 1580c1b

Browse files
authored
feat: Add GitLab ingest connector (#349)
Add GitLab data connector for ingest. Involves more general Git functionality that is shared between the GitHub and GitLab data connectors. Prevent code duplication for functionality between GitHub and GitLab ingest connectors. Renamed github-access-token, github-branch and github-file-glob to git-access-token, git-branch and git-file-glob, respectively. These work for GitHub and GitLab.
1 parent a915231 commit 1580c1b

File tree

15 files changed

+516
-155
lines changed

15 files changed

+516
-155
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ jobs:
110110
make check-coverage
111111
make install-ingest-s3
112112
make install-ingest-github
113+
make install-ingest-gitlab
113114
make install-ingest-wikipedia
114115
./test_unstructured_ingest/test-ingest.sh
115116

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.3-dev4
1+
## 0.5.3-dev5
22

33
### Enhancements
44

@@ -11,6 +11,7 @@
1111
to pages with similar names.
1212
* Add optional `encoding` argument to the `partition_(text/email/html)` functions.
1313
* Added Google Drive connector for ingest cli.
14+
* Added Gitlab connector for ingest cli.
1415

1516
### Fixes
1617

@@ -280,4 +281,3 @@ of an email.
280281
## 0.2.0
281282

282283
* Initial release of unstructured
283-

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ install-ingest-s3:
6262
install-ingest-github:
6363
pip install -r requirements/ingest-github.txt
6464

65+
.PHONY: install-ingest-gitlab
66+
install-ingest-gitlab:
67+
pip install -r requirements/ingest-gitlab.txt
68+
6569
.PHONY: install-ingest-reddit
6670
install-ingest-reddit:
6771
pip install -r requirements/ingest-reddit.txt
@@ -101,6 +105,7 @@ pip-compile:
101105
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
102106
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
103107
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
108+
pip-compile --upgrade --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py
104109
pip-compile --upgrade --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py
105110
pip-compile --upgrade --extra=google-drive --output-file=requirements/ingest-google-drive.txt requirements/base.txt setup.py
106111

examples/ingest/github/ingest.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
1212
--github-url Unstructured-IO/unstructured \
13-
--github-branch main \
13+
--git-branch main \
1414
--structured-output-dir github-ingest-output \
1515
--num-processes 2 \
1616
--verbose

examples/ingest/gitlab/ingest.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env bash
2+
3+
# Processes the arbitrarily chosen https://gitlab.com/gitlab-com/content-sites/docsy-gitlab repository
4+
# through Unstructured's library in 2 processes.
5+
6+
# Structured outputs are stored in gitlab-ingest-output/
7+
8+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
9+
cd "$SCRIPT_DIR"/../../.. || exit 1
10+
11+
PYTHONPATH=. ./unstructured/ingest/main.py \
12+
--gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
13+
--git-branch 'v0.0.7' \
14+
--structured-output-dir gitlab-ingest-output \
15+
--num-processes 2 \
16+
--verbose
17+
18+
# Alternatively, you can call it using:
19+
# unstructured-ingest --gitlab-url ...

requirements/ingest-gitlab.txt

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
4+
#
5+
# pip-compile --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py
6+
#
7+
anyio==3.6.2
8+
# via
9+
# -r requirements/base.txt
10+
# httpcore
11+
argilla==1.3.1
12+
# via
13+
# -r requirements/base.txt
14+
# unstructured (setup.py)
15+
backoff==2.2.1
16+
# via
17+
# -r requirements/base.txt
18+
# argilla
19+
certifi==2022.12.7
20+
# via
21+
# -r requirements/base.txt
22+
# httpcore
23+
# httpx
24+
# requests
25+
# unstructured (setup.py)
26+
charset-normalizer==3.0.1
27+
# via
28+
# -r requirements/base.txt
29+
# requests
30+
click==8.1.3
31+
# via
32+
# -r requirements/base.txt
33+
# nltk
34+
colorama==0.4.6
35+
# via
36+
# click
37+
# tqdm
38+
deprecated==1.2.13
39+
# via
40+
# -r requirements/base.txt
41+
# argilla
42+
et-xmlfile==1.1.0
43+
# via
44+
# -r requirements/base.txt
45+
# openpyxl
46+
h11==0.14.0
47+
# via
48+
# -r requirements/base.txt
49+
# httpcore
50+
httpcore==0.16.3
51+
# via
52+
# -r requirements/base.txt
53+
# httpx
54+
httpx==0.23.3
55+
# via
56+
# -r requirements/base.txt
57+
# argilla
58+
idna==3.4
59+
# via
60+
# -r requirements/base.txt
61+
# anyio
62+
# requests
63+
# rfc3986
64+
importlib-metadata==6.0.0
65+
# via
66+
# -r requirements/base.txt
67+
# markdown
68+
joblib==1.2.0
69+
# via
70+
# -r requirements/base.txt
71+
# nltk
72+
lxml==4.9.2
73+
# via
74+
# -r requirements/base.txt
75+
# python-docx
76+
# python-pptx
77+
# unstructured (setup.py)
78+
markdown==3.4.1
79+
# via
80+
# -r requirements/base.txt
81+
# unstructured (setup.py)
82+
monotonic==1.6
83+
# via
84+
# -r requirements/base.txt
85+
# argilla
86+
nltk==3.8.1
87+
# via
88+
# -r requirements/base.txt
89+
# unstructured (setup.py)
90+
numpy==1.23.5
91+
# via
92+
# -r requirements/base.txt
93+
# argilla
94+
# pandas
95+
openpyxl==3.1.1
96+
# via
97+
# -r requirements/base.txt
98+
# unstructured (setup.py)
99+
packaging==23.0
100+
# via
101+
# -r requirements/base.txt
102+
# argilla
103+
pandas==1.5.3
104+
# via
105+
# -r requirements/base.txt
106+
# argilla
107+
# unstructured (setup.py)
108+
pillow==9.4.0
109+
# via
110+
# -r requirements/base.txt
111+
# python-pptx
112+
# unstructured (setup.py)
113+
pydantic==1.10.5
114+
# via
115+
# -r requirements/base.txt
116+
# argilla
117+
python-dateutil==2.8.2
118+
# via
119+
# -r requirements/base.txt
120+
# pandas
121+
python-docx==0.8.11
122+
# via
123+
# -r requirements/base.txt
124+
# unstructured (setup.py)
125+
python-gitlab==3.13.0
126+
# via unstructured (setup.py)
127+
python-magic==0.4.27
128+
# via
129+
# -r requirements/base.txt
130+
# unstructured (setup.py)
131+
python-pptx==0.6.21
132+
# via
133+
# -r requirements/base.txt
134+
# unstructured (setup.py)
135+
pytz==2022.7.1
136+
# via
137+
# -r requirements/base.txt
138+
# pandas
139+
regex==2022.10.31
140+
# via
141+
# -r requirements/base.txt
142+
# nltk
143+
requests==2.28.2
144+
# via
145+
# -r requirements/base.txt
146+
# python-gitlab
147+
# requests-toolbelt
148+
# unstructured (setup.py)
149+
requests-toolbelt==0.10.1
150+
# via python-gitlab
151+
rfc3986[idna2008]==1.5.0
152+
# via
153+
# -r requirements/base.txt
154+
# httpx
155+
six==1.16.0
156+
# via
157+
# -r requirements/base.txt
158+
# python-dateutil
159+
sniffio==1.3.0
160+
# via
161+
# -r requirements/base.txt
162+
# anyio
163+
# httpcore
164+
# httpx
165+
tqdm==4.64.1
166+
# via
167+
# -r requirements/base.txt
168+
# argilla
169+
# nltk
170+
typing-extensions==4.5.0
171+
# via
172+
# -r requirements/base.txt
173+
# pydantic
174+
urllib3==1.26.14
175+
# via
176+
# -r requirements/base.txt
177+
# requests
178+
wrapt==1.14.1
179+
# via
180+
# -r requirements/base.txt
181+
# argilla
182+
# deprecated
183+
xlsxwriter==3.0.8
184+
# via
185+
# -r requirements/base.txt
186+
# python-pptx
187+
zipp==3.15.0
188+
# via
189+
# -r requirements/base.txt
190+
# importlib-metadata

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
packages=find_packages(),
4848
version=__version__,
4949
entry_points={
50-
'console_scripts': ['unstructured-ingest=unstructured.ingest.main:main'],
50+
"console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"],
5151
},
5252
install_requires=[
5353
"argilla",
@@ -79,10 +79,11 @@
7979
],
8080
"s3": ["boto3"],
8181
"github": [
82-
# NOTE - pygithub at 1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
82+
# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
8383
# In the future, we can update this to pygithub>1.58.0
8484
"pygithub==1.57.0",
8585
],
86+
"gitlab": ["python-gitlab"],
8687
"reddit": ["praw"],
8788
"wikipedia": ["wikipedia"],
8889
"google-drive": ["google-api-python-client"],

test_unstructured_ingest/test-ingest-github.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ if [[ "$CI" == "true" ]]; then
1212
fi
1313

1414

15-
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
15+
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --git-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
1616

1717
if ! diff -ru github-downloadify-output test_unstructured_ingest/expected-structured-output/github-downloadify ; then
1818
echo
1919
echo "There are differences from the previously checked-in structured outputs."
20-
echo
20+
echo
2121
echo "If these differences are acceptable, copy the outputs from"
2222
echo "github-downloadify-output/ to test_unstructured_ingest/expected-structured-output/github-downloadify/ after running"
23-
echo
23+
echo
2424
echo " PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose"
2525
echo
2626
exit 1
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
3+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4+
cd "$SCRIPT_DIR"/.. || exit 1
5+
6+
PYTHONPATH=. ./unstructured/ingest/main.py \
7+
--gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
8+
--git-file-glob '*.md,*.txt' \
9+
--structured-output-dir gitlab-ingest-output \
10+
--git-branch 'v0.0.7' \
11+
--verbose
12+
13+
if [ "$(find 'gitlab-ingest-output' -type f -printf '.' | wc -c)" != 2 ]; then
14+
echo
15+
echo "2 files should have been created."
16+
exit 1
17+
fi

test_unstructured_ingest/test-ingest.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ cd "$SCRIPT_DIR"/.. || exit 1
77

88
./test_unstructured_ingest/test-ingest-s3.sh
99
./test_unstructured_ingest/test-ingest-github.sh
10+
./test_unstructured_ingest/test-ingest-gitlab.sh
1011
./test_unstructured_ingest/test-ingest-wikipedia.sh

0 commit comments

Comments
 (0)