Skip to content

Commit e52dd5c

Browse files
authored
feat: add requires_dependencies decorator (#302)
* Add `requires_dependencies` decorator * Use `required_dependencies` on Reddit & S3 * Fix bug in `requires_dependencies` To used named args the decorator needs to be also wrapped * Add `requires_dependencies` integration tests * Add `requires_dependencies` in `Competition.md` * Update `CHANGELOG.md` * Bump version 0.4.16-dev5 * Ignore `F401` unused imports in `requires_dependencies` tests * Apply suggestions from code review * Add `functools.wrap` to keep docs, & annotations * Use `requires_dependencies` in `GitHubConnector`
1 parent 54a6db1 commit e52dd5c

File tree

7 files changed

+86
-2
lines changed

7 files changed

+86
-2
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
### Enhancements
44

5+
* Add `requires_dependencies` Python decorator to check dependencies are installed before
6+
instantiating a class or running a function
7+
58
### Features
69

710
* Added Wikipedia connector for ingest cli.

Ingest.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ In checklist form, the above steps are summarized as:
6767
- [ ] Add them as an extra to [setup.py](unstructured/setup.py).
6868
- [ ] Update the Makefile, adding a target for `install-ingest-<name>` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference.
6969
- [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports.
70+
- [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `S3Connector` should look like `@requires_dependencies(dependencies=["boto3"], extras="s3")`
7071
- [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured/ingest/interfaces.py](unstructured/ingest/interfaces.py) which is passed through [the CLI](unstructured/ingest/main.py):
7172
- [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured/ingest/main.py).
7273
- [ ] Unless `.reprocess` is `True`, then documents are always reprocessed.
7374
- [ ] If `.preserve_download` is `True`, documents downloaded to `.download_dir` are not removed after processing.
7475
- [ ] Else if `.preserve_download` is `False`, documents downloaded to `.download_dir` are removed after they are **successfully** processed during the invocation of `MyIngestDoc.cleanup_file()` in [process_document](unstructured/ingest/doc_processor/generalized.py)
7576
- [ ] Does not re-download documents to `.download_dir` if `.re_download` is False, enforced in `MyIngestDoc.get_file()`
7677
- [ ] Prints more details if `.verbose` similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).
77-

test_unstructured/test_utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,48 @@ def test_save_as_jsonl(input_data, output_jsonl_file):
3737
def test_read_as_jsonl(input_jsonl_file, input_data):
3838
file_data = utils.read_from_jsonl(input_jsonl_file)
3939
assert file_data == input_data
40+
41+
42+
def test_requires_dependencies_decorator():
43+
@utils.requires_dependencies(dependencies="numpy")
44+
def test_func():
45+
import numpy # noqa: F401
46+
47+
test_func()
48+
49+
50+
def test_requires_dependencies_decorator_multiple():
51+
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
52+
def test_func():
53+
import numpy # noqa: F401
54+
import pandas # noqa: F401
55+
56+
test_func()
57+
58+
59+
def test_requires_dependencies_decorator_import_error():
60+
@utils.requires_dependencies(dependencies="not_a_package")
61+
def test_func():
62+
import not_a_package # noqa: F401
63+
64+
with pytest.raises(ImportError):
65+
test_func()
66+
67+
68+
def test_requires_dependencies_decorator_import_error_multiple():
69+
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
70+
def test_func():
71+
import not_a_package # noqa: F401
72+
import numpy # noqa: F401
73+
74+
with pytest.raises(ImportError):
75+
test_func()
76+
77+
78+
def test_requires_dependencies_decorator_in_class():
79+
@utils.requires_dependencies(dependencies="numpy")
80+
class TestClass:
81+
def __init__(self):
82+
import numpy # noqa: F401
83+
84+
TestClass()

unstructured/ingest/connector/github.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
BaseConnectorConfig,
1414
BaseIngestDoc,
1515
)
16+
from unstructured.utils import requires_dependencies
1617

1718
if TYPE_CHECKING:
1819
from github.Repository import Repository
@@ -124,6 +125,7 @@ def write_result(self):
124125
print(f"Wrote {output_filename}")
125126

126127

128+
@requires_dependencies(["pygithub"], extras="github")
127129
class GitHubConnector(BaseConnector):
128130
def __init__(self, config: SimpleGitHubConfig):
129131
from github import Github

unstructured/ingest/connector/reddit.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
BaseConnectorConfig,
1010
BaseIngestDoc,
1111
)
12+
from unstructured.utils import requires_dependencies
1213

1314
if TYPE_CHECKING:
1415
from praw.models import Submission
@@ -87,6 +88,7 @@ def write_result(self):
8788
print(f"Wrote {output_filename}")
8889

8990

91+
@requires_dependencies(["praw"], extras="reddit")
9092
class RedditConnector(BaseConnector):
9193
def __init__(self, config: SimpleRedditConfig):
9294
from praw import Reddit

unstructured/ingest/connector/s3_connector.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
BaseConnectorConfig,
1010
BaseIngestDoc,
1111
)
12+
from unstructured.utils import requires_dependencies
1213

1314

1415
@dataclass
@@ -84,6 +85,7 @@ def _create_full_tmp_dir_path(self):
8485
"""includes "directories" in s3 object path"""
8586
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
8687

88+
@requires_dependencies(["boto3"], extras="s3")
8789
def get_file(self):
8890
"""Actually fetches the file from s3 and stores it locally."""
8991
import boto3
@@ -130,6 +132,7 @@ def cleanup_file(self):
130132
os.unlink(self._tmp_download_file())
131133

132134

135+
@requires_dependencies(["boto3"], extras="s3")
133136
class S3Connector(BaseConnector):
134137
"""Objects of this class support fetching document(s) from"""
135138

unstructured/utils.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
import importlib
12
import json
2-
from typing import Dict, List
3+
from functools import wraps
4+
from typing import Dict, List, Optional, Union
35

46

57
def save_as_jsonl(data: List[Dict], filename: str) -> None:
@@ -10,3 +12,30 @@ def save_as_jsonl(data: List[Dict], filename: str) -> None:
1012
def read_from_jsonl(filename: str) -> List[Dict]:
1113
with open(filename) as input_file:
1214
return [json.loads(line) for line in input_file]
15+
16+
17+
def requires_dependencies(dependencies: Union[str, List[str]], extras: Optional[str] = None):
18+
if isinstance(dependencies, str):
19+
dependencies = [dependencies]
20+
21+
def decorator(func):
22+
@wraps(func)
23+
def wrapper(*args, **kwargs):
24+
missing_deps = []
25+
for dep in dependencies:
26+
try:
27+
importlib.import_module(dep)
28+
except ImportError:
29+
missing_deps.append(dep)
30+
if len(missing_deps) > 0:
31+
raise ImportError(
32+
f"Following dependencies are missing: {', '.join(missing_deps)}."
33+
+ f"Please install them using `pip install unstructured[{extras}]`."
34+
if extras
35+
else f"Please install them using `pip install {' '.join(missing_deps)}`.",
36+
)
37+
return func(*args, **kwargs)
38+
39+
return wrapper
40+
41+
return decorator

0 commit comments

Comments
 (0)