Skip to content

Commit 5ac299d

Browse files
authored
Add s3 support to I/O operations. (#126)
1 parent a1e5b9f commit 5ac299d

File tree

13 files changed

+194
-59
lines changed

13 files changed

+194
-59
lines changed

.github/workflows/test-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
python-version: ${{ matrix.python-version }}
2828
cache: 'pip'
2929

30-
- name: Install dependencies
30+
- name: Install requirements
3131
run: |
3232
python -m pip install --upgrade pip
3333
pip install -r requirements.txt

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ python-benedict is a dict subclass with **keylist/keypath** support, **I/O** sho
2121
- **Keypath** support using **keypath-separator** *(dot syntax by default)*.
2222
- Keypath **list-index** support *(also negative)* using the standard `[n]` suffix.
2323
- Normalized **I/O operations** with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
24+
- `NEW` Multiple **I/O operations** backends: `filepath` *(read/write)*, `url` *(read-only)*, `s3` *(read/write)*.
2425
- Many **utility** and **parse methods** to retrieve data as needed *(check the [API](#api) section)*.
2526
- Well **tested**. ;)
2627

@@ -437,7 +438,7 @@ d.unique()
437438

438439
### I/O methods
439440

440-
It is possible to create a `benedict` instance directly from data source (filepath, url or data-string) by passing the data source and the data format (default 'json') in the constructor.
441+
It is possible to create a `benedict` instance directly from data-source (`filepath`, `url`, `s3` or `data-string`) by passing the data source and the data format (optional, default 'json') in the constructor.
441442

442443
```python
443444
# filepath
@@ -446,11 +447,14 @@ d = benedict('/root/data.yml', format='yaml')
446447
# url
447448
d = benedict('https://localhost:8000/data.xml', format='xml')
448449

450+
# s3
451+
d = benedict('s3://my-bucket/data.xml', s3_options={"aws_access_key_id": "...", "aws_secret_access_key": "..."})
452+
449453
# data-string
450454
d = benedict('{"a": 1, "b": 2, "c": 3, "x": 7, "y": 8, "z": 9}')
451455
```
452456

453-
These methods simplify I/O operations with most common formats: `base64`, `csv`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xml`, `yaml`.
457+
These methods simplify I/O operations with most common formats: `base64`, `csv`, `ini`, `json`, `pickle`, `plist`, `query-string`, `toml`, `xls`, `xml`, `yaml`.
454458

455459
In all `from_*` methods, the first argument can be: **url**, **filepath** or **data-string**.
456460

benedict/dicts/io/io_dict.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,7 @@ def _decode(s, format, **kwargs):
4444

4545
@staticmethod
4646
def _encode(d, format, **kwargs):
47-
filepath = kwargs.pop("filepath", None)
4847
s = io_util.encode(d, format, **kwargs)
49-
if filepath:
50-
io_util.write_file(filepath, s)
5148
return s
5249

5350
@classmethod

benedict/dicts/io/io_util.py

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@
33
from benedict.serializers import (
44
get_format_by_path,
55
get_serializer_by_format,
6-
get_serializers_extensions,
76
)
87

8+
# from botocore.exceptions import ClientError
9+
from urllib.parse import urlparse
10+
11+
import boto3
912
import fsutil
1013
import tempfile
1114

1215

1316
def autodetect_format(s):
14-
if is_url(s) or is_filepath(s):
17+
if any([is_url(s), is_s3(s), is_filepath(s)]):
1518
return get_format_by_path(s)
1619
return None
1720

@@ -20,20 +23,23 @@ def decode(s, format, **kwargs):
2023
serializer = get_serializer_by_format(format)
2124
if not serializer:
2225
raise ValueError(f"Invalid format: {format}.")
23-
decode_opts = kwargs.copy()
26+
options = kwargs.copy()
2427
if format in ["b64", "base64"]:
25-
decode_opts.setdefault("subformat", "json")
26-
content = read_content(s, format)
27-
data = serializer.decode(content, **decode_opts)
28+
options.setdefault("subformat", "json")
29+
content = read_content(s, format, **options)
30+
data = serializer.decode(content, **options)
2831
return data
2932

3033

31-
def encode(d, format, **kwargs):
34+
def encode(d, format, filepath=None, **kwargs):
3235
serializer = get_serializer_by_format(format)
3336
if not serializer:
3437
raise ValueError(f"Invalid format: {format}.")
35-
s = serializer.encode(d, **kwargs)
36-
return s
38+
options = kwargs.copy()
39+
content = serializer.encode(d, **options)
40+
if filepath:
41+
write_content(filepath, content, **options)
42+
return content
3743

3844

3945
def is_binary_format(format):
@@ -49,51 +55,94 @@ def is_data(s):
4955

5056

5157
def is_filepath(s):
52-
if any([s.endswith(ext) for ext in get_serializers_extensions()]):
53-
return True
54-
return fsutil.is_file(s)
58+
return fsutil.is_file(s) or get_format_by_path(s)
59+
60+
61+
def is_s3(s):
62+
return s.startswith("s3://") and get_format_by_path(s)
5563

5664

5765
def is_url(s):
5866
return any([s.startswith(protocol) for protocol in ["http://", "https://"]])
5967

6068

61-
def read_content(s, format):
69+
def parse_s3_url(url):
70+
parsed = urlparse(url, allow_fragments=False)
71+
bucket = parsed.netloc
72+
key = parsed.path.lstrip("/")
73+
if parsed.query:
74+
key += "?" + self._parsed.query
75+
url = parsed.geturl()
76+
return {
77+
"url": url,
78+
"bucket": bucket,
79+
"key": key,
80+
}
81+
82+
83+
def read_content(s, format=None, **options):
6284
# s -> filepath or url or data
85+
options.setdefault("format", format)
6386
s = s.strip()
6487
if is_data(s):
6588
return s
6689
elif is_url(s):
67-
return read_content_from_url(s, format)
90+
return read_content_from_url(s, **options)
91+
elif is_s3(s):
92+
return read_content_from_s3(s, **options)
6893
elif is_filepath(s):
69-
return read_content_from_file(s, format)
94+
return read_content_from_file(s, **options)
7095
# one-line data?!
7196
return s
7297

7398

74-
def read_content_from_file(filepath, format):
99+
def read_content_from_file(filepath, format=None, **options):
75100
binary_format = is_binary_format(format)
76101
if binary_format:
77102
return filepath
78-
return read_file(filepath)
103+
return fsutil.read_file(filepath)
104+
105+
106+
def read_content_from_s3(url, s3_options, format=None, **options):
107+
s3_url = parse_s3_url(url)
108+
dirpath = tempfile.gettempdir()
109+
filename = fsutil.get_filename(s3_url["key"])
110+
filepath = fsutil.join_path(dirpath, filename)
111+
s3 = boto3.client("s3", **s3_options)
112+
s3.download_file(s3_url["bucket"], s3_url["key"], filepath)
113+
s3.close()
114+
content = read_content_from_file(filepath, format, **options)
115+
return content
79116

80117

81-
def read_content_from_url(url, format, **options):
118+
def read_content_from_url(url, requests_options=None, format=None, **options):
119+
requests_options = requests_options or {}
82120
binary_format = is_binary_format(format)
83121
if binary_format:
84122
dirpath = tempfile.gettempdir()
85-
filepath = fsutil.download_file(url, dirpath, **options)
123+
filepath = fsutil.download_file(url, dirpath, **requests_options)
86124
return filepath
87-
return read_url(url, **options)
125+
return fsutil.read_file_from_url(url, **requests_options)
88126

89127

90-
def read_file(filepath, **options):
91-
return fsutil.read_file(filepath, **options)
128+
def write_content(filepath, content, **options):
129+
if is_s3(filepath):
130+
write_content_to_s3(filepath, content, **options)
131+
else:
132+
write_content_to_file(filepath, content, **options)
92133

93134

94-
def read_url(url, **options):
95-
return fsutil.read_file_from_url(url, **options)
135+
def write_content_to_file(filepath, content, **options):
136+
fsutil.write_file(filepath, content)
96137

97138

98-
def write_file(filepath, content, **options):
99-
fsutil.write_file(filepath, content, **options)
139+
def write_content_to_s3(url, content, s3_options, **options):
140+
s3_url = parse_s3_url(url)
141+
dirpath = tempfile.gettempdir()
142+
filename = fsutil.get_filename(s3_url["key"])
143+
filepath = fsutil.join_path(dirpath, filename)
144+
fsutil.write_file(filepath, content)
145+
s3 = boto3.client("s3", **s3_options)
146+
s3.upload_file(filepath, s3_url["bucket"], s3_url["key"])
147+
s3.close()
148+
fsutil.remove_file(filepath)

benedict/serializers/__init__.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,4 @@ def get_serializer_by_format(format):
7979
format_key = (format or "").lower().strip()
8080
format_key = re.sub(r"[\s\-\_]*", "", format_key)
8181
serializer = _SERIALIZERS_BY_EXTENSION.get(format_key, None)
82-
if not serializer:
83-
raise ValueError(f"Invalid format: {format}.")
8482
return serializer
85-
86-
87-
def get_serializers_extensions():
88-
return list(_SERIALIZERS_EXTENSIONS)

requirements-test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ codecov == 2.1.12
22
coverage == 6.5.0
33
flake8 == 5.0.4
44
orjson == 3.8.0
5+
python-decouple == 3.6
56
tox == 3.26.0

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
boto3 == 1.24.89
12
ftfy == 6.1.1
23
mailchecker == 5.0.2
34
openpyxl == 3.0.10

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
"unique",
9595
],
9696
install_requires=[
97+
"boto3 >= 1.24.89, < 2.0.0",
9798
"ftfy >= 6.0.0, < 7.0.0",
9899
"mailchecker >= 4.1.0, < 6.0.0",
99100
"openpyxl >= 3.0.0, < 4.0.0",

tests/dicts/base/test_base_dict.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,12 +234,12 @@ def test__str__with_pointer(self):
234234
def test__unicode__(self):
235235
d = BaseDict()
236236
d["name"] = "pythòn-bènèdìçt"
237-
print(unicode(d))
237+
# print(unicode(d))
238238

239239
@unittest.skipIf(sys.version_info[0] > 2, "No unicode in Python > 2")
240240
def test__unicode__with_pointer(self):
241241
d = BaseDict({"name": "pythòn-bènèdìçt"})
242-
print(unicode(d))
242+
# print(unicode(d))
243243

244244
def test_clear(self):
245245
d = {

tests/dicts/io/test_io_dict_xls.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from benedict.dicts.io import IODict
44

5+
from decouple import config
6+
57
from .test_io_dict import io_dict_test_case
68

79

@@ -104,7 +106,8 @@ def test_from_xls_with_valid_url_valid_content(self):
104106
with self.subTest(
105107
msg=f"test_from_xls_({extension})_with_valid_url_valid_content"
106108
):
107-
url = f"https://github.com/fabiocaccamo/python-benedict/raw/xls/tests/dicts/io/input/valid-content.{extension}"
109+
# url = f"https://github.com/fabiocaccamo/python-benedict/raw/s3/tests/dicts/io/input/valid-content.{extension}"
110+
url = f"https://github.com/fabiocaccamo/python-benedict/raw/master/tests/dicts/io/input/valid-content.{extension}"
108111
# static method
109112
d = IODict.from_xls(url)
110113
self.assertTrue(isinstance(d, dict))
@@ -118,6 +121,65 @@ def test_from_xls_with_valid_url_valid_content(self):
118121
self.assertTrue(isinstance(d, dict))
119122
self.assertEqual(d, expected_dict)
120123

124+
def test_from_xls_with_valid_s3_url_valid_content(self):
125+
aws_access_key_id = config("AWS_ACCESS_KEY_ID", default=None)
126+
aws_secret_access_key = config("AWS_SECRET_ACCESS_KEY", default=None)
127+
if not all([aws_access_key_id, aws_secret_access_key]):
128+
# don't use s3 on GH CI
129+
return
130+
s3_options = {
131+
"aws_access_key_id": aws_access_key_id,
132+
"aws_secret_access_key": aws_secret_access_key,
133+
}
134+
expected_dict = {
135+
"values": [
136+
{
137+
"mon": 10,
138+
"tue": 11,
139+
"wed": 12,
140+
"thu": 13,
141+
"fri": 14,
142+
"sat": 15,
143+
"sun": 16,
144+
},
145+
{
146+
"mon": 20,
147+
"tue": 21,
148+
"wed": 22,
149+
"thu": 23,
150+
"fri": 24,
151+
"sat": 25,
152+
"sun": 26,
153+
},
154+
{
155+
"mon": 30,
156+
"tue": 31,
157+
"wed": 32,
158+
"thu": 33,
159+
"fri": 34,
160+
"sat": 35,
161+
"sun": 36,
162+
},
163+
]
164+
}
165+
for extension in self._extensions:
166+
with self.subTest(
167+
msg=f"test_from_xls_({extension})_with_valid_s3_url_valid_content"
168+
):
169+
url = f"s3://python-benedict/valid-content.{extension}"
170+
# static method
171+
d = IODict.from_xls(url, s3_options=s3_options)
172+
self.assertTrue(isinstance(d, dict))
173+
self.assertEqual(d, expected_dict)
174+
# constructor explicit format
175+
d = IODict(url, format=extension, s3_options=s3_options)
176+
self.assertTrue(isinstance(d, dict))
177+
self.assertEqual(d, expected_dict)
178+
# constructor implicit format
179+
d = IODict(url, s3_options=s3_options)
180+
self.assertTrue(isinstance(d, dict))
181+
self.assertEqual(d, expected_dict)
182+
121183
def test_from_xls_with_valid_file_valid_content_custom_sheet_by_index_and_columns(
122184
self,
123185
):

0 commit comments

Comments
 (0)