forked from dataforgoodfr/13_pollution_eau
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstorage_client.py
More file actions
145 lines (129 loc) · 4.7 KB
/
storage_client.py
File metadata and controls
145 lines (129 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import io
import logging
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError
import pandas as pd
from tqdm import tqdm
"""Client class to interact with Scaleway Object Storage."""
logger = logging.getLogger(__name__)
class ObjectStorageClient:
region_name = "fr-par"
endpoint_url = "https://s3.fr-par.scw.cloud"
bucket_name = "pollution-eau-s3"
def __init__(self):
# Need to use V2 signature for upload and V4 for download
self.client_v2 = self.build_client("s3")
self.client_v4 = self.build_client("s3v4")
@staticmethod
def build_client(signature_version: str = "s3v4"):
return boto3.session.Session().client(
service_name="s3",
config=Config(signature_version=signature_version),
region_name=ObjectStorageClient.region_name,
use_ssl=True,
endpoint_url=ObjectStorageClient.endpoint_url,
aws_access_key_id=os.getenv("SCW_ACCESS_KEY"),
aws_secret_access_key=os.getenv("SCW_SECRET_KEY"),
)
# def list_buckets(self):
# response = self.client_v4.list_buckets()
# return response['Buckets']
def list_objects(self, prefix=None):
try:
response = self.client_v4.list_objects(
Bucket=self.bucket_name, Prefix=prefix
)
if "Contents" in response:
return response["Contents"]
else:
return []
except ClientError as e:
logger.error(
"Error list_objects in bucket %s: %s/%s", self.bucket_name, prefix, e
)
def download_object(self, file_key, local_path):
# Get file size
try:
meta_data = self.client_v4.head_object(
Bucket=self.bucket_name, Key=file_key
)
total_length = int(meta_data.get("ContentLength", 0))
# Configure the callback to update the progress bar
with tqdm(
total=total_length, unit="iB", unit_scale=True, desc=file_key
) as pbar:
self.client_v4.download_file(
self.bucket_name,
file_key,
local_path,
Callback=lambda bytes_transferred: pbar.update(bytes_transferred),
)
except ClientError as e:
logger.error(
"Error deleting object '%s' from S3 bucket '%s': %s",
file_key,
self.bucket_name,
e,
)
def upload_object(self, local_path, file_key=None, public_read=False):
try:
if file_key is None:
file_key = os.path.basename(local_path)
self.client_v2.upload_file(
local_path,
self.bucket_name,
file_key,
ExtraArgs={"ACL": "public-read"} if public_read else None,
)
except ClientError as e:
logger.error(
"Boto Client Error upload_object '%s' to bucket '%s': %s",
local_path,
self.bucket_name,
e,
)
except Exception as e:
logger.error(
"Error upload_object '%s' to bucket '%s': %s",
local_path,
self.bucket_name,
e,
)
def upload_dataframe(self, df, file_key):
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
# Upload the buffer to S3
try:
self.client_v2.put_object(
Bucket=self.bucket_name, Key=file_key, Body=csv_buffer.getvalue()
)
except ClientError as e:
logger.error(
"Error upload_dataframe to S3 bucket '%s': %s", self.bucket_name, e
)
def read_object_as_dataframe(self, file_key):
try:
response = self.client_v4.get_object(Bucket=self.bucket_name, Key=file_key)
csv_data = response["Body"].read().decode("utf-8")
df = pd.read_csv(io.StringIO(csv_data))
return df
except ClientError as e:
logger.error(
"Error read_object_as_dataframe '%s' from S3 bucket '%s': %s",
file_key,
self.bucket_name,
e,
)
return pd.DataFrame()
def delete_object(self, key):
try:
self.client_v4.delete_object(Bucket=self.bucket_name, Key=key)
except ClientError as e:
logger.error(
"Error delete_object '%s' from S3 bucket '%s': %s",
key,
self.bucket_name,
e,
)