Skip to content

Commit 26fc232

Browse files
authored
Merge pull request #44 from Cosmo-Tech/JREY/add_dump_to_azure_command
Add dump_to_azure command
2 parents a434ef2 + f013f30 commit 26fc232

File tree

3 files changed

+169
-0
lines changed

3 files changed

+169
-0
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# Copyright (C) - 2023 - 2024 - Cosmo Tech
2+
# This document and all information contained herein is the exclusive property -
3+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
4+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
5+
# etc., to any person is prohibited unless it has been previously and
6+
# specifically authorized by written means by Cosmo Tech.
7+
8+
from io import BytesIO
9+
10+
from azure.identity import ClientSecretCredential
11+
from azure.storage.blob import BlobServiceClient
12+
13+
import pyarrow.csv as pc
14+
import pyarrow.parquet as pq
15+
from cosmotech.coal.cli.utils.click import click
16+
from cosmotech.coal.cli.utils.decorators import web_help
17+
from cosmotech.coal.store.store import Store
18+
from cosmotech.coal.utils.logger import LOGGER
19+
20+
VALID_TYPES = (
21+
"sqlite",
22+
"csv",
23+
"parquet",
24+
)
25+
26+
27+
@click.command()
28+
@click.option("--store-folder",
29+
envvar="CSM_PARAMETERS_ABSOLUTE_PATH",
30+
help="The folder containing the store files",
31+
metavar="PATH",
32+
type=str,
33+
show_envvar=True,
34+
required=True)
35+
@click.option("--output-type",
36+
default="sqlite",
37+
help="Choose the type of file output to use",
38+
type=click.Choice(VALID_TYPES,
39+
case_sensitive=False))
40+
@click.option("--account-name",
41+
"account_name",
42+
envvar="AZURE_ACCOUNT_NAME",
43+
help="The account name on Azure to upload to",
44+
type=str,
45+
show_envvar=True,
46+
required=True)
47+
@click.option("--container-name",
48+
"container_name",
49+
envvar="AZURE_CONTAINER_NAME",
50+
help="The container name on Azure to upload to",
51+
type=str,
52+
show_envvar=True,
53+
default="")
54+
@click.option("--prefix",
55+
"file_prefix",
56+
envvar="CSM_DATA_PREFIX",
57+
help="A prefix by which all uploaded files should start with in the container",
58+
metavar="PREFIX",
59+
type=str,
60+
show_envvar=True,
61+
default="")
62+
@click.option("--tenant-id",
63+
"tenant_id",
64+
help="Tenant Identity used to connect to Azure storage system",
65+
type=str,
66+
required=True,
67+
show_envvar=True,
68+
metavar="ID",
69+
envvar="AZURE_TENANT_ID")
70+
@click.option("--client-id",
71+
"client_id",
72+
help="Client Identity used to connect to Azure storage system",
73+
type=str,
74+
required=True,
75+
show_envvar=True,
76+
metavar="ID",
77+
envvar="AZURE_CLIENT_ID")
78+
@click.option("--client-secret",
79+
"client_secret",
80+
help="Client Secret tied to the ID used to connect to Azure storage system",
81+
type=str,
82+
required=True,
83+
show_envvar=True,
84+
metavar="ID",
85+
envvar="AZURE_CLIENT_SECRET")
86+
@web_help("csm-data/store/dump-to-azure")
87+
def dump_to_azure(
88+
store_folder,
89+
account_name: str,
90+
container_name: str,
91+
tenant_id: str,
92+
client_id: str,
93+
client_secret: str,
94+
output_type: str,
95+
file_prefix: str
96+
):
97+
"""Dump a datastore to a Azure storage account.
98+
99+
Will upload everything from a given data store to a Azure storage container.
100+
101+
3 modes currently exists :
102+
- sqlite : will dump the data store underlying database as is
103+
- csv : will convert every table of the datastore to csv and send them as separate files
104+
- parquet : will convert every table of the datastore to parquet and send them as separate files
105+
106+
Make use of the azure.storage.blob library to access the container
107+
108+
More information is available on this page:
109+
[https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=managed-identity%2Croles-azure-portal%2Csign-in-azure-cli&pivots=blob-storage-quickstart-scratch)
110+
"""
111+
_s = Store(store_location=store_folder)
112+
113+
if output_type not in VALID_TYPES:
114+
LOGGER.error(f"{output_type} is not a valid type of output")
115+
raise ValueError(f"{output_type} is not a valid type of output")
116+
117+
container_client = (BlobServiceClient(
118+
account_url=f"https://{account_name}.blob.core.windows.net/",
119+
credential=ClientSecretCredential(tenant_id=tenant_id,
120+
client_id=client_id,
121+
client_secret=client_secret))
122+
.get_container_client(container_name))
123+
124+
def data_upload(data_stream: BytesIO, file_name: str):
125+
uploaded_file_name = file_prefix + file_name
126+
data_stream.seek(0)
127+
size = len(data_stream.read())
128+
data_stream.seek(0)
129+
130+
LOGGER.info(f" Sending {size} bytes of data")
131+
container_client.upload_blob(name=uploaded_file_name, data=data_stream, length=size, overwrite=True)
132+
133+
if output_type == "sqlite":
134+
_file_path = _s._database_path
135+
_file_name = "db.sqlite"
136+
_uploaded_file_name = file_prefix + _file_name
137+
LOGGER.info(f"Sending {_file_path} as {_uploaded_file_name}")
138+
with open(_file_path, "rb") as data:
139+
container_client.upload_blob(name=_uploaded_file_name, data=data, overwrite=True)
140+
else:
141+
tables = list(_s.list_tables())
142+
for table_name in tables:
143+
_data_stream = BytesIO()
144+
_file_name = None
145+
_data = _s.get_table(table_name)
146+
if not len(_data):
147+
LOGGER.info(f"Table {table_name} is empty (skipping)")
148+
continue
149+
if output_type == "csv":
150+
_file_name = table_name + ".csv"
151+
pc.write_csv(_data, _data_stream)
152+
elif output_type == "parquet":
153+
_file_name = table_name + ".parquet"
154+
pq.write_table(_data, _data_stream)
155+
LOGGER.info(f"Sending table {table_name} as {output_type}")
156+
data_upload(_data_stream, _file_name)

cosmotech/coal/cli/commands/store/store.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# specifically authorized by written means by Cosmo Tech.
77

88
from cosmotech.coal.cli.commands.api.rds_send_store import rds_send_store
9+
from cosmotech.coal.cli.commands.store import dump_to_azure
910
from cosmotech.coal.cli.commands.store.dump_to_postgresql import dump_to_postgresql
1011
from cosmotech.coal.cli.commands.store.dump_to_s3 import dump_to_s3
1112
from cosmotech.coal.cli.commands.store.list_tables import list_tables
@@ -31,3 +32,4 @@ def store():
3132
store.add_command(load_csv_folder, "load-csv-folder")
3233
store.add_command(dump_to_postgresql, "dump-to-postgresql")
3334
store.add_command(dump_to_s3, "dump-to-s3")
35+
store.add_command(dump_to_azure, "dump-to-azure")
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
hide:
3+
- toc
4+
description: "Command help: `csm-data store dump-to-azure`"
5+
---
6+
# dump-to-azure
7+
8+
!!! info "Help command"
9+
```text
10+
--8<-- "generated/commands_help/csm-data/store/dump-to-azure.txt"
11+
```

0 commit comments

Comments
 (0)