|
| 1 | +# Copyright (C) - 2023 - 2024 - Cosmo Tech |
| 2 | +# This document and all information contained herein is the exclusive property - |
| 3 | +# including all intellectual property rights pertaining thereto - of Cosmo Tech. |
| 4 | +# Any use, reproduction, translation, broadcasting, transmission, distribution, |
| 5 | +# etc., to any person is prohibited unless it has been previously and |
| 6 | +# specifically authorized by written means by Cosmo Tech. |
| 7 | + |
| 8 | +from io import BytesIO |
| 9 | + |
| 10 | +from azure.identity import ClientSecretCredential |
| 11 | +from azure.storage.blob import BlobServiceClient |
| 12 | + |
| 13 | +import pyarrow.csv as pc |
| 14 | +import pyarrow.parquet as pq |
| 15 | +from cosmotech.coal.cli.utils.click import click |
| 16 | +from cosmotech.coal.cli.utils.decorators import web_help |
| 17 | +from cosmotech.coal.store.store import Store |
| 18 | +from cosmotech.coal.utils.logger import LOGGER |
| 19 | + |
| 20 | +VALID_TYPES = ( |
| 21 | + "sqlite", |
| 22 | + "csv", |
| 23 | + "parquet", |
| 24 | +) |
| 25 | + |
| 26 | + |
| 27 | +@click.command() |
| 28 | +@click.option("--store-folder", |
| 29 | + envvar="CSM_PARAMETERS_ABSOLUTE_PATH", |
| 30 | + help="The folder containing the store files", |
| 31 | + metavar="PATH", |
| 32 | + type=str, |
| 33 | + show_envvar=True, |
| 34 | + required=True) |
| 35 | +@click.option("--output-type", |
| 36 | + default="sqlite", |
| 37 | + help="Choose the type of file output to use", |
| 38 | + type=click.Choice(VALID_TYPES, |
| 39 | + case_sensitive=False)) |
| 40 | +@click.option("--account-name", |
| 41 | + "account_name", |
| 42 | + envvar="AZURE_ACCOUNT_NAME", |
| 43 | + help="The account name on Azure to upload to", |
| 44 | + type=str, |
| 45 | + show_envvar=True, |
| 46 | + required=True) |
| 47 | +@click.option("--container-name", |
| 48 | + "container_name", |
| 49 | + envvar="AZURE_CONTAINER_NAME", |
| 50 | + help="The container name on Azure to upload to", |
| 51 | + type=str, |
| 52 | + show_envvar=True, |
| 53 | + default="") |
| 54 | +@click.option("--prefix", |
| 55 | + "file_prefix", |
| 56 | + envvar="CSM_DATA_PREFIX", |
| 57 | + help="A prefix by which all uploaded files should start with in the container", |
| 58 | + metavar="PREFIX", |
| 59 | + type=str, |
| 60 | + show_envvar=True, |
| 61 | + default="") |
| 62 | +@click.option("--tenant-id", |
| 63 | + "tenant_id", |
| 64 | + help="Tenant Identity used to connect to Azure storage system", |
| 65 | + type=str, |
| 66 | + required=True, |
| 67 | + show_envvar=True, |
| 68 | + metavar="ID", |
| 69 | + envvar="AZURE_TENANT_ID") |
| 70 | +@click.option("--client-id", |
| 71 | + "client_id", |
| 72 | + help="Client Identity used to connect to Azure storage system", |
| 73 | + type=str, |
| 74 | + required=True, |
| 75 | + show_envvar=True, |
| 76 | + metavar="ID", |
| 77 | + envvar="AZURE_CLIENT_ID") |
| 78 | +@click.option("--client-secret", |
| 79 | + "client_secret", |
| 80 | + help="Client Secret tied to the ID used to connect to Azure storage system", |
| 81 | + type=str, |
| 82 | + required=True, |
| 83 | + show_envvar=True, |
| 84 | + metavar="ID", |
| 85 | + envvar="AZURE_CLIENT_SECRET") |
| 86 | +@web_help("csm-data/store/dump-to-azure") |
| 87 | +def dump_to_azure( |
| 88 | + store_folder, |
| 89 | + account_name: str, |
| 90 | + container_name: str, |
| 91 | + tenant_id: str, |
| 92 | + client_id: str, |
| 93 | + client_secret: str, |
| 94 | + output_type: str, |
| 95 | + file_prefix: str |
| 96 | +): |
| 97 | + """Dump a datastore to a Azure storage account. |
| 98 | +
|
| 99 | +Will upload everything from a given data store to a Azure storage container. |
| 100 | +
|
| 101 | +3 modes currently exists : |
| 102 | + - sqlite : will dump the data store underlying database as is |
| 103 | + - csv : will convert every table of the datastore to csv and send them as separate files |
| 104 | + - parquet : will convert every table of the datastore to parquet and send them as separate files |
| 105 | +
|
| 106 | +Make use of the azure.storage.blob library to access the container |
| 107 | +
|
| 108 | +More information is available on this page: |
| 109 | +[https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=managed-identity%2Croles-azure-portal%2Csign-in-azure-cli&pivots=blob-storage-quickstart-scratch) |
| 110 | +""" |
| 111 | + _s = Store(store_location=store_folder) |
| 112 | + |
| 113 | + if output_type not in VALID_TYPES: |
| 114 | + LOGGER.error(f"{output_type} is not a valid type of output") |
| 115 | + raise ValueError(f"{output_type} is not a valid type of output") |
| 116 | + |
| 117 | + container_client = (BlobServiceClient( |
| 118 | + account_url=f"https://{account_name}.blob.core.windows.net/", |
| 119 | + credential=ClientSecretCredential(tenant_id=tenant_id, |
| 120 | + client_id=client_id, |
| 121 | + client_secret=client_secret)) |
| 122 | + .get_container_client(container_name)) |
| 123 | + |
| 124 | + def data_upload(data_stream: BytesIO, file_name: str): |
| 125 | + uploaded_file_name = file_prefix + file_name |
| 126 | + data_stream.seek(0) |
| 127 | + size = len(data_stream.read()) |
| 128 | + data_stream.seek(0) |
| 129 | + |
| 130 | + LOGGER.info(f" Sending {size} bytes of data") |
| 131 | + container_client.upload_blob(name=uploaded_file_name, data=data_stream, length=size, overwrite=True) |
| 132 | + |
| 133 | + if output_type == "sqlite": |
| 134 | + _file_path = _s._database_path |
| 135 | + _file_name = "db.sqlite" |
| 136 | + _uploaded_file_name = file_prefix + _file_name |
| 137 | + LOGGER.info(f"Sending {_file_path} as {_uploaded_file_name}") |
| 138 | + with open(_file_path, "rb") as data: |
| 139 | + container_client.upload_blob(name=_uploaded_file_name, data=data, overwrite=True) |
| 140 | + else: |
| 141 | + tables = list(_s.list_tables()) |
| 142 | + for table_name in tables: |
| 143 | + _data_stream = BytesIO() |
| 144 | + _file_name = None |
| 145 | + _data = _s.get_table(table_name) |
| 146 | + if not len(_data): |
| 147 | + LOGGER.info(f"Table {table_name} is empty (skipping)") |
| 148 | + continue |
| 149 | + if output_type == "csv": |
| 150 | + _file_name = table_name + ".csv" |
| 151 | + pc.write_csv(_data, _data_stream) |
| 152 | + elif output_type == "parquet": |
| 153 | + _file_name = table_name + ".parquet" |
| 154 | + pq.write_table(_data, _data_stream) |
| 155 | + LOGGER.info(f"Sending table {table_name} as {output_type}") |
| 156 | + data_upload(_data_stream, _file_name) |
0 commit comments