diff --git a/CosmoTech_Acceleration_Library/Accelerators/__init__.py b/CosmoTech_Acceleration_Library/Accelerators/__init__.py index 9fd9534b..e69de29b 100644 --- a/CosmoTech_Acceleration_Library/Accelerators/__init__.py +++ b/CosmoTech_Acceleration_Library/Accelerators/__init__.py @@ -1,4 +0,0 @@ -# Copyright (c) Cosmo Tech corporation. -# Licensed under the MIT license. -import os -parametersPath = os.environ.get("CSM_PARAMETERS_ABSOLUTE_PATH") \ No newline at end of file diff --git a/CosmoTech_Acceleration_Library/Accelerators/cosmo_api.py b/CosmoTech_Acceleration_Library/Accelerators/cosmo_api.py deleted file mode 100644 index 0be0085c..00000000 --- a/CosmoTech_Acceleration_Library/Accelerators/cosmo_api.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) Cosmo Tech corporation. -# Licensed under the MIT license. -import io -import os - -import cosmotech_api -from azure.identity import DefaultAzureCredential -from cosmotech_api.api.scenario_api import ScenarioApi -from cosmotech_api.api.workspace_api import WorkspaceApi - - -from CosmoTech_Acceleration_Library.Accelerators.utils.multi_environment import MultiEnvironment - -env = MultiEnvironment() - - -def __get_configuration(): - api_url = env.api_host - api_scope = env.api_scope - credentials = DefaultAzureCredential() - token = credentials.get_token(api_scope) - - configuration = cosmotech_api.Configuration( - host=api_url, - access_token=token.token - ) - return configuration - - -def send_file_to_api(file_content, file_name: str): - """Send a file to the api""" - organization_id = os.environ.get("CSM_ORGANIZATION_ID") - workspace_id = os.environ.get("CSM_WORKSPACE_ID") - - with cosmotech_api.ApiClient(__get_configuration()) as api_client: - api_ws = WorkspaceApi(api_client) - api_ws.upload_workspace_file(organization_id=organization_id, - workspace_id=workspace_id, - file=file_content, - overwrite=True, - destination=file_name) - - -def send_dataframe_to_api(dataframe, file_name: str): - """Send a dataframe to the API""" - file_content = io.StringIO() - dataframe.to_csv(file_content, index=False) - file_content.seek(0) - file_content.name = file_name.split('/')[-1] - send_file_to_api(file_content, file_name) - - -def get_current_scenario_data(): - """ - Uses environment vars to find the current scenario data from the cosmotech api - :return: a dict containing the data of the scenario from the API or None in another context - """ - organization_id = os.environ.get("CSM_ORGANIZATION_ID") - workspace_id = os.environ.get("CSM_WORKSPACE_ID") - scenario_id = os.environ.get("CSM_SCENARIO_ID") - - if not all([organization_id, workspace_id, scenario_id]): - return None - - with cosmotech_api.ApiClient(__get_configuration()) as api_client: - api_instance = ScenarioApi(api_client) - scenario_data = api_instance.find_scenario_by_id(organization_id=organization_id, - workspace_id=workspace_id, - scenario_id=scenario_id) - return scenario_data diff --git a/CosmoTech_Acceleration_Library/Accelerators/scenario_download/scenario_downloader.py b/CosmoTech_Acceleration_Library/Accelerators/scenario_download/scenario_downloader.py index b5ab5808..08592617 100644 --- a/CosmoTech_Acceleration_Library/Accelerators/scenario_download/scenario_downloader.py +++ b/CosmoTech_Acceleration_Library/Accelerators/scenario_download/scenario_downloader.py @@ -19,11 +19,8 @@ from cosmotech_api import TwinGraphQuery from openpyxl import load_workbook -from CosmoTech_Acceleration_Library.Accelerators.utils.multi_environment import MultiEnvironment from cosmotech.coal.cosmotech_api.connection import get_api_client -env = MultiEnvironment() - def get_content_from_twin_graph_data(nodes, relationships, restore_names=False): ''' diff --git a/CosmoTech_Acceleration_Library/Accelerators/utils/multi_environment.py b/CosmoTech_Acceleration_Library/Accelerators/utils/multi_environment.py deleted file mode 100644 index 587b3995..00000000 --- a/CosmoTech_Acceleration_Library/Accelerators/utils/multi_environment.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - - -class MultiEnvironment: - - def __init__(self): - self.api_host = None - self.api_scope = None - - for host_var in ['COSMOTECH_API_SCOPE', 'CSM_API_SCOPE']: - if host_var in os.environ: - self.api_scope = os.environ.get(host_var) - break - - for host_var in ['COSMOTECH_API_HOST', 'COSMOTECH_API_URL', 'CSM_API_HOST', 'CSM_API_URL']: - if host_var in os.environ: - self.api_host = os.environ.get(host_var) - break diff --git a/CosmoTech_Acceleration_Library/Core/DataInterface/__init__.py b/CosmoTech_Acceleration_Library/Core/DataInterface/__init__.py deleted file mode 100644 index f07047b3..00000000 --- a/CosmoTech_Acceleration_Library/Core/DataInterface/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) Cosmo Tech corporation. -# Licensed under the MIT license. \ No newline at end of file diff --git a/CosmoTech_Acceleration_Library/Core/DataStorage/__init__.py b/CosmoTech_Acceleration_Library/Core/DataStorage/__init__.py deleted file mode 100644 index f07047b3..00000000 --- a/CosmoTech_Acceleration_Library/Core/DataStorage/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) Cosmo Tech corporation. -# Licensed under the MIT license. \ No newline at end of file diff --git a/CosmoTech_Acceleration_Library/Core/__init__.py b/CosmoTech_Acceleration_Library/Core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/cosmotech/coal/cli/commands/s3_bucket_download.py b/cosmotech/coal/cli/commands/s3_bucket_download.py new file mode 100644 index 00000000..3f2d248f --- /dev/null +++ b/cosmotech/coal/cli/commands/s3_bucket_download.py @@ -0,0 +1,125 @@ +# Copyright (C) - 2023 - 2024 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +import pathlib +from typing import Optional + +import boto3 + +from cosmotech.coal.cli.utils.click import click +from cosmotech.coal.cli.utils.decorators import web_help +from cosmotech.coal.utils.logger import LOGGER + + +@click.command() +@click.option("--target-folder", + envvar="CSM_DATASET_ABSOLUTE_PATH", + help="The folder in which to download the bucket content", + metavar="PATH", + type=str, + show_envvar=True, + required=True) +@click.option("--bucket-name", + envvar="CSM_DATA_BUCKET_NAME", + help="The bucket on S3 to download", + metavar="BUCKET", + type=str, + show_envvar=True, + required=True) +@click.option("--prefix-filter", + "file_prefix", + envvar="CSM_DATA_BUCKET_PREFIX", + help="A prefix by which all downloaded files should start in the bucket", + metavar="PREFIX", + type=str, + show_envvar=True) +@click.option("--use-ssl/--no-ssl", + default=True, + help="Use SSL to secure connection to S3", + type=bool, + is_flag=True) +@click.option("--s3-url", + "endpoint_url", + help="URL to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="URL", + envvar="AWS_ENDPOINT_URL") +@click.option("--access-id", + "access_id", + help="Identity used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_ACCESS_KEY_ID") +@click.option("--secret-key", + "secret_key", + help="Secret tied to the ID used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_SECRET_ACCESS_KEY") +@click.option("--ssl-cert-bundle", + help="Path to an alternate CA Bundle to validate SSL connections", + type=str, + show_envvar=True, + metavar="PATH", + envvar="CSM_S3_CA_BUNDLE") +@web_help("csm-data/s3-bucket-download") +def s3_bucket_download( + target_folder: str, + bucket_name: str, + file_prefix: str, + endpoint_url: str, + access_id: str, + secret_key: str, + use_ssl: bool = True, + ssl_cert_bundle: Optional[str] = None, +): + """Download S3 bucket content to a given folder + +Will download everything in the bucket unless a prefix is set, then only file following the given prefix will be downloaded + +Make use of the boto3 library to access the bucket + +More information is available on this page: +[https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html) +""" + boto3_parameters = { + "use_ssl": use_ssl, + "endpoint_url": endpoint_url, + "aws_access_key_id": access_id, + "aws_secret_access_key": secret_key, + } + if ssl_cert_bundle: + boto3_parameters["verify"] = ssl_cert_bundle + + s3_resource = boto3.resource("s3", + **boto3_parameters) + + bucket = s3_resource.Bucket(bucket_name) + + pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True) + remove_prefix = False + if file_prefix: + bucket_files = bucket.objects.filter(Prefix=file_prefix) + if file_prefix.endswith("/"): + remove_prefix = True + else: + bucket_files = bucket.objects.all() + for _file in bucket_files: + if not (path_name := str(_file.key)).endswith("/"): + target_file = path_name + if remove_prefix: + target_file = target_file.removeprefix(file_prefix) + output_file = f"{target_folder}/{target_file}" + pathlib.Path(output_file).parent.mkdir(parents=True,exist_ok=True) + LOGGER.info(f"Downloading {path_name} to {output_file}") + bucket.download_file(_file.key, output_file) diff --git a/cosmotech/coal/cli/commands/s3_bucket_loader.py b/cosmotech/coal/cli/commands/s3_bucket_loader.py deleted file mode 100644 index 0289b397..00000000 --- a/cosmotech/coal/cli/commands/s3_bucket_loader.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (C) - 2023 - 2024 - Cosmo Tech -# This document and all information contained herein is the exclusive property - -# including all intellectual property rights pertaining thereto - of Cosmo Tech. -# Any use, reproduction, translation, broadcasting, transmission, distribution, -# etc., to any person is prohibited unless it has been previously and -# specifically authorized by written means by Cosmo Tech. - -import pathlib - -import boto3 - -from cosmotech.coal.cli.utils.click import click -from cosmotech.coal.cli.utils.decorators import web_help -from cosmotech.coal.utils.logger import LOGGER - - -def get_connection(is_client=True): - connect_function = boto3.client if is_client else boto3.resource - return connect_function('s3') - - -@click.command() -@click.option("--target-folder", - envvar="CSM_DATASET_ABSOLUTE_PATH", - help="The folder in which to download the bucket content", - metavar="PATH", - type=str, - show_envvar=True, - required=True) -@click.option("--bucket-name", - envvar="CSM_DATA_BUCKET_NAME", - help="The bucket on S3 to download", - metavar="BUCKET", - type=str, - show_envvar=True, - required=True) -@web_help("csm-data/s3-bucket-load") -def s3_bucket_load(target_folder, bucket_name): - """Download S3 bucket content to a given folder - -Make use of the default AWS/S3 configuration to access the bucket - -More information is available on this page: -[https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html) - -The following environment variables can be used to configure the connection: -- `AWS_ENDPOINT_URL` : The uri pointing to the S3 service endpoint -- `AWS_ACCESS_KEY_ID` : Your access key to the service -- `AWS_SECRET_ACCESS_KEY` : The secret associated to the access key -""" - s3_resource = get_connection(False) - s3_client = get_connection() - - bucket = s3_resource.Bucket(bucket_name) - - pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True) - - for _file in bucket.objects.all(): - LOGGER.info(f"Downloading {_file.key}") - output_file = f"{target_folder}/{_file.key}" - s3_client.download_file(bucket_name, _file.key, output_file) diff --git a/cosmotech/coal/cli/commands/s3_bucket_upload.py b/cosmotech/coal/cli/commands/s3_bucket_upload.py new file mode 100644 index 00000000..8ff7fadb --- /dev/null +++ b/cosmotech/coal/cli/commands/s3_bucket_upload.py @@ -0,0 +1,132 @@ +# Copyright (C) - 2023 - 2024 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +import pathlib +from typing import Optional + +import boto3 + +from cosmotech.coal.cli.utils.click import click +from cosmotech.coal.cli.utils.decorators import web_help +from cosmotech.coal.utils.logger import LOGGER + + +@click.command() +@click.option("--source-folder", + envvar="CSM_DATASET_ABSOLUTE_PATH", + help="The folder/file to upload to the target bucket", + metavar="PATH", + type=str, + show_envvar=True, + required=True) +@click.option("--recursive/--no-recursive", + default=False, + help="Recursively send the content of every folder inside the starting folder to the bucket", + type=bool, + is_flag=True) +@click.option("--bucket-name", + envvar="CSM_DATA_BUCKET_NAME", + help="The bucket on S3 to upload to", + metavar="BUCKET", + type=str, + show_envvar=True, + required=True) +@click.option("--prefix", + "file_prefix", + envvar="CSM_DATA_BUCKET_PREFIX", + help="A prefix by which all uploaded files should start with in the bucket", + metavar="PREFIX", + type=str, + show_envvar=True, + default="") +@click.option("--use-ssl/--no-ssl", + default=True, + help="Use SSL to secure connection to S3", + type=bool, + is_flag=True) +@click.option("--s3-url", + "endpoint_url", + help="URL to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="URL", + envvar="AWS_ENDPOINT_URL") +@click.option("--access-id", + "access_id", + help="Identity used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_ACCESS_KEY_ID") +@click.option("--secret-key", + "secret_key", + help="Secret tied to the ID used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_SECRET_ACCESS_KEY") +@click.option("--ssl-cert-bundle", + help="Path to an alternate CA Bundle to validate SSL connections", + type=str, + show_envvar=True, + metavar="PATH", + envvar="CSM_S3_CA_BUNDLE") +@web_help("csm-data/s3-bucket-upload") +def s3_bucket_upload( + source_folder, + bucket_name: str, + endpoint_url: str, + access_id: str, + secret_key: str, + file_prefix: str = "", + use_ssl: bool = True, + ssl_cert_bundle: Optional[str] = None, + recursive: bool = False +): + """Upload a folder to a S3 Bucket + +Will upload everything from a given folder to a S3 bucket. If a single file is passed only it will be uploaded, and recursive will be ignored + +Giving a prefix will add it to every upload (finishing the prefix with a "/" will allow to upload in a folder inside the bucket) + +Make use of the boto3 library to access the bucket + +More information is available on this page: +[https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html) +""" + source_path = pathlib.Path(source_folder) + if not source_path.exists(): + LOGGER.error(f"{source_folder} does not exists") + raise FileNotFoundError(f"{source_folder} does not exists") + + boto3_parameters = { + "use_ssl": use_ssl, + "endpoint_url": endpoint_url, + "aws_access_key_id": access_id, + "aws_secret_access_key": secret_key, + } + if ssl_cert_bundle: + boto3_parameters["verify"] = ssl_cert_bundle + + s3_client = boto3.client("s3", **boto3_parameters) + + def file_upload(file_path: pathlib.Path, file_name: str): + uploaded_file_name = file_prefix + file_name + LOGGER.info(f"Sending {file_path} as {uploaded_file_name}") + s3_client.upload_file(file_path, bucket_name, uploaded_file_name) + + if source_path.is_dir(): + _source_name = str(source_path) + for _file_path in source_path.glob("**/*" if recursive else "*"): + if _file_path.is_file(): + _file_name = str(_file_path).removeprefix(_source_name).removeprefix("/") + file_upload(_file_path, _file_name) + else: + file_upload(source_path, source_path.name) diff --git a/cosmotech/coal/cli/commands/store/dump_to_postgresql.py b/cosmotech/coal/cli/commands/store/dump_to_postgresql.py new file mode 100644 index 00000000..7eba40c7 --- /dev/null +++ b/cosmotech/coal/cli/commands/store/dump_to_postgresql.py @@ -0,0 +1,119 @@ +# Copyright (C) - 2023 - 2024 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +from time import perf_counter + +from adbc_driver_postgresql import dbapi + +from cosmotech.coal.cli.utils.click import click +from cosmotech.coal.cli.utils.decorators import web_help +from cosmotech.coal.store.store import Store +from cosmotech.coal.utils.logger import LOGGER + + +@click.command() +@web_help("csm-data/store/dump-to-postgres") +@click.option("--store-folder", + envvar="CSM_PARAMETERS_ABSOLUTE_PATH", + help="The folder containing the store files", + metavar="PATH", + type=str, + show_envvar=True, + required=True) +@click.option("--table-prefix", + help="Prefix to add to the table name", + metavar="PREFIX", + type=str, + default="Cosmotech_") +@click.option('--postgres-host', + help='Postgresql host URI', + envvar="POSTGRES_HOST_URI", + show_envvar=True, + required=True) +@click.option('--postgres-port', + help='Postgresql database port', + envvar="POSTGRES_HOST_PORT", + show_envvar=True, + required=False, + default=5432) +@click.option('--postgres-db', + help='Postgresql database name', + envvar="POSTGRES_DB_NAME", + show_envvar=True, + required=True) +@click.option('--postgres-schema', + help='Postgresql schema name', + envvar="POSTGRES_DB_SCHEMA", + show_envvar=True, + required=True) +@click.option('--postgres-user', + help='Postgresql connection user name', + envvar="POSTGRES_USER_NAME", + show_envvar=True, + required=True) +@click.option('--postgres-password', + help='Postgresql connection password', + envvar="POSTGRES_USER_PASSWORD", + show_envvar=True, + required=True) +@click.option("--replace/--append", + "replace", + help="Append data on existing tables", + default=True, + is_flag=True, + show_default=True) +def dump_to_postgresql( + store_folder, + table_prefix: str, + postgres_host, + postgres_port, + postgres_db, + postgres_schema, + postgres_user, + postgres_password, + replace: bool +): + """Running this command will dump your store to a given postgresql database + + Tables names from the store will be prepended with table-prefix in target database + """ + _s = Store(store_location=store_folder) + + postgresql_full_uri = (f'postgresql://' + f'{postgres_user}' + f':{postgres_password}' + f'@{postgres_host}' + f':{postgres_port}' + f'/{postgres_db}') + + tables = list(_s.list_tables()) + if len(tables): + LOGGER.info(f"Sending tables to [green bold]{postgres_db}.{postgres_schema}[/]") + total_rows = 0 + _process_start = perf_counter() + with dbapi.connect(postgresql_full_uri, autocommit=True) as conn: + for table_name in tables: + with conn.cursor() as curs: + _s_time = perf_counter() + target_table_name = f"{table_prefix}{table_name}" + data = _s.get_table(table_name) + _dl_time = perf_counter() + rows = curs.adbc_ingest( + target_table_name, + data, + "replace" if replace else "create_append", + db_schema_name=postgres_schema) + total_rows += rows + _up_time = perf_counter() + LOGGER.info(f" - [yellow]{target_table_name}[/] : [cyan bold]{rows}[/] rows") + LOGGER.debug(f" -> Load from datastore took [blue]{_dl_time - _s_time:0.3}s[/]") + LOGGER.debug(f" -> Send to postgresql took [blue]{_up_time - _dl_time:0.3}s[/]") + _process_end = perf_counter() + LOGGER.info(f"Sent [cyan bold]{total_rows}[/] rows " + f"in [blue]{_process_end - _process_start:0.3}s[/]") + else: + LOGGER.info("Data store is empty") diff --git a/cosmotech/coal/cli/commands/store/dump_to_s3.py b/cosmotech/coal/cli/commands/store/dump_to_s3.py new file mode 100644 index 00000000..531d8edc --- /dev/null +++ b/cosmotech/coal/cli/commands/store/dump_to_s3.py @@ -0,0 +1,162 @@ +# Copyright (C) - 2023 - 2024 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +from io import BytesIO +from typing import Optional + +import boto3 +import pyarrow.csv as pc +import pyarrow.parquet as pq + +from cosmotech.coal.cli.utils.click import click +from cosmotech.coal.cli.utils.decorators import web_help +from cosmotech.coal.store.store import Store +from cosmotech.coal.utils.logger import LOGGER + +VALID_TYPES = ( + "sqlite", + "csv", + "parquet", +) + + +@click.command() +@click.option("--store-folder", + envvar="CSM_PARAMETERS_ABSOLUTE_PATH", + help="The folder containing the store files", + metavar="PATH", + type=str, + show_envvar=True, + required=True) +@click.option("--output-type", + default="sqlite", + help="Choose the type of file output to use", + type=click.Choice(VALID_TYPES, + case_sensitive=False)) +@click.option("--bucket-name", + envvar="CSM_DATA_BUCKET_NAME", + help="The bucket on S3 to upload to", + metavar="BUCKET", + type=str, + show_envvar=True, + required=True) +@click.option("--prefix", + "file_prefix", + envvar="CSM_DATA_BUCKET_PREFIX", + help="A prefix by which all uploaded files should start with in the bucket", + metavar="PREFIX", + type=str, + show_envvar=True, + default="") +@click.option("--use-ssl/--no-ssl", + default=True, + help="Use SSL to secure connection to S3", + type=bool, + is_flag=True) +@click.option("--s3-url", + "endpoint_url", + help="URL to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="URL", + envvar="AWS_ENDPOINT_URL") +@click.option("--access-id", + "access_id", + help="Identity used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_ACCESS_KEY_ID") +@click.option("--secret-key", + "secret_key", + help="Secret tied to the ID used to connect to the S3 system", + type=str, + required=True, + show_envvar=True, + metavar="ID", + envvar="AWS_SECRET_ACCESS_KEY") +@click.option("--ssl-cert-bundle", + help="Path to an alternate CA Bundle to validate SSL connections", + type=str, + show_envvar=True, + metavar="PATH", + envvar="CSM_S3_CA_BUNDLE") +@web_help("csm-data/store/dump-to-s3") +def dump_to_s3( + store_folder, + bucket_name: str, + endpoint_url: str, + access_id: str, + secret_key: str, + output_type: str, + file_prefix: str = "", + use_ssl: bool = True, + ssl_cert_bundle: Optional[str] = None +): + """Dump a datastore to a S3 + +Will upload everything from a given data store to a S3 bucket. + +3 modes currently exists : + - sqlite : will dump the data store underlying database as is + - csv : will convert every table of the datastore to csv and send them as separate files + - parquet : will convert every table of the datastore to parquet and send them as separate files + +Giving a prefix will add it to every upload (finishing the prefix with a "/" will allow to upload in a folder inside the bucket) + +Make use of the boto3 library to access the bucket + +More information is available on this page: +[https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html) +""" + _s = Store(store_location=store_folder) + + if output_type not in VALID_TYPES: + LOGGER.error(f"{output_type} is not a valid type of output") + raise ValueError(f"{output_type} is not a valid type of output") + + boto3_parameters = { + "use_ssl": use_ssl, + "endpoint_url": endpoint_url, + "aws_access_key_id": access_id, + "aws_secret_access_key": secret_key, + } + if ssl_cert_bundle: + boto3_parameters["verify"] = ssl_cert_bundle + + s3_client = boto3.client("s3", **boto3_parameters) + + def data_upload(data_stream: BytesIO, file_name: str): + uploaded_file_name = file_prefix + file_name + data_stream.seek(0) + size = len(data_stream.read()) + data_stream.seek(0) + + LOGGER.info(f" Sending {size} bytes of data") + s3_client.upload_fileobj(data_stream, bucket_name, uploaded_file_name) + + if output_type == "sqlite": + _file_path = _s._database_path + _file_name = "db.sqlite" + _uploaded_file_name = file_prefix + _file_name + LOGGER.info(f"Sending {_file_path} as {_uploaded_file_name}") + s3_client.upload_file(_file_path, bucket_name, _uploaded_file_name) + else: + tables = list(_s.list_tables()) + for table_name in tables: + _data_stream = BytesIO() + _file_name = None + if output_type == "csv": + _file_name = table_name + ".csv" + pc.write_csv(_s.get_table(table_name), _data_stream) + elif output_type == "parquet": + _file_name = table_name + ".parquet" + pq.write_table(_s.get_table(table_name), _data_stream) + LOGGER.info(f"Sending table {table_name} as {output_type}") + data_upload(_data_stream, _file_name) diff --git a/cosmotech/coal/cli/commands/store/list_tables.py b/cosmotech/coal/cli/commands/store/list_tables.py index fda3bc54..7cdfd48a 100644 --- a/cosmotech/coal/cli/commands/store/list_tables.py +++ b/cosmotech/coal/cli/commands/store/list_tables.py @@ -26,8 +26,8 @@ type=bool, default=False) def list_tables(store_folder, schema): - """Running this command will reset the state of your store""" - _s = Store() + """Running this command will list the existing tables in your datastore""" + _s = Store(store_location=store_folder) tables = list(_s.list_tables()) if len(tables): LOGGER.info("Data store contains the following tables") diff --git a/cosmotech/coal/cli/commands/store/load_csv_folder.py b/cosmotech/coal/cli/commands/store/load_csv_folder.py index 1ab33973..665f62ec 100644 --- a/cosmotech/coal/cli/commands/store/load_csv_folder.py +++ b/cosmotech/coal/cli/commands/store/load_csv_folder.py @@ -10,6 +10,7 @@ from cosmotech.coal.cli.utils.click import click from cosmotech.coal.cli.utils.decorators import web_help from cosmotech.coal.store.csv import store_csv_file +from cosmotech.coal.store.store import Store from cosmotech.coal.utils.logger import LOGGER @@ -33,4 +34,4 @@ def load_csv_folder(store_folder, csv_folder): """Running this command will find all csvs in the given folder and put them in the store""" for csv_path in pathlib.Path(csv_folder).glob("*.csv"): LOGGER.info(f"Found {csv_path.name}, storing it") - store_csv_file(csv_path.name[:-4], csv_path) + store_csv_file(csv_path.name[:-4], csv_path, store=Store(False, store_folder)) diff --git a/cosmotech/coal/cli/commands/store/reset.py b/cosmotech/coal/cli/commands/store/reset.py index 1a069881..082d7663 100644 --- a/cosmotech/coal/cli/commands/store/reset.py +++ b/cosmotech/coal/cli/commands/store/reset.py @@ -22,5 +22,5 @@ required=True) def reset(store_folder): """Running this command will reset the state of your store""" - Store().reset() + Store(True, store_folder) LOGGER.info(f"Data store in {store_folder} got reset") diff --git a/cosmotech/coal/cli/commands/store/store.py b/cosmotech/coal/cli/commands/store/store.py index 681ab7fe..3d7badb6 100644 --- a/cosmotech/coal/cli/commands/store/store.py +++ b/cosmotech/coal/cli/commands/store/store.py @@ -6,9 +6,11 @@ # specifically authorized by written means by Cosmo Tech. from cosmotech.coal.cli.commands.api.rds_send_store import rds_send_store -from cosmotech.coal.cli.commands.store.reset import reset +from cosmotech.coal.cli.commands.store.dump_to_postgresql import dump_to_postgresql +from cosmotech.coal.cli.commands.store.dump_to_s3 import dump_to_s3 from cosmotech.coal.cli.commands.store.list_tables import list_tables from cosmotech.coal.cli.commands.store.load_csv_folder import load_csv_folder +from cosmotech.coal.cli.commands.store.reset import reset from cosmotech.coal.cli.utils.click import click from cosmotech.coal.cli.utils.decorators import web_help @@ -27,3 +29,5 @@ def store(): store.add_command(reset, "reset") store.add_command(list_tables, "list-tables") store.add_command(load_csv_folder, "load-csv-folder") +store.add_command(dump_to_postgresql, "dump-to-postgresql") +store.add_command(dump_to_s3, "dump-to-s3") diff --git a/cosmotech/coal/cli/main.py b/cosmotech/coal/cli/main.py index 3868713d..f90768bb 100644 --- a/cosmotech/coal/cli/main.py +++ b/cosmotech/coal/cli/main.py @@ -9,9 +9,10 @@ from CosmoTech_Acceleration_Library import __version__ from cosmotech.coal.cli.commands.adx_send_scenariodata import adx_send_scenariodata from cosmotech.coal.cli.commands.api.api import api -from cosmotech.coal.cli.commands.store.store import store from cosmotech.coal.cli.commands.legacy.legacy import legacy -from cosmotech.coal.cli.commands.s3_bucket_loader import s3_bucket_load +from cosmotech.coal.cli.commands.s3_bucket_download import s3_bucket_download +from cosmotech.coal.cli.commands.s3_bucket_upload import s3_bucket_upload +from cosmotech.coal.cli.commands.store.store import store from cosmotech.coal.cli.utils.click import click from cosmotech.coal.cli.utils.decorators import web_help from cosmotech.coal.utils.logger import LOGGER @@ -22,7 +23,7 @@ def print_version(ctx, param, value): return click.echo(f"Cosmo Tech Data Interface {__version__}") ctx.exit() - + @click.group("csm-data") @click_log.simple_verbosity_option(LOGGER, @@ -46,7 +47,8 @@ def main(): main.add_command(api, "api") main.add_command(legacy, "legacy") main.add_command(store, "store") -main.add_command(s3_bucket_load, "s3-bucket-load") +main.add_command(s3_bucket_download, "s3-bucket-download") +main.add_command(s3_bucket_upload, "s3-bucket-upload") main.add_command(adx_send_scenariodata, "adx-send-scenariodata") if __name__ == "__main__": diff --git a/CosmoTech_Acceleration_Library/Accelerators/utils/__init__.py b/cosmotech/coal/csm/__init__.py similarity index 100% rename from CosmoTech_Acceleration_Library/Accelerators/utils/__init__.py rename to cosmotech/coal/csm/__init__.py diff --git a/CosmoTech_Acceleration_Library/Accelerators/csm_engine.py b/cosmotech/coal/csm/engine/__init__.py similarity index 76% rename from CosmoTech_Acceleration_Library/Accelerators/csm_engine.py rename to cosmotech/coal/csm/engine/__init__.py index d1d5e878..f3bf591f 100644 --- a/CosmoTech_Acceleration_Library/Accelerators/csm_engine.py +++ b/cosmotech/coal/csm/engine/__init__.py @@ -1,17 +1,18 @@ # Copyright (c) Cosmo Tech corporation. # Licensed under the MIT license. -import os import csv import glob import json -from . import parametersPath +import os -def apply_simple_csv_parameter_to_simulator(simulator, - parameter_name: str, - target_attribute_name: str, - csv_id_column: str = "id", - csv_value_column: str = "value"): +def apply_simple_csv_parameter_to_simulator( + simulator, + parameter_name: str, + target_attribute_name: str, + csv_id_column: str = "id", + csv_value_column: str = "value" + ): """ Accelerator used to apply CSV parameters directly to a simulator Will raise a ValueError if the parameter does not exist @@ -23,7 +24,7 @@ def apply_simple_csv_parameter_to_simulator(simulator, :param csv_value_column: Column in the CSV file used for the attribute value to change :return: None """ - parameter_path = os.path.join(parametersPath, parameter_name) + parameter_path = os.path.join(os.environ.get("CSM_PARAMETERS_ABSOLUTE_PATH"), parameter_name) if os.path.exists(parameter_path): csv_files = glob.glob(os.path.join(parameter_path, "*.csv")) for csv_filename in csv_files: @@ -37,3 +38,6 @@ def apply_simple_csv_parameter_to_simulator(simulator, entity.SetAttributeAsString(target_attribute_name, json.dumps(value)) else: raise ValueError(f"Parameter {parameter_name} does not exists.") + + +__all__ = [apply_simple_csv_parameter_to_simulator] diff --git a/cosmotech/coal/store/csv.py b/cosmotech/coal/store/csv.py index baad8178..b27ed64d 100644 --- a/cosmotech/coal/store/csv.py +++ b/cosmotech/coal/store/csv.py @@ -4,21 +4,31 @@ from cosmotech.coal.store.store import Store -def store_csv_file(table_name: str, csv_path: pathlib.Path, replace_existsing_file: bool = False): + +def store_csv_file( + table_name: str, + csv_path: pathlib.Path, + replace_existsing_file: bool = False, + store=Store() +): if not csv_path.exists(): raise FileNotFoundError(f"File {csv_path} does not exists") data = pc.read_csv(csv_path) _c = data.column_names data = data.rename_columns([Store.sanitize_column(_column) for _column in _c]) - _s = Store() - _s.add_table(table_name=table_name, - data=data, - replace=replace_existsing_file) + store.add_table(table_name=table_name, + data=data, + replace=replace_existsing_file) -def convert_store_table_to_csv(table_name: str, csv_path: pathlib.Path, replace_existsing_file: bool = False): +def convert_store_table_to_csv( + table_name: str, + csv_path: pathlib.Path, + replace_existsing_file: bool = False, + store=Store() +): if csv_path.name.endswith(".csv") and csv_path.exists() and not replace_existsing_file: raise FileExistsError(f"File {csv_path} already exists") if not csv_path.name.endswith(".csv"): @@ -26,5 +36,4 @@ def convert_store_table_to_csv(table_name: str, csv_path: pathlib.Path, replace_ folder = csv_path.parent folder.mkdir(parents=True, exist_ok=True) - _s = Store() - pc.write_csv(_s.get_table(table_name), csv_path) + pc.write_csv(store.get_table(table_name), csv_path) diff --git a/cosmotech/coal/store/native_python.py b/cosmotech/coal/store/native_python.py index 9da5aaea..8919fd93 100644 --- a/cosmotech/coal/store/native_python.py +++ b/cosmotech/coal/store/native_python.py @@ -3,16 +3,22 @@ from cosmotech.coal.store.store import Store -def store_pylist(table_name: str, data: list[dict], replace_existsing_file: bool = False): +def store_pylist( + table_name: str, + data: list[dict], + replace_existsing_file: + bool = False, + store=Store() +): data = pa.Table.from_pylist(data) - _s = Store() + store.add_table(table_name=table_name, + data=data, + replace=replace_existsing_file) - _s.add_table(table_name=table_name, - data=data, - replace=replace_existsing_file) - -def convert_table_as_pylist(table_name: str): - _s = Store() - return _s.get_table(table_name).to_pylist() +def convert_table_as_pylist( + table_name: str, + store=Store() +): + return store.get_table(table_name).to_pylist() diff --git a/cosmotech/coal/store/pandas.py b/cosmotech/coal/store/pandas.py index bd297321..1e6e5e52 100644 --- a/cosmotech/coal/store/pandas.py +++ b/cosmotech/coal/store/pandas.py @@ -6,20 +6,26 @@ import pandas as pd - def store_dataframe(table_name: str, dataframe: pd.DataFrame, replace_existsing_file: bool = False): + def store_dataframe( + table_name: str, + dataframe: pd.DataFrame, + replace_existsing_file: bool = False, + store=Store() + ): data = pyarrow.Table.from_pandas(dataframe) - _s = Store() + store.add_table(table_name=table_name, + data=data, + replace=replace_existsing_file) - _s.add_table(table_name=table_name, - data=data, - replace=replace_existsing_file) + def convert_store_table_to_dataframe( + table_name: str, + store=Store() + ) -> pd.DataFrame: - def convert_store_table_to_dataframe(table_name: str) -> pd.DataFrame: - _s = Store() - return _s.get_table(table_name).to_pandas() + return store.get_table(table_name).to_pandas() except ModuleNotFoundError: pass diff --git a/cosmotech/coal/store/pyarrow.py b/cosmotech/coal/store/pyarrow.py index 06f417f7..530746e3 100644 --- a/cosmotech/coal/store/pyarrow.py +++ b/cosmotech/coal/store/pyarrow.py @@ -4,17 +4,20 @@ import pyarrow as pa - def store_table(table_name: str, data: pa.Table, replace_existsing_file: bool = False): - _s = Store() + def store_table( + table_name: str, + data: pa.Table, + replace_existsing_file: bool = False, + store=Store() + ): - _s.add_table(table_name=table_name, - data=data, - replace=replace_existsing_file) + store.add_table(table_name=table_name, + data=data, + replace=replace_existsing_file) - def convert_store_table_to_dataframe(table_name: str) -> pa.Table: - _s = Store() - return _s.get_table(table_name) + def convert_store_table_to_dataframe(table_name: str, store=Store()) -> pa.Table: + return store.get_table(table_name) except ModuleNotFoundError: pass diff --git a/cosmotech/coal/store/store.py b/cosmotech/coal/store/store.py index b268e679..a38d1544 100644 --- a/cosmotech/coal/store/store.py +++ b/cosmotech/coal/store/store.py @@ -3,19 +3,23 @@ import pyarrow from adbc_driver_sqlite import dbapi -from cosmotech.orchestrator.utils.singleton import Singleton from cosmotech.coal.utils.logger import LOGGER -class Store(metaclass=Singleton): +class Store: @staticmethod def sanitize_column(column_name: str) -> str: return column_name.replace(" ", "_").lower() - def __init__(self, reset=False): - self.store_location = pathlib.Path(os.environ.get("CSM_PARAMETERS_ABSOLUTE_PATH", ".")) / ".coal/store" + def __init__( + self, + reset=False, + store_location: pathlib.Path = pathlib.Path(os.environ.get("CSM_PARAMETERS_ABSOLUTE_PATH", + ".")) + ): + self.store_location = pathlib.Path(store_location) / ".coal/store" self.store_location.mkdir(parents=True, exist_ok=True) self._tables = dict() self._database_path = self.store_location / "db.sqlite" diff --git a/cosmotech/coal/utils/logger.py b/cosmotech/coal/utils/logger.py index 32268557..96fee772 100644 --- a/cosmotech/coal/utils/logger.py +++ b/cosmotech/coal/utils/logger.py @@ -9,12 +9,15 @@ import os from rich.logging import RichHandler +from rich.highlighter import NullHighlighter LOGGER = logging.getLogger("csm.data") +HIGLIGHTER = NullHighlighter() HANDLER = RichHandler(rich_tracebacks=True, omit_repeated_times=False, show_path=False, - markup=True) + markup=True, + highlighter=HIGLIGHTER) _format = "%(message)s" if "PAILLETTES" in os.environ: @@ -22,9 +25,9 @@ _format = f"{paillettes} {_format} {paillettes}" FORMATTER = logging.Formatter(fmt=_format, - datefmt="[%Y/%m/%d-%X]", + datefmt="[%Y/%m/%d-%H:%M:%S]", ) HANDLER.setFormatter(FORMATTER) LOGGER.addHandler(HANDLER) -LOGGER.setLevel(logging.INFO) \ No newline at end of file +LOGGER.setLevel(logging.INFO) diff --git a/docs/csm-data/s3-bucket-download.md b/docs/csm-data/s3-bucket-download.md new file mode 100644 index 00000000..25958165 --- /dev/null +++ b/docs/csm-data/s3-bucket-download.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +description: "Command help: `csm-data s3-bucket-download`" +--- +# s3-bucket-download + +!!! info "Help command" + ```text + --8<-- "generated/commands_help/csm-data/s3-bucket-download.txt" + ``` diff --git a/docs/csm-data/s3-bucket-load.md b/docs/csm-data/s3-bucket-load.md deleted file mode 100644 index 0adb4d57..00000000 --- a/docs/csm-data/s3-bucket-load.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -hide: - - toc -description: "Command help: `csm-data s3-bucket-load`" ---- -# s3-bucket-load - -!!! info "Help command" - ```text - --8<-- "generated/commands_help/csm-data/s3-bucket-load.txt" - ``` diff --git a/docs/csm-data/s3-bucket-upload.md b/docs/csm-data/s3-bucket-upload.md new file mode 100644 index 00000000..ab5c66a5 --- /dev/null +++ b/docs/csm-data/s3-bucket-upload.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +description: "Command help: `csm-data s3-bucket-upload`" +--- +# s3-bucket-upload + +!!! info "Help command" + ```text + --8<-- "generated/commands_help/csm-data/s3-bucket-upload.txt" + ``` diff --git a/docs/csm-data/store/dump-to-postgresql.md b/docs/csm-data/store/dump-to-postgresql.md new file mode 100644 index 00000000..77913268 --- /dev/null +++ b/docs/csm-data/store/dump-to-postgresql.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +description: "Command help: `csm-data store dump-to-postgresql`" +--- +# dump-to-postgresql + +!!! info "Help command" + ```text + --8<-- "generated/commands_help/csm-data/store/dump-to-postgresql.txt" + ``` diff --git a/docs/csm-data/store/dump-to-s3.md b/docs/csm-data/store/dump-to-s3.md new file mode 100644 index 00000000..45fff302 --- /dev/null +++ b/docs/csm-data/store/dump-to-s3.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +description: "Command help: `csm-data store dump-to-s3`" +--- +# dump-to-s3 + +!!! info "Help command" + ```text + --8<-- "generated/commands_help/csm-data/store/dump-to-s3.txt" + ``` diff --git a/docs/overrides/partials/copyright.html b/docs/overrides/partials/copyright.html index 649753b3..18c5ac5b 100644 --- a/docs/overrides/partials/copyright.html +++ b/docs/overrides/partials/copyright.html @@ -1,11 +1,14 @@ \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 21334663..37de9954 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,13 @@ extra_javascript: extra: version: provider: mike + consent: + title: Cookie consent + description: >- + We use cookies to recognize your repeated visits and preferences, as well + as to measure the effectiveness of our documentation and whether users + find what they're searching for. With your consent, you're helping us to + make our documentation better. nav: - Home: 'index.md' diff --git a/requirements.doc.txt b/requirements.doc.txt index 3891c8a6..2f10f17d 100644 --- a/requirements.doc.txt +++ b/requirements.doc.txt @@ -5,7 +5,7 @@ mkdocs-gen-files~=0.5.0 mkdocstrings[python]~=0.24 mkdocs-awesome-pages-plugin~=2.9.3 pymdown-extensions~=10.7 -requirements-parser~=0.5.0 +requirements-parser~=0.11.0 setuptools~=70.3.0 mike~=2.0.0 griffe~=0.47.0 diff --git a/requirements.txt b/requirements.txt index 657458a2..bacd664f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,12 +21,13 @@ boto3~=1.34 requests~=2.32.3 # Orchestrator templates requirements -cosmotech-run-orchestrator~=1.3.1 +cosmotech-run-orchestrator~=1.3 # Data store requirements pyarrow~=17.0.0 adbc-driver-manager~=1.1.0 adbc-driver-sqlite~=1.1.0 +adbc-driver-postgresql~=1.1.0 # CLI requirements click~=8.1.7