|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +This file contains some utilities to help with converting existing CSV files |
| 5 | +to the format required by ApertureDB. The main functions are for writing |
| 6 | +CSV files for creating entities, images, and connections. The functions |
| 7 | +take in a pandas DataFrame and write it to a CSV file with the appropriate |
| 8 | +headers required by the various *CSV loaders in ApertureDB. This makes it |
| 9 | +easier to generate CSV files without having to remember the exact column |
| 10 | +names required by the CSV loaders. |
| 11 | +""" |
| 12 | + |
| 13 | +from typing import Optional |
| 14 | + |
| 15 | +import pandas as pd |
| 16 | + |
| 17 | + |
| 18 | +def convert_entity_data(input, entity_class: str, unique_key: Optional[str] = None): |
| 19 | + """ |
| 20 | + Convert data to the format required for creating entities |
| 21 | +
|
| 22 | + Arguments: |
| 23 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 24 | + entity_class: The entity class to write to the CSV file as the first column |
| 25 | + unique_key: (optional) An existing key to call out as a constraint |
| 26 | +
|
| 27 | + Returns: |
| 28 | + A pandas DataFrame with the entity class as the first column |
| 29 | + """ |
| 30 | + df = pd.DataFrame(input) |
| 31 | + df.insert(0, 'EntityClass', entity_class) |
| 32 | + if unique_key: |
| 33 | + assert unique_key in df.columns, f"unique_key {unique_key} not found in the input data" |
| 34 | + df[f"constraint_{unique_key}"] = df[unique_key] |
| 35 | + return df |
| 36 | + |
| 37 | + |
| 38 | +def write_entity_csv(filename: str, input, **kwargs): |
| 39 | + """ |
| 40 | + Write data to a CSV file for creating entities |
| 41 | +
|
| 42 | + Arguments: |
| 43 | + filename: The name of the file to write to. |
| 44 | + Recommended to end in ".entity.csv". |
| 45 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 46 | + entity_class: The entity class to write to the CSV file as the first column |
| 47 | + unique_key: (optional) An existing key to call out as a constraint |
| 48 | + """ |
| 49 | + df = convert_entity_data(input, **kwargs) |
| 50 | + df.to_csv(filename, index=False) |
| 51 | + |
| 52 | + |
| 53 | +def convert_image_data(input, source_column: str, source_type: Optional[str] = None, format: Optional[str] = None, unique_key: Optional[str] = None): |
| 54 | + """ |
| 55 | + Convert data to the format required for creating images |
| 56 | +
|
| 57 | + Arguments: |
| 58 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 59 | + source_column: The name of the column that contains the image data |
| 60 | + source_type: (optional) The type the source column. If not specified, the source column will be used. Should be one of "filename", "url", "gsurl", or "s3url". |
| 61 | + format: (optional) The format of the image data. If not provided, there should be a column called "format" in the input data. |
| 62 | + unique_key: (optional) An existing key to call out as a constraint |
| 63 | +
|
| 64 | + Returns: |
| 65 | + A pandas DataFrame with the source column as the first column |
| 66 | + """ |
| 67 | + df = pd.DataFrame(input) |
| 68 | + |
| 69 | + assert source_column in df.columns, f"source_column {source_column} not found in the input data" |
| 70 | + |
| 71 | + if source_type is None: |
| 72 | + source_type = source_column |
| 73 | + |
| 74 | + assert source_type in ["filename", "url", "gsurl", |
| 75 | + "s3url"], f"source_type must be one of 'filename', 'url', 'gsurl', or 's3url', found: {source_type}" |
| 76 | + |
| 77 | + if source_column == source_type: |
| 78 | + # reordering the columns to make the source column the first column |
| 79 | + df = df[[source_column] + |
| 80 | + [col for col in df.columns if col != source_column]] |
| 81 | + else: |
| 82 | + df.insert(0, source_type, df[source_column]) |
| 83 | + |
| 84 | + if unique_key is not None: |
| 85 | + assert unique_key in df.columns, f"unique_key {unique_key} not found in the input data" |
| 86 | + df[f"constraint_{unique_key}"] = df[unique_key] |
| 87 | + |
| 88 | + if format is not None: |
| 89 | + assert 'format' not in df.columns, "format column already exists in the input data" |
| 90 | + df['format'] = format |
| 91 | + else: |
| 92 | + assert 'format' in df.columns, "format column not found in the input data" |
| 93 | + # Reorder the columns to make the format column the last column |
| 94 | + df = df[[col for col in df.columns if col != 'format'] + ['format']] |
| 95 | + |
| 96 | + return df |
| 97 | + |
| 98 | + |
| 99 | +def write_image_csv(filename: str, input, **kwargs): |
| 100 | + """ |
| 101 | + Write data to a CSV file for creating images |
| 102 | +
|
| 103 | + Arguments: |
| 104 | + filename: The name of the file to write to. |
| 105 | + Recommended to end in ".image.csv". |
| 106 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 107 | + source_column: The name of the column that contains the image data |
| 108 | + source_type: (optional) The type the source column. If not specified, the source column will be used. Should be one of "filename", "url", "gsurl", or "s3url". |
| 109 | + format: (optional) The format of the image data. If not provided, there should be a column called "format" in the input data. |
| 110 | + unique_key: (optional) An existing key to call out as a constraint |
| 111 | + """ |
| 112 | + df = convert_image_data(input, **kwargs) |
| 113 | + df.to_csv(filename, index=False) |
| 114 | + |
| 115 | + |
| 116 | +def convert_connection_data(input, |
| 117 | + connection_class: str, |
| 118 | + source_class: str, source_property: str, |
| 119 | + destination_class: str, destination_property: str, |
| 120 | + source_column: Optional[str] = None, destination_column: Optional[str] = None, |
| 121 | + unique_key: Optional[str] = None): |
| 122 | + """ |
| 123 | + Convert data to the format required for creating connections |
| 124 | +
|
| 125 | + Arguments: |
| 126 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 127 | + connection_class: The connection class to write to the CSV file as the first column |
| 128 | + source_class: The source entity class |
| 129 | + source_property: The property containing the source key in ApertureDB |
| 130 | + destination_class: The destination entity class |
| 131 | + destination_property: The property of the destination entity in ApertureDB |
| 132 | + source_column: (optional) The column containing the source keys in the input data. Defaults to the source_property. |
| 133 | + destination_column: (optional) The column containing the destination keys in the input data. Defaults to the destination_property. |
| 134 | + unique_key: (optional) An existing key to call out as a constraint |
| 135 | + """ |
| 136 | + assert "@" not in source_class, "source_class should not contain '@'" |
| 137 | + assert "@" not in destination_class, "destination_class should not contain '@'" |
| 138 | + |
| 139 | + df = pd.DataFrame(input) |
| 140 | + |
| 141 | + if source_column is None: |
| 142 | + source_column = source_property |
| 143 | + assert source_column in df.columns, f"source_column {source_column} not found in the input data" |
| 144 | + |
| 145 | + if destination_column is None: |
| 146 | + destination_column = destination_property |
| 147 | + assert destination_column in df.columns, f"destination_column {destination_column} not found in the input data" |
| 148 | + |
| 149 | + df.insert(0, 'ConnectionClass', connection_class) |
| 150 | + df.insert(1, f"{source_class}@{source_property}", df[source_column]) |
| 151 | + df.insert(2, f"{destination_class}@{destination_property}", |
| 152 | + df[destination_column]) |
| 153 | + |
| 154 | + if unique_key: |
| 155 | + assert unique_key in df.columns, f"unique_key {unique_key} not found in the input data" |
| 156 | + df[f"constraint_{unique_key}"] = df[unique_key] |
| 157 | + |
| 158 | + return df |
| 159 | + |
| 160 | + |
| 161 | +def write_connection_csv(filename: str, input, **kwargs): |
| 162 | + """ |
| 163 | + Write data to a CSV file for creating connections |
| 164 | +
|
| 165 | + Arguments: |
| 166 | + filename: The name of the file to write to. |
| 167 | + Recommended to end in ".connection.csv". |
| 168 | + input: Anything that can be used as input to a pandas DataFrame, including a pandas DataFrame |
| 169 | + connection_class: The connection class to write to the CSV file as the first column |
| 170 | + source_class: The source entity class |
| 171 | + source_property: The property containing the source key in ApertureDB |
| 172 | + source_column: (optional) The column containing the source keys in the input data. Defaults to the source_property. |
| 173 | + destination_class: The destination entity class |
| 174 | + destination_property: The property of the destination entity in ApertureDB |
| 175 | + destination_column: (optional) The column containing the destination keys in the input data. Defaults to the destination_property. |
| 176 | + unique_key: (optional) An existing key to call out as a constraint |
| 177 | + """ |
| 178 | + df = convert_connection_data(input, **kwargs) |
| 179 | + df.to_csv(filename, index=False) |
0 commit comments