diff --git a/docs/backends.rst b/docs/backends.rst index 2255a3b0..a4ed1114 100644 --- a/docs/backends.rst +++ b/docs/backends.rst @@ -51,10 +51,3 @@ GUFI .. automodule:: dsi.backends.gufi :members: :special-members: __init__ - -Parquet --------- - -.. automodule:: dsi.backends.parquet - :members: - :special-members: __init__ diff --git a/docs/cr_intro.rst b/docs/cr_intro.rst index 89b3c6b0..1950dbb5 100644 --- a/docs/cr_intro.rst +++ b/docs/cr_intro.rst @@ -42,7 +42,6 @@ All DSI backends include: - DuckDB: In-process SQL database designed for fast queries on large data files - GUFI: the `Grand Unified File Index system `_ ; developed at LANL. GUFI is a fast, secure metadata search across a filesystem accessible to both privileged and unprivileged users. -- Parquet: a columnar storage format for `Apache Hadoop `_. DSI Core ~~~~~~~~ diff --git a/dsi/backends/filesystem.py b/dsi/backends/filesystem.py index a247cec8..d9e59e23 100644 --- a/dsi/backends/filesystem.py +++ b/dsi/backends/filesystem.py @@ -78,7 +78,6 @@ class Filesystem(Backend): # Declare store types GUFI_STORE = "gufi" SQLITE_STORE = "sqlite" - PARQUET_STORE = "parquet" # Declare comparison types GT = ">" diff --git a/dsi/backends/parquet.py b/dsi/backends/parquet.py deleted file mode 100644 index 8edf2bcc..00000000 --- a/dsi/backends/parquet.py +++ /dev/null @@ -1,114 +0,0 @@ -import pyarrow as pa -from pyarrow import parquet as pq -import subprocess - -from dsi.backends.filesystem import Filesystem - - -class Parquet(Filesystem): - """ - Support for a Parquet back-end. - - Parquet is a convenient format when metadata are larger than SQLite supports. - """ - - def __init__(self, filename, **kwargs): - super().__init__(filename=filename) - self.filename = filename - try: - self.compression = kwargs['compression'] - except KeyError: - self.compression = None - - # OLD NAME OF query_artifacts(). TO BE DEPRECATED IN FUTURE DSI RELEASE - def get_artifacts(self): - return self.query_artifacts() - - def query_artifacts(self): - """Query Parquet data from filename.""" - table = pq.read_table(self.filename) - resout = table.to_pydict() - return resout - - # OLD NAME OF ingest_artifacts(). TO BE DEPRECATED IN FUTURE DSI RELEASE - def put_artifacts(self, collection): - return self.ingest_artifacts(collection) - - def ingest_artifacts(self, collection): - """Ingest artifacts into file at filename path.""" - table = pa.table(collection) - pq.write_table(table, self.filename, compression=self.compression) - - @staticmethod - def get_cmd_output(cmd: list) -> str: - """ - Runs a given command and returns the stdout if successful. - - If stderr is not empty, an exception is raised with the stderr text. - """ - proc = subprocess.run(cmd, capture_output=True, shell=True) - if proc.stderr != b"": - raise Exception(proc.stderr) - return proc.stdout.strip().decode("utf-8") - - # OLD NAME OF notebook(). TO BE DEPRECATED IN FUTURE DSI RELEASE - def inspect_artifacts(self, collection, interactive=False): - return self.notebook(collection, interactive) - - def notebook(self, collection, interactive=False): - """Generate Jupyter notebook of Parquet data from filename.""" - import nbconvert as nbc - import nbformat as nbf - - """Populate a Jupyter notebook with tools required to look at Parquet data.""" - nb = nbf.v4.new_notebook() - text = """\ - # This notebook was auto-generated by a DSI Backend for Parquet. - # Execute the Jupyter notebook cells below and interact with "df" - # to explore your data. - """ - code1 = """\ - import pandas as pd - df = pd.read_parquet('{}') - df.head() - """.format(self.filename) - - code2 = """\ - df.info() - """ - - code3 = """\ - df.describe() - """ - - nb['cells'] = [nbf.v4.new_markdown_cell(text), - nbf.v4.new_code_cell(code1), - nbf.v4.new_code_cell(code2), - nbf.v4.new_code_cell(code3)] - - fname = 'dsi_parquet_backend_output.ipynb' - - print('Writing Jupyter notebook...') - with open(fname, 'w') as fh: - nbf.write(nb, fh) - - # open the jupyter notebook for static page generation - with open(fname, 'r', encoding='utf-8') as fh: - nb_content = nbf.read(fh, as_version=4) - # Init executor for notebook - run_nb = nbc.preprocessors.ExecutePreprocessor(timeout=-1) # No timeout - # Execute the notebook - run_nb.preprocess(nb_content, {'metadata':{'path':'.'}}) - - if interactive: - print('Opening Jupyter notebook...') - self.get_cmd_output(cmd=['jupyter-lab ./dsi_parquet_backend_output.ipynb']) - else: -# self.get_cmd_output(cmd=['jupyter nbconvert --to html {}'.format(fname)]) - # Init HTML exporter - html_exporter = nbc.HTMLExporter() - html_content,_ = html_exporter.from_notebook_node(nb_content) - # Save HTML file - html_filename = 'dsi_parquet_backend_output.html' - with open(html_filename, 'w', encoding='utf-8') as fh: - fh.write(html_content) diff --git a/dsi/backends/tests/test_parquet.py b/dsi/backends/tests/test_parquet.py deleted file mode 100644 index c4b02d0a..00000000 --- a/dsi/backends/tests/test_parquet.py +++ /dev/null @@ -1,29 +0,0 @@ -import git -from collections import OrderedDict - -from dsi.backends.parquet import Parquet - -isVerbose = True - - -def get_git_root(path): - git_repo = git.Repo(path, search_parent_directories=True) - git_root = git_repo.git.rev_parse("--show-toplevel") - return (git_root) - -def test_query_artifacts(): - a = Parquet(filename='/'.join([get_git_root('.'), 'examples/test/wildfiredata.pq'])) - b = a.query_artifacts() - cnt = 0 - for key in b: - cnt = cnt + 1 - assert 4 == len(b[key]) - assert 11 == cnt - -def test_notebook(): - a = Parquet(filename='/'.join([get_git_root('.'), 'examples/test/wildfiredata.pq'])) - b = a.query_artifacts() - a.notebook(b) - # No error on notebook return implies success - assert True - diff --git a/dsi/cli.py b/dsi/cli.py index 4497916a..c0f337e8 100644 --- a/dsi/cli.py +++ b/dsi/cli.py @@ -219,9 +219,7 @@ def export_table(self, table_name, filename): elif file_extension.lower() == "csv": self.t.load_module('plugin', "Csv_Writer", "writer", filename = filename, table_name = table_name) elif file_extension.lower() in ['pq', 'parquet']: - table_data = self.t.active_metadata[table_name] - df = pd.DataFrame(table_data) - df.to_parquet(filename, engine='pyarrow', index=False) + self.t.load_module('plugin', "Parquet_Writer", "writer", filename = filename, table_name = table_name) else: success_load = False except Exception as e: @@ -481,14 +479,8 @@ def read(self, args): self.t.load_module('plugin', "YAML1", "reader", filenames = dbfile) elif file_extension.lower() == 'json': self.t.load_module('plugin', "JSON", "reader", filenames = dbfile) - elif file_extension.lower() == 'pq' or file_extension.lower() == 'parquet': - self.t.load_module('backend','Parquet','back-write', filename=dbfile) - data = OrderedDict(self.t.artifact_handler(interaction_type="query")) #Parquet's query() returns a normal dict - if table_name is not None: - self.t.active_metadata[table_name] = data - else: - self.t.active_metadata["Parquet"] = data - self.t.unload_module('backend','Parquet','back-write') + elif file_extension.lower() in ['pq', 'parquet']: + self.t.load_module('plugin', "Parquet", "reader", filenames = dbfile, table_name = table_name) except Exception as e: print(f"read ERROR: {e}\n") self.t.active_metadata = OrderedDict() diff --git a/dsi/core.py b/dsi/core.py index 79131be3..5b17f7e6 100644 --- a/dsi/core.py +++ b/dsi/core.py @@ -24,15 +24,15 @@ class Terminal(): for more information. """ BACKEND_PREFIX = ['dsi.backends'] - BACKEND_IMPLEMENTATIONS = ['gufi', 'sqlite', 'parquet', 'duckdb', 'hpss'] + BACKEND_IMPLEMENTATIONS = ['gufi', 'sqlite', 'duckdb', 'hpss'] PLUGIN_PREFIX = ['dsi.plugins'] PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer', 'collection_reader'] VALID_ENV = ['Hostname', 'SystemKernel', 'GitInfo'] - VALID_READERS = ['Bueno', 'Csv', 'YAML1', 'TOML1', 'Schema', 'JSON', 'MetadataReader1', 'Ensemble', 'Cloverleaf', 'Dict'] + VALID_READERS = ['Bueno', 'Csv', 'YAML1', 'TOML1', 'Parquet', 'Schema', 'JSON', 'MetadataReader1', 'Ensemble', 'Cloverleaf', 'Dict'] VALID_DATACARDS = ['Oceans11Datacard', 'DublinCoreDatacard', 'SchemaOrgDatacard', 'GoogleDatacard'] - VALID_WRITERS = ['ER_Diagram', 'Table_Plot', 'Csv_Writer'] + VALID_WRITERS = ['ER_Diagram', 'Table_Plot', 'Csv_Writer', 'Parquet_Writer'] VALID_PLUGINS = VALID_ENV + VALID_READERS + VALID_WRITERS + VALID_DATACARDS - VALID_BACKENDS = ['Gufi', 'Sqlite', 'Parquet', 'DuckDB', 'SqlAlchemy', 'HPSS'] + VALID_BACKENDS = ['Gufi', 'Sqlite', 'DuckDB', 'SqlAlchemy', 'HPSS'] VALID_MODULES = VALID_PLUGINS + VALID_BACKENDS VALID_MODULE_FUNCTIONS = {'plugin': ['reader', 'writer'], 'backend': ['back-read', 'back-write']} @@ -503,13 +503,7 @@ def artifact_handler(self, interaction_type, query = None, **kwargs): self.logger.info(f"{first_backend.__class__.__name__} backend - {interaction_type.upper()} the data") start = datetime.now() if interaction_type in ['query', 'get']: - # Only used when reading data from Parquet backend in CLI API (Parquet uses query not process) - - # TODO fix this passthrough by updating Parquet to use process_artifacts() - # TODO query all backends - if len(self.loaded_backends) > 1: - if parent_backend == "Filesystem" and ".temp.db" in first_backend.filename: - first_backend = self.loaded_backends[1] - parent_backend = first_backend.__class__.__bases__[0].__name__ + # TODO query all backends together if self.valid_backend(first_backend, parent_backend): if "query" in first_backend.query_artifacts.__code__.co_varnames: self.logger.info(f"Query to get data: {query}") @@ -1362,8 +1356,6 @@ def valid_backend(self, backend, parent_name): valid = True if backend.__class__.__name__ == "DuckDB" and os.path.getsize(backend.filename) > 13000: valid = True - if backend.__class__.__name__ == "Parquet" and os.path.getsize(backend.filename) > 100: - valid = True return valid diff --git a/dsi/dsi.py b/dsi/dsi.py index 7a26508f..de668403 100644 --- a/dsi/dsi.py +++ b/dsi/dsi.py @@ -117,6 +117,7 @@ def list_readers(self): print("\nValid Readers for `reader_name` in read():\n" + "-"*50) print("Collection : Loads data from an Ordered Dict. If multiple tables, each table must be a nested OrderedDict.") print("CSV : Loads data from CSV files (one table per call)") + print("Parquet : Loads data from Parquet - a columnar storage format for Apache Hadoop (one table per call)") print("YAML1 : Loads data from YAML files of a certain structure") print("TOML1 : Loads data from TOML files of a certain structure") print("JSON : Loads single-table data from JSON files") @@ -139,6 +140,7 @@ def read(self, filenames, reader_name, table_name = None): The expected input type depends on the selected `reader_name`: - "Collection" → Ordered Dictionary of table(s) - "CSV" → .csv + - "Parquet" → .pq - "YAML1" → .yaml or .yml - "TOML1" → .toml - "JSON" → .json @@ -163,7 +165,7 @@ def read(self, filenames, reader_name, table_name = None): Required when using the `Collection` reader to load an Ordered Dictionary representing only one table. - Recommended when the input file contains a single table for the `CSV`, `JSON`, or `Ensemble` reader. + Recommended when the input file contains a single table for the `CSV`, `Parquet`, `JSON`, or `Ensemble` reader. """ if isinstance(filenames, str) and not os.path.exists(filenames): sys.exit("read() ERROR: The input file must be a valid filepath. Please check again.") @@ -234,6 +236,8 @@ def read(self, filenames, reader_name, table_name = None): self.t.load_module('plugin', 'Bueno', 'reader', filenames=filenames) elif reader_name.lower() == "csv": self.t.load_module('plugin', 'Csv', 'reader', filenames=filenames, table_name=table_name) + elif reader_name.lower() == "parquet": + self.t.load_module('plugin', 'Parquet', 'reader', filenames=filenames, table_name=table_name) elif reader_name.lower() == "yaml1": self.t.load_module('plugin', 'YAML1', 'reader', filenames=filenames) elif reader_name.lower() == "toml1": @@ -257,7 +261,7 @@ def read(self, filenames, reader_name, table_name = None): if correct_reader == False: print("read() ERROR: Please check your spelling of the 'reader_name' argument as it does not exist in DSI\n") - elg = "Collection, CSV, YAML1, TOML1, JSON, Ensemble, Cloverleaf, Bueno, DublinCoreDatacard, SchemaOrgDatacard" + elg = "Collection, CSV, Parquet, YAML1, TOML1, JSON, Ensemble, Cloverleaf, Bueno, DublinCoreDatacard, SchemaOrgDatacard" sys.exit(f"Eligible readers are: {elg}, GoogleDatacard, Oceans11Datacard") table_keys = [k for k in self.t.new_tables if k not in ("dsi_relations", "dsi_units")] @@ -672,6 +676,7 @@ def list_writers(self): print("ER_Diagram : Creates a visual ER diagram image based on all tables in DSI.") print("Table_Plot : Generates a plot of numerical data from a specified table.") print("Csv : Exports the data of a specified table to a CSV file.") + print("Parquet : Exports the data of a specified table to a Parquet file.") print() def write(self, filename, writer_name, table_name = None): @@ -685,6 +690,7 @@ def write(self, filename, writer_name, table_name = None): - "ER_Diagram" → .png, .pdf, .jpg, .jpeg - "Table_Plot" → .png, .jpg, .jpeg - "Csv" → .csv + - "Parquet" → .pq `writer_name` : str Name of the DSI Writer to export data. @@ -695,7 +701,7 @@ def write(self, filename, writer_name, table_name = None): For guidance on creating a DSI-compatible Writer, view :ref:`custom_writer`. `table_name`: str, optional - Required when using "Table_Plot" or "Csv" to specify which table to export. + Required when using "Table_Plot", "Csv" or "Parquet" to specify which table to export. """ if not self.t.valid_backend(self.main_backend_obj, self.main_backend_obj.__class__.__bases__[0].__name__): sys.exit("ERROR: Cannot write() data from an empty backend. Please ensure there is data in it.") @@ -764,6 +770,8 @@ def write(self, filename, writer_name, table_name = None): self.t.load_module('plugin', 'Table_Plot', 'writer', filename=filename, table_name = table_name) elif writer_name.lower() in ["csv", "csv writer", "csv_writer"]: self.t.load_module('plugin', 'Csv_Writer', 'writer', filename=filename, table_name = table_name) + elif writer_name.lower() in ["parquet", "parquet writer", "parquet_writer"]: + self.t.load_module('plugin', 'Parquet_Writer', 'writer', filename=filename, table_name = table_name) else: correct_writer = False except Exception as e: diff --git a/dsi/plugins/file_reader.py b/dsi/plugins/file_reader.py index 4e0fa5f2..e26b3751 100644 --- a/dsi/plugins/file_reader.py +++ b/dsi/plugins/file_reader.py @@ -9,6 +9,7 @@ try: import tomllib except ModuleNotFoundError: import pip._vendor.tomli as tomllib import os +from pyarrow import parquet as pq # import ast from dsi.plugins.metadata import StructuredMetadata @@ -459,6 +460,52 @@ def add_rows(self) -> None: self.set_schema_2(self.toml_data) +class Parquet(FileReader): + """ + DSI Reader that loads data stored in a Parquet file as a table. Users can choose to specify the table name upon reading too. + """ + def __init__(self, filenames, table_name = None, **kwargs): + """ + Initializes the Parquet Reader with user specified filenames and an optional table_name. + + `filenames` : str or list of str + Required. One or more Parquet file paths to be loaded into DSI. + If multiple files are provided, all data must correspond to the same table. + + `table_name` : str, optional + Optional name to assign to the loaded table. + If not provided, DSI will default to using "Parquet" as the table name. + """ + super().__init__(filenames, **kwargs) + self.parquet_data = OrderedDict() + if isinstance(filenames, str): + self.filenames = [filenames] + else: + self.filenames = filenames + self.table_name = table_name + + def add_rows(self) -> None: + """Parses Parquet data and stores data into a table as an Ordered Dictionary.""" + total_df = DataFrame() + + for filename in self.filenames: + table = pq.read_table(filename).to_pandas() + try: + total_df = concat([total_df, table], axis=0, ignore_index=True) + except: + raise TypeError(f"Error in adding {filename} to the existing Parquet data. Please recheck column names and data structure") + + table_data = OrderedDict(total_df.to_dict(orient='list')) + for col, coldata in table_data.items(): # replace NaNs with None + table_data[col] = [None if type(item) == float and isnan(item) else item for item in coldata] + + if self.table_name is not None: + self.parquet_data[self.table_name] = table_data + else: + self.parquet_data = table_data + + self.set_schema_2(self.parquet_data) + class Ensemble(FileReader): """ DSI Reader that loads ensemble simulation data stored in a CSV file. diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 0d305d8a..928517eb 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -2,6 +2,8 @@ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt +import pyarrow as pa +from pyarrow import parquet as pq from dsi.plugins.metadata import StructuredMetadata @@ -146,7 +148,7 @@ def __init__(self, table_name, filename, export_cols = None, **kwargs): Initializes the CSV Writer with the specified inputs `table_name` : str - Name of the table to export from the DSI backend. + Name of the table to export from DSI. `filename` : str Name of the CSV file to be generated. @@ -287,4 +289,61 @@ def get_rows(self, collection) -> None: if len(not_plot_cols) > 1: return ("Warning", f"Even though {not_plot_cols} are in display_cols, they are not numeric and cannot be plotted") elif len(not_plot_cols) == 1: - return ("Warning", f"Even though '{not_plot_cols[0]}' is in display_cols, it is not numeric and cannot be plotted") \ No newline at end of file + return ("Warning", f"Even though '{not_plot_cols[0]}' is in display_cols, it is not numeric and cannot be plotted") + +class Parquet_Writer(FileWriter): + """ + DSI Writer to output certain data as a Parquet file + """ + def __init__(self, table_name, filename, export_cols = None, **kwargs): + """ + Initializes the Parquet Writer with the specified inputs + + `table_name` : str + Name of the table to export from DSI. + + `filename` : str + Name of the Parquet file to be generated. + + `export_cols` : list of str, optional, default is None. + A list of column names to include in the exported Parquet file. + + If None , all columns from the table will be included. + + - Ex: if a table has columns [a, b, c, d, e], and export_cols = [a, c, e], only those are writted to Parquet + """ + super().__init__(filename, **kwargs) + file_extension = filename.rsplit(".", 1)[-1] if '.' in filename else '' + if file_extension not in ["pq", "parquet"]: + filename = filename + ".pq" + self.parquet_file_name = filename + self.table_name = table_name + self.export_cols = export_cols + + def get_rows(self, collection) -> None: + """ + Exports data from the given DSI data collection to a Parquet file. + + `collection` : OrderedDict + The internal DSI abstraction. This is a nested OrderedDict where: + - Top-level keys are table names. + - Each value is another OrderedDict representing the table's data (with column names as keys and lists of values). + + `return`: None. + If an error occurs, a tuple in the format - (ErrorType, "error message") - is returned to and printed by the core + """ + if self.table_name not in collection.keys(): + return (KeyError, f"{self.table_name} does not exist in memory") + if self.export_cols is not None and not set(self.export_cols).issubset(set(collection[self.table_name].keys())): + return (ValueError, f"Inputted list of column names to plot for {self.table_name} is incorrect") + + df = pd.DataFrame(collection[self.table_name]) + + if self.export_cols is not None: + try: + df = df[self.export_cols] + except: + return (ValueError, f"Could not export to Parquet as the specified column input {self.export_cols} is incorrect") + + table = pa.Table.from_pandas(df) + pq.write_table(table, self.parquet_file_name, compression="snappy") \ No newline at end of file diff --git a/dsi/plugins/tests/test_file_reader.py b/dsi/plugins/tests/test_file_reader.py index 6861f4f1..ad31eb60 100644 --- a/dsi/plugins/tests/test_file_reader.py +++ b/dsi/plugins/tests/test_file_reader.py @@ -127,4 +127,12 @@ def test_schema_reader(): assert "dsi_relations" in a.active_metadata.keys() for tableData in a.active_metadata.values(): assert isinstance(tableData, OrderedDict) - assert len(tableData["primary_key"]) == len(tableData["foreign_key"]) \ No newline at end of file + assert len(tableData["primary_key"]) == len(tableData["foreign_key"]) + +def test_parquet_reader(): + a=Terminal() + a.load_module('plugin', 'Parquet', 'reader', filenames="examples/test/wildfiredata.pq") + + assert len(a.active_metadata.keys()) == 1 + assert "Parquet" in a.active_metadata.keys() + assert a.active_metadata["Parquet"]["wind_speed"] == [2,8,8,5] \ No newline at end of file diff --git a/dsi/plugins/tests/test_file_writer.py b/dsi/plugins/tests/test_file_writer.py index 413e5df4..77118363 100644 --- a/dsi/plugins/tests/test_file_writer.py +++ b/dsi/plugins/tests/test_file_writer.py @@ -45,4 +45,18 @@ def test_table_plot(): pixel_mean = np.mean(plot_image) os.remove("student_physics_plot.png") - assert pixel_mean != 255 #check if image is all white pixels (no diagram generated) \ No newline at end of file + assert pixel_mean != 255 #check if image is all white pixels (no diagram generated) + +def test_parquet_writer(): + a=Terminal() + a.load_module('plugin', 'YAML1', 'reader', filenames=["examples/test/student_test1.yml", "examples/test/student_test2.yml"], target_table_prefix = "student") + a.load_module('plugin', "Parquet_Writer", "writer", table_name = "student__physics", filename = "student_physics_parquet") + a.transload() + + assert os.path.exists("student_physics_parquet.pq") + + a.load_module('plugin', 'Parquet', 'reader', filenames="student_physics_parquet.pq") + assert "Parquet" in a.active_metadata.keys() + assert a.active_metadata["Parquet"]["specification"] == ["!amy", "!amy1"] + + os.remove("student_physics_parquet.pq") \ No newline at end of file diff --git a/examples/developer/1.baseline.py b/examples/developer/1.baseline.py index 2527660f..a9bd48e2 100644 --- a/examples/developer/1.baseline.py +++ b/examples/developer/1.baseline.py @@ -11,7 +11,7 @@ # ['GitInfo', 'Hostname', 'SystemKernel', 'Bueno', 'Csv'] print(base_terminal.list_available_modules('backend')) -# ['Gufi', 'Sqlite', 'Parquet'] +# ['Gufi', 'Sqlite', 'DuckDB', 'HPSS'] print(base_terminal.list_loaded_modules()) # {'writer': [], diff --git a/examples/test/coreterminal.py b/examples/test/coreterminal.py index 8ed11c3c..e765411b 100644 --- a/examples/test/coreterminal.py +++ b/examples/test/coreterminal.py @@ -1,22 +1,22 @@ from dsi.core import Terminal '''This is an example workflow using core.py''' -a=Terminal(debug=0, backup_db = False, runTable=True) +a=Terminal(debug=0, backup_db = False, runTable=False) ''' Example uses of loading open DSI readers ''' # a.load_module('plugin','Bueno','reader', filenames=['bueno1.data', 'bueno2.data']) # a.load_module('plugin','Hostname','reader') # a.load_module('plugin', 'Schema', 'reader', filename="example_schema.json") -a.load_module('plugin', 'Schema', 'reader', filename="yaml1_schema.json") +# a.load_module('plugin', 'Schema', 'reader', filename="yaml1_schema.json") -a.load_module('plugin', 'YAML1', 'reader', filenames=["student_test1.yml", "student_test2.yml"]) +# a.load_module('plugin', 'YAML1', 'reader', filenames=["student_test1.yml", "student_test2.yml"]) # a.load_module('plugin', 'TOML1', 'reader', filenames=["results.toml", "results1.toml"], target_table_prefix = "results") # a.load_module('plugin', 'Csv', 'reader', filenames="yosemite5.csv") # a.load_module('plugin', 'Ensemble', 'reader', filenames="wildfiredata.csv") # a.load_module('plugin', 'Cloverleaf', 'reader', folder_path="../clover3d/") -# a.load_module('plugin', 'Oceans11Datacard', 'reader', filenames=['../wildfire/wildfire_oceans11.yml', '../pennant/pennant_oceans11.yml']) +a.load_module('plugin', 'Oceans11Datacard', 'reader', filenames=['../wildfire/wildfire_oceans11.yml', '../pennant/pennant_oceans11.yml']) # a.load_module('plugin', 'DublinCoreDatacard', 'reader', filenames="../wildfire/wildfire_dublin_core.xml") # a.load_module('plugin', 'SchemaOrgDatacard', 'reader', filenames="../wildfire/wildfire_schema_org.json") # a.load_module('plugin', 'GoogleDatacard', 'reader', filenames="../wildfire/wildfire_google.yml") @@ -92,7 +92,7 @@ # # ['GitInfo', 'Hostname', 'SystemKernel', 'Bueno', 'Csv'] # a.list_available_modules('backend') -# # ['Gufi', 'Sqlite', 'Parquet', 'DuckDB'] +# # ['Gufi', 'Sqlite', 'DuckDB', 'HPSS'] ''' Listing all loaded modules (writers and backends) ''' # print(a.list_loaded_modules()) diff --git a/examples/test/dsi_example.py b/examples/test/dsi_example.py index 5dde0a80..e975e688 100644 --- a/examples/test/dsi_example.py +++ b/examples/test/dsi_example.py @@ -20,6 +20,7 @@ # test.read(filenames=['bueno1.data', 'bueno2.data'], reader_name='Bueno') # test.read(filenames="../clover3d/", reader_name='Cloverleaf') # test.read(filenames="test.txt", reader_name="text_file_reader.py") +# test.read(filenames="wildfiredata.pq", reader_name="Parquet", table_name="Wildfire_parquet") # test.read(filenames=['../wildfire/wildfire_oceans11.yml', '../pennant/pennant_oceans11.yml'], reader_name='Oceans11Datacard') # test.read(filenames="../wildfire/wildfire_dublin_core.xml", reader_name='DublinCoreDatacard') @@ -31,6 +32,7 @@ # test.write(filename="er_diagram.png", writer_name="ER_Diagram") # test.write(filename="physics_plot.png", writer_name="Table_Plot", table_name="physics") # test.write(filename="physics.csv", writer_name="Csv_Writer", table_name="physics") +# test.write(filename="wildfire_test.pq", writer_name="Parquet_Writer", table_name="Wildfire_parquet") ''' Backend data interactions: query()/get_table() and find(). Manipulating their outputs to update() the backend '''