Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Copyright and License

This program is open source under the BSD-3 License.

© 2025. Triad National Security, LLC. All rights reserved.
© 2025. Triad National Security, LLC. All rights reserved. LA-UR-25-29245

Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
exec(open("../dsi/_version.py").read())

project = 'DSI'
copyright = '2025, Triad National Security, LLC. All rights reserved.'
copyright = '2025, Triad National Security, LLC. All rights reserved. LA-UR-25-29248'
author = 'The DSI Project team'
release = __version__

Expand Down
22 changes: 14 additions & 8 deletions dsi/backends/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,19 @@ def sql_type(self, input_list):
`return`: str
A string representing the inferred DuckDB data type for the input list.
"""
for item in input_list:
if isinstance(item, int):
return " INTEGER"
elif isinstance(item, float):
return " FLOAT"
elif isinstance(item, str):
return " VARCHAR"
DUCKDB_BIGINT_MIN = -9223372036854775808
DUCKDB_BIGINT_MAX = 9223372036854775807
DUCKDB_INT_MIN = -2147483648
DUCKDB_INT_MAX = 2147483647

if all(isinstance(x, int) for x in input_list if x is not None):
if any(x < DUCKDB_BIGINT_MIN or x > DUCKDB_BIGINT_MAX for x in input_list if x is not None):
return " DOUBLE"
elif any(x < DUCKDB_INT_MIN or x > DUCKDB_INT_MAX for x in input_list if x is not None):
return " BIGINT"
return " INTEGER"
elif all(isinstance(x, float) for x in input_list if x is not None):
return " DOUBLE"
return " VARCHAR"

def duckdb_compatible_name(self, name):
Expand Down Expand Up @@ -822,7 +828,7 @@ def summary_helper(self, table_name):
"""
col_info = self.cur.execute(f"PRAGMA table_info({table_name})").fetchall()

numeric_types = {'INTEGER', 'REAL', 'FLOAT', 'NUMERIC', 'DECIMAL', 'DOUBLE'}
numeric_types = {'INTEGER', 'REAL', 'FLOAT', 'NUMERIC', 'DECIMAL', 'DOUBLE', 'BIGINT'}
headers = ['column', 'type', 'min', 'max', 'avg', 'std_dev']
rows = []

Expand Down
23 changes: 14 additions & 9 deletions dsi/backends/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,15 @@ class Sqlite(Filesystem):
"""
runTable = False

def __init__(self, filename):
def __init__(self, filename, **kwargs):
"""
Initializes a SQLite backend with a user inputted filename, and creates other internal variables
"""
self.filename = filename
self.con = sqlite3.connect(filename)
if 'kwargs' in kwargs:
self.con = sqlite3.connect(filename, **kwargs['kwargs'])
else:
self.con = sqlite3.connect(filename)
self.cur = self.con.cursor()
self.runTable = Sqlite.runTable
self.sqlite_keywords = ["ABORT", "ACTION", "ADD", "AFTER", "ALL", "ALTER", "ALWAYS", "ANALYZE", "AND", "AS", "ASC", "ATTACH",
Expand Down Expand Up @@ -80,14 +83,16 @@ def sql_type(self, input_list):
`return`: str
A string representing the inferred SQLite data type for the input list.
"""
for item in input_list:
if isinstance(item, int):
return " INTEGER"
elif isinstance(item, float):
SQLITE_INT_MIN = -9223372036854775808
SQLITE_INT_MAX = 9223372036854775807

if all(isinstance(x, int) for x in input_list if x is not None):
if any(x < SQLITE_INT_MIN or x > SQLITE_INT_MAX for x in input_list if x is not None):
return " FLOAT"
elif isinstance(item, str):
return " VARCHAR"
return ""
return " INTEGER"
elif all(isinstance(x, float) for x in input_list if x is not None):
return " FLOAT"
return " VARCHAR"

def sqlite_compatible_name(self, name):
if (name.startswith('"') and name.endswith('"')) or (name.upper() not in self.sqlite_keywords and name.isidentifier()):
Expand Down
24 changes: 1 addition & 23 deletions dsi/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ def wrap_in_quotes(value):
self.logger.error(f"Error finding rows due to {return_object[1]}")
raise return_object[0](return_object[1])
elif isinstance(return_object, list) and isinstance(return_object[0], str):
err_msg = f"'{column_name}' appeared in more than one table. Can only do a conditional find if '{column_name}' is in one table"
err_msg = f"'{column_name}' appeared in more than one table. Can only find if '{column_name}' is in one table"
if self.debug_level != 0:
self.logger.warning(err_msg)
print(f"WARNING: {err_msg}")
Expand Down Expand Up @@ -1479,28 +1479,6 @@ def index(self, local_loc, remote_loc, isVerbose=False):
with redirect_stdout(fnull):
t.load_module('plugin', "Dict", "reader", collection=st_dict, table_name="filesystem")
t.artifact_handler(interaction_type='ingest')

# # Create new filesystem collection with origin and remote locations
# # Stage data for ingest
# # Transpose the OrderedDict to a list of row dictionaries
# num_rows = len(next(iter(st_dict.values()))) # Assume all columns are of equal length
# rows = []

# for i in range(num_rows):
# row = {col: values[i] for col, values in st_dict.items()}
# rows.append(row)

# # Temporary csv to ingest
# output_file = '.fs.csv'
# with open(output_file, mode='w', newline='') as csvfile:
# writer = csv.DictWriter(csvfile, fieldnames=st_dict.keys())
# writer.writeheader()
# writer.writerows(rows)

# # Add filesystem table
# t.load_module('plugin', 'Csv', 'reader', filenames=".fs.csv", table_name="filesystem")
# #t.load_module('plugin', 'collection_reader', 'reader', st_dict )
# t.artifact_handler(interaction_type='ingest')

t.close()

Expand Down
4 changes: 2 additions & 2 deletions dsi/dsi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class DSI():
The DSI Class abstracts Core.Terminal for managing metadata and Core.Sync for data management and movement.
'''

def __init__(self, filename = ".temp.db", backend_name = "Sqlite"):
def __init__(self, filename = ".temp.db", backend_name = "Sqlite", **kwargs):
"""
Initializes DSI by activating a backend for data operations; default is a Sqlite backend for temporary data analysis.
If users specify `filename`, data is saved to a permanent backend file.
Expand Down Expand Up @@ -61,7 +61,7 @@ def __init__(self, filename = ".temp.db", backend_name = "Sqlite"):
try:
if backend_name.lower() == 'sqlite':
with redirect_stdout(fnull):
self.t.load_module('backend','Sqlite','back-write', filename=filename)
self.t.load_module('backend','Sqlite','back-write', filename=filename, kwargs = kwargs)
self.backend_name = "sqlite"
elif backend_name.lower() == 'duckdb':
with redirect_stdout(fnull):
Expand Down
10 changes: 7 additions & 3 deletions dsi/plugins/file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,22 +746,26 @@ def add_rows(self) -> None:
field_names = []
for element, val in data.items():
if element not in ['authorship', 'data']:
if isinstance(val, list):
val = ",, ".join(val)
if element not in temp_data.keys():
temp_data[element] = [val]
else:
temp_data[element].append(val)
field_names.append(element)
else:
for field, val2 in val.items():
if isinstance(val2, list):
val2 = ",, ".join(val2)
if field not in temp_data.keys():
temp_data[field] = [val2]
else:
temp_data[field].append(val2)
field_names.append(field)

if sorted(field_names) != sorted(["name", "description", "data_uses", "creators", "creation_date",
"la_ur", "owner", "funding", "publisher", "published_date", "origin_location",
"num_simulations", "version", "license", "live_dataset"]):
if sorted(field_names) != sorted(["title", "description", "keywords", "instructions_of_use", "authors",
"release_date", "la_ur", "funding", "rights", "file_types", "num_simulations",
"file_size", "num_files", "dataset_size", "version", "doi"]):
return (ValueError, f"Error in reading {filename} data card. Please ensure all fields included match the template")

self.datacard_data["oceans11_datacard"] = temp_data
Expand Down
34 changes: 29 additions & 5 deletions dsi/plugins/file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class ER_Diagram(FileWriter):
"""
DSI Writer that generates an ER Diagram from the current data in the DSI abstraction
"""
def __init__(self, filename, target_table_prefix = None, **kwargs):
def __init__(self, filename, target_table_prefix = None, max_cols = None, **kwargs):
"""
Initializes the ER Diagram writer

Expand All @@ -35,10 +35,15 @@ def __init__(self, filename, target_table_prefix = None, **kwargs):
If provided, filters the ER Diagram to only include tables whose names begin with this prefix.

- Ex: If prefix = "student", only "student__address", "student__math", "student__physics" tables are included

`max_cols` : int, optional, default None
If provided, limits the number of columns displayed for each table in the ER Diagram.
If relational data is included, this must be >= number of primary and foreign keys for a table.
"""
super().__init__(filename, **kwargs)
self.output_filename = filename
self.target_table_prefix = target_table_prefix
self.max_cols = max_cols

def get_rows(self, collection) -> None:
"""
Expand Down Expand Up @@ -99,7 +104,23 @@ def get_rows(self, collection) -> None:

col_list = tableData.keys()
if tableName == "dsi_units":
col_list = ["table_name", "column_and_unit"]
col_list = ["table_name", "column_name", "unit"]
if self.max_cols is not None:
if "dsi_relations" in collection.keys():
fk_cols = [t[1] for t in collection["dsi_relations"]["foreign_key"] if t[0] == tableName]
pk_cols = [t[1] for t in collection["dsi_relations"]["primary_key"] if t[0] == tableName]
rel_cols = set(pk_cols + fk_cols)

if rel_cols:
if len(rel_cols) > self.max_cols:
return (ValueError, "'max_cols' must be >= to the number of primary/foreign key columns.")
other_cols = [col for col in col_list if col not in rel_cols]
combined = list(rel_cols) + other_cols[:self.max_cols - len(rel_cols)]
col_list = [k for k in col_list if k in combined]
col_list = col_list[:self.max_cols]
if len(tableData.keys()) > self.max_cols:
col_list.append("...")

curr_row = 0
inner_brace = 0
for col_name in col_list:
Expand All @@ -121,9 +142,9 @@ def get_rows(self, collection) -> None:

if "dsi_relations" in collection.keys():
for f_table, f_col in collection["dsi_relations"]["foreign_key"]:
if self.target_table_prefix is not None and self.target_table_prefix not in f_table:
if self.target_table_prefix is not None and f_table is not None and self.target_table_prefix not in f_table:
continue
if f_table != None:
if f_table is not None:
foreignIndex = collection["dsi_relations"]["foreign_key"].index((f_table, f_col))
fk_string = f"{f_table}:{f_col}"
pk_string = f"{collection['dsi_relations']['primary_key'][foreignIndex][0]}:{collection['dsi_relations']['primary_key'][foreignIndex][1]}"
Expand All @@ -137,7 +158,10 @@ def get_rows(self, collection) -> None:
subprocess.run(["dot", "-T", file_type[1:], "-o", self.output_filename + file_type, self.output_filename + ".dot"])
os.remove(self.output_filename + ".dot")
else:
dot.render(self.output_filename, cleanup=True)
try:
dot.render(self.output_filename, cleanup=True)
except:
return (EnvironmentError, "Graphviz executable must be downloaded to global environment using sudo or homebrew.")

class Csv_Writer(FileWriter):
"""
Expand Down
Loading