Skip to content

Commit 4043a64

Browse files
dr-rodriguezkelle
andauthored
Refactor to have references in separate sub-directory; closes #49 (#77)
* First pass at having references in separate directory * Iterating on reference directory use * Using shutil to fully remove data directory and any sub-directories * Safer use of shutil for reference directory * Minor updates * Updating documentation * Saving source JSON files to source sub-directory * Updating documentation * Further updates * Apply suggestions from code review Co-authored-by: Kelle Cruz <kellecruz@gmail.com> * Print out path when saving source and reference tables --------- Co-authored-by: Kelle Cruz <kellecruz@gmail.com>
1 parent a178170 commit 4043a64

File tree

3 files changed

+90
-33
lines changed

3 files changed

+90
-33
lines changed

astrodbkit2/astrodb.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import json
66
import os
77
import sqlite3
8+
import shutil
89

910
import numpy as np
1011
import pandas as pd
@@ -736,25 +737,32 @@ def save_json(self, name, directory):
736737
with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
737738
f.write(json.dumps(data, indent=4, default=json_serializer))
738739

739-
def save_reference_table(self, table, directory):
740+
def save_reference_table(self, table: str, directory: str, reference_directory: str="reference"):
740741
"""
742+
Save the reference table to disk
741743
742744
Parameters
743745
----------
744746
table : str
745747
Name of reference table to output
746748
directory : str
747749
Name of directory in which to save the output JSON
750+
reference_directory : str
751+
Name of sub-directory to use for reference JSON files (eg, data/reference)
748752
"""
749753

754+
# Create directory if not already present
755+
if not os.path.isdir(os.path.join(directory, reference_directory)):
756+
os.makedirs(os.path.join(directory, reference_directory))
757+
750758
results = self.session.query(self.metadata.tables[table]).all()
751759
data = [row._asdict() for row in results]
752760
filename = table + ".json"
753761
if len(data) > 0:
754-
with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
762+
with open(os.path.join(directory, reference_directory, filename), "w", encoding="utf-8") as f:
755763
f.write(json.dumps(data, indent=4, default=json_serializer))
756764

757-
def save_database(self, directory, clear_first=True):
765+
def save_database(self, directory: str, clear_first: bool=True, reference_directory: str="reference", source_directory: str="source"):
758766
"""
759767
Output contents of the database into the specified directory as JSON files.
760768
Source objects have individual JSON files with all data for that object.
@@ -763,28 +771,45 @@ def save_database(self, directory, clear_first=True):
763771
Parameters
764772
----------
765773
directory : str
766-
Name of directory in which to save the output JSON
774+
Name of top-level directory in which to save the output JSON
767775
clear_first : bool
768776
First clear the directory of all existing JSON (useful to capture DB deletions). Default: True
777+
reference_directory : str
778+
Name of sub-directory to use for reference JSON files (eg, data/reference)
779+
source_directory : str
780+
Name of sub-directory to use for source JSON files (eg, data/source)
769781
"""
770782

771783
# Clear existing files first from that directory
772784
if clear_first:
773785
print("Clearing existing JSON files...")
774-
for filename in os.listdir(directory):
775-
os.remove(os.path.join(directory, filename))
786+
for file in os.listdir(directory):
787+
file_path = os.path.join(directory, file)
788+
if os.path.isfile(file_path):
789+
os.remove(file_path)
790+
elif os.path.isdir(file_path):
791+
# This is to handle the reference and source directories
792+
shutil.rmtree(file_path)
793+
794+
# Create sub-directories if not already present
795+
if not os.path.isdir(os.path.join(directory, reference_directory)):
796+
os.makedirs(os.path.join(directory, reference_directory))
797+
if not os.path.isdir(os.path.join(directory, source_directory)):
798+
os.makedirs(os.path.join(directory, source_directory))
776799

777800
# Output reference tables
801+
print(f"Storing reference tables to {os.path.join(directory, reference_directory)}...")
778802
for table in self._reference_tables:
779803
# Skip reference tables that are not actually in the database
780804
if table not in self.metadata.tables.keys():
781805
continue
782806

783-
self.save_reference_table(table, directory)
807+
self.save_reference_table(table, directory, reference_directory=reference_directory)
784808

785809
# Output primary objects
810+
print(f"Storing individual sources to {os.path.join(directory, source_directory)}...")
786811
for row in tqdm(self.query(self.metadata.tables[self._primary_table])):
787-
self.save_json(row, directory)
812+
self.save_json(row, os.path.join(directory, source_directory))
788813

789814
# Object input methods
790815
def add_table_data(self, data, table, fmt="csv"):
@@ -892,17 +917,21 @@ def load_json(self, filename):
892917
temp_dict[self._foreign_key] = source
893918
conn.execute(self.metadata.tables[key].insert().values(temp_dict))
894919

895-
def load_database(self, directory, verbose=False):
920+
def load_database(self, directory: str, verbose: bool=False, reference_directory: str="reference", source_directory: str="source"):
896921
"""
897922
Reload entire database from a directory of JSON files.
898923
Note that this will first clear existing tables.
899924
900925
Parameters
901926
----------
902927
directory : str
903-
Name of directory containing the JSON files
928+
Name of top-level directory containing the JSON files
904929
verbose : bool
905930
Flag to enable diagnostic messages
931+
reference_directory : str
932+
Relative path to sub-directory to use for reference JSON files (eg, data/reference)
933+
source_directory : str
934+
Relative path to sub-directory to use for source JSON files (eg, data/source)
906935
"""
907936

908937
# Clear existing database contents
@@ -917,12 +946,24 @@ def load_database(self, directory, verbose=False):
917946
for table in self._reference_tables:
918947
if verbose:
919948
print(f"Loading {table} table")
920-
self.load_table(table, directory, verbose=verbose)
949+
# Check if the reference table is in the sub-directory
950+
if os.path.exists(os.path.join(directory, reference_directory, table+".json")):
951+
self.load_table(table, os.path.join(directory, reference_directory), verbose=verbose)
952+
else:
953+
self.load_table(table, directory, verbose=verbose)
921954

922955
# Load object data
923956
if verbose:
924957
print("Loading object tables")
925-
for file in tqdm(os.listdir(directory)):
958+
959+
# Check if the sources are in the sub-directory
960+
if os.path.exists(os.path.join(directory, source_directory)):
961+
directory_of_sources = os.path.join(directory, source_directory)
962+
else:
963+
directory_of_sources = directory
964+
965+
# Scan selected directory for JSON source files
966+
for file in tqdm(os.listdir(directory_of_sources)):
926967
# Skip reference tables
927968
core_name = file.replace(".json", "")
928969
if core_name in self._reference_tables:
@@ -932,7 +973,7 @@ def load_database(self, directory, verbose=False):
932973
if not file.endswith(".json") or file.startswith("."):
933974
continue
934975

935-
self.load_json(os.path.join(directory, file))
976+
self.load_json(os.path.join(directory_of_sources, file))
936977

937978
def dump_sqlite(self, database_name):
938979
"""Output database as a sqlite file"""

astrodbkit2/tests/test_astrodb.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import io
44
import json
55
import os
6+
import shutil
67

78
import pandas as pd
89
import pytest
@@ -413,31 +414,35 @@ def test_views(db):
413414

414415
def test_save_reference_table(db, db_dir):
415416
# Test saving a reference table
416-
if os.path.exists(os.path.join(db_dir, 'Publications.json')):
417-
os.remove(os.path.join(db_dir, 'Publications.json'))
418-
db.save_reference_table('Publications', db_dir)
419-
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
420-
os.remove(os.path.join(db_dir, 'Publications.json')) # explicitly removing so that the next step will get verified
417+
ref_dir = "reference"
418+
if os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json')):
419+
os.remove(os.path.join(db_dir, ref_dir, 'Publications.json'))
420+
db.save_reference_table('Publications', db_dir, reference_directory=ref_dir)
421+
assert os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json'))
422+
os.remove(os.path.join(db_dir, ref_dir, 'Publications.json')) # explicitly removing so that the next step will get verified
421423

422424

423425
def test_save_database(db, db_dir):
424426
# Test saving the database to JSON files
425427

426428
# Clear temporary directory first
427-
# if not os.path.exists(DB_DIR):
428-
# os.mkdir(DB_DIR)
429429
for file in os.listdir(db_dir):
430-
os.remove(os.path.join(db_dir, file))
430+
file_path = os.path.join(db_dir, file)
431+
if os.path.isfile(file_path):
432+
os.remove(file_path)
433+
elif os.path.isdir(file_path):
434+
shutil.rmtree(file_path)
431435

432436
db.save_database(db_dir)
433437

434438
# Check JSON data
435-
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
436-
assert os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398.json'))
439+
assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
440+
assert os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'))
437441
assert not os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398 2.json'))
442+
assert not os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398 2.json'))
438443

439444
# Load source and confirm it is the same
440-
with open(os.path.join(db_dir, '2mass_j13571237+1428398.json'), 'r') as f:
445+
with open(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'), 'r') as f:
441446
data = json.load(f)
442447
assert data == db.inventory('2MASS J13571237+1428398')
443448

@@ -457,7 +462,7 @@ def test_load_database(db, db_dir):
457462

458463
# Reload the database and check DB contents
459464
assert os.path.exists(db_dir)
460-
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
465+
assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
461466
db.load_database(db_dir, verbose=True)
462467
assert db.query(db.Publications).count() == 2
463468
assert db.query(db.Photometry).count() == 3
@@ -466,7 +471,11 @@ def test_load_database(db, db_dir):
466471

467472
# Clear temporary directory and files
468473
for file in os.listdir(db_dir):
469-
os.remove(os.path.join(db_dir, file))
474+
file_path = os.path.join(db_dir, file)
475+
if os.path.isfile(file_path):
476+
os.remove(file_path)
477+
elif os.path.isdir(file_path):
478+
shutil.rmtree(file_path)
470479

471480

472481
def test_copy_database_schema():

docs/index.rst

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,18 @@ Loading the Database
8383
--------------------
8484

8585
**Astrodbkit2** contains methods to output the full contents of the database as a list of JSON files.
86-
It can likewise read in a directory of these files to populate the database.
87-
This is how SIMPLE is currently version controlled. To load a database of this form, do the following::
86+
It can likewise read in a directory of these files to populate the database.
87+
By default, reference tables (eg, Publications, Telescopes, etc) and source tables are respectively stored in `reference/` and `source/` sub-directories of `data/`.
88+
This is how SIMPLE is currently version controlled.
89+
90+
To load a database of this form, do the following::
8891

8992
from astrodbkit2.astrodb import Database
9093

9194
connection_string = 'sqlite:///SIMPLE.db' # SQLite connection string
9295
db_dir = 'data' # directory where JSON files are located
9396
db = Database(connection_string)
94-
db.load_database(db_dir)
97+
db.load_database(directory=db_dir, reference_directory="reference")
9598

9699
.. note:: Database contents are cleared when loading from JSON files to ensure that the database only contains
97100
sources from on-disk files. We describe later how to use the :py:meth:`~astrodbkit2.astrodb.Database.save_db` method
@@ -406,17 +409,21 @@ Saving the Database
406409
===================
407410

408411
If users perform changes to a database, they will want to output this to disk to be version controlled.
409-
**Astrodbkit2** provides methods to save an individual source or reference table as well as the entire data.
410-
We recommend the later to output the entire contents to disk::
412+
**Astrodbkit2** provides methods to save an individual source or reference table as well as all of the data stored in the database.
413+
By default, reference tables are stored in a sub-directory of `data/` called "reference"; this can be overwritten by
414+
supplying a `reference_directory` variable into `save_database` or `save_reference_table`.
415+
Similarly, source/object tables are stored in a sub-directory of `data/` called "source" which can be overwritten by supplying a `source_directory` variable.
416+
417+
We recommend using `save_database` as that outputs the entire database contents to disk::
411418

412419
# Save single object
413420
db.save_json('2MASS J13571237+1428398', 'data')
414421

415422
# Save single reference table
416423
db.save_reference_table('Publications', 'data')
417424

418-
# Save entire database to directory 'data'
419-
db.save_database('data')
425+
# Save entire database to directory 'data/' with 'reference/' and 'source/' subdirectories.
426+
db.save_database(directory='data', reference_directory='reference', source_directory='source')
420427

421428
.. note:: To properly capture database deletes, the contents of the specified directory is first cleared before
422429
creating JSON files representing the current state of the database.

0 commit comments

Comments
 (0)