44import sys
55import csv
66import logging
7+ from pathlib import Path
78from urllib .request import urlopen
89from copy import copy
910
1213from dataclasses import dataclass
1314from typing import List , Union , Any , Dict , Tuple , Generator , TextIO
1415
16+ from linkml_runtime .dumpers import yaml_dumper , json_dumper
1517from linkml_runtime .linkml_model import Annotation , Example
1618from linkml_runtime .linkml_model .meta import SchemaDefinition , ClassDefinition , Prefix , \
1719 SlotDefinition , EnumDefinition , PermissibleValue , SubsetDefinition , TypeDefinition , Element , Setting
1820from linkml_runtime .utils .schema_as_dict import schema_as_dict
1921from linkml_runtime .utils .schemaview import SchemaView , re
22+ from linkml_runtime .utils .yamlutils import YAMLRoot
2023
2124from schemasheets .schemasheet_datamodel import ColumnConfig , TableConfig , get_configmodel , get_metamodel , COL_NAME , \
2225 DESCRIPTOR , \
2629from schemasheets .utils .prefixtool import guess_prefix_expansion
2730
2831
32+ def ensure_path_tokens (path : Union [str , List [str ]]) -> List [str ]:
33+ if isinstance (path , list ):
34+ return path
35+ if "." in path :
36+ return path .split ("." )
37+ return [path ]
38+
39+
40+ def get_attr_via_path_accessor (obj : Union [dict , YAMLRoot ], path : Union [str , List [str ]]) -> Any :
41+ """
42+ Given an object and a path, return the value at the end of the path
43+
44+ :param obj: object
45+ :param path: path
46+ :return: value
47+ """
48+ toks = ensure_path_tokens (path )
49+ tok = toks [0 ]
50+ toks = toks [1 :]
51+ if isinstance (obj , dict ):
52+ v = obj .get (tok , None )
53+ else :
54+ # https://github.com/linkml/linkml/issues/971
55+ v = getattr (obj , tok , None )
56+ if v and toks :
57+ return get_attr_via_path_accessor (v , toks )
58+ else :
59+ return v
60+
61+
62+ def set_attr_via_path_accessor (obj : Union [dict , YAMLRoot ], path : Union [str , List [str ]], value : Any , depth = 0 ) -> None :
63+ """
64+ Given an object, a path, and a value, set the value at the end of the path
65+
66+ :param obj: object
67+ :param path: path
68+ :param value: value
69+ :param depth: recursion depth
70+ :return: None
71+ """
72+ toks = ensure_path_tokens (path )
73+ tok = toks [0 ]
74+ toks = toks [1 :]
75+ logging .debug (f"[{ depth } ] Setting attr { tok } / { toks } in { obj } to { value } " )
76+ if isinstance (obj , dict ):
77+ if not toks :
78+ obj [tok ] = value
79+ else :
80+ if tok not in obj :
81+ obj [tok ] = {}
82+ logging .info (f"Creating empty dict for: { tok } " )
83+ set_attr_via_path_accessor (obj [tok ], toks , value , depth + 1 )
84+ else :
85+ if not toks :
86+ setattr (obj , tok , value )
87+ else :
88+ if not hasattr (obj , tok ):
89+ setattr (obj , tok , {})
90+ set_attr_via_path_accessor (getattr (obj , tok ), toks , value , depth + 1 )
91+
92+
2993class SchemaSheetRowException (Exception ):
3094 pass
3195
@@ -56,6 +120,8 @@ class SchemaMaker:
56120
57121 gsheet_id : str = None
58122 """Google sheet ID."""
123+
124+ gsheet_cache_dir : str = None
59125
60126 table_config_path : str = None
61127 """Path to table configuration file."""
@@ -82,7 +148,9 @@ def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDef
82148 if not isinstance (csv_files , list ):
83149 csv_files = [csv_files ]
84150 for f in csv_files :
151+ # reconstitute schema
85152 self .load_and_merge_sheet (f , ** kwargs )
153+ self .schema = SchemaDefinition (** json_dumper .to_dict (self .schema ))
86154 self .schema .imports .append ('linkml:types' )
87155 self .schema .prefixes ['linkml' ] = Prefix ('linkml' , 'https://w3id.org/linkml/' )
88156 self ._tidy_slot_usage ()
@@ -102,6 +170,7 @@ def _tidy_slot_usage(self):
102170 :return:
103171 """
104172 for cn , c in self .schema .classes .items ():
173+ logging .debug (f"Tidying { cn } " )
105174 inapplicable_slots = [sn for sn , s in c .slot_usage .items () if 'inapplicable' in s .annotations ]
106175 for sn in inapplicable_slots :
107176 c .slots .remove (sn )
@@ -132,39 +201,47 @@ def load_and_merge_sheet(self, file_name: str, delimiter='\t') -> None:
132201 try :
133202 self .add_row (row , schemasheet .table_config )
134203 line_num += 1
135- except ValueError as e :
136- raise SchemaSheetRowException (f'Error in line { line_num } , row={ row } ' ) from e
204+ except (ValueError , AttributeError ) as e :
205+ raise SchemaSheetRowException (f"Error in line { line_num } , row={ row } \n "
206+ f"Exception:\n { e } " ) from e
137207
138208 def add_row (self , row : Dict [str , Any ], table_config : TableConfig ):
209+ """
210+ Add and translate a row from a schema sheet to the current schema.
211+
212+ A row may represent an instance of a LinkML element, such as a class, slot, type,
213+ or enum. The row may also represent a setting, prefix, or schema-level annotation.
214+
215+ This is known as the "focal element"(s) of the row.
216+
217+ :param row:
218+ :param table_config:
219+ :return:
220+ """
139221 for element in self .row_focal_element (row , table_config ):
140222 if isinstance (element , Prefix ):
141223 name = element .prefix_prefix
142224 elif isinstance (element , PermissibleValue ):
143225 name = element .text
144226 elif isinstance (element , Setting ):
145- # print(f"\n{element = }")
146227 name = element .setting_key
147228 else :
148229 logging .debug (f'EL={ element } in { row } ' )
149230 name = element .name
150231 logging .debug (f'ADDING: { row } // { name } ' )
151232 for k , v in row .items ():
152- # print(f"\n{k = }")
233+ # iterate through all column values in the row
153234 if k not in table_config .columns :
154235 raise ValueError (f'Expected to find { k } in { table_config .columns .keys ()} ' )
155236 cc = table_config .columns [k ]
156- # print(f"{cc = }")
157237 v = self .normalize_value (v , cc )
158238 if v :
159- # print(f"{v = }")
160239 # special case: class-context provided by settings
161240 if cc .settings .applies_to_class :
162241 actual_element = list (self .row_focal_element (row , table_config , column = k ))[0 ]
163242 else :
164243 actual_element = element
165- # print(f"{cc.maps_to = }")
166- # print(f"{cc = }")
167- logging .debug (f'SETTING { name } { cc .maps_to } = { v } ' )
244+ logging .debug (f'SETTING { name } .{ cc .maps_to } = { v } // IK={ cc .settings .inner_key } ' )
168245 if cc .maps_to == 'cardinality' :
169246 self .set_cardinality (actual_element , v )
170247 elif cc .metaslot :
@@ -179,9 +256,10 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
179256 anns = yaml .safe_load (v [0 ])
180257 for ann_key , ann_val in anns .items ():
181258 actual_element .annotations [ann_key ] = ann_val
182- elif isinstance (v , list ):
259+ elif isinstance (v , list ) and not cc .settings .inner_key :
260+ # append to existing list
183261 setattr (actual_element , cc .maps_to , getattr (actual_element , cc .maps_to , []) + v )
184- elif isinstance (v , dict ):
262+ elif isinstance (v , dict ) and not cc . settings . inner_key :
185263 for v_k , v_v in v .items ():
186264 curr_dict = getattr (actual_element , cc .maps_to )
187265 curr_dict [v_k ] = v_v
@@ -196,15 +274,9 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
196274 # will later be converted to a metamodel object
197275 curr_obj = {}
198276 setattr (actual_element , cc .maps_to , curr_obj )
199- if isinstance (curr_obj , dict ):
200- curr_val = curr_obj .get (cc .settings .inner_key , None )
201- else :
202- # https://github.com/linkml/linkml/issues/971
203- curr_val = getattr (curr_obj , cc .settings .inner_key , None )
277+ curr_val = get_attr_via_path_accessor (curr_obj , cc .settings .inner_key )
204278 else :
205279 curr_val = getattr (actual_element , cc .maps_to )
206- # print(f"{curr_val = }")
207- # print(f"{v = }")
208280
209281 if curr_val and curr_val != 'TEMP' and curr_val != v and \
210282 not isinstance (actual_element , SchemaDefinition ) and \
@@ -213,14 +285,18 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
213285 logging .warning (f'Overwriting value for { k } , was { curr_val } , now { v } ' )
214286 raise ValueError (f'Cannot reset value for { k } , was { curr_val } , now { v } ' )
215287 if cc .settings .inner_key :
288+ obj_to_set = getattr (actual_element , cc .maps_to )
216289 if isinstance (getattr (actual_element , cc .maps_to ), list ):
217290 if '|' in v :
218291 vs = v .split ('|' )
219292 else :
220293 vs = [v ]
221- setattr (actual_element , cc .maps_to , [{cc .settings .inner_key : v } for v in vs ])
294+ for v1 in vs :
295+ set_attr_via_path_accessor (obj_to_set , cc .settings .inner_key , v1 )
296+ # setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
222297 else :
223- getattr (actual_element , cc .maps_to )[cc .settings .inner_key ] = v
298+ set_attr_via_path_accessor (obj_to_set , cc .settings .inner_key , v )
299+ # getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
224300 else :
225301 setattr (actual_element , cc .maps_to , v )
226302 elif cc .is_element_type :
@@ -317,7 +393,7 @@ def row_focal_element(self, row: Dict[str, Any], table_config: TableConfig,
317393 else :
318394 raise ValueError (f'Unknown metatype: { typ } ' )
319395 if table_config .column_by_element_type is None :
320- raise ValueError (f' No table_config.column_by_element_type' )
396+ raise ValueError (f""" No table_config.column_by_element_type in { row } """ )
321397 for k , elt_cls in tmap .items ():
322398 if k in table_config .column_by_element_type :
323399 col = table_config .column_by_element_type [k ]
@@ -512,7 +588,13 @@ def normalize_value(self, v: str, column_config: ColumnConfig = None) -> Any:
512588 v = bmap [v .lower ()]
513589 else :
514590 v = bool (v )
515- if metaslot and metaslot .multivalued and not column_config .settings .inner_key :
591+ # TODO: use inner_key to look up the actual slot
592+ metaslot_is_multivalued = metaslot and metaslot .multivalued and not column_config .settings .inner_key
593+ if metaslot and column_config .settings .inner_key :
594+ if column_config .settings .internal_separator :
595+ # print(f"ASSUMING MV FOR {column_config.name}")
596+ metaslot_is_multivalued = True
597+ if metaslot_is_multivalued :
516598 if not isinstance (v , list ):
517599 if v is None :
518600 v = []
@@ -646,10 +728,21 @@ def ensure_file(self, file_name: str) -> str:
646728 def ensure_csvreader (self , file_name : str , delimiter = None ) -> str :
647729 if self .gsheet_id :
648730 url = gsheets_download_url (self .gsheet_id , file_name )
731+ if self .gsheet_cache_dir :
732+ # cache a copy of the file
733+ dir_path = Path (self .gsheet_cache_dir )
734+ dir_path .mkdir (parents = True , exist_ok = True )
735+ path = dir_path / (file_name + '.csv' )
736+ stream = urlopen (url )
737+ lines = [line for line in codecs .iterdecode (stream , 'utf-8' )]
738+ with open (path , 'w' ) as f :
739+ f .write ("" .join (lines ))
740+ stream .close ()
649741 stream = urlopen (url )
650742 text_stream = codecs .iterdecode (stream , 'utf-8' )
651743 reader = csv .DictReader (text_stream , delimiter = "," )
652744 yield reader
745+
653746 else :
654747 with open (file_name ) as file :
655748 reader = csv .DictReader (file , delimiter = delimiter )
@@ -683,11 +776,13 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
683776 help = "Auto-repair schema" )
684777@click .option ("--gsheet-id" ,
685778 help = "Google sheets ID. If this is specified then the arguments MUST be sheet names" )
779+ @click .option ("--gsheet-cache-dir" ,
780+ help = "Directory to cache google sheets" )
686781@click .option ("--base-schema-path" ,
687782 help = "Base schema yaml file, the base-schema will be merged with the generated schema" )
688783@click .option ("-v" , "--verbose" , count = True )
689784@click .argument ('tsv_files' , nargs = - 1 )
690- def convert (tsv_files , gsheet_id , output : TextIO , name , repair , table_config_path : str , use_attributes : bool ,
785+ def convert (tsv_files , gsheet_id , gsheet_cache_dir , output : TextIO , name , repair , table_config_path : str , use_attributes : bool ,
691786 unique_slots : bool , verbose : int , sort_keys : bool , base_schema_path : str ):
692787 """
693788 Convert schemasheets to a LinkML schema
@@ -712,6 +807,7 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
712807 sm = SchemaMaker (use_attributes = use_attributes ,
713808 unique_slots = unique_slots ,
714809 gsheet_id = gsheet_id ,
810+ gsheet_cache_dir = gsheet_cache_dir ,
715811 default_name = name ,
716812 table_config_path = table_config_path ,
717813 base_schema_path = base_schema_path )
@@ -720,7 +816,6 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
720816 schema = sm .repair_schema (schema )
721817 schema_dict = schema_as_dict (schema )
722818 output .write (yaml .dump (schema_dict , sort_keys = sort_keys ))
723- # output.write(yaml_dumper.dumps(schema))
724819
725820
726821if __name__ == '__main__' :
0 commit comments