Skip to content

Commit 530f303

Browse files
authored
Adding Property Graph example (#144)
Fixes #142 Also adding roundtrip when generating schema, catches issues with #141
1 parent a55ba85 commit 530f303

File tree

7 files changed

+1282
-1040
lines changed

7 files changed

+1282
-1040
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ sheets2linkml personinfo.tsv -o personinfo.yaml
8989

9090
We recommend using [COGS](https://linkml.io/schemasheets/howto/google-sheets/) to synchronize your google sheets with local files using a git-like mechanism
9191

92+
## Examples
93+
94+
- [Person Info Schema](https://docs.google.com/spreadsheets/d/1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ/edit#gid=55566104)
95+
- [Movies Property Graph Schema](https://docs.google.com/spreadsheets/d/1oMrzA41tg_nisdWInnqKJrcvv30dOXuwAhznJYYPSB8/edit?gid=1499822522#gid=1499822522)
96+
9297
## Finding out more
9398

9499
* [Schema Sheets Manual](https://linkml.io/schemasheets)

docs/howto/google-sheets.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ E.g.
3838
sheets2linkml --gsheet-id 1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ personinfo types prefixes -o personinfo.yaml
3939
```
4040

41-
__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.
41+
__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals/booleans in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.
4242

4343
## COGS
4444

poetry.lock

Lines changed: 1103 additions & 1011 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

schemasheets/schema_exporter.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88
import click
99
from linkml_runtime.linkml_model import Element, SlotDefinition, SubsetDefinition, ClassDefinition, EnumDefinition, \
1010
PermissibleValue, \
11-
TypeDefinition, Example, Annotation, Prefix
11+
TypeDefinition, Example, Annotation, Prefix, SchemaDefinition
1212
from linkml_runtime.utils.formatutils import underscore
1313
from linkml_runtime.utils.schemaview import SchemaView
1414

1515
from schemasheets.conf.configschema import ColumnSettings
1616
from schemasheets.schemamaker import SchemaMaker
1717
from schemasheets.schemasheet_datamodel import TableConfig, T_CLASS, T_SLOT, SchemaSheet, T_ENUM, T_PV, T_TYPE, \
18-
T_SUBSET, T_PREFIX
18+
T_SUBSET, T_PREFIX, T_SCHEMA
1919

2020
ROW = Dict[str, Any]
2121

@@ -108,7 +108,7 @@ class SchemaExporter:
108108
Exports a schema to Schema Sheets TSV format
109109
"""
110110
schemamaker: SchemaMaker = field(default_factory=lambda: SchemaMaker())
111-
delimiter = '\t'
111+
delimiter: str = field(default_factory=lambda: '\t')
112112
rows: List[ROW] = field(default_factory=lambda: [])
113113

114114
def export(self, schemaview: SchemaView, to_file: Union[str, Path], specification: str = None,
@@ -229,6 +229,11 @@ def export_element(self, element: Element, parent: Optional[Element], schemaview
229229
pk_col = col_name
230230
else:
231231
continue
232+
elif t == T_SCHEMA:
233+
if isinstance(element, SchemaDefinition):
234+
pk_col = col_name
235+
else:
236+
continue
232237
else:
233238
raise AssertionError(f"Unexpected type: {t}")
234239
if not pk_col:

schemasheets/schemamaker.py

Lines changed: 119 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import sys
55
import csv
66
import logging
7+
from pathlib import Path
78
from urllib.request import urlopen
89
from copy import copy
910

@@ -12,11 +13,13 @@
1213
from dataclasses import dataclass
1314
from typing import List, Union, Any, Dict, Tuple, Generator, TextIO
1415

16+
from linkml_runtime.dumpers import yaml_dumper, json_dumper
1517
from linkml_runtime.linkml_model import Annotation, Example
1618
from linkml_runtime.linkml_model.meta import SchemaDefinition, ClassDefinition, Prefix, \
1719
SlotDefinition, EnumDefinition, PermissibleValue, SubsetDefinition, TypeDefinition, Element, Setting
1820
from linkml_runtime.utils.schema_as_dict import schema_as_dict
1921
from linkml_runtime.utils.schemaview import SchemaView, re
22+
from linkml_runtime.utils.yamlutils import YAMLRoot
2023

2124
from schemasheets.schemasheet_datamodel import ColumnConfig, TableConfig, get_configmodel, get_metamodel, COL_NAME, \
2225
DESCRIPTOR, \
@@ -26,6 +29,67 @@
2629
from schemasheets.utils.prefixtool import guess_prefix_expansion
2730

2831

32+
def ensure_path_tokens(path: Union[str, List[str]]) -> List[str]:
33+
if isinstance(path, list):
34+
return path
35+
if "." in path:
36+
return path.split(".")
37+
return [path]
38+
39+
40+
def get_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]]) -> Any:
41+
"""
42+
Given an object and a path, return the value at the end of the path
43+
44+
:param obj: object
45+
:param path: path
46+
:return: value
47+
"""
48+
toks = ensure_path_tokens(path)
49+
tok = toks[0]
50+
toks = toks[1:]
51+
if isinstance(obj, dict):
52+
v = obj.get(tok, None)
53+
else:
54+
# https://github.com/linkml/linkml/issues/971
55+
v = getattr(obj, tok, None)
56+
if v and toks:
57+
return get_attr_via_path_accessor(v, toks)
58+
else:
59+
return v
60+
61+
62+
def set_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]], value: Any, depth=0) -> None:
63+
"""
64+
Given an object, a path, and a value, set the value at the end of the path
65+
66+
:param obj: object
67+
:param path: path
68+
:param value: value
69+
:param depth: recursion depth
70+
:return: None
71+
"""
72+
toks = ensure_path_tokens(path)
73+
tok = toks[0]
74+
toks = toks[1:]
75+
logging.debug(f"[{depth}] Setting attr {tok} / {toks} in {obj} to {value}")
76+
if isinstance(obj, dict):
77+
if not toks:
78+
obj[tok] = value
79+
else:
80+
if tok not in obj:
81+
obj[tok] = {}
82+
logging.info(f"Creating empty dict for: {tok}")
83+
set_attr_via_path_accessor(obj[tok], toks, value, depth+1)
84+
else:
85+
if not toks:
86+
setattr(obj, tok, value)
87+
else:
88+
if not hasattr(obj, tok):
89+
setattr(obj, tok, {})
90+
set_attr_via_path_accessor(getattr(obj, tok), toks, value, depth+1)
91+
92+
2993
class SchemaSheetRowException(Exception):
3094
pass
3195

@@ -56,6 +120,8 @@ class SchemaMaker:
56120

57121
gsheet_id: str = None
58122
"""Google sheet ID."""
123+
124+
gsheet_cache_dir: str = None
59125

60126
table_config_path: str = None
61127
"""Path to table configuration file."""
@@ -82,7 +148,9 @@ def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDef
82148
if not isinstance(csv_files, list):
83149
csv_files = [csv_files]
84150
for f in csv_files:
151+
# reconstitute schema
85152
self.load_and_merge_sheet(f, **kwargs)
153+
self.schema = SchemaDefinition(**json_dumper.to_dict(self.schema))
86154
self.schema.imports.append('linkml:types')
87155
self.schema.prefixes['linkml'] = Prefix('linkml', 'https://w3id.org/linkml/')
88156
self._tidy_slot_usage()
@@ -102,6 +170,7 @@ def _tidy_slot_usage(self):
102170
:return:
103171
"""
104172
for cn, c in self.schema.classes.items():
173+
logging.debug(f"Tidying {cn}")
105174
inapplicable_slots = [sn for sn, s in c.slot_usage.items() if 'inapplicable' in s.annotations]
106175
for sn in inapplicable_slots:
107176
c.slots.remove(sn)
@@ -132,39 +201,47 @@ def load_and_merge_sheet(self, file_name: str, delimiter='\t') -> None:
132201
try:
133202
self.add_row(row, schemasheet.table_config)
134203
line_num += 1
135-
except ValueError as e:
136-
raise SchemaSheetRowException(f'Error in line {line_num}, row={row}') from e
204+
except (ValueError, AttributeError) as e:
205+
raise SchemaSheetRowException(f"Error in line {line_num}, row={row}\n"
206+
f"Exception:\n{e}") from e
137207

138208
def add_row(self, row: Dict[str, Any], table_config: TableConfig):
209+
"""
210+
Add and translate a row from a schema sheet to the current schema.
211+
212+
A row may represent an instance of a LinkML element, such as a class, slot, type,
213+
or enum. The row may also represent a setting, prefix, or schema-level annotation.
214+
215+
This is known as the "focal element"(s) of the row.
216+
217+
:param row:
218+
:param table_config:
219+
:return:
220+
"""
139221
for element in self.row_focal_element(row, table_config):
140222
if isinstance(element, Prefix):
141223
name = element.prefix_prefix
142224
elif isinstance(element, PermissibleValue):
143225
name = element.text
144226
elif isinstance(element, Setting):
145-
# print(f"\n{element = }")
146227
name = element.setting_key
147228
else:
148229
logging.debug(f'EL={element} in {row}')
149230
name = element.name
150231
logging.debug(f'ADDING: {row} // {name}')
151232
for k, v in row.items():
152-
# print(f"\n{k = }")
233+
# iterate through all column values in the row
153234
if k not in table_config.columns:
154235
raise ValueError(f'Expected to find {k} in {table_config.columns.keys()}')
155236
cc = table_config.columns[k]
156-
# print(f"{cc = }")
157237
v = self.normalize_value(v, cc)
158238
if v:
159-
# print(f"{v = }")
160239
# special case: class-context provided by settings
161240
if cc.settings.applies_to_class:
162241
actual_element = list(self.row_focal_element(row, table_config, column=k))[0]
163242
else:
164243
actual_element = element
165-
# print(f"{cc.maps_to = }")
166-
# print(f"{cc = }")
167-
logging.debug(f'SETTING {name} {cc.maps_to} = {v}')
244+
logging.debug(f'SETTING {name}.{cc.maps_to} = {v} // IK={cc.settings.inner_key}')
168245
if cc.maps_to == 'cardinality':
169246
self.set_cardinality(actual_element, v)
170247
elif cc.metaslot:
@@ -179,9 +256,10 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
179256
anns = yaml.safe_load(v[0])
180257
for ann_key, ann_val in anns.items():
181258
actual_element.annotations[ann_key] = ann_val
182-
elif isinstance(v, list):
259+
elif isinstance(v, list) and not cc.settings.inner_key:
260+
# append to existing list
183261
setattr(actual_element, cc.maps_to, getattr(actual_element, cc.maps_to, []) + v)
184-
elif isinstance(v, dict):
262+
elif isinstance(v, dict) and not cc.settings.inner_key:
185263
for v_k, v_v in v.items():
186264
curr_dict = getattr(actual_element, cc.maps_to)
187265
curr_dict[v_k] = v_v
@@ -196,15 +274,9 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
196274
# will later be converted to a metamodel object
197275
curr_obj = {}
198276
setattr(actual_element, cc.maps_to, curr_obj)
199-
if isinstance(curr_obj, dict):
200-
curr_val = curr_obj.get(cc.settings.inner_key, None)
201-
else:
202-
# https://github.com/linkml/linkml/issues/971
203-
curr_val = getattr(curr_obj, cc.settings.inner_key, None)
277+
curr_val = get_attr_via_path_accessor(curr_obj, cc.settings.inner_key)
204278
else:
205279
curr_val = getattr(actual_element, cc.maps_to)
206-
# print(f"{curr_val = }")
207-
# print(f"{v = }")
208280

209281
if curr_val and curr_val != 'TEMP' and curr_val != v and \
210282
not isinstance(actual_element, SchemaDefinition) and \
@@ -213,14 +285,18 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
213285
logging.warning(f'Overwriting value for {k}, was {curr_val}, now {v}')
214286
raise ValueError(f'Cannot reset value for {k}, was {curr_val}, now {v}')
215287
if cc.settings.inner_key:
288+
obj_to_set = getattr(actual_element, cc.maps_to)
216289
if isinstance(getattr(actual_element, cc.maps_to), list):
217290
if '|' in v:
218291
vs = v.split('|')
219292
else:
220293
vs = [v]
221-
setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
294+
for v1 in vs:
295+
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v1)
296+
# setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
222297
else:
223-
getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
298+
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v)
299+
# getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
224300
else:
225301
setattr(actual_element, cc.maps_to, v)
226302
elif cc.is_element_type:
@@ -317,7 +393,7 @@ def row_focal_element(self, row: Dict[str, Any], table_config: TableConfig,
317393
else:
318394
raise ValueError(f'Unknown metatype: {typ}')
319395
if table_config.column_by_element_type is None:
320-
raise ValueError(f'No table_config.column_by_element_type')
396+
raise ValueError(f"""No table_config.column_by_element_type in {row}""")
321397
for k, elt_cls in tmap.items():
322398
if k in table_config.column_by_element_type:
323399
col = table_config.column_by_element_type[k]
@@ -512,7 +588,13 @@ def normalize_value(self, v: str, column_config: ColumnConfig = None) -> Any:
512588
v = bmap[v.lower()]
513589
else:
514590
v = bool(v)
515-
if metaslot and metaslot.multivalued and not column_config.settings.inner_key:
591+
# TODO: use inner_key to look up the actual slot
592+
metaslot_is_multivalued = metaslot and metaslot.multivalued and not column_config.settings.inner_key
593+
if metaslot and column_config.settings.inner_key:
594+
if column_config.settings.internal_separator:
595+
# print(f"ASSUMING MV FOR {column_config.name}")
596+
metaslot_is_multivalued = True
597+
if metaslot_is_multivalued:
516598
if not isinstance(v, list):
517599
if v is None:
518600
v = []
@@ -646,10 +728,21 @@ def ensure_file(self, file_name: str) -> str:
646728
def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
647729
if self.gsheet_id:
648730
url = gsheets_download_url(self.gsheet_id, file_name)
731+
if self.gsheet_cache_dir:
732+
# cache a copy of the file
733+
dir_path = Path(self.gsheet_cache_dir)
734+
dir_path.mkdir(parents=True, exist_ok=True)
735+
path = dir_path / (file_name + '.csv')
736+
stream = urlopen(url)
737+
lines = [line for line in codecs.iterdecode(stream, 'utf-8')]
738+
with open(path, 'w') as f:
739+
f.write("".join(lines))
740+
stream.close()
649741
stream = urlopen(url)
650742
text_stream = codecs.iterdecode(stream, 'utf-8')
651743
reader = csv.DictReader(text_stream, delimiter=",")
652744
yield reader
745+
653746
else:
654747
with open(file_name) as file:
655748
reader = csv.DictReader(file, delimiter=delimiter)
@@ -683,11 +776,13 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
683776
help="Auto-repair schema")
684777
@click.option("--gsheet-id",
685778
help="Google sheets ID. If this is specified then the arguments MUST be sheet names")
779+
@click.option("--gsheet-cache-dir",
780+
help="Directory to cache google sheets")
686781
@click.option("--base-schema-path",
687782
help="Base schema yaml file, the base-schema will be merged with the generated schema")
688783
@click.option("-v", "--verbose", count=True)
689784
@click.argument('tsv_files', nargs=-1)
690-
def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
785+
def convert(tsv_files, gsheet_id, gsheet_cache_dir, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
691786
unique_slots: bool, verbose: int, sort_keys: bool, base_schema_path: str):
692787
"""
693788
Convert schemasheets to a LinkML schema
@@ -712,6 +807,7 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
712807
sm = SchemaMaker(use_attributes=use_attributes,
713808
unique_slots=unique_slots,
714809
gsheet_id=gsheet_id,
810+
gsheet_cache_dir=gsheet_cache_dir,
715811
default_name=name,
716812
table_config_path=table_config_path,
717813
base_schema_path=base_schema_path)
@@ -720,7 +816,6 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
720816
schema = sm.repair_schema(schema)
721817
schema_dict = schema_as_dict(schema)
722818
output.write(yaml.dump(schema_dict, sort_keys=sort_keys))
723-
# output.write(yaml_dumper.dumps(schema))
724819

725820

726821
if __name__ == '__main__':

schemasheets/schemasheet_datamodel.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ def add_info(self, col: COL_NAME, info: Union[Dict, DESCRIPTOR]) -> None:
155155
"""
156156
if col not in self.columns:
157157
self.columns[col] = ColumnConfig(col)
158-
#print(f'ADDING: {col}')
159158
self.columns[col].add_info(info)
160159
if self.columns[col].maps_to == 'metatype':
161160
if self.metatype_column and self.metatype_column != col:
@@ -204,6 +203,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
204203
rows = []
205204
line_num = 1
206205
table_config_rows = []
206+
descriptor_line_count = 0
207207
for row in reader:
208208
logging.debug(f"ROW: {row}")
209209
# google sheets
@@ -213,6 +213,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
213213
if row[k0].startswith('>'):
214214
table_config_rows.append(row)
215215
line_num += 1
216+
descriptor_line_count += 1
216217
for k, v in row.items():
217218
if v is not None and v.startswith('>'):
218219
v = v.replace('>', '')
@@ -226,6 +227,8 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
226227
logging.debug(f'Empty val for {k} in line {line_num}')
227228
else:
228229
rows.append(row)
230+
if descriptor_line_count == 0:
231+
logging.warning(f"No descriptor line found in {line_num} lines. Start line_num = {line_num}")
229232
return SchemaSheet(table_config=table_config,
230233
table_config_rows=table_config_rows,
231234
rows=rows,

0 commit comments

Comments
 (0)