Skip to content

Commit e26393b

Browse files
Schema sync functionnality inside FieldsInfo
Test passes, surprisingly. No special effort has been made to support `header_case` option, or "required" columns with `schema_sync`
1 parent add8dac commit e26393b

File tree

3 files changed

+72
-43
lines changed

3 files changed

+72
-43
lines changed

frictionless/detector/detector.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -404,31 +404,30 @@ def detect_schema(
404404
schema.fields = fields # type: ignore
405405

406406
# Sync schema
407-
if self.schema_sync:
408-
if labels:
409-
case_sensitive = options["header_case"]
407+
if self.schema_sync and labels:
408+
case_sensitive = options["header_case"]
410409

411-
if not case_sensitive:
412-
labels = [label.lower() for label in labels]
410+
if not case_sensitive:
411+
labels = [label.lower() for label in labels]
413412

414-
if len(labels) != len(set(labels)):
415-
note = '"schema_sync" requires unique labels in the header'
416-
raise FrictionlessException(note)
413+
if len(labels) != len(set(labels)):
414+
note = '"schema_sync" requires unique labels in the header'
415+
raise FrictionlessException(note)
417416

418-
mapped_fields = self.map_schema_fields_by_name(
419-
schema.fields,
420-
case_sensitive,
421-
)
417+
mapped_fields = self.map_schema_fields_by_name(
418+
schema.fields,
419+
case_sensitive,
420+
)
422421

423-
self.rearrange_schema_fields_given_labels(
424-
mapped_fields,
425-
schema,
426-
labels,
427-
)
422+
self.rearrange_schema_fields_given_labels(
423+
mapped_fields,
424+
schema,
425+
labels,
426+
)
428427

429-
self.add_missing_required_labels_to_schema_fields(
430-
mapped_fields, schema, labels, case_sensitive
431-
)
428+
self.add_missing_required_labels_to_schema_fields(
429+
mapped_fields, schema, labels, case_sensitive
430+
)
432431

433432
# Patch schema
434433
if self.schema_patch:
@@ -460,8 +459,10 @@ def rearrange_schema_fields_given_labels(
460459
schema: Schema,
461460
labels: List[str],
462461
):
463-
"""Rearrange fields according to the order of labels. All fields
464-
missing from labels are dropped"""
462+
"""Rearrange fields according to the order of labels.
463+
All fields missing from labels are dropped.
464+
Any extra-field is filled in with a default `"type": "any"` field.
465+
"""
465466
schema.clear_fields()
466467

467468
for name in labels:

frictionless/resources/table.py

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ def __open_lookup(self):
263263
self.__lookup[source_name][source_key].add(cells)
264264

265265
def __open_row_stream(self):
266-
field_info = FieldsInfo(self.schema.fields)
266+
fields_info = FieldsInfo(
267+
self.schema.fields, self.labels, self.detector.schema_sync
268+
)
267269

268270
# Create state
269271
memory_unique: Dict[str, Any] = {}
@@ -296,7 +298,7 @@ def row_stream():
296298

297299
row = Row(
298300
cells,
299-
field_info=field_info,
301+
fields_info=fields_info,
300302
row_number=row_number,
301303
)
302304

@@ -378,9 +380,9 @@ def row_stream():
378380

379381
if self.detector.schema_sync:
380382
# Missing required labels are not included in the
381-
# field_info parameter used for row creation
383+
# fields_info parameter used for row creation
382384
for field in self.schema.fields:
383-
self.remove_missing_required_label_from_field_info(field, field_info)
385+
self.remove_missing_required_label_from_field_info(field, fields_info)
384386

385387
self.__row_stream = row_stream()
386388

@@ -415,7 +417,9 @@ def label_is_missing(
415417
def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]:
416418
"""Create a tuple containg all cells from a given row associated to primary
417419
keys"""
418-
return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive))
420+
return tuple(
421+
row[label] for label in self.primary_key_labels(row, case_sensitive)
422+
)
419423

420424
def primary_key_labels(
421425
self,
@@ -689,39 +693,63 @@ def __init__(self, field: Field, field_number: int):
689693

690694

691695
class FieldsInfo:
692-
"""Helper class to store additional data to a collection of fields
696+
"""Helper class for linking columns to schema fields.
697+
698+
It abstracts away the different ways of making this link. In particular, the
699+
reference may be the schema (`detector.schema_sync = False`), or the labels
700+
(`detector.schema_sync = True`).
693701
694702
This class is not Public API, and should be used only in non-public
695703
interfaces.
696704
"""
697705

698-
def __init__(self, fields: List[Field]):
699-
self._fields: List[_FieldInfo] = [
700-
_FieldInfo(field, i + 1) for i, field in enumerate(fields)
701-
]
706+
def __init__(
707+
self, fields: List[Field], labels: Optional[List[str]], schema_sync: bool
708+
):
709+
if schema_sync and labels:
710+
self._expected_fields: List[_FieldInfo] = []
711+
if len(labels) != len(set(labels)):
712+
note = '"schema_sync" requires unique labels in the header'
713+
raise FrictionlessException(note)
714+
715+
for label_index, label in enumerate(labels):
716+
try:
717+
field = next(f for f in fields if f.name == label)
718+
except StopIteration:
719+
field = Field.from_descriptor({"name": label, "type": "any"})
720+
self._expected_fields.append(_FieldInfo(field, label_index + 1))
721+
else:
722+
self._expected_fields = [
723+
_FieldInfo(field, i + 1) for i, field in enumerate(fields)
724+
]
702725

703726
def ls(self) -> List[str]:
704-
"""List all field names"""
705-
return [fi.field.name for fi in self._fields]
727+
"""List all column names"""
728+
return [fi.field.name for fi in self._expected_fields]
706729

707730
def get(self, field_name: str) -> _FieldInfo:
708731
"""Get a Field by its name
709732
733+
In case no field with field_name exists, the behavior depends on
734+
the `detector.schema_sync` option:
735+
710736
Raises:
711-
ValueError: Field with name fieldname does not exist
737+
ValueError
712738
"""
713739
try:
714-
return next(fi for fi in self._fields if fi.field.name == field_name)
740+
return next(
741+
fi for fi in self._expected_fields if fi.field.name == field_name
742+
)
715743
except StopIteration:
716-
raise ValueError(f"'{field_name}' is not in fields data")
744+
raise ValueError(f"{field_name} is missing from expected fields")
717745

718746
def get_copies(self) -> List[Field]:
719747
"""Return field copies"""
720-
return [fi.field.to_copy() for fi in self._fields]
748+
return [fi.field.to_copy() for fi in self._expected_fields]
721749

722750
def rm(self, field_name: str):
723751
try:
724752
i = self.ls().index(field_name)
725-
del self._fields[i]
753+
del self._expected_fields[i]
726754
except ValueError:
727755
raise ValueError(f"'{field_name}' is not in fields data")

frictionless/table/row.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@ def __init__(
3939
self,
4040
cells: List[Any],
4141
*,
42-
field_info: FieldsInfo,
42+
fields_info: FieldsInfo,
4343
row_number: int,
4444
):
4545
self.__cells = cells
46-
self.__fields_info = field_info
46+
self.__fields_info = fields_info
4747
self.__row_number = row_number
4848
self.__processed: bool = False
4949
self.__blank_cells: Dict[str, Any] = {}
@@ -65,7 +65,7 @@ def __repr__(self):
6565
def __setitem__(self, key: str, value: Any):
6666
try:
6767
field_number = self.__fields_info.get(key).field_number
68-
except KeyError:
68+
except ValueError:
6969
raise KeyError(f"Row does not have a field {key}")
7070
if len(self.__cells) < field_number:
7171
self.__cells.extend([None] * (field_number - len(self.__cells)))
@@ -87,7 +87,7 @@ def __contains__(self, key: object):
8787
def __reversed__(self):
8888
return reversed(self.__fields_info.ls())
8989

90-
def keys(self):
90+
def keys(self): # type: ignore
9191
return iter(self.__fields_info.ls())
9292

9393
def values(self): # type: ignore

0 commit comments

Comments
 (0)