|
| 1 | +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. |
| 2 | +"""Name normalizer classes.""" |
| 3 | + |
| 4 | +from __future__ import annotations |
| 5 | + |
| 6 | +import abc |
| 7 | +from typing import TYPE_CHECKING, Any |
| 8 | + |
| 9 | + |
| 10 | +if TYPE_CHECKING: |
| 11 | + from collections.abc import Iterable, Iterator |
| 12 | + |
| 13 | + |
| 14 | +class NameNormalizerBase(abc.ABC): |
| 15 | + """Abstract base class for name normalizers.""" |
| 16 | + |
| 17 | + @staticmethod |
| 18 | + @abc.abstractmethod |
| 19 | + def normalize(name: str) -> str: |
| 20 | + """Return the normalized name.""" |
| 21 | + ... |
| 22 | + |
| 23 | + @classmethod |
| 24 | + def normalize_set(cls, str_iter: Iterable[str]) -> set[str]: |
| 25 | + """Converts string iterable to a set of lower case strings.""" |
| 26 | + return {cls.normalize(s) for s in str_iter} |
| 27 | + |
| 28 | + @classmethod |
| 29 | + def normalize_list(cls, str_iter: Iterable[str]) -> list[str]: |
| 30 | + """Converts string iterable to a list of lower case strings.""" |
| 31 | + return [cls.normalize(s) for s in str_iter] |
| 32 | + |
| 33 | + @classmethod |
| 34 | + def check_matched(cls, name1: str, name2: str) -> bool: |
| 35 | + """Return True if the two names match after each is normalized.""" |
| 36 | + return cls.normalize(name1) == cls.normalize(name2) |
| 37 | + |
| 38 | + @classmethod |
| 39 | + def check_normalized(cls, name: str) -> bool: |
| 40 | + """Return True if the name is already normalized.""" |
| 41 | + return cls.normalize(name) == name |
| 42 | + |
| 43 | + |
| 44 | +class LowerCaseNormalizer(NameNormalizerBase): |
| 45 | + """A name normalizer that converts names to lower case.""" |
| 46 | + |
| 47 | + @staticmethod |
| 48 | + def normalize(name: str) -> str: |
| 49 | + """Return the normalized name.""" |
| 50 | + return name.lower().replace(" ", "_").replace("-", "_") |
| 51 | + |
| 52 | + |
| 53 | +class CaseInsensitiveDict(dict[str, Any]): |
| 54 | + """A case-aware, case-insensitive dictionary implementation. |
| 55 | +
|
| 56 | + It has these behaviors: |
| 57 | + - When a key is retrieved, deleted, or checked for existence, it is always checked in a |
| 58 | + case-insensitive manner. |
| 59 | + - The original case is stored in a separate dictionary, so that the original case can be |
| 60 | + retrieved when needed. |
| 61 | +
|
| 62 | + There are two ways to store keys internally: |
| 63 | + - If normalize_keys is True, the keys are normalized using the given normalizer. |
| 64 | + - If normalize_keys is False, the original case of the keys is stored. |
| 65 | +
|
| 66 | + In regards to missing values, the dictionary accepts an 'expected_keys' input. When set, the |
| 67 | + dictionary will be initialized with the given keys. If a key is not found in the input data, it |
| 68 | + will be initialized with a value of None. When provided, the 'expected_keys' input will also |
| 69 | + determine the original case of the keys. |
| 70 | + """ |
| 71 | + |
| 72 | + def _display_case(self, key: str) -> str: |
| 73 | + """Return the original case of the key.""" |
| 74 | + return self._pretty_case_keys[self._normalizer.normalize(key)] |
| 75 | + |
| 76 | + def _index_case(self, key: str) -> str: |
| 77 | + """Return the internal case of the key. |
| 78 | +
|
| 79 | + If normalize_keys is True, return the normalized key. |
| 80 | + Otherwise, return the original case of the key. |
| 81 | + """ |
| 82 | + if self._normalize_keys: |
| 83 | + return self._normalizer.normalize(key) |
| 84 | + |
| 85 | + return self._display_case(key) |
| 86 | + |
| 87 | + def __init__( |
| 88 | + self, |
| 89 | + from_dict: dict, |
| 90 | + *, |
| 91 | + normalize_keys: bool = True, |
| 92 | + normalizer: type[NameNormalizerBase] | None = None, |
| 93 | + expected_keys: list[str] | None = None, |
| 94 | + ) -> None: |
| 95 | + """Initialize the dictionary with the given data. |
| 96 | +
|
| 97 | + If normalize_keys is True, the keys will be normalized using the given normalizer. |
| 98 | + If expected_keys is provided, the dictionary will be initialized with the given keys. |
| 99 | + """ |
| 100 | + # If no normalizer is provided, use LowerCaseNormalizer. |
| 101 | + self._normalize_keys = normalize_keys |
| 102 | + self._normalizer: type[NameNormalizerBase] = normalizer or LowerCaseNormalizer |
| 103 | + |
| 104 | + # If no expected keys are provided, use all keys from the input dictionary. |
| 105 | + if not expected_keys: |
| 106 | + expected_keys = list(from_dict.keys()) |
| 107 | + |
| 108 | + # Store a lookup from normalized keys to pretty cased (originally cased) keys. |
| 109 | + self._pretty_case_keys: dict[str, str] = { |
| 110 | + self._normalizer.normalize(pretty_case.lower()): pretty_case |
| 111 | + for pretty_case in expected_keys |
| 112 | + } |
| 113 | + |
| 114 | + if normalize_keys: |
| 115 | + index_keys = [self._normalizer.normalize(key) for key in expected_keys] |
| 116 | + else: |
| 117 | + index_keys = expected_keys |
| 118 | + |
| 119 | + self.update({k: None for k in index_keys}) # Start by initializing all values to None |
| 120 | + for k, v in from_dict.items(): |
| 121 | + self[self._index_case(k)] = v |
| 122 | + |
| 123 | + def __getitem__(self, key: str) -> Any: # noqa: ANN401 |
| 124 | + if super().__contains__(key): |
| 125 | + return super().__getitem__(key) |
| 126 | + |
| 127 | + if super().__contains__(self._index_case(key)): |
| 128 | + return super().__getitem__(self._index_case(key)) |
| 129 | + |
| 130 | + raise KeyError(key) |
| 131 | + |
| 132 | + def __setitem__(self, key: str, value: Any) -> None: # noqa: ANN401 |
| 133 | + if super().__contains__(key): |
| 134 | + super().__setitem__(key, value) |
| 135 | + return |
| 136 | + |
| 137 | + if super().__contains__(self._index_case(key)): |
| 138 | + super().__setitem__(self._index_case(key), value) |
| 139 | + return |
| 140 | + |
| 141 | + # Store the pretty cased (originally cased) key: |
| 142 | + self._pretty_case_keys[self._normalizer.normalize(key)] = key |
| 143 | + |
| 144 | + # Store the data with the normalized key: |
| 145 | + super().__setitem__(self._index_case(key), value) |
| 146 | + |
| 147 | + def __delitem__(self, key: str) -> None: |
| 148 | + if super().__contains__(key): |
| 149 | + super().__delitem__(key) |
| 150 | + return |
| 151 | + |
| 152 | + if super().__contains__(self._index_case(key)): |
| 153 | + super().__delitem__(self._index_case(key)) |
| 154 | + return |
| 155 | + |
| 156 | + raise KeyError(key) |
| 157 | + |
| 158 | + def __contains__(self, key: object) -> bool: |
| 159 | + assert isinstance(key, str), "Key must be a string." |
| 160 | + return super().__contains__(key) or super().__contains__(self._index_case(key)) |
| 161 | + |
| 162 | + def __iter__(self) -> Any: # noqa: ANN401 |
| 163 | + return iter(super().__iter__()) |
| 164 | + |
| 165 | + def __len__(self) -> int: |
| 166 | + return super().__len__() |
| 167 | + |
| 168 | + def __eq__(self, other: object) -> bool: |
| 169 | + if isinstance(other, CaseInsensitiveDict): |
| 170 | + return dict(self) == dict(other) |
| 171 | + |
| 172 | + if isinstance(other, dict): |
| 173 | + return {k.lower(): v for k, v in self.items()} == { |
| 174 | + k.lower(): v for k, v in other.items() |
| 175 | + } |
| 176 | + return False |
| 177 | + |
| 178 | + |
| 179 | +def normalize_records( |
| 180 | + records: Iterable[dict[str, Any]], |
| 181 | + expected_keys: list[str], |
| 182 | +) -> Iterator[CaseInsensitiveDict]: |
| 183 | + """Add missing columns to the record with null values. |
| 184 | +
|
| 185 | + Also conform the column names to the case in the catalog. |
| 186 | +
|
| 187 | + This is a generator that yields CaseInsensitiveDicts, which allows for case-insensitive |
| 188 | + lookups of columns. This is useful because the case of the columns in the records may |
| 189 | + not match the case of the columns in the catalog. |
| 190 | + """ |
| 191 | + yield from ( |
| 192 | + CaseInsensitiveDict( |
| 193 | + from_dict=record, |
| 194 | + expected_keys=expected_keys, |
| 195 | + ) |
| 196 | + for record in records |
| 197 | + ) |
| 198 | + |
| 199 | + |
| 200 | +__all__ = [ |
| 201 | + "NameNormalizerBase", |
| 202 | + "LowerCaseNormalizer", |
| 203 | + "CaseInsensitiveDict", |
| 204 | + "normalize_records", |
| 205 | +] |
0 commit comments