fix: make unflatten_dict symmetric to flatten_dict (#461)

mhordynski · web-flow · commit 454c18ac8f85 · 2025-04-01T10:32:35.000+02:00
diff --git a/packages/ragbits-core/CHANGELOG.md b/packages/ragbits-core/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Unreleased
 
 - Add router option to LiteLLMEmbedder (#440)
+- Fix: make unflatten_dict symmetric to flatten_dict (#461)
 
 ## 0.12.0 (2025-03-25)
 - Allow Prompt class to accept the asynchronous response_parser. Change the signature of parse_response method.
diff --git a/packages/ragbits-core/src/ragbits/core/utils/dict_transformations.py b/packages/ragbits-core/src/ragbits/core/utils/dict_transformations.py
@@ -43,21 +43,193 @@ def flatten_dict(input_dict: dict[str, Any], parent_key: str = "", sep: str = ".
     return items
 
 
-def unflatten_dict(input_dict: dict[str, Any]) -> dict[str, Any] | list:
+def _parse_key(key: str) -> list[tuple[str, bool]]:
+    """Parse a key into parts, each part being (name, is_array_index)."""
+    parts = []
+    current = ""
+    i = 0
+    while i < len(key):
+        if key[i] == "[":
+            if current:
+                parts.append((current, False))  # Parent is not an array
+                current = ""
+            i += 1  # Skip [
+            start = i
+            while i < len(key) and key[i] != "]":
+                i += 1
+            parts.append((key[start:i], True))
+            i += 1  # Skip ]
+            if i < len(key) and key[i] == ".":
+                i += 1  # Skip .
+        elif key[i] == ".":
+            if current:
+                parts.append((current, False))
+                current = ""
+            i += 1
+        else:
+            current += key[i]
+            i += 1
+    if current:
+        parts.append((current, False))
+    return parts
+
+
+def _ensure_array(obj: dict[str, Any] | list[Any], key: str) -> list[Any]:
+    """Ensure that obj[key] is a list, creating it if necessary."""
+    if isinstance(obj, list):
+        return obj
+    if key not in obj or not isinstance(obj[key], list):
+        obj[key] = []
+    return obj[key]
+
+
+def _ensure_dict(obj: dict[str, Any] | list[Any], key: str) -> dict[str, Any]:
+    """Ensure that obj[key] is a dict, creating it if necessary."""
+    if isinstance(obj, list):
+        # Lists should be handled by the caller
+        raise TypeError("Cannot ensure dict in a list")
+    if key not in obj or not isinstance(obj[key], dict):
+        obj[key] = {}
+    return obj[key]
+
+
+DictOrList = dict[str, Any] | list[Any]
+
+
+def _handle_array_part(
+    current: DictOrList,
+    part: str,
+    parent_key: str | None = None,
+) -> DictOrList:
+    """Handle an array part in the key."""
+    idx = int(part)
+    if isinstance(current, list):
+        while len(current) <= idx:
+            current.append({})
+        return current[idx]
+    if parent_key is None:
+        raise ValueError(f"Array part '{part}' without parent key")
+    current_list = _ensure_array(current, parent_key)
+    while len(current_list) <= idx:
+        current_list.append({})
+    return current_list[idx]
+
+
+def _handle_dict_part(
+    current: DictOrList,
+    part: str,
+    next_is_array: bool,
+    array_idx: int | None = None,
+) -> DictOrList:
+    """Handle a dictionary part in the key."""
+    if isinstance(current, list):
+        if array_idx is None:
+            raise ValueError("Array index is required when current is a list")
+        while len(current) <= array_idx:
+            current.append({})
+        current = current[array_idx]
+        if not isinstance(current, dict):
+            current = {}
+            current[str(array_idx)] = current
+    if next_is_array:
+        return _ensure_array(current, part)
+    return _ensure_dict(current, part)
+
+
+def _handle_single_part(
+    new_dict: dict[str, Any],
+    first_part: str,
+    is_array: bool,
+    value: SimpleTypes,
+) -> None:
+    """Handle a single-part key."""
+    if is_array:
+        idx = int(first_part)
+        current = _ensure_array(new_dict, first_part)
+        while len(current) <= idx:
+            current.append(None)
+        current[idx] = value
+    else:
+        new_dict[first_part] = value
+
+
+def _handle_last_array_part(
+    current_obj: DictOrList,
+    last_part: str,
+    value: SimpleTypes,
+    parts: list[tuple[str, bool]],
+) -> None:
+    """Handle the last part of the key when it's an array index."""
+    idx = int(last_part)
+    if len(parts) == 1:
+        # Direct array access like "users[0]"
+        parent_key = parts[0][0]
+        current_obj = _ensure_array(current_obj, parent_key)
+    if isinstance(current_obj, list):
+        while len(current_obj) <= idx:
+            current_obj.append(None)
+        current_obj[idx] = value
+    else:
+        raise TypeError("Expected list but got dict")
+
+
+def _handle_last_dict_part(
+    current_obj: DictOrList,
+    last_part: str,
+    value: SimpleTypes,
+    parts: list[tuple[str, bool]],
+) -> None:
+    """Handle the last part of the key when it's a dictionary key."""
+    if isinstance(current_obj, list):
+        # We're in a list, so we need to ensure the current index has a dict
+        idx = int(parts[-2][0])  # Get the index from the previous part
+        while len(current_obj) <= idx:
+            current_obj.append({})
+        current_obj = current_obj[idx]
+        if not isinstance(current_obj, dict):
+            current_obj = {}
+            current_obj[str(idx)] = current_obj
+    if isinstance(current_obj, dict):
+        current_obj[last_part] = value
+    else:
+        raise TypeError("Expected dict but got list")
+
+
+def _set_value(current: dict[str, Any], parts: list[tuple[str, bool]], value: SimpleTypes) -> None:
+    """Set a value in the dictionary based on the parsed key parts."""
+    current_obj: DictOrList = current
+
+    # Handle all parts except the last one
+    for i, (part, is_array) in enumerate(parts[:-1]):
+        if is_array:
+            current_obj = _handle_array_part(current_obj, part, parts[i - 1][0] if i > 0 else None)
+        else:
+            next_is_array = i + 1 < len(parts) and parts[i + 1][1]
+            array_idx = int(parts[i][0]) if isinstance(current_obj, list) else None
+            current_obj = _handle_dict_part(current_obj, part, next_is_array, array_idx)
+
+    # Handle the last part
+    last_part, is_array = parts[-1]
+    if is_array:
+        _handle_last_array_part(current_obj, last_part, value, parts)
+    else:
+        _handle_last_dict_part(current_obj, last_part, value, parts)
+
+
+def unflatten_dict(input_dict: dict[str, Any]) -> dict[str, Any]:
     """
     Converts a flattened dictionary with dot notation and array notation into a nested structure.
 
     This function transforms a dictionary with flattened keys (using dot notation for nested objects
-    and bracket notation for arrays) into a nested dictionary or list structure. It handles both
-    object-like nesting (using dots) and array-like nesting (using brackets).
+    and bracket notation for arrays) into a nested dictionary structure. It uses the notation to determine
+    whether a value should be a dictionary or list.
 
     Args:
-        input_dict (dict[Any, Any]): A dictionary with flattened keys. Keys can use dot notation
+        input_dict (dict[str, Any]): A dictionary with flattened keys. Keys can use dot notation
             (e.g., "person.name") or array notation (e.g., "addresses[0].street").
 
     Returns:
-        Union[dict[Any, Any], list]: A nested dictionary or list structure. Returns a list if all
-        top-level keys are consecutive integer strings starting from 0.
+        dict[str, Any]: A nested dictionary structure. Lists are created only when using array notation.
 
     Examples:
         >>> unflatten_dict({"person.name": "John", "person.age": 30})
@@ -67,82 +239,29 @@ def unflatten_dict(input_dict: dict[str, Any]) -> dict[str, Any] | list:
         {'addresses': [{'street': 'Main St'}, {'street': 'Broadway'}]}
 
         >>> unflatten_dict({"0": "first", "1": "second"})
-        ['first', 'second']
-
-    Notes:
-        - The function recursively processes nested structures
-        - If all keys at any level are consecutive integers starting from 0, that level will be
-          converted to a list
-        - The function preserves the original values for non-nested keys
-        - Keys are sorted before processing to ensure consistent results
-
-    Attribution:
-        - This function is based on the answer by user "djtubig-malicex" on Stack Overflow: https://stackoverflow.com/a/67905359/27947364
+        {'0': 'first', '1': 'second'}
     """
     if not input_dict:
         return {}
 
-    new_dict: dict[Any, Any] = {}
-    field_keys = sorted(input_dict.keys())
-
-    def _decompose_key(key: str) -> tuple[str | int | None, str | int | None]:
-        _key = str(key)
-        _current_key: str | int | None = None
-        _current_subkey: str | int | None = None
-
-        for idx, char in enumerate(_key):
-            if char == "[":
-                _current_key = _key[:idx]
-                start_subscript_index = idx + 1
-                end_subscript_index = _key.index("]")
-                _current_subkey = int(_key[start_subscript_index:end_subscript_index])
-
-                if len(_key[end_subscript_index:]) > 1:
-                    _current_subkey = f"{_current_subkey}.{_key[end_subscript_index + 2 :]}"
-                break
-            elif char == ".":
-                split_work = _key.split(".", 1)
-                if len(split_work) > 1:
-                    _current_key, _current_subkey = split_work
-                else:
-                    _current_key = split_work[0]
-                break
+    new_dict: dict[str, Any] = {}
 
-        return _current_key, _current_subkey
-
-    for each_key in field_keys:
-        field_value = input_dict[each_key]
-        current_key, current_subkey = _decompose_key(each_key)
-
-        if current_key is not None and current_subkey is not None:
-            if isinstance(current_key, str) and current_key.isdigit():
-                current_key = int(current_key)
-            if current_key not in new_dict:
-                new_dict[current_key] = {}
-            new_dict[current_key][current_subkey] = field_value
+    # Sort keys to ensure we process parents before children
+    field_keys = sorted(input_dict.keys())
+    for key in field_keys:
+        parts = _parse_key(key)
+        if not parts:
+            continue
+
+        # Handle the first part specially to ensure it's created in new_dict
+        first_part, is_array = parts[0]
+        if first_part not in new_dict:
+            new_dict[first_part] = {} if not is_array else []
+
+        # Set the value
+        if len(parts) == 1:
+            _handle_single_part(new_dict, first_part, is_array, input_dict[key])
         else:
-            new_dict[each_key] = field_value
-
-    all_digits = True
-    highest_digit = -1
-
-    for each_key, each_item in new_dict.items():
-        if isinstance(each_item, dict):
-            new_dict[each_key] = unflatten_dict(each_item)
-
-        all_digits &= str(each_key).isdigit()
-        if all_digits:
-            next_digit = int(each_key)
-            highest_digit = max(next_digit, highest_digit)
-
-    if all_digits and highest_digit == (len(new_dict) - 1):
-        digit_keys = sorted(new_dict.keys(), key=int)
-        new_list: list = [None] * (highest_digit + 1)
-
-        for k in digit_keys:
-            i = int(k)
-            new_list[i] = new_dict[k]
-
-        return new_list
+            _set_value(new_dict, parts, input_dict[key])
 
     return new_dict
diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py
@@ -184,8 +184,8 @@ async def retrieve(
 
             metadatas: Sequence = [dict(metadata) for batch in results.get("metadatas") or [] for metadata in batch]
 
-            # Remove the `# type: ignore` comment when https://github.com/deepsense-ai/ragbits/pull/379/files resolved
-            unflattened_metadatas: list[dict] = [unflatten_dict(metadata) if metadata else {} for metadata in metadatas]  # type: ignore[misc]
+            # Convert metadata back to nested structure
+            unflattened_metadatas: list[dict] = [unflatten_dict(metadata) if metadata else {} for metadata in metadatas]
 
             images: list[bytes | None] = [metadata.pop("__image", None) for metadata in unflattened_metadatas]
 
@@ -253,8 +253,8 @@ async def list(
             documents = results.get("documents") or []
             metadatas: Sequence = results.get("metadatas") or []
 
-            # Remove the `# type: ignore` comment when https://github.com/deepsense-ai/ragbits/pull/379/files resolved
-            unflattened_metadatas: list[dict] = [unflatten_dict(metadata) if metadata else {} for metadata in metadatas]  # type: ignore[misc]
+            # Convert metadata back to nested structure
+            unflattened_metadatas: list[dict] = [unflatten_dict(metadata) if metadata else {} for metadata in metadatas]
 
             images: list[bytes | None] = [metadata.pop("__image", None) for metadata in unflattened_metadatas]
 
diff --git a/packages/ragbits-core/tests/unit/utils/test_dict_transformations.py b/packages/ragbits-core/tests/unit/utils/test_dict_transformations.py