|
| 1 | +import dataclasses |
| 2 | +import hashlib |
| 3 | +import io |
| 4 | +import json |
| 5 | +import logging |
| 6 | +import PIL.GifImagePlugin |
| 7 | +import mlcroissant as mlc |
| 8 | +import PIL.Image |
| 9 | +import pandas as pd |
| 10 | + |
| 11 | +from typing import Any, List, Tuple |
| 12 | + |
| 13 | +from aperturedb.Subscriptable import Subscriptable |
| 14 | +from aperturedb.Query import QueryBuilder |
| 15 | +from aperturedb.DataModels import IdentityDataModel |
| 16 | +from aperturedb.Query import generate_add_query |
| 17 | + |
| 18 | + |
| 19 | +logger = logging.getLogger(__name__) |
| 20 | + |
| 21 | + |
| 22 | +MAX_REF_VALUE = 99999 |
| 23 | +# This is useful to identify the class of the record in ApertureDB. |
| 24 | +CLASS_PROPERTY_NAME = "adb_class_name" |
| 25 | + |
| 26 | + |
| 27 | +class RecordSetModel(IdentityDataModel): |
| 28 | + name: str |
| 29 | + description: str = "" |
| 30 | + uuid: str = "" |
| 31 | + |
| 32 | + |
| 33 | +class DatasetModel(IdentityDataModel): |
| 34 | + url: str = "" |
| 35 | + name: str = "Croissant Dataset automatically ingested into ApertureDB" |
| 36 | + description: str = f"A dataset loaded from a croissant json-ld" |
| 37 | + version: str = "1.0.0" |
| 38 | + record_sets: List[RecordSetModel] = dataclasses.field(default_factory=list) |
| 39 | + |
| 40 | + |
| 41 | +def deserialize_record(record): |
| 42 | + """These are the types of records that we expect to deserialize, from croissant Records. |
| 43 | +
|
| 44 | + Args: |
| 45 | + record (_type_): _description_ |
| 46 | +
|
| 47 | + Returns: |
| 48 | + _type_: _description_ |
| 49 | + """ |
| 50 | + deserialized = record |
| 51 | + if record is None: |
| 52 | + deserialized = "Not Available" |
| 53 | + if isinstance(record, bytes): |
| 54 | + deserialized = record.decode('utf-8') |
| 55 | + if isinstance(record, pd.Timestamp): |
| 56 | + deserialized = {"_date": record.to_pydatetime().isoformat()} |
| 57 | + if record == pd.NaT: |
| 58 | + deserialized = "Not Available Time" |
| 59 | + if isinstance(deserialized, str): |
| 60 | + if deserialized.startswith("[") or deserialized.startswith("{"): |
| 61 | + # If it looks like a list or dict, try to parse it as JSON |
| 62 | + try: |
| 63 | + deserialized = json.loads(deserialized) |
| 64 | + except json.JSONDecodeError: |
| 65 | + logger.info(f"Failed to parse JSON: {deserialized}") |
| 66 | + |
| 67 | + try: |
| 68 | + deserialized = json.loads(deserialized.replace("'", "\"")) |
| 69 | + except Exception as e: |
| 70 | + logger.info( |
| 71 | + f"Failed to parse JSON: {deserialized} with error {e}") |
| 72 | + pass |
| 73 | + |
| 74 | + if isinstance(deserialized, list): |
| 75 | + deserialized = [deserialize_record(item) for item in deserialized] |
| 76 | + if isinstance(deserialized, dict): |
| 77 | + deserialized = {k: deserialize_record( |
| 78 | + v) for k, v in deserialized.items()} |
| 79 | + |
| 80 | + return deserialized |
| 81 | + |
| 82 | + |
| 83 | +def persist_metadata(dataset: mlc.Dataset, url: str) -> Tuple[List[dict], List[bytes]]: |
| 84 | + """ |
| 85 | + Persist the metadata of a croissant dataset into ApertureDB. |
| 86 | + """ |
| 87 | + ds = DatasetModel( |
| 88 | + url=url, |
| 89 | + name=dataset.metadata.name, |
| 90 | + description=dataset.metadata.description, |
| 91 | + version=dataset.metadata.version or "1.0.0", |
| 92 | + record_sets=[RecordSetModel( |
| 93 | + name=rs.name, |
| 94 | + description=rs.description, |
| 95 | + uuid=rs.uuid, |
| 96 | + ) for rs in dataset.metadata.record_sets] |
| 97 | + ) |
| 98 | + q, b, _ = generate_add_query(ds) |
| 99 | + |
| 100 | + return q, b |
| 101 | + |
| 102 | + |
| 103 | +def try_parse(value: str) -> Any: |
| 104 | + """Attempts to parse a string value into a more appropriate type.""" |
| 105 | + parsed = value.strip() |
| 106 | + |
| 107 | + if parsed.startswith("http"): |
| 108 | + # Download the content from the URL |
| 109 | + from aperturedb.Sources import Sources |
| 110 | + sources = Sources(n_download_retries=3) |
| 111 | + result, buffer = sources.load_from_http_url( |
| 112 | + parsed, validator=lambda x: True) |
| 113 | + if result: |
| 114 | + parsed = PIL.Image.open(io.BytesIO(buffer)) |
| 115 | + |
| 116 | + return parsed |
| 117 | + |
| 118 | + |
| 119 | +def dict_to_query(row_dict, name: str, flatten_json: bool) -> Any: |
| 120 | + literals = {} |
| 121 | + subitems = {} |
| 122 | + known_image_blobs = {} |
| 123 | + unknown_blobs = {} |
| 124 | + o_literals = {} |
| 125 | + |
| 126 | + name = name.split("/")[-1] # Use the last part of the name |
| 127 | + # If name is not specified, or begins with _, this ensures that it |
| 128 | + # complies with the ApertureDB naming conventions |
| 129 | + if not name or name.startswith("_"): |
| 130 | + safe_name = f"E_{name or 'Record'}" # Uncomment if you want |
| 131 | + logger.warning( |
| 132 | + f"Entity Name '{name}' is not valid. Using {safe_name}.") |
| 133 | + name = safe_name |
| 134 | + |
| 135 | + for k, v in row_dict.items(): |
| 136 | + k = k.split("/")[-1] # Use the last part of the key |
| 137 | + if not k or k.startswith("_"): |
| 138 | + safe_key = f"F_{k or 'Field'}" |
| 139 | + logger.warning( |
| 140 | + f"Property name '{k}' is not valid. Using {safe_key}.") |
| 141 | + k = safe_key |
| 142 | + item = v |
| 143 | + # Pre processed items from croissant. |
| 144 | + if isinstance(item, PIL.Image.Image): |
| 145 | + buffer = io.BytesIO() |
| 146 | + item.save(buffer, format=item.format) |
| 147 | + known_image_blobs[k] = buffer.getvalue() |
| 148 | + continue |
| 149 | + |
| 150 | + record = deserialize_record(item) |
| 151 | + if isinstance(record, str): |
| 152 | + record = try_parse(record) |
| 153 | + |
| 154 | + # Post processed items from SDK. |
| 155 | + if isinstance(record, PIL.GifImagePlugin.GifImageFile): |
| 156 | + buffer = io.BytesIO() |
| 157 | + record.save(buffer, format=record.format) |
| 158 | + unknown_blobs[k] = buffer.getvalue() |
| 159 | + continue |
| 160 | + |
| 161 | + if isinstance(record, PIL.Image.Image): |
| 162 | + buffer = io.BytesIO() |
| 163 | + record.save(buffer, format=record.format) |
| 164 | + known_image_blobs[k] = buffer.getvalue() |
| 165 | + continue |
| 166 | + |
| 167 | + if flatten_json and isinstance(record, list): |
| 168 | + subitems[k] = record |
| 169 | + else: |
| 170 | + literals[k] = record |
| 171 | + # Original value from croissant. This is useful for debugging. |
| 172 | + o_literals[k] = item |
| 173 | + |
| 174 | + if flatten_json: |
| 175 | + str_rep = "".join([f"{str(k)}{str(v)}" for k, v in literals.items()]) |
| 176 | + literals["adb_uuid"] = hashlib.sha256( |
| 177 | + str_rep.encode('utf-8')).hexdigest() |
| 178 | + |
| 179 | + literals[CLASS_PROPERTY_NAME] = name |
| 180 | + q = QueryBuilder.add_command(name, { |
| 181 | + "properties": literals, |
| 182 | + "connect": { |
| 183 | + "ref": MAX_REF_VALUE, |
| 184 | + "class": "hasRecord", |
| 185 | + "direction": "in", |
| 186 | + } |
| 187 | + }) |
| 188 | + if flatten_json: |
| 189 | + q[list(q.keys())[-1]]["if_not_found"] = { |
| 190 | + "adb_uuid": ["==", literals["adb_uuid"]] |
| 191 | + } |
| 192 | + |
| 193 | + dependents = [] |
| 194 | + if len(subitems) > 0 or len(known_image_blobs) > 0 or len(unknown_blobs) > 0: |
| 195 | + # We need to create a reference to this record |
| 196 | + q[list(q.keys())[-1]]["_ref"] = 1 |
| 197 | + |
| 198 | + for key in subitems: |
| 199 | + for item in subitems[key]: |
| 200 | + subitem_query, blobs = dict_to_query( |
| 201 | + item, f"{name}.{key}", flatten_json) |
| 202 | + subitem_query[0][list(subitem_query[0].keys())[-1]]["connect"] = { |
| 203 | + "ref": 1, |
| 204 | + "class": key, |
| 205 | + "direction": "in", |
| 206 | + } |
| 207 | + dependents.extend(subitem_query) |
| 208 | + |
| 209 | + from aperturedb.Query import ObjectType |
| 210 | + blobs = [] |
| 211 | + for blob in known_image_blobs: |
| 212 | + image_query = QueryBuilder.add_command(ObjectType.IMAGE, { |
| 213 | + "properties": {CLASS_PROPERTY_NAME: literals[CLASS_PROPERTY_NAME] + "." + "image"}, |
| 214 | + "connect": { |
| 215 | + "ref": 1, |
| 216 | + "class": blob, |
| 217 | + "direction": "in" |
| 218 | + } |
| 219 | + }) |
| 220 | + blobs.append(known_image_blobs[blob]) |
| 221 | + dependents.append(image_query) |
| 222 | + |
| 223 | + for blob in unknown_blobs: |
| 224 | + blob_query = QueryBuilder.add_command(ObjectType.BLOB, { |
| 225 | + "properties": {CLASS_PROPERTY_NAME: literals[CLASS_PROPERTY_NAME] + "." + "blob"}, |
| 226 | + "connect": { |
| 227 | + "ref": 1, |
| 228 | + "class": blob, |
| 229 | + "direction": "in" |
| 230 | + } |
| 231 | + }) |
| 232 | + blobs.append(unknown_blobs[blob]) |
| 233 | + dependents.append(blob_query) |
| 234 | + |
| 235 | + return [q] + dependents, blobs |
| 236 | + |
| 237 | + |
| 238 | +class MLCroissantRecordSet(Subscriptable): |
| 239 | + def __init__( |
| 240 | + self, |
| 241 | + record_set: mlc.Records, |
| 242 | + name: str, |
| 243 | + flatten_json: bool, |
| 244 | + sample_count: int = 0, |
| 245 | + uuid: str = None): |
| 246 | + self.record_set = record_set |
| 247 | + self.uuid = uuid |
| 248 | + samples = [] |
| 249 | + count = 0 |
| 250 | + for record in record_set: |
| 251 | + samples.append({k: v for k, v in record.items()}) |
| 252 | + count += 1 |
| 253 | + if count == sample_count: |
| 254 | + break |
| 255 | + |
| 256 | + self.samples = samples |
| 257 | + self.sample_count = len(samples) |
| 258 | + self.name = name |
| 259 | + self.flatten_json = flatten_json |
| 260 | + self.indexed_entities = set() |
| 261 | + |
| 262 | + def getitem(self, subscript): |
| 263 | + row_dict = self.samples[subscript] |
| 264 | + |
| 265 | + find_recordset_query = QueryBuilder.find_command( |
| 266 | + "RecordSetModel", { |
| 267 | + "_ref": MAX_REF_VALUE, |
| 268 | + "constraints": { |
| 269 | + "uuid": ["==", self.uuid] |
| 270 | + } |
| 271 | + }) |
| 272 | + |
| 273 | + q, blobs = dict_to_query(row_dict, self.name, self.flatten_json) |
| 274 | + indexes_to_create = [] |
| 275 | + for command in q: |
| 276 | + cmd = list(command.keys())[-1] |
| 277 | + if cmd in ["AddImage", "AddBlob", "AddVideo"]: |
| 278 | + continue |
| 279 | + indexable_entity = command[list(command.keys())[-1]]["class"] |
| 280 | + if indexable_entity not in self.indexed_entities: |
| 281 | + index_command = { |
| 282 | + "CreateIndex": { |
| 283 | + "class": indexable_entity, |
| 284 | + "index_type": "entity", |
| 285 | + "property_key": "adb_uuid", |
| 286 | + } |
| 287 | + } |
| 288 | + indexes_to_create.append(index_command) |
| 289 | + return indexes_to_create + [find_recordset_query] + q, blobs |
| 290 | + |
| 291 | + def __len__(self): |
| 292 | + return len(self.samples) |
0 commit comments