aperture-data
diff --git a/‎aperturedb/CommonLibrary.py‎
Lines changed: 1 addition & 1 deletion b/‎aperturedb/CommonLibrary.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aperturedb/Connector.py‎
Lines changed: 1 addition & 1 deletion b/‎aperturedb/Connector.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aperturedb/ConnectorRest.py‎
Lines changed: 4 additions & 1 deletion b/‎aperturedb/ConnectorRest.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎aperturedb/EntityUpdateDataCSV.py‎
Lines changed: 3 additions & 3 deletions b/‎aperturedb/EntityUpdateDataCSV.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎aperturedb/MLCroissant.py‎
Lines changed: 219 additions & 0 deletions b/‎aperturedb/MLCroissant.py‎
Lines changed: 219 additions & 0 deletions
@@ -315,7 +315,7 @@ def execute_query(client: Connector, query: Commands,
                     warn_list.append(wr)
         if len(warn_list) != 0:
             logger.warning(
-                f"Partial errors:\r\n{json.dumps(query)}\r\n{json.dumps(warn_list)}")
+                f"Partial errors:\r\n{json.dumps(query, default=str)}\r\n{json.dumps(warn_list, default=str)}")
             result = 2
 
     return result, r, b
 
@@ -376,7 +376,7 @@ def _query(self, query, blob_array = [], try_resume=True):
         response_blob_array = []
         # Check the query type
         if not isinstance(query, str):  # assumes json
-            query_str = json.dumps(query)
+            query_str = json.dumps(query, default=str)
         else:
             query_str = query
 
 
@@ -77,7 +77,10 @@ def __init__(self, host="localhost", port=None,
         # A Convenience feature to not require the port
         # Relies on common ports for http and https, but can be overriden
         if port is None:
-            self.port = 443 if use_ssl else 80
+            if config is None:
+                self.port = 443 if use_ssl else 80
+            else:
+                self.port = config.port
         else:
             self.port = port
 
 
@@ -25,9 +25,9 @@ class SingleEntityUpdateDataCSV(CSVParser.CSVParser):
        - a series of updateif_ to determine if an update is necessary
 
        Conditionals:
-         updateif>_prop - updates if the database value > csv value
-         updateif<_prop - updates if the database value < csv value
-         updateif!_prop - updates if the database value is != csv value
+         ```updateif>_prop``` : updates if the database value greater than csv value
+         ```updateif<_prop``` : updates if the database value less than csv value
+         ```updateif!_prop``` : updates if the database value is not equal to csv value
 
     :::note
     Is backed by a CSV file with the following columns (format optional):
 
@@ -0,0 +1,219 @@
+import io
+import json
+from typing import Any, List, Tuple
+
+import PIL
+import PIL.Image
+import mlcroissant as mlc
+import pandas as pd
+
+
+from aperturedb.Subscriptable import Subscriptable
+from aperturedb.Query import QueryBuilder
+from aperturedb.CommonLibrary import execute_query
+
+
+import dataclasses
+import hashlib
+
+from aperturedb.DataModels import IdentityDataModel
+from aperturedb.Query import generate_add_query
+
+MAX_REF_VALUE = 99999
+
+
+class RecordSetModel(IdentityDataModel):
+    name: str
+    description: str = ""
+    uuid: str = ""
+
+
+class DatasetModel(IdentityDataModel):
+    name: str = "Croissant Dataset automatically ingested into ApertureDB"
+    description: str = f"A dataset loaded from a croissant json-ld"
+    version: str = "1.0.0"
+    record_sets: List[RecordSetModel] = dataclasses.field(default_factory=list)
+
+
+def deserialize_record(record):
+    """These are the types of records that we expect to deserialize, from croissant Records.
+
+    Args:
+        record (_type_): _description_
+
+    Returns:
+        _type_: _description_
+    """
+    deserialized = record
+    if record is None:
+        deserialized = "Not Available"
+    if isinstance(record, bytes):
+        deserialized = record.decode('utf-8')
+    if isinstance(record, pd.Timestamp):
+        deserialized = {"_date": record.to_pydatetime().isoformat()}
+    if record == pd.NaT:
+        deserialized = "Not Available Time"
+    if isinstance(deserialized, str):
+        try:
+            deserialized = json.loads(deserialized)
+        except:
+            pass
+    if isinstance(deserialized, list):
+        deserialized = [deserialize_record(item) for item in deserialized]
+    if isinstance(deserialized, dict):
+        deserialized = {k: deserialize_record(
+            v) for k, v in deserialized.items()}
+
+    return deserialized
+
+
+def persist_metadata(dataset: mlc.Dataset) -> Tuple[List[dict], List[bytes]]:
+
+    ds = DatasetModel(
+        name=dataset.metadata.name,
+        description=dataset.metadata.description,
+        version=dataset.metadata.version or "1.0.0",
+        record_sets=[RecordSetModel(
+            name=rs.name,
+            description=rs.description,
+            uuid=rs.uuid,
+        ) for rs in dataset.metadata.record_sets]
+    )
+    q, b, _ = generate_add_query(ds)
+
+    return q, b
+
+
+def dict_to_query(row_dict, name: str, flatten_json: bool) -> Any:
+    literals = {}
+    subitems = {}
+    blobs = {}
+    o_literalse = {}
+
+    # If name is not specified, or begins with _, this enures that it
+    # complies with the ApertureDB naming conventions
+    name = f"E_{name or 'Record'}"
+
+    for k, v in row_dict.items():
+        k = f"F_{k}"
+        item = v
+        if isinstance(item, PIL.Image.Image):
+            buffer = io.BytesIO()
+            item.save(buffer, format=item.format)
+            blobs[k] = buffer.getvalue()
+            continue
+
+        record = deserialize_record(item)
+        if flatten_json and isinstance(record, list):
+            subitems[k] = record
+        else:
+            literals[k] = record
+            o_literalse[k] = item
+
+    if flatten_json:
+        str_rep = "".join([f"{str(k)}{str(v)}" for k, v in literals.items()])
+        literals["adb_uuid"] = hashlib.sha256(
+            str_rep.encode('utf-8')).hexdigest()
+
+    literals["adb_class_name"] = name
+    q = QueryBuilder.add_command(name, {
+        "properties": literals,
+        "connect": {
+            "ref": MAX_REF_VALUE,
+            "class": "hasRecord",
+            "direction": "in",
+        }
+    })
+    if flatten_json:
+        q[list(q.keys())[-1]]["if_not_found"] = {
+            "adb_uuid": ["==", literals["adb_uuid"]]
+        }
+
+    dependents = []
+    if len(subitems) > 0 or len(blobs) > 0:
+        q[list(q.keys())[-1]]["_ref"] = 1
+
+    for key in subitems:
+        for item in subitems[key]:
+            subitem_query = dict_to_query(item, f"{name}.{key}", flatten_json)
+            subitem_query[0][list(subitem_query[0].keys())[-1]]["connect"] = {
+                "ref": 1,
+                "class": key,
+                "direction": "out",
+            }
+            dependents.extend(subitem_query)
+
+    from aperturedb.Query import ObjectType
+    image_blobs = []
+    for blob in blobs:
+        image_query = QueryBuilder.add_command(ObjectType.IMAGE, {
+            "properties": literals,
+            "connect": {
+                "ref": 1,
+                "class": blob,
+                "direction": "out"
+            }
+        })
+        image_blobs.append(blobs[blob])
+        dependents.append(image_query)
+
+    return [q] + dependents, image_blobs
+
+
+class MLCroissantRecordSet(Subscriptable):
+    def __init__(
+            self,
+            record_set: mlc.Records,
+            name: str,
+            flatten_json: bool,
+            sample_count: int = 0,
+            uuid: str = None):
+        self.record_set = record_set
+        self.uuid = uuid
+        samples = []
+        count = 0
+        for record in record_set:
+            samples.append({k: v for k, v in record.items()})
+            count += 1
+            if count == sample_count:
+                break
+
+        self.df = pd.json_normalize(samples)
+        self.sample_count = len(samples)
+        self.name = name
+        self.flatten_json = flatten_json
+        self.indexed_entities = set()
+
+    def getitem(self, subscript):
+        row = self.df.iloc[subscript]
+        # Convert the row to a dictionary
+        row_dict = row.to_dict()
+
+        find_recordset_query = QueryBuilder.find_command(
+            "RecordSetModel", {
+                "_ref": MAX_REF_VALUE,
+                "constraints": {
+                    "uuid": ["==", self.uuid]
+                }
+            })
+
+        q, blobs = dict_to_query(row_dict, self.name, self.flatten_json)
+        indexes_to_create = []
+        for command in q:
+            cmd = list(command.keys())[-1]
+            if cmd == "AddImage":
+                continue
+            indexable_entity = command[list(command.keys())[-1]]["class"]
+            if indexable_entity not in self.indexed_entities:
+                index_command = {
+                    "CreateIndex": {
+                        "class": indexable_entity,
+                        "index_type": "entity",
+                        "property_key": "adb_uuid",
+                    }
+                }
+                indexes_to_create.append(index_command)
+        return indexes_to_create + [find_recordset_query] + q, blobs
+
+    def __len__(self):
+        return len(self.df)