aperture-data
diff --git a/‎aperturedb/CommonLibrary.py‎
Lines changed: 15 additions & 6 deletions b/‎aperturedb/CommonLibrary.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎aperturedb/Connector.py‎
Lines changed: 1 addition & 1 deletion b/‎aperturedb/Connector.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aperturedb/ConnectorRest.py‎
Lines changed: 4 additions & 1 deletion b/‎aperturedb/ConnectorRest.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎aperturedb/EntityUpdateDataCSV.py‎
Lines changed: 3 additions & 3 deletions b/‎aperturedb/EntityUpdateDataCSV.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎aperturedb/MLCroissant.py‎
Lines changed: 292 additions & 0 deletions b/‎aperturedb/MLCroissant.py‎
Lines changed: 292 additions & 0 deletions
diff --git a/‎aperturedb/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎aperturedb/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aperturedb/cli/configure.py‎
Lines changed: 2 additions & 2 deletions b/‎aperturedb/cli/configure.py‎
Lines changed: 2 additions & 2 deletions
@@ -164,11 +164,12 @@ def create_connector(
     This function chooses a configuration in the folowing order:
     1. The configuration named by the `name` parameter or `key` parameter
     2. The configuration described in the `APERTUREDB_KEY` environment variable.
-    3. The configuration described in the `APERTUREDB_JSON` environment variable.
-    4. The configuration described in the `APERTUREDB_JSON` Google Colab secret.
-    5. The configuration described in the `APERTUREDB_JSON` secret in a `.env` file.
-    6. The configuration named by the `APERTUREDB_CONFIG` environment variable.
-    7. The active configuration.
+    3. The configuration described in the `APERTUREDB_KEY` Google Colab secret.
+    4. The configuration described in the `APERTUREDB_JSON` environment variable.
+    5. The configuration described in the `APERTUREDB_JSON` Google Colab secret.
+    6. The configuration described in the `APERTUREDB_JSON` secret in a `.env` file.
+    7. The configuration named by the `APERTUREDB_CONFIG` environment variable.
+    8. The active configuration.
 
     If there are both global and local configurations with the same name, the global configuration is preferred.
 
@@ -214,6 +215,14 @@ def lookup_config_by_name(name: str, source: str) -> Configuration:
         logger.info(
             f"Using configuration from APERTUREDB_KEY environment variable")
         config = Configuration.reinflate(data)
+    elif (data := _get_colab_secret("APERTUREDB_KEY")) is not None and data != "":
+        logger.info(
+            f"Using configuration from APERTUREDB_KEY Google Colab secret")
+        config = Configuration.reinflate(data)
+        if create_config_for_colab_secret:
+            logger.info(
+                f"Creating and activating configuration from APERTUREDB_KEY Google Colab secret")
+            _store_config(config, 'google_colab')
     elif (data := os.environ.get("APERTUREDB_JSON")) is not None and data != "":
         logger.info(
             f"Using configuration from APERTUREDB_JSON environment variable")
@@ -315,7 +324,7 @@ def execute_query(client: Connector, query: Commands,
                     warn_list.append(wr)
         if len(warn_list) != 0:
             logger.warning(
-                f"Partial errors:\r\n{json.dumps(query)}\r\n{json.dumps(warn_list)}")
+                f"Partial errors:\r\n{json.dumps(query, default=str)}\r\n{json.dumps(warn_list, default=str)}")
             result = 2
 
     return result, r, b
 
@@ -376,7 +376,7 @@ def _query(self, query, blob_array = [], try_resume=True):
         response_blob_array = []
         # Check the query type
         if not isinstance(query, str):  # assumes json
-            query_str = json.dumps(query)
+            query_str = json.dumps(query, default=str)
         else:
             query_str = query
 
 
@@ -77,7 +77,10 @@ def __init__(self, host="localhost", port=None,
         # A Convenience feature to not require the port
         # Relies on common ports for http and https, but can be overriden
         if port is None:
-            self.port = 443 if use_ssl else 80
+            if config is None:
+                self.port = 443 if use_ssl else 80
+            else:
+                self.port = config.port
         else:
             self.port = port
 
 
@@ -25,9 +25,9 @@ class SingleEntityUpdateDataCSV(CSVParser.CSVParser):
        - a series of updateif_ to determine if an update is necessary
 
        Conditionals:
-         updateif>_prop - updates if the database value > csv value
-         updateif<_prop - updates if the database value < csv value
-         updateif!_prop - updates if the database value is != csv value
+         ```updateif>_prop``` : updates if the database value greater than csv value
+         ```updateif<_prop``` : updates if the database value less than csv value
+         ```updateif!_prop``` : updates if the database value is not equal to csv value
 
     :::note
     Is backed by a CSV file with the following columns (format optional):
 
@@ -0,0 +1,292 @@
+import dataclasses
+import hashlib
+import io
+import json
+import logging
+import PIL.GifImagePlugin
+import mlcroissant as mlc
+import PIL.Image
+import pandas as pd
+
+from typing import Any, List, Tuple
+
+from aperturedb.Subscriptable import Subscriptable
+from aperturedb.Query import QueryBuilder
+from aperturedb.DataModels import IdentityDataModel
+from aperturedb.Query import generate_add_query
+
+
+logger = logging.getLogger(__name__)
+
+
+MAX_REF_VALUE = 99999
+# This is useful to identify the class of the record in ApertureDB.
+CLASS_PROPERTY_NAME = "adb_class_name"
+
+
+class RecordSetModel(IdentityDataModel):
+    name: str
+    description: str = ""
+    uuid: str = ""
+
+
+class DatasetModel(IdentityDataModel):
+    url: str = ""
+    name: str = "Croissant Dataset automatically ingested into ApertureDB"
+    description: str = f"A dataset loaded from a croissant json-ld"
+    version: str = "1.0.0"
+    record_sets: List[RecordSetModel] = dataclasses.field(default_factory=list)
+
+
+def deserialize_record(record):
+    """These are the types of records that we expect to deserialize, from croissant Records.
+
+    Args:
+        record (_type_): _description_
+
+    Returns:
+        _type_: _description_
+    """
+    deserialized = record
+    if record is None:
+        deserialized = "Not Available"
+    if isinstance(record, bytes):
+        deserialized = record.decode('utf-8')
+    if isinstance(record, pd.Timestamp):
+        deserialized = {"_date": record.to_pydatetime().isoformat()}
+    if record == pd.NaT:
+        deserialized = "Not Available Time"
+    if isinstance(deserialized, str):
+        if deserialized.startswith("[") or deserialized.startswith("{"):
+            # If it looks like a list or dict, try to parse it as JSON
+            try:
+                deserialized = json.loads(deserialized)
+            except json.JSONDecodeError:
+                logger.info(f"Failed to parse JSON: {deserialized}")
+
+            try:
+                deserialized = json.loads(deserialized.replace("'", "\""))
+            except Exception as e:
+                logger.info(
+                    f"Failed to parse JSON: {deserialized} with error {e}")
+                pass
+
+    if isinstance(deserialized, list):
+        deserialized = [deserialize_record(item) for item in deserialized]
+    if isinstance(deserialized, dict):
+        deserialized = {k: deserialize_record(
+            v) for k, v in deserialized.items()}
+
+    return deserialized
+
+
+def persist_metadata(dataset: mlc.Dataset, url: str) -> Tuple[List[dict], List[bytes]]:
+    """
+    Persist the metadata of a croissant dataset into ApertureDB.
+    """
+    ds = DatasetModel(
+        url=url,
+        name=dataset.metadata.name,
+        description=dataset.metadata.description,
+        version=dataset.metadata.version or "1.0.0",
+        record_sets=[RecordSetModel(
+            name=rs.name,
+            description=rs.description,
+            uuid=rs.uuid,
+        ) for rs in dataset.metadata.record_sets]
+    )
+    q, b, _ = generate_add_query(ds)
+
+    return q, b
+
+
+def try_parse(value: str) -> Any:
+    """Attempts to parse a string value into a more appropriate type."""
+    parsed = value.strip()
+
+    if parsed.startswith("http"):
+        # Download the content from the URL
+        from aperturedb.Sources import Sources
+        sources = Sources(n_download_retries=3)
+        result, buffer = sources.load_from_http_url(
+            parsed, validator=lambda x: True)
+        if result:
+            parsed = PIL.Image.open(io.BytesIO(buffer))
+
+    return parsed
+
+
+def dict_to_query(row_dict, name: str, flatten_json: bool) -> Any:
+    literals = {}
+    subitems = {}
+    known_image_blobs = {}
+    unknown_blobs = {}
+    o_literals = {}
+
+    name = name.split("/")[-1]  # Use the last part of the name
+    # If name is not specified, or begins with _, this ensures that it
+    # complies with the ApertureDB naming conventions
+    if not name or name.startswith("_"):
+        safe_name = f"E_{name or 'Record'}"  # Uncomment if you want
+        logger.warning(
+            f"Entity Name '{name}' is not valid. Using {safe_name}.")
+        name = safe_name
+
+    for k, v in row_dict.items():
+        k = k.split("/")[-1]  # Use the last part of the key
+        if not k or k.startswith("_"):
+            safe_key = f"F_{k or 'Field'}"
+            logger.warning(
+                f"Property name '{k}' is not valid. Using {safe_key}.")
+            k = safe_key
+        item = v
+        # Pre processed items from croissant.
+        if isinstance(item, PIL.Image.Image):
+            buffer = io.BytesIO()
+            item.save(buffer, format=item.format)
+            known_image_blobs[k] = buffer.getvalue()
+            continue
+
+        record = deserialize_record(item)
+        if isinstance(record, str):
+            record = try_parse(record)
+
+            # Post processed items from SDK.
+            if isinstance(record, PIL.GifImagePlugin.GifImageFile):
+                buffer = io.BytesIO()
+                record.save(buffer, format=record.format)
+                unknown_blobs[k] = buffer.getvalue()
+                continue
+
+            if isinstance(record, PIL.Image.Image):
+                buffer = io.BytesIO()
+                record.save(buffer, format=record.format)
+                known_image_blobs[k] = buffer.getvalue()
+                continue
+
+        if flatten_json and isinstance(record, list):
+            subitems[k] = record
+        else:
+            literals[k] = record
+            # Original value from croissant. This is useful for debugging.
+            o_literals[k] = item
+
+    if flatten_json:
+        str_rep = "".join([f"{str(k)}{str(v)}" for k, v in literals.items()])
+        literals["adb_uuid"] = hashlib.sha256(
+            str_rep.encode('utf-8')).hexdigest()
+
+    literals[CLASS_PROPERTY_NAME] = name
+    q = QueryBuilder.add_command(name, {
+        "properties": literals,
+        "connect": {
+            "ref": MAX_REF_VALUE,
+            "class": "hasRecord",
+            "direction": "in",
+        }
+    })
+    if flatten_json:
+        q[list(q.keys())[-1]]["if_not_found"] = {
+            "adb_uuid": ["==", literals["adb_uuid"]]
+        }
+
+    dependents = []
+    if len(subitems) > 0 or len(known_image_blobs) > 0 or len(unknown_blobs) > 0:
+        # We need to create a reference to this record
+        q[list(q.keys())[-1]]["_ref"] = 1
+
+    for key in subitems:
+        for item in subitems[key]:
+            subitem_query, blobs = dict_to_query(
+                item, f"{name}.{key}", flatten_json)
+            subitem_query[0][list(subitem_query[0].keys())[-1]]["connect"] = {
+                "ref": 1,
+                "class": key,
+                "direction": "in",
+            }
+            dependents.extend(subitem_query)
+
+    from aperturedb.Query import ObjectType
+    blobs = []
+    for blob in known_image_blobs:
+        image_query = QueryBuilder.add_command(ObjectType.IMAGE, {
+            "properties": {CLASS_PROPERTY_NAME: literals[CLASS_PROPERTY_NAME] + "." + "image"},
+            "connect": {
+                "ref": 1,
+                "class": blob,
+                "direction": "in"
+            }
+        })
+        blobs.append(known_image_blobs[blob])
+        dependents.append(image_query)
+
+    for blob in unknown_blobs:
+        blob_query = QueryBuilder.add_command(ObjectType.BLOB, {
+            "properties": {CLASS_PROPERTY_NAME: literals[CLASS_PROPERTY_NAME] + "." + "blob"},
+            "connect": {
+                "ref": 1,
+                "class": blob,
+                "direction": "in"
+            }
+        })
+        blobs.append(unknown_blobs[blob])
+        dependents.append(blob_query)
+
+    return [q] + dependents, blobs
+
+
+class MLCroissantRecordSet(Subscriptable):
+    def __init__(
+            self,
+            record_set: mlc.Records,
+            name: str,
+            flatten_json: bool,
+            sample_count: int = 0,
+            uuid: str = None):
+        self.record_set = record_set
+        self.uuid = uuid
+        samples = []
+        count = 0
+        for record in record_set:
+            samples.append({k: v for k, v in record.items()})
+            count += 1
+            if count == sample_count:
+                break
+
+        self.samples = samples
+        self.sample_count = len(samples)
+        self.name = name
+        self.flatten_json = flatten_json
+        self.indexed_entities = set()
+
+    def getitem(self, subscript):
+        row_dict = self.samples[subscript]
+
+        find_recordset_query = QueryBuilder.find_command(
+            "RecordSetModel", {
+                "_ref": MAX_REF_VALUE,
+                "constraints": {
+                    "uuid": ["==", self.uuid]
+                }
+            })
+
+        q, blobs = dict_to_query(row_dict, self.name, self.flatten_json)
+        indexes_to_create = []
+        for command in q:
+            cmd = list(command.keys())[-1]
+            if cmd in ["AddImage", "AddBlob", "AddVideo"]:
+                continue
+            indexable_entity = command[list(command.keys())[-1]]["class"]
+            if indexable_entity not in self.indexed_entities:
+                index_command = {
+                    "CreateIndex": {
+                        "class": indexable_entity,
+                        "index_type": "entity",
+                        "property_key": "adb_uuid",
+                    }
+                }
+                indexes_to_create.append(index_command)
+        return indexes_to_create + [find_recordset_query] + q, blobs
+
+    def __len__(self):
+        return len(self.samples)
@@ -10,7 +10,7 @@
 import signal
 import sys
 
-__version__ = "0.4.47"
+__version__ = "0.4.48"
 
 logger = logging.getLogger(__name__)
 
 
@@ -198,12 +198,12 @@ def check_for_overwrite(name):
         gen_config = _create_configuration_from_json(
             json_str, name=name, name_required=True)
         check_for_overwrite(gen_config.name)
-        name = gen_config.name
+        name = name if name is not None else gen_config.name
     elif from_key:
         assert interactive, "Interactive mode must be enabled for --from-key"
         encoded_str = typer.prompt("Enter encoded string", hide_input=True)
         gen_config = Configuration.reinflate(encoded_str)
-        name = gen_config.name
+        name = name if name is not None else gen_config.name
 
     else:
         if interactive: