aperture-data
diff --git a/‎aperturedb/BBoxDataCSV.py‎
Lines changed: 10 additions & 9 deletions b/‎aperturedb/BBoxDataCSV.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎aperturedb/BlobDataCSV.py‎
Lines changed: 9 additions & 9 deletions b/‎aperturedb/BlobDataCSV.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎aperturedb/CSVParser.py‎
Lines changed: 36 additions & 8 deletions b/‎aperturedb/CSVParser.py‎
Lines changed: 36 additions & 8 deletions
diff --git a/‎aperturedb/ConnectionDataCSV.py‎
Lines changed: 17 additions & 17 deletions b/‎aperturedb/ConnectionDataCSV.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎aperturedb/DaskManager.py‎
Lines changed: 72 additions & 0 deletions b/‎aperturedb/DaskManager.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎aperturedb/DescriptorDataCSV.py‎
Lines changed: 9 additions & 10 deletions b/‎aperturedb/DescriptorDataCSV.py‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎aperturedb/DescriptorSetDataCSV.py‎
Lines changed: 9 additions & 9 deletions b/‎aperturedb/DescriptorSetDataCSV.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎aperturedb/EntityDataCSV.py‎
Lines changed: 9 additions & 12 deletions b/‎aperturedb/EntityDataCSV.py‎
Lines changed: 9 additions & 12 deletions
@@ -57,19 +57,20 @@ class BBoxDataCSV(CSVParser.CSVParser):
 
     """
 
-    def __init__(self, filename):
+    def __init__(self, filename, df=None, use_dask=False):
 
-        super().__init__(filename)
+        super().__init__(filename, df=df, use_dask=use_dask)
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[5:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.constraints_keys = [x for x in self.header[5:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
 
-        self.props_keys       = [x for x in self.header[5:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.constraints_keys = [x for x in self.header[5:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-
-        self.img_key = self.header[0]
-        self.command = "AddBoundingBox"
+            self.img_key = self.header[0]
+            self.command = "AddBoundingBox"
 
     def getitem(self, idx):
+        idx = self.df.index.start + idx
 
         q = []
 
 
@@ -47,19 +47,19 @@ class BlobDataCSV(CSVParser.CSVParser):
         id would be only inserted if it does not already exist in the database.
     """
 
-    def __init__(self, filename):
+    def __init__(self, filename, df=None, use_dask=False):
 
-        super().__init__(filename)
-
-        self.props_keys       = [x for x in self.header[1:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX) and x != BLOB_PATH]
-        self.constraints_keys = [x for x in self.header[1:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.command = "AddBlob"
+        super().__init__(filename, df=df, use_dask=use_dask)
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[1:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX) and x != BLOB_PATH]
+            self.constraints_keys = [x for x in self.header[1:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.command = "AddBlob"
 
     def getitem(self, idx):
+        idx = self.df.index.start + idx
         filename = self.df.loc[idx, BLOB_PATH]
-
         blob_ok, blob = self.load_blob(filename)
         if not blob_ok:
             logger.error("Error loading blob: " + filename)
 
@@ -1,6 +1,10 @@
 import pandas as pd
 import logging
 from aperturedb.Subscriptable import Subscriptable
+from dask import dataframe
+import os
+import multiprocessing as mp
+
 
 logger = logging.getLogger(__name__)
 
@@ -9,24 +13,48 @@
 PROPERTIES  = "properties"
 CONSTRAINTS = "constraints"
 
+# This number is based on the partitions one wants to use per core.
+PARTITIONS_PER_CORE = 10
 
-class CSVParser(Subscriptable):
-    """**ApertureDB General CSV Parser for Loaders.**
-    ...
-    """
+# Use 90% os the CPU cores by default.
+CORES_USED_FOR_PARALLELIZATION = 0.9
 
-    def __init__(self, filename):
 
-        self.df = pd.read_csv(filename)
+class CSVParser(Subscriptable):
+    """
+    **ApertureDB General CSV Parser for Loaders.**
+    This operates in 2 modes:
+    - **Normal Mode**: This is the default mode. It reads the CSV file into a Pandas DataFrame.
+    - **Dask Mode**: This mode is used when the CSV file is too big to fit in memory, or multiprocessing is desired.
+        It reads the CSV file into a Dask DataFrame.
+        In Dask mode the CSV file is read in chunks, and the operations are performed on each chunk.
+        The tricky bit is that the chunck size is not known till the loader is created, so the processing happens when ingest is called.
+        So the Data CSV has another signature, where the df is passed explicitly.
+    """
 
-        self.validate()
+    def __init__(self, filename, df=None, use_dask=False):
+        self.use_dask = use_dask
+        self.filename = filename
+
+        if not use_dask:
+            if df is None:
+                self.df = pd.read_csv(filename)
+            else:
+                self.df = df
+        else:
+            # It'll impact the number of partitions, and memory usage.
+            # TODO: tune this for the best performance.
+            cores_used = int(CORES_USED_FOR_PARALLELIZATION * mp.cpu_count())
+            self.df = dataframe.read_csv(
+                self.filename,
+                blocksize = os.path.getsize(self.filename) // (cores_used * PARTITIONS_PER_CORE))
 
         if len(self.df) == 0:
             logger.error("Dataframe empty. Is the CSV file ok?")
 
         self.df = self.df.astype('object')
-
         self.header = list(self.df.columns.values)
+        self.validate()
 
     def __len__(self):
 
 
@@ -53,32 +53,32 @@ class ConnectionDataCSV(CSVParser.CSVParser):
 
     """
 
-    def __init__(self, filename):
-
-        super().__init__(filename)
-
-        self.props_keys       = [x for x in self.header[3:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-
-        self.constraints_keys = [x for x in self.header[3:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-
-        self.src_class   = self.header[1].split("@")[0]
-        self.src_key     = self.header[1].split("@")[1]
-        self.dst_class   = self.header[2].split("@")[0]
-        # Pandas appends a .n to the column name if there is a duplicate
-        self.dst_key     = self.header[2].split("@")[1].split(".")[0]
-        self.command     = "AddConnection"
+    def __init__(self, filename, df=None, use_dask=False):
+        super().__init__(filename, df=df, use_dask=use_dask)
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[3:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+
+            self.constraints_keys = [x for x in self.header[3:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+
+            self.src_class   = self.header[1].split("@")[0]
+            self.src_key     = self.header[1].split("@")[1]
+            self.dst_class   = self.header[2].split("@")[0]
+            # Pandas appends a .n to the column name if there is a duplicate
+            self.dst_key     = self.header[2].split("@")[1].split(".")[0]
+            self.command     = "AddConnection"
 
     def getitem(self, idx):
+        idx = self.df.index.start + idx
         src_value = self.df.loc[idx, self.header[1]]
         dst_value = self.df.loc[idx, self.header[2]]
         connection_class = self.df.loc[idx, CONNECTION_CLASS]
         q = []
 
         try:
 
-            ref_src = (2 * idx) % 10000 + 1
+            ref_src = (2 * idx) % 100000 + 1
             fe_a = {
                 "FindEntity": {
                     "_ref": ref_src,
 
@@ -0,0 +1,72 @@
+from __future__ import annotations
+import logging
+from threading import Lock
+import time
+from types import SimpleNamespace
+import dask
+from dask.distributed import Client, LocalCluster, progress
+from aperturedb.Connector import Connector
+
+import multiprocessing as mp
+
+from aperturedb.Stats import Stats
+
+logger = logging.getLogger(__name__)
+
+
+class DaskManager:
+    def __init__(self, num_workers: int = -1):
+        self.__num_workers = num_workers
+
+    def run(self, db: Connector, generator, batchsize, stats):
+        def process(df, host, port, session):
+            metrics = Stats()
+            # Dask reads data in partitions, and the first partition is of 2 rows, with all
+            # values as 'foo'. This is for sampling the column names and types. Should not process
+            # those rows.
+            if len(df) == 2:
+                if df.iloc[0, 0] == "foo":
+                    return
+            count = 0
+            try:
+                shared_data = SimpleNamespace()
+                shared_data.session = session
+                shared_data.lock = Lock()
+                db = Connector(host=host, port=port, shared_data=shared_data)
+            except Exception as e:
+                logger.exception(e)
+            from aperturedb.ParallelLoader import ParallelLoader
+            loader = ParallelLoader(db)
+            for i in range(0, len(df), batchsize):
+                end = min(i + batchsize, len(df))
+                slice = df[i:end]
+                data = generator.__class__(filename="", df=slice)
+                loader.ingest(generator=data, batchsize=len(
+                    slice), numthreads=1, stats=False)
+                count += 1
+                metrics.times_arr.extend(loader.times_arr)
+                metrics.error_counter += loader.error_counter
+
+            return metrics
+
+        # The -1 magic number is to use as many 90% of the cores (1 worker per core).
+        # This can be overridden by the user.
+        # Create a pool of workers.
+        # TODO: see if the same pool can be reused for multiple tasks.
+        workers = self.__num_workers if self.__num_workers != \
+            -1 else int(0.9 * mp.cpu_count())
+        with LocalCluster(n_workers=workers) as cluster, Client(cluster) as client:
+            dask.config.set(scheduler="distributed")
+            start_time = time.time()
+            # Passing DB as an argument to function is not supported by Dask,
+            # so we pass session and host/port instead.
+            computation = generator.df.map_partitions(
+                process,
+                db.host,
+                db.port,
+                db.shared_data.session)
+            computation = computation.persist()
+            if stats:
+                progress(computation)
+            results = computation.compute()
+        return results, time.time() - start_time
@@ -68,21 +68,20 @@ class DescriptorDataCSV(CSVParser.CSVParser):
 
     """
 
-    def __init__(self, filename):
-
-        super().__init__(filename)
+    def __init__(self, filename, df=None, use_dask=False):
 
+        super().__init__(filename, df=df, use_dask=use_dask)
         self.npy_arrays = {}
         self.has_label = False
-
-        self.props_keys       = [x for x in self.header[3:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.constraints_keys = [x for x in self.header[3:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.command = "AddDescriptor"
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[3:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.constraints_keys = [x for x in self.header[3:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.command = "AddDescriptor"
 
     def getitem(self, idx):
-
+        idx = self.df.index.start + idx
         filename = self.df.loc[idx, HEADER_PATH]
         index    = self.df.loc[idx, HEADER_INDEX]
         desc_set = self.df.loc[idx, HEADER_SET]
 
@@ -45,22 +45,22 @@ class DescriptorSetDataCSV(CSVParser.CSVParser):
         the distance would be L2.
     """
 
-    def __init__(self, filename):
+    def __init__(self, filename, df=None, use_dask=False):
 
-        super().__init__(filename)
-
-        self.props_keys       = [x for x in self.header[4:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.constraints_keys = [x for x in self.header[4:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.command = "AddDescriptorSet"
+        super().__init__(filename, df=df, use_dask=use_dask)
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[4:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.constraints_keys = [x for x in self.header[4:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.command = "AddDescriptorSet"
 
     def getitem(self, idx):
 
         # Metrics/Engine can be of the form:
         #       "IP", or
         #       ["IP" ...]
-
+        idx = self.df.index.start + idx
         metrics = self.df.loc[idx, HEADER_METRIC]
         metrics = metrics if "[" not in metrics else ast.literal_eval(metrics)
         engines = self.df.loc[idx, HEADER_ENGINE]
 
@@ -39,17 +39,17 @@ class EntityDataCSV(CSVParser.CSVParser):
 
     """
 
-    def __init__(self, filename):
-
-        super().__init__(filename)
-
-        self.props_keys       = [x for x in self.header[1:]
-                                 if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.constraints_keys = [x for x in self.header[1:]
-                                 if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
-        self.command = "AddEntity"
+    def __init__(self, filename, df=None, use_dask=False):
+        super().__init__(filename, df=df, use_dask=use_dask)
+        if not use_dask:
+            self.props_keys       = [x for x in self.header[1:]
+                                     if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.constraints_keys = [x for x in self.header[1:]
+                                     if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
+            self.command = "AddEntity"
 
     def getitem(self, idx):
+        idx = self.df.index.start + idx
         eclass = self.df.loc[idx, ENTITY_CLASS]
         q = []
         ae = self._basic_command(idx,
@@ -61,8 +61,5 @@ def getitem(self, idx):
         return q, []
 
     def validate(self):
-
-        self.header = list(self.df.columns.values)
-
         if self.header[0] != ENTITY_CLASS:
             raise Exception("Error with CSV file field: " + ENTITY_CLASS)