Skip to content

Commit a01ebfb

Browse files
authored
Merge pull request #161 from aperture-data/release-0.3.4
Release 0.3.4
2 parents 02f1ff2 + 4f10026 commit a01ebfb

21 files changed

+413
-140
lines changed

aperturedb/BBoxDataCSV.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,19 +57,20 @@ class BBoxDataCSV(CSVParser.CSVParser):
5757
5858
"""
5959

60-
def __init__(self, filename):
60+
def __init__(self, filename, df=None, use_dask=False):
6161

62-
super().__init__(filename)
62+
super().__init__(filename, df=df, use_dask=use_dask)
63+
if not use_dask:
64+
self.props_keys = [x for x in self.header[5:]
65+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
66+
self.constraints_keys = [x for x in self.header[5:]
67+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
6368

64-
self.props_keys = [x for x in self.header[5:]
65-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
66-
self.constraints_keys = [x for x in self.header[5:]
67-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
68-
69-
self.img_key = self.header[0]
70-
self.command = "AddBoundingBox"
69+
self.img_key = self.header[0]
70+
self.command = "AddBoundingBox"
7171

7272
def getitem(self, idx):
73+
idx = self.df.index.start + idx
7374

7475
q = []
7576

aperturedb/BlobDataCSV.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,19 @@ class BlobDataCSV(CSVParser.CSVParser):
4747
id would be only inserted if it does not already exist in the database.
4848
"""
4949

50-
def __init__(self, filename):
50+
def __init__(self, filename, df=None, use_dask=False):
5151

52-
super().__init__(filename)
53-
54-
self.props_keys = [x for x in self.header[1:]
55-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX) and x != BLOB_PATH]
56-
self.constraints_keys = [x for x in self.header[1:]
57-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
58-
self.command = "AddBlob"
52+
super().__init__(filename, df=df, use_dask=use_dask)
53+
if not use_dask:
54+
self.props_keys = [x for x in self.header[1:]
55+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX) and x != BLOB_PATH]
56+
self.constraints_keys = [x for x in self.header[1:]
57+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
58+
self.command = "AddBlob"
5959

6060
def getitem(self, idx):
61+
idx = self.df.index.start + idx
6162
filename = self.df.loc[idx, BLOB_PATH]
62-
6363
blob_ok, blob = self.load_blob(filename)
6464
if not blob_ok:
6565
logger.error("Error loading blob: " + filename)

aperturedb/CSVParser.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import pandas as pd
22
import logging
33
from aperturedb.Subscriptable import Subscriptable
4+
from dask import dataframe
5+
import os
6+
import multiprocessing as mp
7+
48

59
logger = logging.getLogger(__name__)
610

@@ -9,24 +13,48 @@
913
PROPERTIES = "properties"
1014
CONSTRAINTS = "constraints"
1115

16+
# This number is based on the partitions one wants to use per core.
17+
PARTITIONS_PER_CORE = 10
1218

13-
class CSVParser(Subscriptable):
14-
"""**ApertureDB General CSV Parser for Loaders.**
15-
...
16-
"""
19+
# Use 90% os the CPU cores by default.
20+
CORES_USED_FOR_PARALLELIZATION = 0.9
1721

18-
def __init__(self, filename):
1922

20-
self.df = pd.read_csv(filename)
23+
class CSVParser(Subscriptable):
24+
"""
25+
**ApertureDB General CSV Parser for Loaders.**
26+
This operates in 2 modes:
27+
- **Normal Mode**: This is the default mode. It reads the CSV file into a Pandas DataFrame.
28+
- **Dask Mode**: This mode is used when the CSV file is too big to fit in memory, or multiprocessing is desired.
29+
It reads the CSV file into a Dask DataFrame.
30+
In Dask mode the CSV file is read in chunks, and the operations are performed on each chunk.
31+
The tricky bit is that the chunck size is not known till the loader is created, so the processing happens when ingest is called.
32+
So the Data CSV has another signature, where the df is passed explicitly.
33+
"""
2134

22-
self.validate()
35+
def __init__(self, filename, df=None, use_dask=False):
36+
self.use_dask = use_dask
37+
self.filename = filename
38+
39+
if not use_dask:
40+
if df is None:
41+
self.df = pd.read_csv(filename)
42+
else:
43+
self.df = df
44+
else:
45+
# It'll impact the number of partitions, and memory usage.
46+
# TODO: tune this for the best performance.
47+
cores_used = int(CORES_USED_FOR_PARALLELIZATION * mp.cpu_count())
48+
self.df = dataframe.read_csv(
49+
self.filename,
50+
blocksize = os.path.getsize(self.filename) // (cores_used * PARTITIONS_PER_CORE))
2351

2452
if len(self.df) == 0:
2553
logger.error("Dataframe empty. Is the CSV file ok?")
2654

2755
self.df = self.df.astype('object')
28-
2956
self.header = list(self.df.columns.values)
57+
self.validate()
3058

3159
def __len__(self):
3260

aperturedb/ConnectionDataCSV.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -53,32 +53,32 @@ class ConnectionDataCSV(CSVParser.CSVParser):
5353
5454
"""
5555

56-
def __init__(self, filename):
57-
58-
super().__init__(filename)
59-
60-
self.props_keys = [x for x in self.header[3:]
61-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
62-
63-
self.constraints_keys = [x for x in self.header[3:]
64-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
65-
66-
self.src_class = self.header[1].split("@")[0]
67-
self.src_key = self.header[1].split("@")[1]
68-
self.dst_class = self.header[2].split("@")[0]
69-
# Pandas appends a .n to the column name if there is a duplicate
70-
self.dst_key = self.header[2].split("@")[1].split(".")[0]
71-
self.command = "AddConnection"
56+
def __init__(self, filename, df=None, use_dask=False):
57+
super().__init__(filename, df=df, use_dask=use_dask)
58+
if not use_dask:
59+
self.props_keys = [x for x in self.header[3:]
60+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
61+
62+
self.constraints_keys = [x for x in self.header[3:]
63+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
64+
65+
self.src_class = self.header[1].split("@")[0]
66+
self.src_key = self.header[1].split("@")[1]
67+
self.dst_class = self.header[2].split("@")[0]
68+
# Pandas appends a .n to the column name if there is a duplicate
69+
self.dst_key = self.header[2].split("@")[1].split(".")[0]
70+
self.command = "AddConnection"
7271

7372
def getitem(self, idx):
73+
idx = self.df.index.start + idx
7474
src_value = self.df.loc[idx, self.header[1]]
7575
dst_value = self.df.loc[idx, self.header[2]]
7676
connection_class = self.df.loc[idx, CONNECTION_CLASS]
7777
q = []
7878

7979
try:
8080

81-
ref_src = (2 * idx) % 10000 + 1
81+
ref_src = (2 * idx) % 100000 + 1
8282
fe_a = {
8383
"FindEntity": {
8484
"_ref": ref_src,

aperturedb/DaskManager.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from __future__ import annotations
2+
import logging
3+
from threading import Lock
4+
import time
5+
from types import SimpleNamespace
6+
import dask
7+
from dask.distributed import Client, LocalCluster, progress
8+
from aperturedb.Connector import Connector
9+
10+
import multiprocessing as mp
11+
12+
from aperturedb.Stats import Stats
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class DaskManager:
18+
def __init__(self, num_workers: int = -1):
19+
self.__num_workers = num_workers
20+
21+
def run(self, db: Connector, generator, batchsize, stats):
22+
def process(df, host, port, session):
23+
metrics = Stats()
24+
# Dask reads data in partitions, and the first partition is of 2 rows, with all
25+
# values as 'foo'. This is for sampling the column names and types. Should not process
26+
# those rows.
27+
if len(df) == 2:
28+
if df.iloc[0, 0] == "foo":
29+
return
30+
count = 0
31+
try:
32+
shared_data = SimpleNamespace()
33+
shared_data.session = session
34+
shared_data.lock = Lock()
35+
db = Connector(host=host, port=port, shared_data=shared_data)
36+
except Exception as e:
37+
logger.exception(e)
38+
from aperturedb.ParallelLoader import ParallelLoader
39+
loader = ParallelLoader(db)
40+
for i in range(0, len(df), batchsize):
41+
end = min(i + batchsize, len(df))
42+
slice = df[i:end]
43+
data = generator.__class__(filename="", df=slice)
44+
loader.ingest(generator=data, batchsize=len(
45+
slice), numthreads=1, stats=False)
46+
count += 1
47+
metrics.times_arr.extend(loader.times_arr)
48+
metrics.error_counter += loader.error_counter
49+
50+
return metrics
51+
52+
# The -1 magic number is to use as many 90% of the cores (1 worker per core).
53+
# This can be overridden by the user.
54+
# Create a pool of workers.
55+
# TODO: see if the same pool can be reused for multiple tasks.
56+
workers = self.__num_workers if self.__num_workers != \
57+
-1 else int(0.9 * mp.cpu_count())
58+
with LocalCluster(n_workers=workers) as cluster, Client(cluster) as client:
59+
dask.config.set(scheduler="distributed")
60+
start_time = time.time()
61+
# Passing DB as an argument to function is not supported by Dask,
62+
# so we pass session and host/port instead.
63+
computation = generator.df.map_partitions(
64+
process,
65+
db.host,
66+
db.port,
67+
db.shared_data.session)
68+
computation = computation.persist()
69+
if stats:
70+
progress(computation)
71+
results = computation.compute()
72+
return results, time.time() - start_time

aperturedb/DescriptorDataCSV.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,20 @@ class DescriptorDataCSV(CSVParser.CSVParser):
6868
6969
"""
7070

71-
def __init__(self, filename):
72-
73-
super().__init__(filename)
71+
def __init__(self, filename, df=None, use_dask=False):
7472

73+
super().__init__(filename, df=df, use_dask=use_dask)
7574
self.npy_arrays = {}
7675
self.has_label = False
77-
78-
self.props_keys = [x for x in self.header[3:]
79-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
80-
self.constraints_keys = [x for x in self.header[3:]
81-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
82-
self.command = "AddDescriptor"
76+
if not use_dask:
77+
self.props_keys = [x for x in self.header[3:]
78+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
79+
self.constraints_keys = [x for x in self.header[3:]
80+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
81+
self.command = "AddDescriptor"
8382

8483
def getitem(self, idx):
85-
84+
idx = self.df.index.start + idx
8685
filename = self.df.loc[idx, HEADER_PATH]
8786
index = self.df.loc[idx, HEADER_INDEX]
8887
desc_set = self.df.loc[idx, HEADER_SET]

aperturedb/DescriptorSetDataCSV.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,22 +45,22 @@ class DescriptorSetDataCSV(CSVParser.CSVParser):
4545
the distance would be L2.
4646
"""
4747

48-
def __init__(self, filename):
48+
def __init__(self, filename, df=None, use_dask=False):
4949

50-
super().__init__(filename)
51-
52-
self.props_keys = [x for x in self.header[4:]
53-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
54-
self.constraints_keys = [x for x in self.header[4:]
55-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
56-
self.command = "AddDescriptorSet"
50+
super().__init__(filename, df=df, use_dask=use_dask)
51+
if not use_dask:
52+
self.props_keys = [x for x in self.header[4:]
53+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
54+
self.constraints_keys = [x for x in self.header[4:]
55+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
56+
self.command = "AddDescriptorSet"
5757

5858
def getitem(self, idx):
5959

6060
# Metrics/Engine can be of the form:
6161
# "IP", or
6262
# ["IP" ...]
63-
63+
idx = self.df.index.start + idx
6464
metrics = self.df.loc[idx, HEADER_METRIC]
6565
metrics = metrics if "[" not in metrics else ast.literal_eval(metrics)
6666
engines = self.df.loc[idx, HEADER_ENGINE]

aperturedb/EntityDataCSV.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,17 @@ class EntityDataCSV(CSVParser.CSVParser):
3939
4040
"""
4141

42-
def __init__(self, filename):
43-
44-
super().__init__(filename)
45-
46-
self.props_keys = [x for x in self.header[1:]
47-
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
48-
self.constraints_keys = [x for x in self.header[1:]
49-
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
50-
self.command = "AddEntity"
42+
def __init__(self, filename, df=None, use_dask=False):
43+
super().__init__(filename, df=df, use_dask=use_dask)
44+
if not use_dask:
45+
self.props_keys = [x for x in self.header[1:]
46+
if not x.startswith(CSVParser.CONTRAINTS_PREFIX)]
47+
self.constraints_keys = [x for x in self.header[1:]
48+
if x.startswith(CSVParser.CONTRAINTS_PREFIX)]
49+
self.command = "AddEntity"
5150

5251
def getitem(self, idx):
52+
idx = self.df.index.start + idx
5353
eclass = self.df.loc[idx, ENTITY_CLASS]
5454
q = []
5555
ae = self._basic_command(idx,
@@ -61,8 +61,5 @@ def getitem(self, idx):
6161
return q, []
6262

6363
def validate(self):
64-
65-
self.header = list(self.df.columns.values)
66-
6764
if self.header[0] != ENTITY_CLASS:
6865
raise Exception("Error with CSV file field: " + ENTITY_CLASS)

0 commit comments

Comments
 (0)