Skip to content

Commit 5f360ef

Browse files
NakromaBaunsgaard
authored andcommitted
[SYSTEMDS-3548] load python parallel
This commit: - fixes the load_numpy string performance test case. It keeps the CLI usage consistent with the other test cases, but converts the dtype to the correct one internally. - fixes the array boolean convert breaking for row numbers above 64. It also adds a bit more error handling to prevent cases like this in the future. - parallelizes the column processing in the pandas DataFrame to FrameBlock conversion. - moves the assignment of column data to the FrameBlock to the parallel column processing. Closes #2154
1 parent c504549 commit 5f360ef

File tree

5 files changed

+76
-49
lines changed

5 files changed

+76
-49
lines changed

scripts/perftest/python/io/load_native.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,30 @@
2121

2222
import argparse
2323
import timeit
24+
from systemds.context import SystemDSContext
2425

2526

2627
setup = "\n".join(
2728
[
28-
"from systemds.context import SystemDSContext",
2929
"from systemds.script_building.script import DMLScript",
3030
]
3131
)
3232

3333

3434
run = "\n".join(
3535
[
36-
"with SystemDSContext(logging_level=10, py4j_logging_level=50) as ctx:",
37-
" node = ctx.read(src)",
38-
" script = DMLScript(ctx)",
39-
" script.build_code(node)",
40-
" script.execute()",
36+
"node = ctx.read(src)",
37+
"script = DMLScript(ctx)",
38+
"script.build_code(node)",
39+
"script.execute()",
4140
]
4241
)
4342

4443

4544
def main(args):
46-
gvars = {"src": args.src}
45+
gvars = {"src": args.src, "ctx": SystemDSContext(logging_level=10, py4j_logging_level=50)}
4746
print(timeit.timeit(run, setup, globals=gvars, number=args.number))
47+
gvars["ctx"].close()
4848

4949

5050
if __name__ == "__main__":

scripts/perftest/python/io/load_numpy.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222

2323
import argparse
2424
import timeit
25+
from systemds.context import SystemDSContext
2526

2627
setup = "\n".join(
2728
[
28-
"from systemds.context import SystemDSContext",
2929
"from systemds.script_building.script import DMLScript",
3030
"import numpy as np",
3131
"array = np.loadtxt(src, delimiter=',')",
@@ -37,11 +37,10 @@
3737

3838
run = "\n".join(
3939
[
40-
"with SystemDSContext(logging_level=10, py4j_logging_level=50) as ctx:",
41-
" matrix_from_np = ctx.from_numpy(array)",
42-
" script = DMLScript(ctx)",
43-
" script.add_input_from_python('test', matrix_from_np)",
44-
" script.execute()",
40+
"matrix_from_np = ctx.from_numpy(array)",
41+
"script = DMLScript(ctx)",
42+
"script.add_input_from_python('test', matrix_from_np)",
43+
"script.execute()",
4544
]
4645
)
4746

@@ -66,8 +65,9 @@
6665

6766

6867
def main(args):
69-
gvars = {"src": args.src, "dtype": args.dtype}
68+
gvars = {"src": args.src, "dtype": args.dtype, "ctx": SystemDSContext(logging_level=10, py4j_logging_level=50)}
7069
print(timeit.timeit(run, setup, globals=gvars, number=args.number))
70+
gvars["ctx"].close()
7171

7272

7373
if __name__ == "__main__":
@@ -86,4 +86,8 @@ def main(args):
8686
help=help_force_dtype,
8787
)
8888
args = parser.parse_args()
89+
90+
if args.dtype == "string": # numpy has no "string" dtype, convert to "str"
91+
args.dtype = "str"
92+
8993
main(args)

scripts/perftest/python/io/load_pandas.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@
2121

2222
import argparse
2323
import timeit
24+
from systemds.context import SystemDSContext
2425

2526
setup = "\n".join(
2627
[
27-
"from systemds.context import SystemDSContext",
2828
"from systemds.script_building.script import DMLScript",
2929
"import pandas as pd",
3030
"df = pd.read_csv(src, header=None)",
@@ -36,11 +36,10 @@
3636

3737
run = "\n".join(
3838
[
39-
"with SystemDSContext(logging_level=10, py4j_logging_level=50) as ctx:",
40-
" frame_from_pandas = ctx.from_pandas(df)",
41-
" script = DMLScript(ctx)",
42-
" script.add_input_from_python('test', frame_from_pandas)",
43-
" script.execute()",
39+
"frame_from_pandas = ctx.from_pandas(df)",
40+
"script = DMLScript(ctx)",
41+
"script.add_input_from_python('test', frame_from_pandas)",
42+
"script.execute()",
4443
]
4544
)
4645

@@ -64,8 +63,9 @@
6463

6564

6665
def main(args):
67-
gvars = {"src": args.src, "dtype": args.dtype}
66+
gvars = {"src": args.src, "dtype": args.dtype, "ctx": SystemDSContext(logging_level=10, py4j_logging_level=50)}
6867
print(timeit.timeit(run, setup, globals=gvars, number=args.number))
68+
gvars["ctx"].close()
6969

7070

7171
if __name__ == "__main__":

src/main/java/org/apache/sysds/runtime/util/Py4jConverterUtils.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.sysds.runtime.DMLRuntimeException;
2828
import org.apache.sysds.runtime.frame.data.columns.Array;
2929
import org.apache.sysds.runtime.frame.data.columns.ArrayFactory;
30+
import org.apache.sysds.runtime.frame.data.columns.BitSetArray;
3031
import org.apache.sysds.runtime.frame.data.columns.BooleanArray;
3132
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
3233

@@ -157,7 +158,13 @@ public static Array<?> convert(byte[] data, int numElements, Types.ValueType val
157158
break;
158159
case BOOLEAN:
159160
for(int i = 0; i < numElements; i++) {
160-
((BooleanArray) array).set(i, buffer.get() != 0);
161+
if (array instanceof BooleanArray) {
162+
((BooleanArray) array).set(i, buffer.get() != 0);
163+
} else if (array instanceof BitSetArray) {
164+
((BitSetArray) array).set(i, buffer.get() != 0);
165+
} else {
166+
throw new DMLRuntimeException("Array factory returned invalid array type for boolean values.");
167+
}
161168
}
162169
break;
163170
case STRING:

src/main/python/systemds/utils/converters.py

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import numpy as np
2525
import pandas as pd
26+
import concurrent.futures
2627
from py4j.java_gateway import JavaClass, JavaGateway, JavaObject, JVMView
2728

2829

@@ -81,6 +82,33 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject):
8182
)
8283

8384

85+
def convert_column(jvm, rows, j, col_type, pd_col, fb, col_name):
86+
"""Converts a given pandas column to a FrameBlock representation.
87+
88+
:param jvm: The JVMView of the current SystemDS context.
89+
:param rows: The number of rows in the pandas DataFrame.
90+
:param j: The current column index.
91+
:param col_type: The ValueType of the column.
92+
:param pd_col: The pandas column to convert.
93+
"""
94+
if col_type == jvm.org.apache.sysds.common.Types.ValueType.STRING:
95+
byte_data = bytearray()
96+
for value in pd_col.astype(str):
97+
encoded_value = value.encode("utf-8")
98+
byte_data.extend(struct.pack(">I", len(encoded_value)))
99+
byte_data.extend(encoded_value)
100+
else:
101+
col_data = pd_col.fillna("").to_numpy()
102+
byte_data = bytearray(col_data.tobytes())
103+
104+
converted_array = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convert(
105+
byte_data, rows, col_type
106+
)
107+
108+
fb.setColumnName(j, str(col_name))
109+
fb.setColumn(j, converted_array)
110+
111+
84112
def pandas_to_frame_block(sds, pd_df: pd.DataFrame):
85113
"""Converts a given pandas DataFrame to an internal FrameBlock representation.
86114
@@ -120,49 +148,37 @@ def pandas_to_frame_block(sds, pd_df: pd.DataFrame):
120148
jc_String = jvm.java.lang.String
121149
jc_FrameBlock = jvm.org.apache.sysds.runtime.frame.data.FrameBlock
122150
j_valueTypeArray = java_gate.new_array(jc_ValueType, len(schema))
123-
j_colNameArray = java_gate.new_array(jc_String, len(col_names))
124151

125152
# execution speed increases with optimized code when the number of rows exceeds 4
126153
if rows > 4:
127154
for i in range(len(schema)):
128155
j_valueTypeArray[i] = schema[i]
129-
for i in range(len(col_names)):
130-
j_colNameArray[i] = str(col_names[i])
131156

132-
fb = jc_FrameBlock(j_valueTypeArray, j_colNameArray, rows)
157+
fb = jc_FrameBlock(j_valueTypeArray, rows)
133158

134-
# convert and set data for each column
135-
for j, col_name in enumerate(col_names):
136-
col_type = schema[j]
137-
if col_type == jvm.org.apache.sysds.common.Types.ValueType.STRING:
138-
byte_data = bytearray()
139-
for value in pd_df[col_name].astype(str):
140-
encoded_value = value.encode("utf-8")
141-
byte_data.extend(struct.pack(">I", len(encoded_value)))
142-
byte_data.extend(encoded_value)
143-
else:
144-
col_data = pd_df[col_name].fillna("").to_numpy()
145-
byte_data = bytearray(col_data.tobytes())
146-
147-
converted_array = (
148-
jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convert(
149-
byte_data, rows, col_type
150-
)
159+
with concurrent.futures.ThreadPoolExecutor() as executor:
160+
executor.map(
161+
lambda j, col_name: convert_column(
162+
jvm, rows, j, schema[j], pd_df[col_name], fb, col_name
163+
),
164+
range(len(col_names)),
165+
col_names,
151166
)
152-
fb.setColumn(j, converted_array)
167+
153168
return fb
154169
else:
155170
j_dataArray = java_gate.new_array(jc_String, rows, cols)
156-
for i in range(len(schema)):
157-
j_valueTypeArray[i] = schema[i]
158-
for i in range(len(col_names)):
159-
j_colNameArray[i] = str(col_names[i])
160-
j = 0
171+
j_colNameArray = java_gate.new_array(jc_String, len(col_names))
172+
161173
for j, col_name in enumerate(col_names):
174+
j_valueTypeArray[j] = schema[j]
175+
j_colNameArray[j] = str(col_names[j])
162176
col_data = pd_df[col_name].fillna("").to_numpy(dtype=str)
177+
163178
for i in range(col_data.shape[0]):
164179
if col_data[i]:
165180
j_dataArray[i][j] = col_data[i]
181+
166182
fb = jc_FrameBlock(j_valueTypeArray, j_colNameArray, j_dataArray)
167183
return fb
168184

0 commit comments

Comments
 (0)