Skip to content

Commit 20a7b67

Browse files
ghafekmboehm7
authored andcommitted
[SYSTEMDS-3862] Initial SSB Benchmark Implementation
Closes #2280.
1 parent 61bb660 commit 20a7b67

File tree

16 files changed

+5768
-0
lines changed

16 files changed

+5768
-0
lines changed

scripts/staging/ssb/README.md

Lines changed: 525 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#-------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
#-------------------------------------------------------------
21+
22+
/* DML-script implementing the ssb query Q1.1 in SystemDS.
23+
SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
24+
FROM lineorder, dates
25+
WHERE
26+
lo_orderdate = d_datekey
27+
AND d_year = 1993
28+
AND lo_discount BETWEEN 1 AND 3
29+
AND lo_quantity < 25;
30+
31+
Usage:
32+
./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
33+
./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
34+
or with explicit -f flag:
35+
./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
36+
37+
Parameters:
38+
input_dir - Path to input directory containing the table files (e.g., ./data)
39+
*/
40+
# -- SOURCING THE RA-FUNCTIONS --
41+
source("./scripts/builtin/raSelection.dml") as raSel
42+
source("./scripts/builtin/raJoin.dml") as raJoin
43+
44+
# -- PARAMETER HANDLING --
45+
input_dir = ifdef($input_dir, "./data");
46+
print("Loading tables from directory: " + input_dir);
47+
48+
# -- READING INPUT FILES --
49+
# CSV TABLES
50+
date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
51+
lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
52+
53+
54+
# -- PREPARING --
55+
# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : YEAR
56+
date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
57+
date_matrix_min = as.matrix(date_csv_min);
58+
59+
# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
60+
# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
61+
lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
62+
lineorder_matrix_min = as.matrix(lineorder_csv_min);
63+
64+
65+
# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
66+
d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1993); # D_YEAR = '1993'
67+
68+
# LO_QUANTITY < 25
69+
lo_quan_filt = raSel::m_raSelection(lineorder_matrix_min, col=2, op="<", val=25);
70+
71+
# LO_DISCOUNT BETWEEN 1 AND 3
72+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=1);
73+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=3);
74+
75+
76+
# -- JOIN TABLES WITH RA-JOIN FUNCTION --
77+
# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
78+
joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
79+
#print("LO-DATE JOINED.");
80+
81+
82+
# -- AGGREGATION --
83+
lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
84+
lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
85+
revenue = sum(lo_extprice * lo_disc);
86+
87+
print("REVENUE: " + as.integer(revenue));
88+
89+
#print("Q1.1 finished.\n");
90+
91+
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#-------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
#-------------------------------------------------------------
21+
22+
23+
/*DML-script implementing the ssb query Q1.2 in SystemDS.
24+
SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
25+
FROM lineorder, dates
26+
WHERE
27+
lo_orderdate = d_datekey
28+
AND d_yearmonth = 'Jan1994'
29+
AND lo_discount BETWEEN 4 AND 6
30+
AND lo_quantity BETWEEN 26 AND 35;
31+
32+
Usage:
33+
./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
34+
./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
35+
or with explicit -f flag:
36+
./bin/systemds -f scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
37+
38+
Parameters:
39+
input_dir - Path to input directory containing the table files (e.g., ./data)
40+
*/
41+
42+
# -- SOURCING THE RA-FUNCTIONS --
43+
source("./scripts/builtin/raSelection.dml") as raSel
44+
source("./scripts/builtin/raJoin.dml") as raJoin
45+
46+
# -- PARAMETER HANDLING --
47+
input_dir = ifdef($input_dir, "./data");
48+
print("Loading tables from directory: " + input_dir);
49+
50+
# -- READING INPUT FILES --
51+
# CSV TABLES
52+
date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
53+
lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
54+
55+
# -- PREPARING --
56+
# Optimized approach: Single-pass filtering with direct matrix construction
57+
# Convert date key column to numeric matrix for proper handling
58+
date_keys_matrix = as.matrix(date_csv[, 1]);
59+
60+
# Count Jan1994 rows first to pre-allocate matrix efficiently
61+
date_nrows = nrow(date_csv);
62+
jan1994_count = 0;
63+
for (i in 1:date_nrows) {
64+
yearmonth_val = as.scalar(date_csv[i, 7]);
65+
if (yearmonth_val == "Jan1994") {
66+
jan1994_count = jan1994_count + 1;
67+
}
68+
}
69+
70+
# Pre-allocate final matrix and fill in single pass
71+
date_filtered = matrix(0, jan1994_count, 2);
72+
filtered_idx = 0;
73+
for (i in 1:date_nrows) {
74+
yearmonth_val = as.scalar(date_csv[i, 7]);
75+
if (yearmonth_val == "Jan1994") {
76+
filtered_idx = filtered_idx + 1;
77+
date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key
78+
date_filtered[filtered_idx, 2] = 1; # encoded value for Jan1994
79+
}
80+
}
81+
82+
# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
83+
# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
84+
lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
85+
lineorder_min_matrix = as.matrix(lineorder_csv_min);
86+
87+
88+
# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
89+
# We already filtered for D_YEARMONTH = 'Jan1994', so d_year_filt is our filtered date data
90+
d_year_filt = date_filtered;
91+
92+
# LO_QUANTITY BETWEEN 26 AND 35
93+
lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
94+
lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
95+
96+
# LO_DISCOUNT BETWEEN 4 AND 6
97+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=4);
98+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=6);
99+
100+
101+
# -- JOIN TABLES WITH RA-JOIN FUNCTION --
102+
# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
103+
joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
104+
#print("LO-DATE JOINED.");
105+
106+
107+
# -- AGGREGATION --
108+
lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
109+
lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
110+
revenue = sum(lo_extprice * lo_disc);
111+
112+
print("REVENUE: " + as.integer(revenue));
113+
114+
#print("Q1.2 finished.\n");
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#-------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
#-------------------------------------------------------------
21+
22+
23+
/*DML-script implementing the ssb query Q1.3 in SystemDS.
24+
SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
25+
FROM lineorder, dates
26+
WHERE
27+
lo_orderdate = d_datekey
28+
AND d_weeknuminyear = 6
29+
AND d_year = 1994
30+
AND lo_discount BETWEEN 5 AND 7
31+
AND lo_quantity BETWEEN 26 AND 35;
32+
33+
Usage:
34+
./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
35+
./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
36+
or with explicit -f flag:
37+
./bin/systemds -f scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
38+
39+
Parameters:
40+
input_dir - Path to input directory containing the table files (e.g., ./data)
41+
*/
42+
43+
44+
# -- SOURCING THE RA-FUNCTIONS --
45+
source("./scripts/builtin/raSelection.dml") as raSel
46+
source("./scripts/builtin/raJoin.dml") as raJoin
47+
48+
# -- PARAMETER HANDLING --
49+
input_dir = ifdef($input_dir, "./data");
50+
51+
# -- READING INPUT FILES --
52+
# CSV TABLES
53+
date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
54+
lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
55+
56+
# -- PREPARING --
57+
# Optimized approach: Two-pass filtering with direct matrix construction
58+
# Convert date columns to numeric matrices for proper handling
59+
date_keys_matrix = as.matrix(date_csv[, 1]); # date_key
60+
date_year_matrix = as.matrix(date_csv[, 5]); # d_year
61+
date_weeknum_matrix = as.matrix(date_csv[, 12]); # d_weeknuminyear
62+
63+
# Count matching rows first to pre-allocate matrix efficiently
64+
date_nrows = nrow(date_csv);
65+
matching_count = 0;
66+
for (i in 1:date_nrows) {
67+
year_val = as.scalar(date_year_matrix[i, 1]);
68+
weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
69+
if (year_val == 1994 && weeknum_val == 6) {
70+
matching_count = matching_count + 1;
71+
}
72+
}
73+
74+
# Pre-allocate final matrix and fill in single pass
75+
date_filtered = matrix(0, matching_count, 2);
76+
filtered_idx = 0;
77+
for (i in 1:date_nrows) {
78+
year_val = as.scalar(date_year_matrix[i, 1]);
79+
weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
80+
if (year_val == 1994 && weeknum_val == 6) {
81+
filtered_idx = filtered_idx + 1;
82+
date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]); # date_key
83+
date_filtered[filtered_idx, 2] = 1; # encoded value for matching criteria
84+
}
85+
}
86+
87+
# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
88+
# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
89+
lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
90+
lineorder_min_matrix = as.matrix(lineorder_csv_min);
91+
92+
# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
93+
# We already filtered for D_YEAR = 1994 AND D_WEEKNUMINYEAR = 6, so date_filtered is our filtered date data
94+
d_year_filt = date_filtered;
95+
96+
# LO_QUANTITY BETWEEN 26 AND 35
97+
lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
98+
lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
99+
100+
# LO_DISCOUNT BETWEEN 5 AND 7 (FIXED: was incorrectly >=6)
101+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=5);
102+
lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=7);
103+
104+
105+
# -- JOIN TABLES WITH RA-JOIN FUNCTION --
106+
# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
107+
joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
108+
109+
110+
# -- AGGREGATION --
111+
lo_extprice = joined_matrix[, 3]; #LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
112+
lo_disc = joined_matrix[, 4]; #LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
113+
revenue = sum(lo_extprice * lo_disc);
114+
115+
print("REVENUE: " + as.integer(revenue));

0 commit comments

Comments
 (0)