Skip to content

Commit 8bed176

Browse files
maxranklmboehm7
authored andcommitted
[SYSTEMDS-3859] Improved Relational Algebra Builtin Functions
Closes #2284.
1 parent 4eb1273 commit 8bed176

File tree

1 file changed

+199
-87
lines changed

1 file changed

+199
-87
lines changed

scripts/builtin/raGroupby.dml

Lines changed: 199 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -37,117 +37,229 @@
3737
m_raGroupby = function (Matrix[Double] X, Integer col, String method)
3838
return (Matrix[Double] Y)
3939
{
40-
if (method == "nested-loop") {
41-
# Extract and sort unique values from the specified column (1-based index)
42-
uniqueValues = unique(X[, col])
43-
order_uniqueValues = order(target = uniqueValues, by = 1);
40+
if (method == "nested-loop") {
41+
# Extract and sort unique group values from the specified column (1-based index)
42+
groupsUnique = unique(X[, col])
43+
groupsUniqueOrdered = order(target = groupsUnique, by = 1)
44+
numGroups = nrow(groupsUnique)
45+
maxRowsInGroup = max(table(X[,col],1));
4446

45-
# Calcute the number of groups
46-
numGroups = nrow(uniqueValues)
47+
# Define a zero output matrix, save the initial order of the groups, and sort increasingly
48+
Y = matrix(0, numGroups, maxRowsInGroup*(ncol(X) - 1) + 1)
49+
Y[,1] = groupsUnique
50+
indicesY = order(target = Y, by = 1, index.return = TRUE)
51+
Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE)
4752

48-
# Determine the maximum number of rows in any group
49-
maxRowsInGroup = max(table(X[,col],1));
53+
# Order the input matrix by the grouping column
54+
indicesX = order(target = X, by = col, index.return = TRUE)
55+
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
56+
57+
currentGroupX = 1
58+
currentGroupY = 1
59+
i = 1
60+
61+
# Iterate over the input matrix
62+
while (numGroups > 0) {
63+
currentGroup = as.scalar(Y[currentGroupX,1])
64+
nRowsToCopy = 0
65+
66+
# Find the rows for the current group
67+
group = 1
68+
while (group > 0) {
69+
# Break if there are no more rows left in X
70+
if (i > nrow(X)) {
71+
group = 0
72+
}
73+
# Check if the row belongs to the current group
74+
else if (as.scalar(X[i, col]) == currentGroup) {
75+
nRowsToCopy = nRowsToCopy + 1
76+
i = i + 1
77+
}
78+
# Break if the row does not belong to the current group
79+
else {
80+
group = 0
81+
}
82+
}
83+
84+
# Copy the values into the output matrix
85+
if (nRowsToCopy > 0) {
86+
nRowsCurrentGroup = currentGroupY + nRowsToCopy - 1
87+
88+
# 1. Grouping column is the first column
89+
if (col == 1) {
90+
newMatrix = X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)]
91+
}
92+
# 2. Grouping column is the last column
93+
else if (col == ncol(X)) {
94+
newMatrix = X [currentGroupY:nRowsCurrentGroup, 1:col-1]
95+
}
96+
# 3. Grouping column has an intermediate position
97+
else {
98+
newMatrix = cbind(X[currentGroupY:nRowsCurrentGroup, 1:(col-1)], X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)])
99+
}
100+
101+
# Flatten the new row
102+
newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix))
103+
newRowColIdx = nRowsToCopy * (ncol(X)-1)
50104

51-
# Define a zero matrix to put the group data into
52-
Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1)
53-
54-
# Put the ordered uniqueValues into first column of Y as group_id
55-
#Y[,1] = order_uniqueValues
56-
Y[,1] = uniqueValues
57-
58-
# Loop for each group
59-
for(i in 1:numGroups){
60-
index = 0
61-
62-
# Iterate each row in matrix X to deal with group data
63-
for ( j in 1:nrow(X) ) {
64-
if ( as.scalar( X[j,col] == uniqueValues[i,1] )) {
65-
# Define the formula of the start and end column position
66-
startCol = index*(ncol(X)-1) +2
67-
endCol = startCol + (ncol(X)-2)
68-
69-
if (col == 1) {
70-
# Case when the selected column is the first column
71-
Y[i,startCol:endCol] = X[j,2:ncol(X)]
72-
}
73-
else if (col == ncol(X)) {
74-
# Case when the selected column is the last column
75-
Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)]
76-
}
77-
else {
78-
# General case
79-
newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)])
80-
Y[i,startCol:endCol] = newRow
81-
}
82-
index = index +1
105+
# Add the new row into Y at the current group
106+
Y[currentGroupX, 2: (newRowColIdx + 1)] = newRow
83107
}
84-
}
108+
109+
# Continue with the next group
110+
currentGroupX = currentGroupX + 1
111+
currentGroupY = currentGroupY + nRowsToCopy
112+
numGroups = numGroups - 1
85113
}
114+
115+
# Restore the initial order of X
116+
X = cbind(X, indicesX)
117+
nColX = ncol(X)
118+
X = order(target = X, by= nColX)
119+
X = X[, 1:nColX-1]
120+
121+
# Restore the initial order of Y
122+
Y = cbind(Y, indicesY)
123+
nColY = ncol(Y)
124+
Y = order(target = Y, by= nColY)
125+
Y = Y[, 1:nColY-1]
86126
}
127+
87128
else if (method == "permutation-matrix") {
88129
# Extract the grouping column and create unique groups
89130
key = X[,col]
90-
key_unique = unique(X[, col])
91-
numGroups = nrow(key_unique)
131+
keyUnique = unique(X[, col])
132+
numGroups = nrow(keyUnique)
133+
maxRowsInGroup = max(table(X[,col],1))
92134

93-
# Matrix for comparison
94-
key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X))
95-
key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key)
135+
# Calculate the frequency of each group
136+
freqPerKey = table(key, 1)
137+
freqPerKey = removeEmpty(target = freqPerKey, margin = "rows")
138+
freqPerKeyIndices = order(target = keyUnique, by = 1, index.return = TRUE)
96139

97-
# Find group index
98-
groupIndex = rowIndexMax(t(key_compare == key_matrix))
140+
# Match the length of freqPerKey to keyUnique and sort it accordingly
141+
freqPerKey = cbind(freqPerKey, freqPerKeyIndices)
142+
nColFpk = ncol(freqPerKey)
143+
freqPerKey = order(target = freqPerKey, by= nColFpk)
144+
freqPerKey = freqPerKey[, 1:nColFpk-1]
145+
freqPerKey = t(freqPerKey)
99146

100-
# Determine the maximum number of rows in any group
101-
maxRowsInGroup = max(table(X[,col],1))
102-
totalCells = (maxRowsInGroup) * (ncol(X)-1) +1
147+
# Find the group with the most values
148+
groupMaxVal = maxRowsInGroup*(ncol(X)-1)+1
149+
groupMaxValKey = max(freqPerKey)
150+
151+
# Calculate the amount of rows that need padding and the amount of padding per key
152+
groupMaxValKeySeq = matrix(groupMaxValKey, nrow(freqPerKey), ncol(freqPerKey))
153+
missingPadding = groupMaxValKeySeq - freqPerKey
154+
amountOfZeroRows = sum(missingPadding)
155+
156+
# 1. Padding is required
157+
if (amountOfZeroRows > 0) {
158+
missingPadding = t(missingPadding)
103159

104-
# Create permutation matrix P copy relevant tuples with a single matrix multiplication
105-
P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup)
106-
# Create offsets to store the first column of each group
107-
offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1)
160+
# Remove the keys that dont need padding
161+
removeMask = (missingPadding != 0)
162+
missingPadding = cbind(keyUnique, missingPadding)
163+
missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removeMask)
108164

109-
# Create row and column index for the permutation matrix
110-
rowIndex = seq(1, nrow(X))
111-
indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X))))
112-
selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex)
113-
colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix)
165+
# Keys that need padding and padding length per group
166+
keysPadding = missingPadding[,1]
167+
missingPadding = missingPadding[,2]
168+
repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1)
114169

115-
# Set values in P
116-
P = table(seq(1, nrow(X)), colIndex)
170+
# Generate the repeating keys
171+
repeatKeysIdxS = 1
117172

118-
# Perform matrix multiplication
119-
Y_temp = t(P) %*% X
173+
for (i in 1:nrow(missingPadding)) {
174+
repeat_count = as.scalar(missingPadding[i,1])
175+
if (repeat_count > 0) {
176+
temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1)
177+
repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1
178+
repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp
179+
repeatKeysIdxS = repeatKeysIdxE + 1
180+
}
181+
}
182+
183+
# Combine the keys that need padding with the actual padding
184+
padding = matrix(0, rows = nrow(repeatKeys), cols = 1)
185+
padding = cbind(repeatKeys, padding)
186+
187+
# Extend the existing keys to a second column to match the padded keys
188+
key = key %*% matrix(1, rows = 1, cols = 2)
189+
190+
# Combine the keys with the padded keys and sort them increasingly
191+
tempY = rbind(key, padding)
192+
tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE)
193+
194+
# Remove the padded rows and save the Indices of the combined keys for the permutation matrix
195+
paddedRows = tempY[, 2]
196+
tempIndicesY = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE)
197+
tempIndicesY = removeEmpty(target = tempIndicesY, margin = "rows", select = (paddedRows!=0))
120198

121-
# Remove the selected column from Y_temp
122-
if( col == 1 ) {
123-
Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)]
199+
# Create the permutation matrix by using the Indices of the combined keys
200+
P = table(seq(1, nrow(X)), tempIndicesY)
201+
202+
# Order the initial matrix to match the sorted keys with padding
203+
indicesX = order(target = X, by = col, index.return = TRUE)
204+
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
205+
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
206+
207+
# Perform the matrix multiplication
208+
tempY = t(P) %*% X
124209
}
125-
else if( col == ncol(X) ) {
126-
Y_temp_reduce = Y_temp[, 1:col-1]
210+
211+
# 2. Padding is not required
212+
else {
213+
tempY = X
214+
tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE)
127215
}
128-
else{
129-
Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)])
216+
217+
# Remove the selected column from tempY
218+
if (col == 1) {
219+
tempY = tempY[, col+1:ncol(tempY)]
220+
}
221+
else if (col == ncol(X)) {
222+
tempY = tempY[, 1:col-1]
223+
}
224+
else {
225+
tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)])
130226
}
131227

132-
# Set value of final output
133-
Y = matrix(0, rows=numGroups, cols=totalCells)
134-
Y[,1] = key_unique
228+
# Set the value of the final output
229+
Y = matrix(0, rows=numGroups, cols=groupMaxVal)
230+
Y[,1] = keyUnique
135231

136-
# The permutation matrix creates a structure where each group's data
137-
# may not fill exactly maxRowsInGroup rows.
138-
# If needed, we need to pad to the expected size first.
232+
# Each group's data may not fill exactly maxRowsInGroup rows
233+
# If needed, we need to pad to the expected size first
139234
expectedRows = numGroups * maxRowsInGroup
140-
actualRows = nrow(Y_temp_reduce)
141-
142-
if(actualRows < expectedRows) {
143-
# Pad Y_temp_reduce with zeros to match expected structure
144-
Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp_reduce))
145-
Y_tmp_padded[1:actualRows,] = Y_temp_reduce
146-
} else {
147-
Y_tmp_padded = Y_temp_reduce
235+
actualRows = nrow(tempY)
236+
237+
if (actualRows < expectedRows) {
238+
# Pad tempY with zeros to match expected structure
239+
tempYPadded = matrix(0, rows=expectedRows, cols=ncol(tempY))
240+
tempYPadded[1:actualRows,] = tempY
241+
}
242+
else {
243+
tempYPadded = tempY
148244
}
149245

150-
Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1)
246+
# Save the initial order of the groups in Y and order Y to match the sorted tempYPadded
247+
indicesY = order(target = Y, by = 1, index.return = TRUE)
248+
Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE)
249+
250+
# Copy the values into Y
251+
Y[,2:ncol(Y)] = matrix(tempYPadded, rows=numGroups, cols=groupMaxVal-1)
252+
253+
# Restore the initial order of X
254+
X = cbind(X, indicesX)
255+
nColX = ncol(X)
256+
X = order(target = X, by= nColX)
257+
X = X[, 1:nColX-1]
258+
259+
# Restore the initial order of Y
260+
Y = cbind(Y, indicesY)
261+
nColY = ncol(Y)
262+
Y = order(target = Y, by= nColY)
263+
Y = Y[, 1:nColY-1]
151264
}
152265
}
153-

0 commit comments

Comments
 (0)