|
37 | 37 | m_raGroupby = function (Matrix[Double] X, Integer col, String method) |
38 | 38 | return (Matrix[Double] Y) |
39 | 39 | { |
40 | | - if (method == "nested-loop") { |
41 | | - # Extract and sort unique values from the specified column (1-based index) |
42 | | - uniqueValues = unique(X[, col]) |
43 | | - order_uniqueValues = order(target = uniqueValues, by = 1); |
| 40 | + if (method == "nested-loop") { |
| 41 | + # Extract and sort unique group values from the specified column (1-based index) |
| 42 | + groupsUnique = unique(X[, col]) |
| 43 | + groupsUniqueOrdered = order(target = groupsUnique, by = 1) |
| 44 | + numGroups = nrow(groupsUnique) |
| 45 | + maxRowsInGroup = max(table(X[,col],1)); |
44 | 46 |
|
45 | | - # Calcute the number of groups |
46 | | - numGroups = nrow(uniqueValues) |
| 47 | + # Define a zero output matrix, save the initial order of the groups, and sort increasingly |
| 48 | + Y = matrix(0, numGroups, maxRowsInGroup*(ncol(X) - 1) + 1) |
| 49 | + Y[,1] = groupsUnique |
| 50 | + indicesY = order(target = Y, by = 1, index.return = TRUE) |
| 51 | + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) |
47 | 52 |
|
48 | | - # Determine the maximum number of rows in any group |
49 | | - maxRowsInGroup = max(table(X[,col],1)); |
| 53 | + # Order the input matrix by the grouping column |
| 54 | + indicesX = order(target = X, by = col, index.return = TRUE) |
| 55 | + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) |
| 56 | + |
| 57 | + currentGroupX = 1 |
| 58 | + currentGroupY = 1 |
| 59 | + i = 1 |
| 60 | + |
| 61 | + # Iterate over the input matrix |
| 62 | + while (numGroups > 0) { |
| 63 | + currentGroup = as.scalar(Y[currentGroupX,1]) |
| 64 | + nRowsToCopy = 0 |
| 65 | + |
| 66 | + # Find the rows for the current group |
| 67 | + group = 1 |
| 68 | + while (group > 0) { |
| 69 | + # Break if there are no more rows left in X |
| 70 | + if (i > nrow(X)) { |
| 71 | + group = 0 |
| 72 | + } |
| 73 | + # Check if the row belongs to the current group |
| 74 | + else if (as.scalar(X[i, col]) == currentGroup) { |
| 75 | + nRowsToCopy = nRowsToCopy + 1 |
| 76 | + i = i + 1 |
| 77 | + } |
| 78 | + # Break if the row does not belong to the current group |
| 79 | + else { |
| 80 | + group = 0 |
| 81 | + } |
| 82 | + } |
| 83 | + |
| 84 | + # Copy the values into the output matrix |
| 85 | + if (nRowsToCopy > 0) { |
| 86 | + nRowsCurrentGroup = currentGroupY + nRowsToCopy - 1 |
| 87 | + |
| 88 | + # 1. Grouping column is the first column |
| 89 | + if (col == 1) { |
| 90 | + newMatrix = X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)] |
| 91 | + } |
| 92 | + # 2. Grouping column is the last column |
| 93 | + else if (col == ncol(X)) { |
| 94 | + newMatrix = X [currentGroupY:nRowsCurrentGroup, 1:col-1] |
| 95 | + } |
| 96 | + # 3. Grouping column has an intermediate position |
| 97 | + else { |
| 98 | + newMatrix = cbind(X[currentGroupY:nRowsCurrentGroup, 1:(col-1)], X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)]) |
| 99 | + } |
| 100 | + |
| 101 | + # Flatten the new row |
| 102 | + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) |
| 103 | + newRowColIdx = nRowsToCopy * (ncol(X)-1) |
50 | 104 |
|
51 | | - # Define a zero matrix to put the group data into |
52 | | - Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) |
53 | | - |
54 | | - # Put the ordered uniqueValues into first column of Y as group_id |
55 | | - #Y[,1] = order_uniqueValues |
56 | | - Y[,1] = uniqueValues |
57 | | - |
58 | | - # Loop for each group |
59 | | - for(i in 1:numGroups){ |
60 | | - index = 0 |
61 | | - |
62 | | - # Iterate each row in matrix X to deal with group data |
63 | | - for ( j in 1:nrow(X) ) { |
64 | | - if ( as.scalar( X[j,col] == uniqueValues[i,1] )) { |
65 | | - # Define the formula of the start and end column position |
66 | | - startCol = index*(ncol(X)-1) +2 |
67 | | - endCol = startCol + (ncol(X)-2) |
68 | | - |
69 | | - if (col == 1) { |
70 | | - # Case when the selected column is the first column |
71 | | - Y[i,startCol:endCol] = X[j,2:ncol(X)] |
72 | | - } |
73 | | - else if (col == ncol(X)) { |
74 | | - # Case when the selected column is the last column |
75 | | - Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)] |
76 | | - } |
77 | | - else { |
78 | | - # General case |
79 | | - newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)]) |
80 | | - Y[i,startCol:endCol] = newRow |
81 | | - } |
82 | | - index = index +1 |
| 105 | + # Add the new row into Y at the current group |
| 106 | + Y[currentGroupX, 2: (newRowColIdx + 1)] = newRow |
83 | 107 | } |
84 | | - } |
| 108 | + |
| 109 | + # Continue with the next group |
| 110 | + currentGroupX = currentGroupX + 1 |
| 111 | + currentGroupY = currentGroupY + nRowsToCopy |
| 112 | + numGroups = numGroups - 1 |
85 | 113 | } |
| 114 | + |
| 115 | + # Restore the initial order of X |
| 116 | + X = cbind(X, indicesX) |
| 117 | + nColX = ncol(X) |
| 118 | + X = order(target = X, by= nColX) |
| 119 | + X = X[, 1:nColX-1] |
| 120 | + |
| 121 | + # Restore the initial order of Y |
| 122 | + Y = cbind(Y, indicesY) |
| 123 | + nColY = ncol(Y) |
| 124 | + Y = order(target = Y, by= nColY) |
| 125 | + Y = Y[, 1:nColY-1] |
86 | 126 | } |
| 127 | + |
87 | 128 | else if (method == "permutation-matrix") { |
88 | 129 | # Extract the grouping column and create unique groups |
89 | 130 | key = X[,col] |
90 | | - key_unique = unique(X[, col]) |
91 | | - numGroups = nrow(key_unique) |
| 131 | + keyUnique = unique(X[, col]) |
| 132 | + numGroups = nrow(keyUnique) |
| 133 | + maxRowsInGroup = max(table(X[,col],1)) |
92 | 134 |
|
93 | | - # Matrix for comparison |
94 | | - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) |
95 | | - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) |
| 135 | + # Calculate the frequency of each group |
| 136 | + freqPerKey = table(key, 1) |
| 137 | + freqPerKey = removeEmpty(target = freqPerKey, margin = "rows") |
| 138 | + freqPerKeyIndices = order(target = keyUnique, by = 1, index.return = TRUE) |
96 | 139 |
|
97 | | - # Find group index |
98 | | - groupIndex = rowIndexMax(t(key_compare == key_matrix)) |
| 140 | + # Match the length of freqPerKey to keyUnique and sort it accordingly |
| 141 | + freqPerKey = cbind(freqPerKey, freqPerKeyIndices) |
| 142 | + nColFpk = ncol(freqPerKey) |
| 143 | + freqPerKey = order(target = freqPerKey, by= nColFpk) |
| 144 | + freqPerKey = freqPerKey[, 1:nColFpk-1] |
| 145 | + freqPerKey = t(freqPerKey) |
99 | 146 |
|
100 | | - # Determine the maximum number of rows in any group |
101 | | - maxRowsInGroup = max(table(X[,col],1)) |
102 | | - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 |
| 147 | + # Find the group with the most values |
| 148 | + groupMaxVal = maxRowsInGroup*(ncol(X)-1)+1 |
| 149 | + groupMaxValKey = max(freqPerKey) |
| 150 | + |
| 151 | + # Calculate the amount of rows that need padding and the amount of padding per key |
| 152 | + groupMaxValKeySeq = matrix(groupMaxValKey, nrow(freqPerKey), ncol(freqPerKey)) |
| 153 | + missingPadding = groupMaxValKeySeq - freqPerKey |
| 154 | + amountOfZeroRows = sum(missingPadding) |
| 155 | + |
| 156 | + # 1. Padding is required |
| 157 | + if (amountOfZeroRows > 0) { |
| 158 | + missingPadding = t(missingPadding) |
103 | 159 |
|
104 | | - # Create permutation matrix P copy relevant tuples with a single matrix multiplication |
105 | | - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) |
106 | | - # Create offsets to store the first column of each group |
107 | | - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) |
| 160 | + # Remove the keys that dont need padding |
| 161 | + removeMask = (missingPadding != 0) |
| 162 | + missingPadding = cbind(keyUnique, missingPadding) |
| 163 | + missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removeMask) |
108 | 164 |
|
109 | | - # Create row and column index for the permutation matrix |
110 | | - rowIndex = seq(1, nrow(X)) |
111 | | - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) |
112 | | - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) |
113 | | - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) |
| 165 | + # Keys that need padding and padding length per group |
| 166 | + keysPadding = missingPadding[,1] |
| 167 | + missingPadding = missingPadding[,2] |
| 168 | + repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1) |
114 | 169 |
|
115 | | - # Set values in P |
116 | | - P = table(seq(1, nrow(X)), colIndex) |
| 170 | + # Generate the repeating keys |
| 171 | + repeatKeysIdxS = 1 |
117 | 172 |
|
118 | | - # Perform matrix multiplication |
119 | | - Y_temp = t(P) %*% X |
| 173 | + for (i in 1:nrow(missingPadding)) { |
| 174 | + repeat_count = as.scalar(missingPadding[i,1]) |
| 175 | + if (repeat_count > 0) { |
| 176 | + temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1) |
| 177 | + repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1 |
| 178 | + repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp |
| 179 | + repeatKeysIdxS = repeatKeysIdxE + 1 |
| 180 | + } |
| 181 | + } |
| 182 | + |
| 183 | + # Combine the keys that need padding with the actual padding |
| 184 | + padding = matrix(0, rows = nrow(repeatKeys), cols = 1) |
| 185 | + padding = cbind(repeatKeys, padding) |
| 186 | + |
| 187 | + # Extend the existing keys to a second column to match the padded keys |
| 188 | + key = key %*% matrix(1, rows = 1, cols = 2) |
| 189 | + |
| 190 | + # Combine the keys with the padded keys and sort them increasingly |
| 191 | + tempY = rbind(key, padding) |
| 192 | + tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE) |
| 193 | + |
| 194 | + # Remove the padded rows and save the Indices of the combined keys for the permutation matrix |
| 195 | + paddedRows = tempY[, 2] |
| 196 | + tempIndicesY = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE) |
| 197 | + tempIndicesY = removeEmpty(target = tempIndicesY, margin = "rows", select = (paddedRows!=0)) |
120 | 198 |
|
121 | | - # Remove the selected column from Y_temp |
122 | | - if( col == 1 ) { |
123 | | - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] |
| 199 | + # Create the permutation matrix by using the Indices of the combined keys |
| 200 | + P = table(seq(1, nrow(X)), tempIndicesY) |
| 201 | + |
| 202 | + # Order the initial matrix to match the sorted keys with padding |
| 203 | + indicesX = order(target = X, by = col, index.return = TRUE) |
| 204 | + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) |
| 205 | + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) |
| 206 | + |
| 207 | + # Perform the matrix multiplication |
| 208 | + tempY = t(P) %*% X |
124 | 209 | } |
125 | | - else if( col == ncol(X) ) { |
126 | | - Y_temp_reduce = Y_temp[, 1:col-1] |
| 210 | + |
| 211 | + # 2. Padding is not required |
| 212 | + else { |
| 213 | + tempY = X |
| 214 | + tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE) |
127 | 215 | } |
128 | | - else{ |
129 | | - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) |
| 216 | + |
| 217 | + # Remove the selected column from tempY |
| 218 | + if (col == 1) { |
| 219 | + tempY = tempY[, col+1:ncol(tempY)] |
| 220 | + } |
| 221 | + else if (col == ncol(X)) { |
| 222 | + tempY = tempY[, 1:col-1] |
| 223 | + } |
| 224 | + else { |
| 225 | + tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)]) |
130 | 226 | } |
131 | 227 |
|
132 | | - # Set value of final output |
133 | | - Y = matrix(0, rows=numGroups, cols=totalCells) |
134 | | - Y[,1] = key_unique |
| 228 | + # Set the value of the final output |
| 229 | + Y = matrix(0, rows=numGroups, cols=groupMaxVal) |
| 230 | + Y[,1] = keyUnique |
135 | 231 |
|
136 | | - # The permutation matrix creates a structure where each group's data |
137 | | - # may not fill exactly maxRowsInGroup rows. |
138 | | - # If needed, we need to pad to the expected size first. |
| 232 | + # Each group's data may not fill exactly maxRowsInGroup rows |
| 233 | + # If needed, we need to pad to the expected size first |
139 | 234 | expectedRows = numGroups * maxRowsInGroup |
140 | | - actualRows = nrow(Y_temp_reduce) |
141 | | - |
142 | | - if(actualRows < expectedRows) { |
143 | | - # Pad Y_temp_reduce with zeros to match expected structure |
144 | | - Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp_reduce)) |
145 | | - Y_tmp_padded[1:actualRows,] = Y_temp_reduce |
146 | | - } else { |
147 | | - Y_tmp_padded = Y_temp_reduce |
| 235 | + actualRows = nrow(tempY) |
| 236 | + |
| 237 | + if (actualRows < expectedRows) { |
| 238 | + # Pad tempY with zeros to match expected structure |
| 239 | + tempYPadded = matrix(0, rows=expectedRows, cols=ncol(tempY)) |
| 240 | + tempYPadded[1:actualRows,] = tempY |
| 241 | + } |
| 242 | + else { |
| 243 | + tempYPadded = tempY |
148 | 244 | } |
149 | 245 |
|
150 | | - Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1) |
| 246 | + # Save the initial order of the groups in Y and order Y to match the sorted tempYPadded |
| 247 | + indicesY = order(target = Y, by = 1, index.return = TRUE) |
| 248 | + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) |
| 249 | + |
| 250 | + # Copy the values into Y |
| 251 | + Y[,2:ncol(Y)] = matrix(tempYPadded, rows=numGroups, cols=groupMaxVal-1) |
| 252 | + |
| 253 | + # Restore the initial order of X |
| 254 | + X = cbind(X, indicesX) |
| 255 | + nColX = ncol(X) |
| 256 | + X = order(target = X, by= nColX) |
| 257 | + X = X[, 1:nColX-1] |
| 258 | + |
| 259 | + # Restore the initial order of Y |
| 260 | + Y = cbind(Y, indicesY) |
| 261 | + nColY = ncol(Y) |
| 262 | + Y = order(target = Y, by= nColY) |
| 263 | + Y = Y[, 1:nColY-1] |
151 | 264 | } |
152 | 265 | } |
153 | | - |
|
0 commit comments