|
24 | 24 | # |
25 | 25 | # INPUT: |
26 | 26 | # -------------------------------------------------------------------------------------- |
27 | | -# minority Matrix of minority class samples |
28 | | -# majority Matrix of majority class samples |
29 | | -# k Number of nearest neighbors |
30 | | -# beta Desired balance level after generation of synthetic data [0, 1] |
| 27 | +# X Feature matrix [shape: n-by-m] |
| 28 | +# Y Class labels [shape: n-by-1] |
| 29 | +# k Number of nearest neighbors |
| 30 | +# beta Desired balance level after generation of synthetic data [0, 1] |
| 31 | +# dth Distribution threshold |
31 | 32 | # -------------------------------------------------------------------------------------- |
32 | 33 | # |
33 | 34 | # OUTPUT: |
34 | 35 | # ------------------------------------------------------------------------------------- |
35 | | -# Z Matrix of G synthetic minority class samples, with G = (ml-ms)*beta |
| 36 | +# Xp Feature matrix of n original rows followed by G = (ml-ms)*beta synthetic rows |
| 37 | +# Yp Class labels aligned with output X |
36 | 38 | # ------------------------------------------------------------------------------------- |
37 | 39 |
|
38 | | -m_adasyn = function(Matrix[Double] minority, Matrix[Double] majority, Integer k = 1, Double beta = 0.8) |
39 | | - return (Matrix[Double] Z) |
| 40 | +m_adasyn = function(Matrix[Double] X, Matrix[Double] Y, Integer k = 2, |
| 41 | + Double beta = 1.0, Double dth = 0.9) |
| 42 | + return (Matrix[Double] Xp, Matrix[Double] Yp) |
40 | 43 | { |
41 | 44 | if(k < 1) { |
42 | 45 | print("ADASYN: k should not be less than 1. Setting k value to default k = 1.") |
43 | 46 | k = 1 |
44 | 47 | } |
45 | 48 |
|
46 | 49 | # Preprocessing |
47 | | - dth = 0.9 |
48 | | - ms = nrow(minority) |
49 | | - ml = nrow(majority) |
50 | | - combined = rbind(minority, majority) |
| 50 | + freq = t(table(Y, 1)); |
| 51 | + minorIdx = as.scalar(rowIndexMin(freq)) |
| 52 | + majorIdx = as.scalar(rowIndexMax(freq)) |
51 | 53 |
|
52 | 54 | # (Step 1) |
53 | 55 | # Calculate the degree of class imbalance, where d in (0, 1] |
54 | | - d = ms/ml |
| 56 | + d = as.scalar(freq[1,minorIdx])/sum(freq) |
55 | 57 |
|
56 | 58 | # (Step 2) |
57 | 59 | # Check if imbalance is lower than predefined threshold |
58 | | - if(d >= dth){ |
| 60 | + print("ADASYN: class imbalance: " + d) |
| 61 | + |
| 62 | + if(d >= dth) { |
59 | 63 | stop("ADASYN: Class imbalance not large enough.") |
60 | 64 | } |
61 | 65 |
|
62 | 66 | # (Step 2a) |
63 | 67 | # Calculate number of synthetic data examples |
64 | | - G = (ml-ms)*beta |
| 68 | + G = as.scalar(freq[1,majorIdx]-freq[1,minorIdx])*beta |
65 | 69 |
|
66 | 70 | # (Step 2b) |
67 | | - # For each x_i in minority class, find k nearest neighbors. |
68 | | - # Then, compute ratio r of neighbors belonging to majority class to total number of neighbors k |
69 | | - NNR = knnbf(combined, minority, k+1) |
70 | | - NNR = NNR[,2:ncol(NNR)] |
71 | | - delta = rowSums(NNR>ms) |
72 | | - r = delta/k |
73 | | - r = r + 0 #only to force materialization, caught by compiler rewrites |
74 | | - |
75 | | - # (Step 2c) |
76 | | - # Normalize ratio vector r |
77 | | - rSum = sum(r) |
78 | | - r = r/rSum |
79 | | - |
80 | | - # (Step 2d) |
81 | | - # Calculate the number of synthetic data examples that need to be |
82 | | - # generated for each minority example x_i |
83 | | - # Then, pre-allocate the result matrix Z |
84 | | - g = round(r * G) |
85 | | - gSum = sum(g) |
86 | | - Z = matrix(0, rows=gSum, cols=ncol(minority)) # output matrix, slightly overallocated |
87 | | - |
88 | | - # (Step 2e) |
89 | | - # For each minority class data example x_i, generate g_i synthetic data examples by |
90 | | - # looping from 1 to g_i and randomly choosing one minority data example x_j from |
91 | | - # the k-nearest neighbors. Then, compute the synthetic sample s_i as |
92 | | - # s_i = x_i + (x_j - x_i) * lambda, with lambda being a random number in [0, 1]. |
93 | | - minNNR = NNR * (NNR <= ms) # set every index from majority class to zero |
94 | | - zeroCount = 0 |
95 | | - for(i in 1:nrow(minority)){ |
96 | | - row = minNNR[i, ] # slice a row |
97 | | - minRow = removeEmpty(target=row, margin="cols") # remove all zero values from that row |
98 | | - hasSynthetic = as.scalar(g[i])>0 |
99 | | - hasMinorityNN = (as.scalar(minRow[1, 1]) > 0) & (hasSynthetic) |
100 | | - if(hasMinorityNN){ |
101 | | - for(j in 1:as.scalar(g[i])){ |
102 | | - randomIndex = as.scalar(sample(ncol(minRow), 1)) |
103 | | - lambda = as.scalar(rand(rows=1, cols=1, min=0, max=1)) |
104 | | - randomMinIndex = as.scalar(minRow[ , randomIndex]) |
105 | | - randomMinNN = minority[randomMinIndex, ] |
106 | | - insIdx = i+j-1-zeroCount |
107 | | - Z[insIdx, ] = minority[i, ] + (randomMinNN - minority[i, ]) * lambda |
108 | | - } |
109 | | - } else { |
110 | | - zeroCount = zeroCount + 1 |
111 | | - } |
112 | | - } |
113 | | - |
114 | | - diff = nrow(minority) - gSum |
115 | | - numTrailZeros = zeroCount - diff |
116 | | - Z = Z[1:gSum-numTrailZeros, ] |
| 71 | + # For each x_i in non-majority class, find k nearest neighbors. |
| 72 | + # Get G random points from the KNN set via a permutation matrix multiply |
| 73 | + Xnonmajor = removeEmpty(target=X, margin="rows", select=(Y!=majorIdx)) |
| 74 | + Ynonmajor = removeEmpty(target=Y, margin="rows", select=(Y!=majorIdx)) |
| 75 | + NNR = knnbf(Xnonmajor, Xnonmajor, k+1) |
| 76 | + NNR = matrix(NNR, rows=length(NNR), cols=1) |
| 77 | + I = rand(rows=nrow(NNR), cols=1) < (G/nrow(NNR)) |
| 78 | + NNRg = removeEmpty(target=NNR, margin="rows", select=I); |
| 79 | + P = table(seq(1, nrow(NNRg)), NNRg, nrow(NNRg), nrow(Xnonmajor)); |
| 80 | + Xp = rbind(X, P %*% Xnonmajor); |
| 81 | + Yp = rbind(Y, P %*% Ynonmajor); # multi-class |
117 | 82 | } |
118 | 83 |
|
0 commit comments