|
4 | 4 | from niaarm.dataset import Dataset |
5 | 5 |
|
6 | 6 |
|
7 | | -def _euclidean(u, v): |
8 | | - return 1 - np.linalg.norm(u - v) |
| 7 | +def _euclidean(u, v, features): |
| 8 | + dist = 0 |
| 9 | + for f in features: |
| 10 | + if f.dtype == 'cat': |
| 11 | + weight = 1 / len(f.categories) |
| 12 | + if u[f.name] != v[f.name]: |
| 13 | + dist += weight * weight |
| 14 | + else: |
| 15 | + weight = 1 / (f.max_val - f.min_val) |
| 16 | + dist += (u[f.name] - v[f.name]) * (u[f.name] - v[f.name]) * weight * weight |
| 17 | + |
| 18 | + return 1 - (dist ** 0.5) |
9 | 19 |
|
10 | 20 |
|
11 | 21 | def _cosine_similarity(u, v): |
@@ -37,41 +47,30 @@ def squash(dataset, threshold, similarity='euclidean'): |
37 | 47 | transactions_dummies = pd.get_dummies(dataset.transactions).to_numpy() |
38 | 48 | num_transactions = len(transactions) |
39 | 49 |
|
40 | | - if similarity == 'euclidean': |
41 | | - features_min = np.min(transactions_dummies, axis=0) |
42 | | - features_max = np.max(transactions_dummies, axis=0) |
43 | | - transactions_dummies = transactions_dummies / (features_max - features_min) |
44 | | - |
45 | | - distance = _euclidean if similarity == 'euclidean' else _cosine_similarity |
46 | 50 | squashed = np.zeros(num_transactions, dtype=bool) |
47 | 51 | squashed_transactions = pd.DataFrame(columns=transactions.columns, dtype=int) |
48 | 52 |
|
49 | | - pos = 0 |
50 | | - while pos < num_transactions: |
51 | | - squashed_set = pd.DataFrame(columns=transactions.columns, dtype=int) |
52 | | - while pos < num_transactions and squashed[pos]: |
53 | | - pos += 1 |
54 | | - if pos + 1 < num_transactions: |
55 | | - transaction = pd.DataFrame(transactions.iloc[pos].to_dict(), index=[0]) |
56 | | - squashed_set = pd.concat([squashed_set, transaction], ignore_index=True) |
57 | | - squashed[pos] = True |
58 | | - |
59 | | - i = pos + 1 |
60 | | - while i < num_transactions: |
61 | | - while i < num_transactions and squashed[i]: |
62 | | - i += 1 |
63 | | - |
64 | | - if i < num_transactions: |
65 | | - if distance(transactions_dummies[pos], transactions_dummies[i]) >= threshold: |
66 | | - transaction = pd.DataFrame(transactions.iloc[i].to_dict(), index=[0]) |
67 | | - squashed_set = pd.concat([squashed_set, transaction], ignore_index=True) |
68 | | - squashed[i] = True |
69 | | - i += 1 |
| 53 | + for pos in range(num_transactions): |
| 54 | + if squashed[pos]: |
| 55 | + continue |
| 56 | + |
| 57 | + squashed_set = transactions.iloc[pos:pos + 1] |
| 58 | + squashed[pos] = True |
| 59 | + |
| 60 | + for i in range(pos + 1, num_transactions): |
| 61 | + if squashed[i]: |
| 62 | + continue |
| 63 | + if similarity == 'euclidean': |
| 64 | + distance = _euclidean(transactions.iloc[pos], transactions.iloc[i], dataset.features) |
| 65 | + else: |
| 66 | + distance = _cosine_similarity(transactions_dummies[pos], transactions_dummies[i]) |
| 67 | + |
| 68 | + if distance >= threshold: |
| 69 | + squashed_set = pd.concat([squashed_set, transactions.iloc[i:i + 1]], ignore_index=True) |
| 70 | + squashed[i] = True |
70 | 71 |
|
71 | 72 | if not squashed_set.empty: |
72 | 73 | squashed_transaction = squashed_set.agg(_mean_or_mode) |
73 | 74 | squashed_transactions = pd.concat([squashed_transactions, squashed_transaction], ignore_index=True) |
74 | 75 |
|
75 | | - pos += 1 |
76 | | - |
77 | 76 | return Dataset(squashed_transactions) |
0 commit comments