Skip to content

Commit ef6f9c7

Browse files
authored
Merge pull request #86 from zStupan/main
squashing refactor
2 parents 4d92e19 + f0a3d11 commit ef6f9c7

File tree

1 file changed

+30
-31
lines changed

1 file changed

+30
-31
lines changed

niaarm/preprocessing.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,18 @@
44
from niaarm.dataset import Dataset
55

66

7-
def _euclidean(u, v):
8-
return 1 - np.linalg.norm(u - v)
7+
def _euclidean(u, v, features):
8+
dist = 0
9+
for f in features:
10+
if f.dtype == 'cat':
11+
weight = 1 / len(f.categories)
12+
if u[f.name] != v[f.name]:
13+
dist += weight * weight
14+
else:
15+
weight = 1 / (f.max_val - f.min_val)
16+
dist += (u[f.name] - v[f.name]) * (u[f.name] - v[f.name]) * weight * weight
17+
18+
return 1 - (dist ** 0.5)
919

1020

1121
def _cosine_similarity(u, v):
@@ -37,41 +47,30 @@ def squash(dataset, threshold, similarity='euclidean'):
3747
transactions_dummies = pd.get_dummies(dataset.transactions).to_numpy()
3848
num_transactions = len(transactions)
3949

40-
if similarity == 'euclidean':
41-
features_min = np.min(transactions_dummies, axis=0)
42-
features_max = np.max(transactions_dummies, axis=0)
43-
transactions_dummies = transactions_dummies / (features_max - features_min)
44-
45-
distance = _euclidean if similarity == 'euclidean' else _cosine_similarity
4650
squashed = np.zeros(num_transactions, dtype=bool)
4751
squashed_transactions = pd.DataFrame(columns=transactions.columns, dtype=int)
4852

49-
pos = 0
50-
while pos < num_transactions:
51-
squashed_set = pd.DataFrame(columns=transactions.columns, dtype=int)
52-
while pos < num_transactions and squashed[pos]:
53-
pos += 1
54-
if pos + 1 < num_transactions:
55-
transaction = pd.DataFrame(transactions.iloc[pos].to_dict(), index=[0])
56-
squashed_set = pd.concat([squashed_set, transaction], ignore_index=True)
57-
squashed[pos] = True
58-
59-
i = pos + 1
60-
while i < num_transactions:
61-
while i < num_transactions and squashed[i]:
62-
i += 1
63-
64-
if i < num_transactions:
65-
if distance(transactions_dummies[pos], transactions_dummies[i]) >= threshold:
66-
transaction = pd.DataFrame(transactions.iloc[i].to_dict(), index=[0])
67-
squashed_set = pd.concat([squashed_set, transaction], ignore_index=True)
68-
squashed[i] = True
69-
i += 1
53+
for pos in range(num_transactions):
54+
if squashed[pos]:
55+
continue
56+
57+
squashed_set = transactions.iloc[pos:pos + 1]
58+
squashed[pos] = True
59+
60+
for i in range(pos + 1, num_transactions):
61+
if squashed[i]:
62+
continue
63+
if similarity == 'euclidean':
64+
distance = _euclidean(transactions.iloc[pos], transactions.iloc[i], dataset.features)
65+
else:
66+
distance = _cosine_similarity(transactions_dummies[pos], transactions_dummies[i])
67+
68+
if distance >= threshold:
69+
squashed_set = pd.concat([squashed_set, transactions.iloc[i:i + 1]], ignore_index=True)
70+
squashed[i] = True
7071

7172
if not squashed_set.empty:
7273
squashed_transaction = squashed_set.agg(_mean_or_mode)
7374
squashed_transactions = pd.concat([squashed_transactions, squashed_transaction], ignore_index=True)
7475

75-
pos += 1
76-
7776
return Dataset(squashed_transactions)

0 commit comments

Comments
 (0)