Skip to content

Commit edb1a38

Browse files
committed
✨🔧 Adapt code to comply with Ruff linter (D205) by adding blank lines after docstring summaries.
1 parent 80f82e4 commit edb1a38

File tree

1 file changed

+388
-0
lines changed

1 file changed

+388
-0
lines changed
Lines changed: 388 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,388 @@
1+
import numpy as np # noqa: D100
2+
import pandas as pd
3+
4+
5+
def moy_p(V, weights):
6+
"""Compute the weighted mean of a vector, ignoring NaNs.
7+
8+
Parameters
9+
----------
10+
V : array-like
11+
Input vector with possible NaN values.
12+
weights : array-like
13+
Weights corresponding to each element in V.
14+
15+
Returns
16+
-------
17+
float
18+
Weighted mean of non-NaN elements.
19+
20+
"""
21+
mask = ~np.isnan(V)
22+
total_weight = np.sum(weights[mask])
23+
if total_weight == 0:
24+
return 0.0 # or use np.finfo(float).eps for a small positive value
25+
return np.sum(V[mask] * weights[mask]) / total_weight
26+
27+
28+
def tab_disjonctif_NA(df):
29+
"""Create a disjunctive (one-hot encoded).
30+
31+
Parameters
32+
----------
33+
df : DataFrame
34+
Input DataFrame with categorical and numeric variables.
35+
36+
Returns
37+
-------
38+
DataFrame
39+
Disjunctive table with one-hot encoding.
40+
41+
""" # noqa: E501
42+
df_encoded_list = []
43+
for col in df.columns:
44+
if df[col].dtype.name == "category" or df[col].dtype == object:
45+
df[col] = df[col].astype("category")
46+
# Include '__MISSING__' as a category if not already present
47+
if "__MISSING__" not in df[col].cat.categories:
48+
df[col] = df[col].cat.add_categories(["__MISSING__"])
49+
# Fill missing values with '__MISSING__'
50+
df[col] = df[col].fillna("__MISSING__")
51+
# One-hot encode the categorical variable
52+
encoded = pd.get_dummies(
53+
df[col],
54+
prefix=col,
55+
prefix_sep="_",
56+
dummy_na=False,
57+
dtype=float,
58+
)
59+
df_encoded_list.append(encoded)
60+
else:
61+
# Numeric column; keep as is
62+
df_encoded_list.append(df[[col]])
63+
# Concatenate all encoded columns
64+
df_encoded = pd.concat(df_encoded_list, axis=1)
65+
return df_encoded
66+
67+
68+
def tab_disjonctif_prop(df, seed=None):
69+
"""Perform probabilistic imputation for categorical columns using observed
70+
value distributions, without creating a separate missing category.
71+
72+
Parameters
73+
----------
74+
df : DataFrame
75+
DataFrame with categorical columns to impute.
76+
seed : int, optional
77+
Random seed for reproducibility. Default is None.
78+
79+
Returns
80+
-------
81+
DataFrame
82+
Disjunctive coded DataFrame with missing values probabilistically
83+
imputed.
84+
85+
""" # noqa: D205
86+
if seed is not None:
87+
np.random.seed(seed)
88+
df = df.copy()
89+
df_encoded_list = []
90+
for col in df.columns:
91+
if df[col].dtype.name == "category" or df[col].dtype == object:
92+
# Ensure categories are strings
93+
df[col] = df[col].cat.rename_categories(
94+
df[col].cat.categories.astype(str)
95+
)
96+
observed = df[col][df[col].notna()]
97+
categories = df[col].cat.categories.tolist()
98+
# Get observed frequencies
99+
freqs = observed.value_counts(normalize=True)
100+
# Impute missing values based on observed frequencies
101+
missing_indices = df[col][df[col].isna()].index
102+
if len(missing_indices) > 0:
103+
imputed_values = np.random.choice(
104+
freqs.index, size=len(missing_indices), p=freqs.values
105+
)
106+
df.loc[missing_indices, col] = imputed_values
107+
# One-hot encode without creating missing category
108+
encoded = pd.get_dummies(
109+
df[col],
110+
prefix=col,
111+
prefix_sep="_",
112+
dummy_na=False,
113+
dtype=float,
114+
)
115+
col_names = [f"{col}_{cat}" for cat in categories]
116+
encoded = encoded.reindex(columns=col_names, fill_value=0.0)
117+
df_encoded_list.append(encoded)
118+
else:
119+
df_encoded_list.append(df[[col]])
120+
df_encoded = pd.concat(df_encoded_list, axis=1)
121+
return df_encoded
122+
123+
124+
def find_category(df_original, tab_disj):
125+
"""Reconstruct the original categorical variables from the disjunctive.
126+
127+
Parameters
128+
----------
129+
df_original : DataFrame
130+
Original DataFrame with categorical variables.
131+
tab_disj : DataFrame
132+
Disjunctive table after imputation.
133+
134+
Returns
135+
-------
136+
DataFrame
137+
Reconstructed DataFrame with imputed categorical variables.
138+
139+
"""
140+
df_reconstructed = df_original.copy()
141+
start_idx = 0
142+
for col in df_original.columns:
143+
if (
144+
df_original[col].dtype.name == "category"
145+
or df_original[col].dtype == object
146+
): # noqa: E501
147+
categories = df_original[col].cat.categories.tolist()
148+
if "__MISSING__" in categories:
149+
missing_cat_index = categories.index("__MISSING__")
150+
else:
151+
missing_cat_index = None
152+
num_categories = len(categories)
153+
sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories]
154+
if missing_cat_index is not None:
155+
sub_tab.iloc[:, missing_cat_index] = -np.inf
156+
# Find the category with the maximum value for each row
157+
max_indices = sub_tab.values.argmax(axis=1)
158+
df_reconstructed[col] = [categories[idx] for idx in max_indices]
159+
# Replace '__MISSING__' back to NaN
160+
df_reconstructed[col].replace("__MISSING__", np.nan, inplace=True)
161+
start_idx += num_categories
162+
else:
163+
# For numeric variables, keep as is
164+
start_idx += 1 # Increment start_idx by 1 for numeric columns
165+
return df_reconstructed
166+
167+
168+
def svdtriplet(X, row_w=None, ncp=np.inf):
169+
"""Perform weighted SVD on matrix X with row weights.
170+
171+
Parameters
172+
----------
173+
X : ndarray
174+
Data matrix of shape (n_samples, n_features).
175+
row_w : array-like, optional
176+
Row weights. If None, uniform weights are assumed. Default is None.
177+
ncp : int
178+
Number of principal components to retain. Default is infinity.
179+
180+
Returns
181+
-------
182+
s : ndarray
183+
Singular values.
184+
U : ndarray
185+
Left singular vectors.
186+
V : ndarray
187+
Right singular vectors.
188+
189+
"""
190+
if not isinstance(X, np.ndarray):
191+
X = np.array(X, dtype=float)
192+
else:
193+
X = X.astype(float)
194+
if row_w is None:
195+
row_w = np.ones(X.shape[0]) / X.shape[0]
196+
else:
197+
row_w = np.array(row_w, dtype=float)
198+
row_w /= row_w.sum()
199+
ncp = int(min(ncp, X.shape[0] - 1, X.shape[1]))
200+
# Apply weights to rows
201+
X_weighted = X * np.sqrt(row_w[:, None])
202+
# Perform SVD
203+
U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False)
204+
V = Vt.T
205+
U = U[:, :ncp]
206+
V = V[:, :ncp]
207+
s = s[:ncp]
208+
# Adjust signs to ensure consistency
209+
mult = np.sign(np.sum(V, axis=0))
210+
mult[mult == 0] = 1
211+
U *= mult
212+
V *= mult
213+
# Rescale U by the square root of row weights
214+
U /= np.sqrt(row_w[:, None])
215+
return s, U, V
216+
217+
218+
def imputeMCA(
219+
don,
220+
ncp=2,
221+
method="Regularized",
222+
row_w=None,
223+
coeff_ridge=1,
224+
threshold=1e-6,
225+
seed=None,
226+
maxiter=1000,
227+
):
228+
"""Impute missing values in a dataset using (MCA).
229+
230+
Parameters
231+
----------
232+
don : DataFrame
233+
Input dataset with missing values.
234+
ncp : int, optional
235+
Number of principal components for MCA. Default is 2.
236+
method : str, optional
237+
Imputation method ('Regularized' or 'EM'). Default is 'Regularized'.
238+
row_w : array-like, optional
239+
Row weights. If None, uniform weights are applied. Default is None.
240+
coeff_ridge : float, optional
241+
Regularization coefficient for 'Regularized' MCA. Default is 1.
242+
threshold : float, optional
243+
Convergence threshold. Default is 1e-6.
244+
seed : int, optional
245+
Random seed for reproducibility. Default is None.
246+
maxiter : int, optional
247+
Maximum number of iterations for the imputation process.
248+
249+
Returns
250+
-------
251+
dict
252+
Dictionary containing:
253+
- "tab_disj": Disjunctive coded table after imputation.
254+
- "completeObs": Complete dataset with missing values imputed.
255+
256+
"""
257+
# Ensure the data is a DataFrame
258+
don = pd.DataFrame(don)
259+
don = don.copy()
260+
261+
for col in don.columns:
262+
if (
263+
not pd.api.types.is_numeric_dtype(don[col])
264+
or don[col].dtype == "bool"
265+
): # noqa: E501
266+
don[col] = don[col].astype("category")
267+
# Convert categories to strings and rename them
268+
new_categories = don[col].cat.categories.astype(str)
269+
don[col] = don[col].cat.rename_categories(new_categories)
270+
else:
271+
unique_values = don[col].dropna().unique()
272+
if set(unique_values).issubset({0, 1}):
273+
don[col] = don[col].astype("category")
274+
new_categories = don[col].cat.categories.astype(str)
275+
don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501
276+
277+
print("Data types after conversion:")
278+
print(don.dtypes)
279+
280+
# Handle row weights
281+
if row_w is None:
282+
row_w = np.ones(len(don)) / len(don)
283+
else:
284+
row_w = np.array(row_w, dtype=float)
285+
row_w /= row_w.sum()
286+
287+
# Initial imputation and creation of disjunctive tables
288+
tab_disj_NA = tab_disjonctif_NA(don)
289+
tab_disj_comp = tab_disjonctif_prop(don, seed=seed)
290+
hidden = tab_disj_NA.isna()
291+
tab_disj_rec_old = tab_disj_comp.copy()
292+
293+
# Initialize iteration parameters
294+
nbiter = 0
295+
continue_flag = True
296+
297+
while continue_flag:
298+
nbiter += 1
299+
300+
# Step 1: Compute weighted means M
301+
M = (
302+
tab_disj_comp.apply(lambda col: moy_p(col.values, row_w))
303+
/ don.shape[1]
304+
) # noqa: E501
305+
M = M.replace({0: np.finfo(float).eps})
306+
M = M.fillna(np.finfo(float).eps)
307+
308+
if (M < 0).any():
309+
raise ValueError(
310+
"Negative values encountered in M. Check data preprocessing."
311+
) # noqa: E501
312+
313+
print(f"Iteration {nbiter}:")
314+
print("Weighted means (M):")
315+
print(M.head())
316+
317+
# Step 2: Center and scale the data
318+
tab_disj_comp_mean = tab_disj_comp.apply(
319+
lambda col: moy_p(col.values, row_w)
320+
) # noqa: E501
321+
tab_disj_comp_mean = tab_disj_comp_mean.replace(
322+
{0: np.finfo(float).eps}
323+
) # noqa: E501
324+
Z = tab_disj_comp.div(tab_disj_comp_mean, axis=1)
325+
Z_mean = Z.apply(lambda col: moy_p(col.values, row_w))
326+
Z = Z.subtract(Z_mean, axis=1)
327+
Zscale = Z.multiply(np.sqrt(M), axis=1)
328+
329+
print("Centered and scaled data (Zscale):")
330+
print(Zscale.head())
331+
332+
# Step 3: Perform weighted SVD
333+
s, U, V = svdtriplet(Zscale.values, row_w=row_w, ncp=ncp)
334+
print("Singular values (s):")
335+
print(s)
336+
print("Left singular vectors (U):")
337+
print(U)
338+
print("Right singular vectors (V):")
339+
print(V)
340+
341+
# Step 4: Regularization (Shrinking Eigenvalues)
342+
if method.lower() == "em":
343+
moyeig = 0
344+
else:
345+
# Calculate moyeig based on R's imputeMCA logic
346+
if len(s) > ncp:
347+
moyeig = np.mean(s[ncp:] ** 2)
348+
moyeig = min(moyeig * coeff_ridge, s[ncp] ** 2)
349+
else:
350+
moyeig = 0
351+
# Set to 0 when there are no additional singular values
352+
eig_shrunk = (s[:ncp] ** 2 - moyeig) / s[:ncp]
353+
eig_shrunk = np.maximum(eig_shrunk, 0) # Ensure non-negative
354+
print("Shrunk eigenvalues (eig_shrunk):")
355+
print(eig_shrunk)
356+
357+
# Step 5: Reconstruct the data
358+
rec = U @ np.diag(eig_shrunk) @ V.T
359+
tab_disj_rec = pd.DataFrame(
360+
rec, columns=tab_disj_comp.columns, index=tab_disj_comp.index
361+
) # noqa: E501
362+
tab_disj_rec = tab_disj_rec.div(np.sqrt(M), axis=1) + 1
363+
tab_disj_rec = tab_disj_rec.multiply(tab_disj_comp_mean, axis=1)
364+
print("Reconstructed disjunctive table (tab_disj_rec):")
365+
print(tab_disj_rec.head())
366+
367+
# Step 6: Compute difference and relative change
368+
diff = tab_disj_rec - tab_disj_rec_old
369+
diff_values = diff.values
370+
hidden_values = hidden.values
371+
# Zero out observed positions
372+
diff_values[~hidden_values] = 0
373+
relch = np.sum((diff_values**2) * row_w[:, None])
374+
print(f"Relative Change: {relch}\n")
375+
376+
# Step 7: Update for next iteration
377+
tab_disj_rec_old = tab_disj_rec.copy()
378+
tab_disj_comp.values[hidden_values] = tab_disj_rec.values[
379+
hidden_values
380+
] # noqa: E501
381+
382+
# Step 8: Check convergence
383+
continue_flag = (relch > threshold) and (nbiter < maxiter)
384+
385+
# Step 9: Reconstruct categorical data
386+
completeObs = find_category(don, tab_disj_comp)
387+
388+
return {"tab_disj": tab_disj_comp, "completeObs": completeObs}

0 commit comments

Comments
 (0)