Skip to content

Commit 9b12e5d

Browse files
Merge pull request #101 from AnotherSamWilson/organizational
Removed old poetry sections in pyproject, updated license specification.
2 parents d9145b3 + 262b966 commit 9b12e5d

File tree

15 files changed

+2535
-1748
lines changed

15 files changed

+2535
-1748
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ pyproject.toml
2525
.devcontainer
2626
Dockerfile
2727
dev_guide.md
28+
.pypirc

LICENSE

Lines changed: 0 additions & 21 deletions
This file was deleted.

README.md

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ with lightgbm. The R version of this package may be found
2929
- Has efficient mean matching solutions.
3030
- Can utilize GPU training
3131
- **Flexible**
32-
- Can impute pandas dataframes
32+
- Can impute pandas dataframes and numpy arrays
3333
- Handles categorical data automatically
3434
- Fits into a sklearn pipeline
3535
- User can customize every aspect of the imputation process
@@ -39,6 +39,7 @@ with lightgbm. The R version of this package may be found
3939
- Data can be imputed in place to save memory
4040
- Can build models on non-missing data
4141

42+
4243
This document contains a thorough walkthrough of the package,
4344
benchmarks, and an introduction to multiple imputation. More information
4445
on MICE can be found in Stef van Buuren’s excellent online book, which
@@ -338,7 +339,7 @@ new_data_imputed = cust_kernel.impute_new_data(new_data=new_data)
338339
print(f"New Data imputed in {(datetime.now() - start_t).total_seconds()} seconds")
339340
```
340341

341-
New Data imputed in 0.040396 seconds
342+
New Data imputed in 0.035129 seconds
342343

343344

344345
## Saving and Loading Kernels
@@ -506,6 +507,19 @@ pd.DataFrame(optimal_params)
506507

507508

508509
<div>
510+
<style scoped>
511+
.dataframe tbody tr th:only-of-type {
512+
vertical-align: middle;
513+
}
514+
515+
.dataframe tbody tr th {
516+
vertical-align: top;
517+
}
518+
519+
.dataframe thead th {
520+
text-align: right;
521+
}
522+
</style>
509523
<table border="1" class="dataframe">
510524
<thead>
511525
<tr style="text-align: right;">
@@ -561,10 +575,10 @@ pd.DataFrame(optimal_params)
561575
</tr>
562576
<tr>
563577
<th>min_sum_hessian_in_leaf</th>
564-
<td>0.1</td>
565-
<td>0.1</td>
566-
<td>0.1</td>
567-
<td>0.1</td>
578+
<td>0.01</td>
579+
<td>0.01</td>
580+
<td>0.01</td>
581+
<td>0.01</td>
568582
</tr>
569583
<tr>
570584
<th>min_gain_to_split</th>
@@ -811,7 +825,7 @@ kernel.plot_feature_importance(dataset=0)
811825

812826

813827

814-
![png](README_files/README_48_0.png)
828+
![png](README_files/README_49_0.png)
815829

816830

817831

@@ -824,7 +838,7 @@ kernel.plot_imputed_distributions()
824838

825839

826840

827-
![png](README_files/README_50_0.png)
841+
![png](README_files/README_51_0.png)
828842

829843

830844

@@ -871,7 +885,7 @@ acclist
871885
0 0.35
872886
1 0.81
873887
2 0.81
874-
3 0.78
888+
3 0.84
875889
Name: Species Imputation Accuracy, dtype: float64
876890

877891

@@ -1021,7 +1035,7 @@ plot_matrix(dat, dat.columns)
10211035

10221036

10231037

1024-
![png](README_files/README_60_0.png)
1038+
![png](README_files/README_61_0.png)
10251039

10261040

10271041

@@ -1054,7 +1068,7 @@ kernel_mean_match.plot_imputed_distributions()
10541068

10551069

10561070

1057-
![png](README_files/README_63_0.png)
1071+
![png](README_files/README_64_0.png)
10581072

10591073

10601074

@@ -1065,7 +1079,7 @@ kernel_no_mean_match.plot_imputed_distributions()
10651079

10661080

10671081

1068-
![png](README_files/README_64_0.png)
1082+
![png](README_files/README_65_0.png)
10691083

10701084

10711085

README_files/README_49_0.png

107 KB
Loading

README_files/README_51_0.png

121 KB
Loading

README_files/README_61_0.png

387 KB
Loading

README_files/README_64_0.png

2.11 KB
Loading

README_files/README_65_0.png

188 KB
Loading

README_gen.ipynb

Lines changed: 14 additions & 14 deletions
Large diffs are not rendered by default.

miceforest/utils.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ def ampute_data(
5050
amputed_data = data.copy()
5151
num_rows = amputed_data.shape[0]
5252
amp_rows = int(perc * num_rows)
53-
random_state = ensure_rng(random_state)
53+
rs = ensure_rng(random_state)
5454
variables = list(data.columns) if variables is None else variables
5555

5656
for col in variables:
57-
ind = random_state.choice(amputed_data.index, size=amp_rows, replace=False)
57+
ind = rs.choice(amputed_data.index, size=amp_rows, replace=False)
5858
amputed_data.loc[ind, col] = np.nan
5959

6060
return amputed_data
@@ -91,7 +91,7 @@ def stratified_subset(
9191
9292
"""
9393

94-
random_state = ensure_rng(random_state=random_state)
94+
rs = ensure_rng(random_state=random_state)
9595

9696
cat = False
9797
if y.dtype.name == "category":
@@ -112,9 +112,7 @@ def stratified_subset(
112112
digits_s = (digits_p * size).round(0).astype("int32")
113113
diff = size - digits_s.sum()
114114
if diff != 0:
115-
digits_fix = random_state.choice(
116-
digits_i, size=abs(diff), p=digits_p, replace=False
117-
)
115+
digits_fix = rs.choice(digits_i, size=abs(diff), p=digits_p, replace=False)
118116
if diff < 0:
119117
for d in digits_fix:
120118
digits_s[d] -= 1
@@ -128,7 +126,7 @@ def stratified_subset(
128126
d_v = digits_v[d_i]
129127
n = digits_s[d_i]
130128
ind = np.where(digits == d_v)[0]
131-
choice = random_state.choice(ind, size=n, replace=False)
129+
choice = rs.choice(ind, size=n, replace=False)
132130
sub[added : (added + n)] = choice
133131
added += n
134132

0 commit comments

Comments
 (0)