Skip to content

Commit 9fb4c27

Browse files
committed
🦙 Missingness encoder is here
1 parent eff5da6 commit 9fb4c27

File tree

7 files changed

+460
-3
lines changed

7 files changed

+460
-3
lines changed

Project.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
99
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
1010
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1111
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
12+
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1213
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
1314
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
1415
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
@@ -20,7 +21,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2021

2122
[compat]
2223
CategoricalArrays = "0.10"
23-
MLJModelInterface = "1.10"
24+
MLJModelInterface = "1.11"
2425
ScientificTypes = "3.0"
2526
StatsBase = "0.34"
2627
TableOperations = "1.2"
@@ -29,10 +30,11 @@ julia = "1.6.7"
2930

3031
[extras]
3132
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
33+
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
3234
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
3335
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
36+
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
3437
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
35-
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
3638

3739
[targets]
38-
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs"]
40+
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]

src/MLJTransforms.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,27 @@ include("encoders/target_encoding/errors.jl")
2424
include("encoders/target_encoding/target_encoding.jl")
2525
include("encoders/target_encoding/interface_mlj.jl")
2626
export target_encoder_fit, target_encoder_transform, TargetEncoder
27+
export TargetEncoder
2728

2829
# Ordinal encoding
2930
include("encoders/ordinal_encoding/ordinal_encoding.jl")
3031
include("encoders/ordinal_encoding/interface_mlj.jl")
3132
export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
33+
export OrdinalEncoder
3234

3335
# Frequency encoding
3436
include("encoders/frequency_encoding/frequency_encoding.jl")
3537
include("encoders/frequency_encoding/interface_mlj.jl")
3638
export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
39+
export FrequencyEncoder
3740

3841
# Cardinality reduction
3942
include("transformers/cardinality_reducer/cardinality_reducer.jl")
4043
include("transformers/cardinality_reducer/interface_mlj.jl")
4144
export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
45+
export CardinalityReducer
46+
include("encoders/missingness_encoding/missingness_encoding.jl")
47+
include("encoders/missingness_encoding/interface_mlj.jl")
48+
export MissingnessEncoder
4249

4350
end
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
UNSUPPORTED_COL_TYPE_ME(col_type) =
2+
"In MissingnessEncoder, elements have type $(col_type). The supported types are `Union{Char, AbstractString, Number}`"
3+
VALID_TYPES_NEW_VAL_ME(possible_col_type) =
4+
"In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Union{Char, AbstractString, Number}`"
5+
COLLISION_NEW_VAL_ME(value) =
6+
"In MissingnessEncoder, label_for_missing specifies new feature name $(value). However, this name already exists in one of the features. Please respecify label_for_missing."
7+
UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing) =
8+
"In MissingnessEncoder, $(col_type) does not appear in label_for_missing which only has keys $(keys(label_for_missing))"
9+
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
### MissingnessEncoder with MLJ Interface
2+
3+
# 1. Interface Struct
4+
mutable struct MissingnessEncoder{
5+
AS <: AbstractVector{Symbol},
6+
T <: Type,
7+
A <: Any,
8+
} <: Unsupervised
9+
features::AS
10+
ignore::Bool
11+
ordered_factor::Bool
12+
label_for_missing::Dict{T, A}
13+
end;
14+
15+
# 2. Constructor
16+
function MissingnessEncoder(;
17+
features = Symbol[],
18+
ignore = true,
19+
ordered_factor = false,
20+
label_for_missing = Dict(
21+
AbstractString => "missing",
22+
Char => 'm',
23+
),
24+
)
25+
return MissingnessEncoder(features, ignore, ordered_factor, label_for_missing)
26+
end;
27+
28+
29+
# 4. Fitted parameters (for user access)
30+
MMI.fitted_params(::MissingnessEncoder, fitresult) = (
31+
new_cat_given_col_val = fitresult,
32+
)
33+
34+
# 5. Fit method
35+
function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
36+
generic_cache = missingness_encoder_fit(
37+
X,
38+
transformer.features;
39+
ignore = transformer.ignore,
40+
ordered_factor = transformer.ordered_factor,
41+
label_for_missing = transformer.label_for_missing,
42+
)
43+
fitresult = generic_cache[:new_cat_given_col_val]
44+
45+
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
46+
cache = nothing
47+
return fitresult, cache, report
48+
end;
49+
50+
51+
# 6. Transform method
52+
function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
53+
generic_cache = Dict(
54+
:new_cat_given_col_val =>
55+
fitresult,
56+
)
57+
Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
58+
return Xnew_transf
59+
end
60+
61+
# 8. Extra metadata
62+
MMI.metadata_pkg(
63+
MissingnessEncoder,
64+
package_name = "MLJTransforms",
65+
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
66+
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
67+
is_pure_julia = true,
68+
)
69+
70+
MMI.metadata_model(
71+
MissingnessEncoder,
72+
input_scitype = Table,
73+
output_scitype = Table,
74+
load_path = "MLJTransforms.MissingnessEncoder",
75+
)
76+
77+
78+
79+
"""
80+
$(MMI.doc_header(MissingnessEncoder))
81+
82+
`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing").
83+
By this, missingness will be treated as a new
84+
level by any subsequent model. This assumes that the categorical features have raw
85+
types that are in `Union{Char, AbstractString, Number}`.
86+
87+
# Training data
88+
89+
In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
90+
91+
mach = machine(model, X)
92+
93+
Here:
94+
95+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
96+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
97+
check scitypes.
98+
99+
Train the machine using `fit!(mach, rows=...)`.
100+
101+
# Hyper-parameters
102+
103+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
104+
- `ignore=true`: Whether to exclude or includes the features given in `features`
105+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
106+
- `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
107+
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value
108+
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
109+
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
110+
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
111+
112+
# Operations
113+
114+
- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and
115+
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
116+
are always left unchanged.
117+
118+
# Fitted parameters
119+
120+
The fields of `fitted_params(mach)` are:
121+
122+
- `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
123+
124+
# Report
125+
126+
The fields of `report(mach)` are:
127+
128+
- `encoded_features`: The subset of the categorical features of X that were encoded
129+
130+
# Examples
131+
132+
```julia
133+
import StatsBase.proportionmap
134+
using MLJ
135+
136+
# Define a table with missing values
137+
Xm = (
138+
A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]),
139+
B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing],
140+
C= categorical([7, 5, missing, missing, 10, 0, missing]),
141+
D = [23, 23, 44, 66, 14, 23, 11],
142+
E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p'])
143+
)
144+
145+
encoder = MissingnessEncoder(ordered_factor = false)
146+
mach = fit!(machine(encoder, Xm))
147+
Xnew = transform(mach, Xm)
148+
149+
julia> Xnew
150+
(A = ["Ben", "John", "missing", "missing", "Mary", "John", "missing"],
151+
B = Union{Missing, Float64}[1.85, 1.67, missing, missing, 1.5, 1.67, missing],
152+
C = [7, 5, -1, -1, 10, 0, -1],
153+
D = [23, 23, 44, 66, 14, 23, 11],
154+
E = ['m', 'g', 'r', 'm', 'r', 'g', 'p'],)
155+
156+
```
157+
158+
See also
159+
[`CardinalityReducer`](@ref)
160+
"""
161+
MissingnessEncoder
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
include("errors.jl")
2+
3+
"""
4+
**Private method.**
5+
6+
Fit a transformer that maps any missing value into a new level (e.g., "Missing"). By this, missingness will be treated as a new
7+
level by any subsequent model. This assumes that the categorical features have raw
8+
types that are in `Union{Char, AbstractString, Number}`.
9+
10+
# Arguments
11+
12+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
13+
`Multiclass` or `OrderedFactor`
14+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
15+
- `ignore=true`: Whether to exclude or includes the features given in `features`
16+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
17+
- `label_for_missing=Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
18+
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and where each value
19+
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
20+
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
21+
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
22+
23+
# Returns (in a dict)
24+
25+
- `new_cat_given_col_val`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
26+
- `encoded_features`: The subset of the categorical features of X that were encoded
27+
"""
28+
function missingness_encoder_fit(
29+
X,
30+
features::AbstractVector{Symbol} = Symbol[];
31+
ignore::Bool = true,
32+
ordered_factor::Bool = false,
33+
label_for_missing::Dict{<:Type, <:Any} = Dict(
34+
AbstractString => "missing",
35+
Char => 'm',
36+
),
37+
)
38+
supportedtypes = Union{Char, AbstractString, Number}
39+
40+
# 1. Define feature mapper
41+
function feature_mapper(col, name)
42+
col_type = nonmissingtype(eltype(col)).parameters[1]
43+
feat_levels = levels(col; skipmissing=true)
44+
45+
# Ensure column type is valid (can't test because never occurs)
46+
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
47+
if !(col_type <: supportedtypes)
48+
throw(ArgumentError(UNSUPPORTED_COL_TYPE_ME(col_type)))
49+
end
50+
51+
# Ensure label_for_missing keys are valid types
52+
for possible_col_type in keys(label_for_missing)
53+
if !(possible_col_type in union_types(supportedtypes))
54+
throw(ArgumentError(VALID_TYPES_NEW_VAL_ME(possible_col_type)))
55+
end
56+
end
57+
58+
# Check no collision between keys(label_for_missing) and feat_levels
59+
for value in values(label_for_missing)
60+
if !ismissing(value)
61+
if value in feat_levels
62+
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
63+
end
64+
end
65+
end
66+
67+
# Get ancestor type of column
68+
elgrandtype = nothing
69+
for allowed_type in union_types(supportedtypes)
70+
if col_type <: allowed_type
71+
elgrandtype = allowed_type
72+
break
73+
end
74+
end
75+
76+
# Nonmissing levels remain as is
77+
new_cat_given_col_val = Dict{Missing, col_type}()
78+
79+
# Missing levels are mapped
80+
if elgrandtype in keys(label_for_missing)
81+
new_cat_given_col_val[missing] = label_for_missing[elgrandtype]
82+
elseif elgrandtype == Number
83+
new_cat_given_col_val[missing] = minimum(feat_levels) - 1
84+
else
85+
throw(ArgumentError(UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing)))
86+
end
87+
88+
return new_cat_given_col_val::Dict{Missing, col_type}
89+
end
90+
91+
# 2. Pass it to generic_fit
92+
new_cat_given_col_val, encoded_features = generic_fit(
93+
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
94+
)
95+
cache = Dict(
96+
:new_cat_given_col_val => new_cat_given_col_val,
97+
:encoded_features => encoded_features,
98+
)
99+
return cache
100+
end
101+
102+
"""
103+
**Private method.**
104+
105+
Apply a fitted missingness encoder to a table given the output of `missingness_encoder_fit`
106+
107+
# Arguments
108+
109+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
110+
`Multiclass` or `OrderedFactor`
111+
- `cache`: The output of `missingness_encoder_fit`
112+
113+
# Returns
114+
115+
- `X_tr`: The table with selected features after the selected features are transformed by missingness encoder
116+
"""
117+
function missingness_encoder_transform(X, cache::Dict)
118+
new_cat_given_col_val = cache[:new_cat_given_col_val]
119+
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
120+
end
121+

0 commit comments

Comments
 (0)