Skip to content

Commit d42a934

Browse files
authored
Merge pull request #7 from JuliaAI/missingness-encoder
🦙 Missingness encoder is here
2 parents 972da5c + eea0466 commit d42a934

File tree

9 files changed

+458
-6
lines changed

9 files changed

+458
-6
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2121

2222
[compat]
2323
CategoricalArrays = "0.10"
24-
MLJModelInterface = "1.10"
24+
MLJModelInterface = "1.11"
2525
ScientificTypes = "3.0"
2626
StatsBase = "0.34"
2727
TableOperations = "1.2"
@@ -30,9 +30,9 @@ julia = "1.6.7"
3030

3131
[extras]
3232
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
33+
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
3334
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
3435
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
35-
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
3636
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
3737
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
3838

src/MLJTransforms.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ export FrequencyEncoder
4343
include("transformers/cardinality_reducer/cardinality_reducer.jl")
4444
include("transformers/cardinality_reducer/interface_mlj.jl")
4545
export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
46+
export CardinalityReducer
47+
include("encoders/missingness_encoding/missingness_encoding.jl")
48+
include("encoders/missingness_encoding/interface_mlj.jl")
49+
export MissingnessEncoder
4650

4751
# Contrast encoder
4852
include("encoders/contrast_encoder/contrast_encoder.jl")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
UNSUPPORTED_COL_TYPE_ME(col_type) =
2+
"In MissingnessEncoder, elements have type $(col_type). The supported types are `Char`, `AbstractString`, and `Number`"
3+
VALID_TYPES_NEW_VAL_ME(possible_col_type) =
4+
"In MissingnessEncoder, label_for_missing keys have type $(possible_col_type). The supported types are `Char`, `AbstractString`, and `Number`"
5+
COLLISION_NEW_VAL_ME(value) =
6+
"In MissingnessEncoder, label_for_missing specifies new feature name $(value). However, this name already exists in one of the features. Please respecify label_for_missing."
7+
UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing) =
8+
"In MissingnessEncoder, $(col_type) does not appear in label_for_missing which only has keys $(keys(label_for_missing))"
9+
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
### MissingnessEncoder with MLJ Interface
2+
3+
# 1. Interface Struct
4+
mutable struct MissingnessEncoder{
5+
AS <: AbstractVector{Symbol},
6+
T <: Type,
7+
A <: Any,
8+
} <: Unsupervised
9+
features::AS
10+
ignore::Bool
11+
ordered_factor::Bool
12+
label_for_missing::Dict{T, A}
13+
end;
14+
15+
# 2. Constructor
16+
function MissingnessEncoder(;
17+
features = Symbol[],
18+
ignore = true,
19+
ordered_factor = false,
20+
label_for_missing = Dict(
21+
AbstractString => "missing",
22+
Char => 'm',
23+
),
24+
)
25+
return MissingnessEncoder(features, ignore, ordered_factor, label_for_missing)
26+
end;
27+
28+
29+
# 4. Fitted parameters (for user access)
30+
MMI.fitted_params(::MissingnessEncoder, fitresult) = (
31+
label_for_missing_given_feature = fitresult,
32+
)
33+
34+
# 5. Fit method
35+
function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
36+
generic_cache = missingness_encoder_fit(
37+
X,
38+
transformer.features;
39+
ignore = transformer.ignore,
40+
ordered_factor = transformer.ordered_factor,
41+
label_for_missing = transformer.label_for_missing,
42+
)
43+
fitresult = generic_cache[:label_for_missing_given_feature]
44+
45+
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
46+
cache = nothing
47+
return fitresult, cache, report
48+
end;
49+
50+
51+
# 6. Transform method
52+
function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
53+
generic_cache = Dict(
54+
:label_for_missing_given_feature =>
55+
fitresult,
56+
)
57+
Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
58+
return Xnew_transf
59+
end
60+
61+
# 8. Extra metadata
62+
MMI.metadata_pkg(
63+
MissingnessEncoder,
64+
package_name = "MLJTransforms",
65+
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
66+
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
67+
is_pure_julia = true,
68+
)
69+
70+
MMI.metadata_model(
71+
MissingnessEncoder,
72+
input_scitype = Table,
73+
output_scitype = Table,
74+
load_path = "MLJTransforms.MissingnessEncoder",
75+
)
76+
77+
78+
79+
"""
80+
$(MMI.doc_header(MissingnessEncoder))
81+
82+
`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing").
83+
By this, missingness will be treated as a new
84+
level by any subsequent model. This assumes that the categorical features have raw
85+
types that are in `Char`, `AbstractString`, and `Number`.
86+
87+
# Training data
88+
89+
In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
90+
91+
mach = machine(model, X)
92+
93+
Here:
94+
95+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
96+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
97+
check scitypes.
98+
99+
Train the machine using `fit!(mach, rows=...)`.
100+
101+
# Hyper-parameters
102+
103+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
104+
- `ignore=true`: Whether to exclude or includes the features given in `features`
105+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
106+
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
107+
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
108+
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
109+
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
110+
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
111+
112+
# Operations
113+
114+
- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and
115+
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
116+
are always left unchanged.
117+
118+
# Fitted parameters
119+
120+
The fields of `fitted_params(mach)` are:
121+
122+
- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
123+
124+
# Report
125+
126+
The fields of `report(mach)` are:
127+
128+
- `encoded_features`: The subset of the categorical features of X that were encoded
129+
130+
# Examples
131+
132+
```julia
133+
import StatsBase.proportionmap
134+
using MLJ
135+
136+
# Define a table with missing values
137+
Xm = (
138+
A = categorical(["Ben", "John", missing, missing, "Mary", "John", missing]),
139+
B = [1.85, 1.67, missing, missing, 1.5, 1.67, missing],
140+
C= categorical([7, 5, missing, missing, 10, 0, missing]),
141+
D = [23, 23, 44, 66, 14, 23, 11],
142+
E = categorical([missing, 'g', 'r', missing, 'r', 'g', 'p'])
143+
)
144+
145+
encoder = MissingnessEncoder()
146+
mach = fit!(machine(encoder, Xm))
147+
Xnew = transform(mach, Xm)
148+
149+
julia> Xnew
150+
(A = ["Ben", "John", "missing", "missing", "Mary", "John", "missing"],
151+
B = Union{Missing, Float64}[1.85, 1.67, missing, missing, 1.5, 1.67, missing],
152+
C = [7, 5, -1, -1, 10, 0, -1],
153+
D = [23, 23, 44, 66, 14, 23, 11],
154+
E = ['m', 'g', 'r', 'm', 'r', 'g', 'p'],)
155+
156+
```
157+
158+
See also
159+
[`CardinalityReducer`](@ref)
160+
"""
161+
MissingnessEncoder
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
include("errors.jl")
2+
3+
"""
4+
**Private method.**
5+
6+
Fit a transformer that maps any missing value into a new level (e.g., "Missing"). By this, missingness will be treated as a new
7+
level by any subsequent model. This assumes that the categorical features have raw
8+
types that are in `Char`, `AbstractString`, and `Number`.
9+
10+
# Arguments
11+
12+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
13+
`Multiclass` or `OrderedFactor`
14+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
15+
- `ignore=true`: Whether to exclude or includes the features given in `features`
16+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
17+
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
18+
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
19+
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
20+
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
21+
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
22+
23+
# Returns (in a dict)
24+
25+
- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
26+
- `encoded_features`: The subset of the categorical features of X that were encoded
27+
"""
28+
function missingness_encoder_fit(
29+
X,
30+
features::AbstractVector{Symbol} = Symbol[];
31+
ignore::Bool = true,
32+
ordered_factor::Bool = false,
33+
label_for_missing::Dict{<:Type, <:Any} = Dict(
34+
AbstractString => "missing",
35+
Char => 'm',
36+
),
37+
)
38+
supportedtypes = Union{Char, AbstractString, Number}
39+
40+
# 1. Define feature mapper
41+
function feature_mapper(col, name)
42+
col_type = nonmissingtype(eltype(col)).parameters[1]
43+
feat_levels = levels(col; skipmissing=true)
44+
45+
# Ensure column type is valid (can't test because never occurs)
46+
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
47+
if !(col_type <: supportedtypes)
48+
throw(ArgumentError(UNSUPPORTED_COL_TYPE_ME(col_type)))
49+
end
50+
51+
# Ensure label_for_missing keys are valid types
52+
for possible_col_type in keys(label_for_missing)
53+
if !(possible_col_type in union_types(supportedtypes))
54+
throw(ArgumentError(VALID_TYPES_NEW_VAL_ME(possible_col_type)))
55+
end
56+
end
57+
58+
# Check no collision between keys(label_for_missing) and feat_levels
59+
for value in values(label_for_missing)
60+
if !ismissing(value)
61+
if value in feat_levels
62+
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
63+
end
64+
end
65+
end
66+
67+
# Get ancestor type of column
68+
elgrandtype = nothing
69+
for allowed_type in union_types(supportedtypes)
70+
if col_type <: allowed_type
71+
elgrandtype = allowed_type
72+
break
73+
end
74+
end
75+
76+
# Nonmissing levels remain as is
77+
label_for_missing_given_feature = Dict{Missing, col_type}()
78+
79+
# Missing levels are mapped
80+
if elgrandtype in keys(label_for_missing)
81+
label_for_missing_given_feature[missing] = label_for_missing[elgrandtype]
82+
elseif elgrandtype == Number
83+
label_for_missing_given_feature[missing] = minimum(feat_levels) - 1
84+
else
85+
throw(ArgumentError(UNSPECIFIED_COL_TYPE_ME(col_type, label_for_missing)))
86+
end
87+
88+
return label_for_missing_given_feature::Dict{Missing, col_type}
89+
end
90+
91+
# 2. Pass it to generic_fit
92+
label_for_missing_given_feature, encoded_features = generic_fit(
93+
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
94+
)
95+
cache = Dict(
96+
:label_for_missing_given_feature => label_for_missing_given_feature,
97+
:encoded_features => encoded_features,
98+
)
99+
return cache
100+
end
101+
102+
"""
103+
**Private method.**
104+
105+
Apply a fitted missingness encoder to a table given the output of `missingness_encoder_fit`
106+
107+
# Arguments
108+
109+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
110+
`Multiclass` or `OrderedFactor`
111+
- `cache`: The output of `missingness_encoder_fit`
112+
113+
# Returns
114+
115+
- `X_tr`: The table with selected features after the selected features are transformed by missingness encoder
116+
"""
117+
function missingness_encoder_transform(X, cache::Dict)
118+
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
119+
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
120+
end
121+

src/transformers/cardinality_reducer/cardinality_reducer.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ include("errors.jl")
77
Fit a transformer that maps any level of a categorical feature that occurs with
88
frequency < `min_frequency` into a new level (e.g., "Other"). This is useful when some categorical features have
99
high cardinality and many levels are infrequent. This assumes that the categorical features have raw
10-
types that are in `Union{Char, AbstractString, Number}`.
10+
types that are in `Char`, `AbstractString`, and `Number`.
1111
1212
# Arguments
1313
@@ -19,7 +19,7 @@ types that are in `Union{Char, AbstractString, Number}`.
1919
- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be
2020
an integer or a float which decides whether raw counts or normalized frequencies are used.
2121
- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A
22-
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies
22+
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies
2323
the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
2424
then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'`
2525
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.

src/transformers/cardinality_reducer/interface_mlj.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ Train the machine using `fit!(mach, rows=...)`.
112112
- `min_frequency::Real=3`: Any level of a categorical feature that occurs with frequency < `min_frequency` will be mapped to a new level. Could be
113113
an integer or a float which decides whether raw counts or normalized frequencies are used.
114114
- `label_for_infrequent::Dict{<:Type, <:Any}()= Dict( AbstractString => "Other", Char => 'O', )`: A
115-
dictionary where the possible values for keys are the types in `Union{Char, AbstractString, Number}` and each value signifies
115+
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and each value signifies
116116
the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
117117
then the new value is `"Other"` and if the raw type subtypes `Char` then the new value is `'O'`
118118
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.

0 commit comments

Comments
 (0)