Skip to content

Commit 6fa6eeb

Browse files
authored
Merge pull request #27 from JuliaAI/named-tuple-for-cache-better-doc-redundancy
✨ NamedTuple for cache and less documentation redundancy
2 parents 08d973f + 4744103 commit 6fa6eeb

22 files changed

+538
-362
lines changed

src/MLJTransforms.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ using OrderedCollections
1919
const MMI = MLJModelInterface
2020

2121
# Functions of generic use across transformers
22+
include("common_docs.jl")
2223
include("generic.jl")
2324
include("utils.jl")
2425

src/common_docs.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
const X_doc = """
2+
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
3+
`Multiclass` or `OrderedFactor`
4+
"""
5+
const X_doc_mlj = """
6+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
7+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
8+
check scitypes.
9+
"""
10+
const features_doc = """
11+
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
12+
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
13+
or a callable that returns true for features to be included/excluded
14+
"""
15+
const ignore_doc = """
16+
- ignore=true: Whether to exclude or include the features given in `features`
17+
"""
18+
const ordered_factor_doc = """
19+
- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
20+
"""
21+
const encoded_features_doc = """
22+
- encoded_features: The subset of the categorical features of `X` that were encoded
23+
"""
24+
const cache_doc = """
25+
- `cache`: The output of `contrast_encoder_fit`
26+
"""
27+

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
99
"""
1010
### 1. Dummy Coding
1111
function get_dummy_contrast(k)
12-
return Matrix(1.0I, k, k-1)
12+
return Matrix(1.0I, k, k - 1)
1313
end
1414

1515

1616
### 2. Sum Coding
1717
function get_sum_contrast(k)
18-
C = Matrix(1.0I, k, k-1)
18+
C = Matrix(1.0I, k, k - 1)
1919
C[end, :] .= -1.0
2020
return C
2121
end
@@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
2626
vec = ones(length) .* index / length
2727

2828
# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
29-
vec[1:index] .= index/length - 1
29+
vec[1:index] .= index / length - 1
3030
return vec
3131
end
3232
function get_backward_diff_contrast(k)
@@ -61,21 +61,21 @@ Fit a contrast encoing scheme on given data in `X`.
6161
6262
# Arguments
6363
64-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
65-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
64+
$X_doc
65+
$features_doc
6666
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
67-
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68-
contrast encoding scheme for each feature
69-
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70-
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71-
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72-
- `ignore=true`: Whether to exclude or includes the features given in `features`
73-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
67+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68+
contrast encoding scheme for each feature
69+
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70+
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71+
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72+
$ignore_doc
73+
$ordered_factor_doc
7474
75-
# Returns (in a dict)
75+
# Returns as a named-tuple
7676
7777
- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
78-
- `encoded_features`: The subset of the categorical features of X that were encoded
78+
$encoded_features_doc
7979
"""
8080
function contrast_encoder_fit(
8181
X,
@@ -90,9 +90,10 @@ function contrast_encoder_fit(
9090
if mode isa Vector{Symbol}
9191
mode_is_vector = true
9292
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
93-
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
93+
length(features) == length(mode) ||
94+
throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
9495
end
95-
96+
9697
# buildmatrix should be specified if mode is :contrast or :hypothesis
9798
if mode in (:contrast, :hypothesis)
9899
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
@@ -105,11 +106,13 @@ function contrast_encoder_fit(
105106
k = length(feat_levels)
106107
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
107108
if feat_mode == :contrast
108-
contrastmatrix = buildmatrix(name, k)
109-
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
109+
contrastmatrix = buildmatrix(name, k)
110+
size(contrastmatrix) == (k, k - 1) ||
111+
throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
110112
elseif feat_mode == :hypothesis
111-
hypothesismatrix = buildmatrix(name, k)
112-
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113+
hypothesismatrix = buildmatrix(name, k)
114+
size(hypothesismatrix) == (k - 1, k) ||
115+
throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113116
contrastmatrix = pinv(hypothesismatrix)
114117
elseif feat_mode == :dummy
115118
contrastmatrix = get_dummy_contrast(k)
@@ -125,7 +128,9 @@ function contrast_encoder_fit(
125128
throw(ArgumentError("Mode $feat_mode is not supported."))
126129
end
127130

128-
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
131+
vector_given_value_given_feature = OrderedDict(
132+
level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
133+
)
129134
return vector_given_value_given_feature
130135
end
131136

@@ -134,10 +139,9 @@ function contrast_encoder_fit(
134139
X, features; ignore = ignore, ordered_factor = ordered_factor,
135140
feature_mapper = feature_mapper,
136141
)
137-
138-
cache = Dict(
139-
:vector_given_value_given_feature => vector_given_value_given_feature,
140-
:encoded_features => encoded_features,
142+
cache = (
143+
vector_given_value_given_feature = vector_given_value_given_feature,
144+
encoded_features = encoded_features,
141145
)
142146

143147
return cache
@@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
157161
158162
- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
159163
"""
160-
function contrast_encoder_transform(X, cache::Dict)
161-
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
162-
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
163-
end
164+
function contrast_encoder_transform(X, cache::NamedTuple)
165+
vector_given_value_given_feature = cache.vector_given_value_given_feature
166+
return generic_transform(
167+
X,
168+
vector_given_value_given_feature,
169+
single_feat = false;
170+
use_levelnames = true,
171+
)
172+
end

src/encoders/contrast_encoder/interface_mlj.jl

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,18 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
3636
buildmatrix = transformer.buildmatrix,
3737
ordered_factor = transformer.ordered_factor,
3838
)
39-
fitresult = generic_cache[:vector_given_value_given_feature]
39+
fitresult = generic_cache.vector_given_value_given_feature
4040

41-
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
41+
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
4242
cache = nothing
4343
return fitresult, cache, report
4444
end;
4545

4646

4747
# 6. Transform method
4848
function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
49-
generic_cache = Dict(
50-
:vector_given_value_given_feature =>
51-
fitresult,
49+
generic_cache = (
50+
vector_given_value_given_feature = fitresult,
5251
)
5352
Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
5453
return Xnew_transf
@@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
8786
8887
Here:
8988
90-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
91-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
92-
check scitypes.
89+
$X_doc_mlj
9390
9491
Train the machine using `fit!(mach, rows=...)`.
9592
9693
# Hyper-parameters
9794
98-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
95+
$features_doc
9996
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
10097
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
10198
contrast encoding scheme for each feature
10299
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
103100
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
104101
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
105-
- `ignore=true`: Whether to exclude or includes the features given in `features`
106-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
102+
$ignore_doc
103+
$ordered_factor_doc
107104
108105
# Operations
109106
@@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are:
121118
122119
The fields of `report(mach)` are:
123120
124-
- `encoded_features`: The subset of the categorical features of X that were encoded
121+
$encoded_features_doc
125122
126123
# Examples
127124

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@ categorical features with their (normalized or raw) frequencies of occurrence in
77
88
# Arguments
99
10-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
11-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
12-
- `ignore=true`: Whether to exclude or includes the features given in `features`
13-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
10+
$X_doc
11+
$features_doc
12+
$ignore_doc
13+
$ordered_factor_doc
1414
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
1515
16-
# Returns (in a dict)
16+
# Returns as a named-tuple
1717
1818
- `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
19-
- `encoded_features`: The subset of the categorical features of X that were encoded
19+
$encoded_features_doc
2020
"""
2121
function frequency_encoder_fit(
2222
X,
@@ -39,11 +39,11 @@ function frequency_encoder_fit(
3939
# 2. Pass it to generic_fit
4040
statistic_given_feat_val, encoded_features = generic_fit(
4141
X, features; ignore = ignore, ordered_factor = ordered_factor,
42-
feature_mapper = feature_mapper,
43-
)
44-
cache = Dict(
45-
:statistic_given_feat_val => statistic_given_feat_val,
46-
:encoded_features => encoded_features,
42+
feature_mapper = feature_mapper)
43+
44+
cache = (
45+
statistic_given_feat_val = statistic_given_feat_val,
46+
encoded_features = encoded_features,
4747
)
4848
return cache
4949
end
@@ -62,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali
6262
6363
- `X_tr`: The table with selected features after the selected features are encoded by frequency encoding.
6464
"""
65-
function frequency_encoder_transform(X, cache::Dict)
66-
statistic_given_feat_val = cache[:statistic_given_feat_val]
65+
function frequency_encoder_transform(X, cache::NamedTuple)
66+
statistic_given_feat_val = cache.statistic_given_feat_val
6767
return generic_transform(X, statistic_given_feat_val)
6868
end

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,18 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
3636
normalize = transformer.normalize,
3737
output_type = transformer.output_type,
3838
)
39-
fitresult = generic_cache[:statistic_given_feat_val]
39+
fitresult = generic_cache.statistic_given_feat_val
4040

41-
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
41+
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
4242
cache = nothing
4343
return fitresult, cache, report
4444
end;
4545

4646

4747
# 6. Transform method
4848
function MMI.transform(transformer::FrequencyEncoder, fitresult, Xnew)
49-
generic_cache = Dict(
50-
:statistic_given_feat_val =>
51-
fitresult,
49+
generic_cache = (
50+
statistic_given_feat_val = fitresult,
5251
)
5352
Xnew_transf = frequency_encoder_transform(Xnew, generic_cache)
5453
return Xnew_transf
@@ -87,18 +86,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
8786
8887
Here:
8988
90-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
91-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
92-
check scitypes.
89+
$X_doc_mlj
9390
9491
Train the machine using `fit!(mach, rows=...)`.
9592
9693
# Hyper-parameters
9794
98-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
99-
- `ignore=true`: Whether to exclude or include the features given in `features`
100-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
101-
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
95+
$features_doc
96+
$ignore_doc
97+
$ordered_factor_doc
98+
- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
10299
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
103100
104101
# Operations
@@ -117,7 +114,7 @@ The fields of `fitted_params(mach)` are:
117114
118115
The fields of `report(mach)` are:
119116
120-
- `encoded_features`: The subset of the categorical features of X that were encoded
117+
$encoded_features_doc
121118
122119
# Examples
123120

src/encoders/missingness_encoding/interface_mlj.jl

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,18 @@ function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
3939
ordered_factor = transformer.ordered_factor,
4040
label_for_missing = transformer.label_for_missing,
4141
)
42-
fitresult = generic_cache[:label_for_missing_given_feature]
42+
fitresult = generic_cache.label_for_missing_given_feature
4343

44-
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
44+
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
4545
cache = nothing
4646
return fitresult, cache, report
4747
end;
4848

4949

5050
# 6. Transform method
5151
function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
52-
generic_cache = Dict(
53-
:label_for_missing_given_feature =>
54-
fitresult,
52+
generic_cache = (
53+
label_for_missing_given_feature = fitresult,
5554
)
5655
Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
5756
return Xnew_transf
@@ -91,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
9190
9291
Here:
9392
94-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
95-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
96-
check scitypes.
93+
$X_doc_mlj
9794
9895
Train the machine using `fit!(mach, rows=...)`.
9996
10097
# Hyper-parameters
10198
102-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
103-
- `ignore=true`: Whether to exclude or includes the features given in `features`
104-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
99+
$features_doc
100+
$ignore_doc
101+
$ordered_factor_doc
105102
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
106103
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
107104
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
@@ -124,7 +121,7 @@ The fields of `fitted_params(mach)` are:
124121
125122
The fields of `report(mach)` are:
126123
127-
- `encoded_features`: The subset of the categorical features of X that were encoded
124+
$encoded_features_doc
128125
129126
# Examples
130127

0 commit comments

Comments
 (0)