Skip to content

Commit 4744103

Browse files
committed
✨ Better docstring redundance
1 parent 5358dce commit 4744103

File tree

15 files changed

+131
-113
lines changed

15 files changed

+131
-113
lines changed

src/MLJTransforms.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ using OrderedCollections
1919
const MMI = MLJModelInterface
2020

2121
# Functions of generic use across transformers
22+
include("common_docs.jl")
2223
include("generic.jl")
2324
include("utils.jl")
2425

src/common_docs.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
const X_doc = """
2+
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
3+
`Multiclass` or `OrderedFactor`
4+
"""
5+
const X_doc_mlj = """
6+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
7+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
8+
check scitypes.
9+
"""
10+
const features_doc = """
11+
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
12+
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
13+
or a callable that returns true for features to be included/excluded
14+
"""
15+
const ignore_doc = """
16+
- ignore=true: Whether to exclude or include the features given in `features`
17+
"""
18+
const ordered_factor_doc = """
19+
- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
20+
"""
21+
const encoded_features_doc = """
22+
- encoded_features: The subset of the categorical features of `X` that were encoded
23+
"""
24+
const cache_doc = """
25+
- `cache`: The output of `contrast_encoder_fit`
26+
"""
27+

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
99
"""
1010
### 1. Dummy Coding
1111
function get_dummy_contrast(k)
12-
return Matrix(1.0I, k, k-1)
12+
return Matrix(1.0I, k, k - 1)
1313
end
1414

1515

1616
### 2. Sum Coding
1717
function get_sum_contrast(k)
18-
C = Matrix(1.0I, k, k-1)
18+
C = Matrix(1.0I, k, k - 1)
1919
C[end, :] .= -1.0
2020
return C
2121
end
@@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
2626
vec = ones(length) .* index / length
2727

2828
# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
29-
vec[1:index] .= index/length - 1
29+
vec[1:index] .= index / length - 1
3030
return vec
3131
end
3232
function get_backward_diff_contrast(k)
@@ -61,21 +61,21 @@ Fit a contrast encoing scheme on given data in `X`.
6161
6262
# Arguments
6363
64-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
65-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
64+
$X_doc
65+
$features_doc
6666
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
67-
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68-
contrast encoding scheme for each feature
69-
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70-
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71-
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72-
- `ignore=true`: Whether to exclude or includes the features given in `features`
73-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
67+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68+
contrast encoding scheme for each feature
69+
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70+
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71+
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72+
$ignore_doc
73+
$ordered_factor_doc
7474
75-
# Returns (in a dict)
75+
# Returns as a named-tuple
7676
7777
- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
78-
- `encoded_features`: The subset of the categorical features of X that were encoded
78+
$encoded_features_doc
7979
"""
8080
function contrast_encoder_fit(
8181
X,
@@ -90,9 +90,10 @@ function contrast_encoder_fit(
9090
if mode isa Vector{Symbol}
9191
mode_is_vector = true
9292
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
93-
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
93+
length(features) == length(mode) ||
94+
throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
9495
end
95-
96+
9697
# buildmatrix should be specified if mode is :contrast or :hypothesis
9798
if mode in (:contrast, :hypothesis)
9899
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
@@ -105,11 +106,13 @@ function contrast_encoder_fit(
105106
k = length(feat_levels)
106107
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
107108
if feat_mode == :contrast
108-
contrastmatrix = buildmatrix(name, k)
109-
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
109+
contrastmatrix = buildmatrix(name, k)
110+
size(contrastmatrix) == (k, k - 1) ||
111+
throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
110112
elseif feat_mode == :hypothesis
111-
hypothesismatrix = buildmatrix(name, k)
112-
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113+
hypothesismatrix = buildmatrix(name, k)
114+
size(hypothesismatrix) == (k - 1, k) ||
115+
throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113116
contrastmatrix = pinv(hypothesismatrix)
114117
elseif feat_mode == :dummy
115118
contrastmatrix = get_dummy_contrast(k)
@@ -125,7 +128,9 @@ function contrast_encoder_fit(
125128
throw(ArgumentError("Mode $feat_mode is not supported."))
126129
end
127130

128-
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
131+
vector_given_value_given_feature = OrderedDict(
132+
level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
133+
)
129134
return vector_given_value_given_feature
130135
end
131136

@@ -158,5 +163,10 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
158163
"""
159164
function contrast_encoder_transform(X, cache::NamedTuple)
160165
vector_given_value_given_feature = cache.vector_given_value_given_feature
161-
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
162-
end
166+
return generic_transform(
167+
X,
168+
vector_given_value_given_feature,
169+
single_feat = false;
170+
use_levelnames = true,
171+
)
172+
end

src/encoders/contrast_encoder/interface_mlj.jl

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
8686
8787
Here:
8888
89-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
90-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
91-
check scitypes.
89+
$X_doc_mlj
9290
9391
Train the machine using `fit!(mach, rows=...)`.
9492
9593
# Hyper-parameters
9694
97-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
95+
$features_doc
9896
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
9997
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
10098
contrast encoding scheme for each feature
10199
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
102100
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
103101
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
104-
- `ignore=true`: Whether to exclude or includes the features given in `features`
105-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
102+
$ignore_doc
103+
$ordered_factor_doc
106104
107105
# Operations
108106
@@ -120,7 +118,7 @@ The fields of `fitted_params(mach)` are:
120118
121119
The fields of `report(mach)` are:
122120
123-
- `encoded_features`: The subset of the categorical features of X that were encoded
121+
$encoded_features_doc
124122
125123
# Examples
126124

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@ categorical features with their (normalized or raw) frequencies of occurrence in
77
88
# Arguments
99
10-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
11-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
12-
- `ignore=true`: Whether to exclude or includes the features given in `features`
13-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
10+
$X_doc
11+
$features_doc
12+
$ignore_doc
13+
$ordered_factor_doc
1414
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
1515
16-
# Returns (in a dict)
16+
# Returns as a named-tuple
1717
1818
- `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
19-
- `encoded_features`: The subset of the categorical features of X that were encoded
19+
$encoded_features_doc
2020
"""
2121
function frequency_encoder_fit(
2222
X,
@@ -39,11 +39,11 @@ function frequency_encoder_fit(
3939
# 2. Pass it to generic_fit
4040
statistic_given_feat_val, encoded_features = generic_fit(
4141
X, features; ignore = ignore, ordered_factor = ordered_factor,
42-
feature_mapper = feature_mapper,)
43-
42+
feature_mapper = feature_mapper)
43+
4444
cache = (
45-
statistic_given_feat_val = statistic_given_feat_val,
46-
encoded_features = encoded_features,
45+
statistic_given_feat_val = statistic_given_feat_val,
46+
encoded_features = encoded_features,
4747
)
4848
return cache
4949
end

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
8686
8787
Here:
8888
89-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
90-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
91-
check scitypes.
89+
$X_doc_mlj
9290
9391
Train the machine using `fit!(mach, rows=...)`.
9492
9593
# Hyper-parameters
9694
97-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
98-
- `ignore=true`: Whether to exclude or include the features given in `features`
99-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
100-
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
95+
$features_doc
96+
$ignore_doc
97+
$ordered_factor_doc
98+
- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
10199
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
102100
103101
# Operations
@@ -116,7 +114,7 @@ The fields of `fitted_params(mach)` are:
116114
117115
The fields of `report(mach)` are:
118116
119-
- `encoded_features`: The subset of the categorical features of X that were encoded
117+
$encoded_features_doc
120118
121119
# Examples
122120

src/encoders/missingness_encoding/interface_mlj.jl

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
9090
9191
Here:
9292
93-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
94-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
95-
check scitypes.
93+
$X_doc_mlj
9694
9795
Train the machine using `fit!(mach, rows=...)`.
9896
9997
# Hyper-parameters
10098
101-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
102-
- `ignore=true`: Whether to exclude or includes the features given in `features`
103-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
99+
$features_doc
100+
$ignore_doc
101+
$ordered_factor_doc
104102
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
105103
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
106104
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
@@ -123,7 +121,7 @@ The fields of `fitted_params(mach)` are:
123121
124122
The fields of `report(mach)` are:
125123
126-
- `encoded_features`: The subset of the categorical features of X that were encoded
124+
$encoded_features_doc
127125
128126
# Examples
129127

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,20 @@ types that are in `Char`, `AbstractString`, and `Number`.
99
1010
# Arguments
1111
12-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
13-
`Multiclass` or `OrderedFactor`
14-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
15-
- `ignore=true`: Whether to exclude or includes the features given in `features`
16-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
12+
$X_doc
13+
$features_doc
14+
$ignore_doc
15+
$ordered_factor_doc
1716
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
1817
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
1918
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
2019
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
2120
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
2221
23-
# Returns (in a dict)
22+
# Returns as a named-tuple
2423
2524
- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
26-
- `encoded_features`: The subset of the categorical features of X that were encoded
25+
$encoded_features_doc
2726
"""
2827
function missingness_encoder_fit(
2928
X,

src/encoders/ordinal_encoding/interface_mlj.jl

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,17 +82,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
8282
8383
Here:
8484
85-
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
86-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
87-
check scitypes.
85+
$X_doc_mlj
8886
8987
Train the machine using `fit!(mach, rows=...)`.
9088
9189
# Hyper-parameters
9290
93-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
94-
- `ignore=true`: Whether to exclude or includes the features given in `features`
95-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
91+
$features_doc
92+
$ignore_doc
93+
$ordered_factor_doc
9694
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
9795
9896
# Operations
@@ -111,7 +109,7 @@ The fields of `fitted_params(mach)` are:
111109
112110
The fields of `report(mach)` are:
113111
114-
- `encoded_features`: The subset of the categorical features of X that were encoded
112+
$encoded_features_doc
115113
116114
# Examples
117115

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@
55
Fit an encoder to encode the levels of categorical variables in a given table as integers (ordered arbitrarily).
66
77
# Arguments
8-
9-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
10-
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
11-
- `ignore=true`: Whether to exclude or includes the features given in `features`
12-
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
8+
$X_doc
9+
$features_doc
10+
$ignore_doc
11+
$ordered_factor_doc
1312
- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
1413
15-
# Returns (in a dict)
14+
# Returns as a named-tuple
1615
1716
- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
1817
- `encoded_features`: The subset of the categorical features of X that were encoded
@@ -37,10 +36,10 @@ function ordinal_encoder_fit(
3736
# 2. Pass it to generic_fit
3837
index_given_feat_level, encoded_features = generic_fit(
3938
X, features; ignore = ignore, ordered_factor = ordered_factor,
40-
feature_mapper = feature_mapper,)
39+
feature_mapper = feature_mapper)
4140
cache = (
42-
index_given_feat_level = index_given_feat_level,
43-
encoded_features = encoded_features,
41+
index_given_feat_level = index_given_feat_level,
42+
encoded_features = encoded_features,
4443
)
4544
return cache
4645
end

0 commit comments

Comments
 (0)