Skip to content

Commit e0478e4

Browse files
authored
Merge pull request #54 from JuliaAI/dev
For a 0.1.2 release
2 parents 8ed209c + 2a7a255 commit e0478e4

20 files changed

+258
-106
lines changed

.github/workflows/CI.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
arch:
3131
- x64
3232
steps:
33-
- uses: actions/checkout@v4
33+
- uses: actions/checkout@v5
3434
- uses: julia-actions/setup-julia@v2
3535
with:
3636
version: ${{ matrix.version }}

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "MLJTransforms"
22
uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
33
authors = ["Essam <[email protected]> and contributors"]
4-
version = "0.1.1"
4+
version = "0.1.2"
55

66
[deps]
77
BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"

src/MLJTransforms.jl

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ using Tables
55
# https://github.com/JuliaAI/MLJBase.jl/issues/1002
66
import ScientificTypes: elscitype, schema, coerce, ScientificTimeType
77
using MLJModelInterface # exports `scitype`, which will call `ScientificTypes.scitype`,
8-
# once MLJBase is loaded (but this is not a dependency!)
8+
# once MLJBase is loaded (but this is not a dependency!)
99
using CategoricalArrays
1010
using TableOperations
1111
using StatsBase
@@ -29,27 +29,27 @@ include("utils.jl")
2929
include("encoders/target_encoding/errors.jl")
3030
include("encoders/target_encoding/target_encoding.jl")
3131
include("encoders/target_encoding/interface_mlj.jl")
32-
export TargetEncoder
32+
export TargetEncoder
3333

3434
# Ordinal encoding
3535
include("encoders/ordinal_encoding/ordinal_encoding.jl")
3636
include("encoders/ordinal_encoding/interface_mlj.jl")
37-
export OrdinalEncoder
37+
export OrdinalEncoder
3838

3939
# Frequency encoding
4040
include("encoders/frequency_encoding/frequency_encoding.jl")
4141
include("encoders/frequency_encoding/interface_mlj.jl")
4242
export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
43-
export FrequencyEncoder
43+
export FrequencyEncoder
4444

4545
# Cardinality reduction
4646
include("transformers/cardinality_reducer/cardinality_reducer.jl")
4747
include("transformers/cardinality_reducer/interface_mlj.jl")
4848
export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
49-
export CardinalityReducer
49+
export CardinalityReducer
5050
include("encoders/missingness_encoding/missingness_encoding.jl")
5151
include("encoders/missingness_encoding/interface_mlj.jl")
52-
export MissingnessEncoder
52+
export MissingnessEncoder
5353

5454
# Contrast encoder
5555
include("encoders/contrast_encoder/contrast_encoder.jl")
@@ -65,7 +65,6 @@ include("transformers/other_transformers/one_hot_encoder.jl")
6565
include("transformers/other_transformers/standardizer.jl")
6666
include("transformers/other_transformers/univariate_boxcox_transformer.jl")
6767
include("transformers/other_transformers/univariate_discretizer.jl")
68-
include("transformers/other_transformers/metadata_shared.jl")
6968

7069
export UnivariateDiscretizer,
7170
UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,

src/common_docs.jl

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
const X_doc = """
2-
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
3-
`Multiclass` or `OrderedFactor`
2+
- X: A table where the elements of the categorical features have
3+
[scitypes](https://juliaai. github.io/ScientificTypes.jl/dev/) `Multiclass` or
4+
`OrderedFactor`
45
"""
56
const X_doc_mlj = """
67
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
7-
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
8-
check scitypes.
8+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
9+
check scitypes.
910
"""
1011
const features_doc = """
11-
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
12-
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
13-
or a callable that returns true for features to be included/excluded
12+
- features=[]: A list of names of categorical features given as symbols to exclude or in
13+
clude from encoding, according to the value of `ignore`, or a single symbol (which is
14+
treated as a vector with one symbol), or a callable that returns true for features to be
15+
included/excluded.
1416
"""
1517
const ignore_doc = """
1618
- ignore=true: Whether to exclude or include the features given in `features`
@@ -24,4 +26,3 @@ const encoded_features_doc = """
2426
const cache_doc = """
2527
- `cache`: The output of `contrast_encoder_fit`
2628
"""
27-

src/encoders/contrast_encoder/interface_mlj.jl

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ MMI.metadata_model(
7373
"""
7474
$(MMI.doc_header(ContrastEncoder))
7575
76-
`ContrastEncoder` implements the following contrast encoding methods for
77-
categorical features: dummy, sum, backward/forward difference, and Helmert coding.
78-
More generally, users can specify a custom contrast or hypothesis matrix, and each feature
79-
can be encoded using a different method.
76+
`ContrastEncoder` implements the following contrast encoding methods for categorical
77+
features: dummy, sum, backward/forward difference, and Helmert coding. More generally,
78+
users can specify a custom contrast or hypothesis matrix, and each feature can be encoded
79+
using a different method.
8080
8181
# Training data
8282
@@ -93,26 +93,36 @@ Train the machine using `fit!(mach, rows=...)`.
9393
# Hyper-parameters
9494
9595
$features_doc
96-
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
97-
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
98-
contrast encoding scheme for each feature
99-
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
100-
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
101-
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
96+
97+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`,
98+
`:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. If `ignore=false`
99+
(features to be encoded are listed explictly in `features`), then this can be a vector
100+
of the same length as `features` to specify a different contrast encoding scheme for
101+
each feature
102+
103+
- `buildmatrix=nothing`: A function or other callable with signature
104+
`buildmatrix(colname,k)`, where `colname` is the name of the feature levels and `k` is
105+
it's length, and which returns contrast or hypothesis matrix with row/column ordering
106+
consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or
107+
`:hypothesis`.
108+
102109
$ignore_doc
110+
103111
$ordered_factor_doc
104112
105113
# Operations
106114
107-
- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and
108-
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
109-
are always left unchanged.
115+
- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or
116+
`OrderedFactor features of `Xnew` specified by hyper-parameters, and return the new
117+
table. Features that are neither `Multiclass` nor `OrderedFactor` are always left
118+
unchanged.
110119
111120
# Fitted parameters
112121
113122
The fields of `fitted_params(mach)` are:
114123
115-
- `vector_given_value_given_feature`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency.
124+
- `vector_given_value_given_feature`: A dictionary that maps each level for each column in
125+
a subset of the categorical features of X into its frequency.
116126
117127
# Report
118128
@@ -138,7 +148,7 @@ schema(X)
138148
139149
encoder = ContrastEncoder(
140150
features = [:name, :favnum],
141-
ignore = false,
151+
ignore = false,
142152
mode = [:dummy, :helmert],
143153
)
144154
mach = fit!(machine(encoder, X))
@@ -157,4 +167,4 @@ julia > Xnew
157167
See also
158168
[`OneHotEncoder`](@ref)
159169
"""
160-
ContrastEncoder
170+
ContrastEncoder

src/encoders/missingness_encoding/interface_mlj.jl

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ MMI.metadata_model(
7777
"""
7878
$(MMI.doc_header(MissingnessEncoder))
7979
80-
`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing").
81-
By this, missingness will be treated as a new
82-
level by any subsequent model. This assumes that the categorical features have raw
83-
types that are in `Char`, `AbstractString`, and `Number`.
80+
`MissingnessEncoder` maps any missing level of a categorical feature into a new level
81+
(e.g., "Missing"). By this, missingness will be treated as a new level by any subsequent
82+
model. This assumes that the categorical features have raw types that are in `Char`,
83+
`AbstractString`, and `Number`.
8484
8585
# Training data
8686
@@ -97,25 +97,32 @@ Train the machine using `fit!(mach, rows=...)`.
9797
# Hyper-parameters
9898
9999
$features_doc
100+
100101
$ignore_doc
102+
101103
$ordered_factor_doc
102-
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
103-
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
104-
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
105-
then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
106-
and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
104+
105+
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char =>
106+
'm', )`: A dictionary where the possible values for keys are the types in `Char`,
107+
`AbstractString`, and `Number` and where each value signifies the new level to map into
108+
given a column raw super type. By default, if the raw type of the column subtypes
109+
`AbstractString` then missing values will be replaced with `"missing"` and if the raw
110+
type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number`
111+
then the new value is the lowest value in the column - 1.
107112
108113
# Operations
109114
110-
- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and
111-
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
112-
are always left unchanged.
115+
- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or
116+
`OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new
117+
table. Features that are neither `Multiclass` nor `OrderedFactor` are always left
118+
unchanged.
113119
114120
# Fitted parameters
115121
116122
The fields of `fitted_params(mach)` are:
117123
118-
- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
124+
- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing`
125+
into some value according to `label_for_missing`
119126
120127
# Report
121128
@@ -154,4 +161,4 @@ julia> Xnew
154161
See also
155162
[`CardinalityReducer`](@ref)
156163
"""
157-
MissingnessEncoder
164+
MissingnessEncoder

src/encoders/target_encoding/interface_mlj.jl

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ struct TargetEncoderResult{
4949
} <: MMI.MLJType
5050
# target statistic for each level of each categorical feature
5151
y_stat_given_feat_level::Dict{A, A}
52-
task::S # "Regression", "Classification"
52+
task::S # "Regression", "Classification"
5353
num_classes::I # num_classes in case of classification
5454
y_classes::A # y_classes in case of classification
5555

@@ -120,7 +120,7 @@ MMI.target_in_fit(::Type{<:TargetEncoder}) = true
120120
"""
121121
$(MMI.doc_header(TargetEncoder))
122122
123-
`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables
123+
`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables
124124
into continuous ones using statistics from the target variable.
125125
126126
# Training data
@@ -133,34 +133,42 @@ Here:
133133
134134
$X_doc_mlj
135135
136-
- `y` is the target, which can be any `AbstractVector` whose element
137-
scitype is `Continuous` or `Count` for regression problems and
138-
`Multiclass` or `OrderedFactor` for classification problems; check the scitype with `schema(y)`
136+
- `y` is the target, which can be any `AbstractVector` whose element scitype is
137+
`Continuous` or `Count` for regression problems and `Multiclass` or `OrderedFactor` for
138+
classification problems; check the scitype with `schema(y)`
139139
140140
Train the machine using `fit!(mach, rows=...)`.
141141
142142
# Hyper-parameters
143143
144144
$features_doc
145+
145146
$ignore_doc
147+
146148
$ordered_factor_doc
147-
- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as described in [1]
148-
- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` then m will be computed using
149-
empirical Bayes estimation as described in [1]
149+
150+
- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as
151+
described in [1]
152+
153+
- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto`
154+
then m will be computed using empirical Bayes estimation as described in [1]
150155
151156
# Operations
152157
153-
- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and
154-
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
155-
are always left unchanged.
158+
- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or
159+
`OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new
160+
table. Features that are neither `Multiclass` nor `OrderedFactor` are always left
161+
unchanged.
156162
157163
# Fitted parameters
158164
159165
The fields of `fitted_params(mach)` are:
160166
161167
- `task`: Whether the task is `Classification` or `Regression`
162-
- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode each categorical feature. It maps each
163-
level in each categorical feature to a statistic computed over the target.
168+
169+
- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode
170+
each categorical feature. It maps each level in each categorical feature to a statistic
171+
computed over the target.
164172
165173
# Report
166174
@@ -174,13 +182,13 @@ $encoded_features_doc
174182
using MLJ
175183
176184
# Define categorical features
177-
A = ["g", "b", "g", "r", "r",]
185+
A = ["g", "b", "g", "r", "r",]
178186
B = [1.0, 2.0, 3.0, 4.0, 5.0,]
179-
C = ["f", "f", "f", "m", "f",]
187+
C = ["f", "f", "f", "m", "f",]
180188
D = [true, false, true, false, true,]
181189
E = [1, 2, 3, 4, 5,]
182190
183-
# Define the target variable
191+
# Define the target variable
184192
y = ["c1", "c2", "c3", "c1", "c2",]
185193
186194
# Combine into a named tuple
@@ -219,11 +227,11 @@ julia > schema(Xnew)
219227
```
220228
221229
# Reference
222-
[1] Micci-Barreca, Daniele.
223-
“A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems”
230+
[1] Micci-Barreca, Daniele.
231+
“A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems”
224232
SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32.
225233
226234
See also
227235
[`OneHotEncoder`](@ref)
228236
"""
229-
TargetEncoder
237+
TargetEncoder

src/generic.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ function generic_fit(X,
5959
# 4. Use feature mapper to compute the mapping of each level in each column
6060
encoded_features = Symbol[]# to store column that were actually encoded
6161
for feat_name in feat_names
62-
feat_col = Tables.getcolumn(X, feat_name)
62+
feat_col = MMI.selectcols(X, feat_name)
6363
feat_type = elscitype(feat_col)
6464
feat_has_allowed_type =
6565
feat_type <: Union{Missing, Multiclass} ||
@@ -149,7 +149,7 @@ function generic_transform(
149149
new_feat_names = Symbol[]
150150
new_cols = []
151151
for feat_name in feat_names
152-
col = Tables.getcolumn(X, feat_name)
152+
col = MMI.selectcols(X, feat_name)
153153
# Create the transformation function for each column
154154
if feat_name in keys(mapping_per_feat_level)
155155
if !ignore_unknown

0 commit comments

Comments
 (0)