Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fffb34c
✨ Fix the union types
EssamWisam Feb 3, 2025
14a5671
Merge pull request #23 from JuliaAI/fix-union-types
EssamWisam Feb 7, 2025
7b577d7
✨ Fix frequency encoder output types
EssamWisam May 12, 2025
f4ae7bd
✨ Fix target encoder output types
EssamWisam May 12, 2025
e9a0c44
✨ Fix frequency encoder output types
EssamWisam May 12, 2025
6c4589b
✨ Fix cardinality reducer output types
EssamWisam May 12, 2025
5808f8e
✨ Fix missingness encoder output types
EssamWisam May 12, 2025
bab7664
✨ Add output type test for contrast encoder
EssamWisam May 12, 2025
a815b76
✨ Support for using level names instead of indices for new columns in…
EssamWisam May 13, 2025
dd22fe5
✨ Class names to be used as level names for target encoding
EssamWisam May 13, 2025
2f7bebb
✨ Contrast encoding should use level names as it generates columns
EssamWisam May 13, 2025
16758b4
✨ Add support of output type to frequency encoder
EssamWisam May 14, 2025
83f48bf
Update src/generic.jl
EssamWisam May 15, 2025
19c06ed
Update src/generic.jl
EssamWisam May 15, 2025
5a4c5d6
Update src/generic.jl
EssamWisam May 15, 2025
a8b3296
Update src/generic.jl
EssamWisam May 15, 2025
d9d28ed
Merge pull request #24 from JuliaAI/fix-encoder-output-types
EssamWisam May 15, 2025
c401f3b
Merge branch 'main' into use-level-names-for-categories
EssamWisam May 15, 2025
98893ea
Merge pull request #25 from JuliaAI/use-level-names-for-categories
EssamWisam May 16, 2025
7f12234
✨ Add callable features and better error testing
EssamWisam May 16, 2025
0075af7
Update src/generic.jl
EssamWisam May 18, 2025
5272758
Update src/generic.jl
EssamWisam May 18, 2025
c784e50
✨ Attempt to fix broken test
EssamWisam May 25, 2025
194c53e
👨‍🔧 Fix types for features
EssamWisam May 25, 2025
5e0af90
✨ Better callable features logic
EssamWisam May 25, 2025
d0c67ac
Merge branch 'main' into add-callable-and-better-errors
EssamWisam May 27, 2025
52ba39d
✨ Add support for single vector
EssamWisam May 27, 2025
08d973f
Merge pull request #26 from JuliaAI/add-callable-and-better-errors
EssamWisam May 27, 2025
5358dce
✅ Cache is now a named tuple across all methods
EssamWisam May 27, 2025
4744103
✨ Better docstring redundance
EssamWisam May 27, 2025
6fa6eeb
Merge pull request #27 from JuliaAI/named-tuple-for-cache-better-doc-…
ablaom May 31, 2025
2b85d13
add missing compats; dump support for julia < 1.10
ablaom Jun 18, 2025
850181e
rm duplicate compat; add LinearAlgebra compat
ablaom Jun 18, 2025
dfeefe1
Merge pull request #28 from JuliaAI/compat-cleanup
ablaom Jun 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ jobs:
matrix:
version:
- '1.10'
- '1.6'
- 'nightly'
- '1'
os:
- ubuntu-latest
arch:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ meh/*.ipynb
.DS_Store
/*.jl
scratchpad/
examples/test.jl
12 changes: 10 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJTransforms"
uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
authors = ["Essam <[email protected]> and contributors"]
version = "1.0.0-DEV"
version = "0.1.0"

[deps]
BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
Expand All @@ -20,13 +20,21 @@ TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
BitBasis = "0.9"
CategoricalArrays = "0.10"
MLJModelInterface = "1.11"
Combinatorics = "1"
Dates = "1"
Distributions = "0.25"
LinearAlgebra = "1"
OrderedCollections = "1"
Parameters = "0.12"
ScientificTypes = "3.0"
Statistics = "1"
StatsBase = "0.34"
TableOperations = "1.2"
Tables = "1.11"
julia = "1.6.7"
julia = "1.10"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
3 changes: 2 additions & 1 deletion src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using MLJModelInterface
using TableOperations
using StatsBase
using LinearAlgebra

using OrderedCollections: OrderedDict
# Other transformers
using Combinatorics
import Distributions
Expand All @@ -19,6 +19,7 @@ using OrderedCollections
const MMI = MLJModelInterface

# Functions of generic use across transformers
include("common_docs.jl")
include("generic.jl")
include("utils.jl")

Expand Down
27 changes: 27 additions & 0 deletions src/common_docs.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
const X_doc = """
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
"""
const X_doc_mlj = """
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
"""
const features_doc = """
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
or a callable that returns true for features to be included/excluded
"""
const ignore_doc = """
- ignore=true: Whether to exclude or include the features given in `features`
"""
const ordered_factor_doc = """
- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
"""
const encoded_features_doc = """
- encoded_features: The subset of the categorical features of `X` that were encoded
"""
const cache_doc = """
- `cache`: The output of `contrast_encoder_fit`
"""

69 changes: 39 additions & 30 deletions src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
"""
### 1. Dummy Coding
function get_dummy_contrast(k)
return Matrix(1.0I, k, k-1)
return Matrix(1.0I, k, k - 1)
end


### 2. Sum Coding
function get_sum_contrast(k)
C = Matrix(1.0I, k, k-1)
C = Matrix(1.0I, k, k - 1)
C[end, :] .= -1.0
return C
end
Expand All @@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
vec = ones(length) .* index / length

# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
vec[1:index] .= index/length - 1
vec[1:index] .= index / length - 1
return vec
end
function get_backward_diff_contrast(k)
Expand Down Expand Up @@ -61,25 +61,25 @@ Fit a contrast encoing scheme on given data in `X`.

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
$X_doc
$features_doc
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
$ignore_doc
$ordered_factor_doc

# Returns (in a dict)
# Returns as a named-tuple

- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc
"""
function contrast_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
buildmatrix = nothing,
ignore::Bool = true,
Expand All @@ -90,9 +90,10 @@ function contrast_encoder_fit(
if mode isa Vector{Symbol}
mode_is_vector = true
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
length(features) == length(mode) ||
throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
end

# buildmatrix should be specified if mode is :contrast or :hypothesis
if mode in (:contrast, :hypothesis)
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
Expand All @@ -105,11 +106,13 @@ function contrast_encoder_fit(
k = length(feat_levels)
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
if feat_mode == :contrast
contrastmatrix = buildmatrix(name, k)
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
contrastmatrix = buildmatrix(name, k)
size(contrastmatrix) == (k, k - 1) ||
throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
elseif feat_mode == :hypothesis
hypothesismatrix = buildmatrix(name, k)
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
hypothesismatrix = buildmatrix(name, k)
size(hypothesismatrix) == (k - 1, k) ||
throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
contrastmatrix = pinv(hypothesismatrix)
elseif feat_mode == :dummy
contrastmatrix = get_dummy_contrast(k)
Expand All @@ -125,7 +128,9 @@ function contrast_encoder_fit(
throw(ArgumentError("Mode $feat_mode is not supported."))
end

vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
vector_given_value_given_feature = OrderedDict(
level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
)
return vector_given_value_given_feature
end

Expand All @@ -134,10 +139,9 @@ function contrast_encoder_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)

cache = Dict(
:vector_given_value_given_feature => vector_given_value_given_feature,
:encoded_features => encoded_features,
cache = (
vector_given_value_given_feature = vector_given_value_given_feature,
encoded_features = encoded_features,
)

return cache
Expand All @@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia

- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
"""
function contrast_encoder_transform(X, cache::Dict)
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
end
function contrast_encoder_transform(X, cache::NamedTuple)
vector_given_value_given_feature = cache.vector_given_value_given_feature
return generic_transform(
X,
vector_given_value_given_feature,
single_feat = false;
use_levelnames = true,
)
end
39 changes: 18 additions & 21 deletions src/encoders/contrast_encoder/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
### ContrastEncoding with MLJ Interface

# 1. Interface Struct
mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised
features::A1
ignore::Bool
mode::Union{Symbol, AS}
buildmatrix::Any
mode:: ASS
buildmatrix::A2
ordered_factor::Bool
end;

Expand Down Expand Up @@ -36,19 +36,18 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
buildmatrix = transformer.buildmatrix,
ordered_factor = transformer.ordered_factor,
)
fitresult = generic_cache[:vector_given_value_given_feature]
fitresult = generic_cache.vector_given_value_given_feature

report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
cache = nothing
return fitresult, cache, report
end;


# 6. Transform method
function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
generic_cache = Dict(
:vector_given_value_given_feature =>
fitresult,
generic_cache = (
vector_given_value_given_feature = fitresult,
)
Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down Expand Up @@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with

Here:

- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
$X_doc_mlj

Train the machine using `fit!(mach, rows=...)`.

# Hyper-parameters

- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
$features_doc
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
$ignore_doc
$ordered_factor_doc

# Operations

Expand All @@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are:

The fields of `report(mach)` are:

- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc

# Examples

Expand All @@ -148,12 +145,12 @@ mach = fit!(machine(encoder, X))
Xnew = transform(mach, X)

julia > Xnew
(name_1 = [1.0, 0.0, 0.0, 0.0],
name_2 = [0.0, 1.0, 0.0, 1.0],
(name_John = [1.0, 0.0, 0.0, 0.0],
name_Mary = [0.0, 1.0, 0.0, 1.0],
height = [1.85, 1.67, 1.5, 1.67],
favnum_1 = [0.0, 1.0, 0.0, -1.0],
favnum_2 = [2.0, -1.0, 0.0, -1.0],
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
favnum_5 = [0.0, 1.0, 0.0, -1.0],
favnum_7 = [2.0, -1.0, 0.0, -1.0],
favnum_10 = [-1.0, -1.0, 3.0, -1.0],
age = [23, 23, 14, 23],)
```

Expand Down
34 changes: 19 additions & 15 deletions src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,43 @@ categorical features with their (normalized or raw) frequencies of occurrence in

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
$X_doc
$features_doc
$ignore_doc
$ordered_factor_doc
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.

# Returns (in a dict)
# Returns as a named-tuple

- `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc
"""
function frequency_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
normalize::Bool = false,
output_type::Type = Float32,
)
# 1. Define feature mapper
function feature_mapper(col, name)
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
feat_levels = levels(col)
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
level => frequency_map[level] for level in feat_levels
)
return statistic_given_feat_val
end

# 2. Pass it to generic_fit
statistic_given_feat_val, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:statistic_given_feat_val => statistic_given_feat_val,
:encoded_features => encoded_features,
feature_mapper = feature_mapper)

cache = (
statistic_given_feat_val = statistic_given_feat_val,
encoded_features = encoded_features,
)
return cache
end
Expand All @@ -58,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali

- `X_tr`: The table with selected features after the selected features are encoded by frequency encoding.
"""
function frequency_encoder_transform(X, cache::Dict)
statistic_given_feat_val = cache[:statistic_given_feat_val]
function frequency_encoder_transform(X, cache::NamedTuple)
statistic_given_feat_val = cache.statistic_given_feat_val
return generic_transform(X, statistic_given_feat_val)
end
Loading
Loading