Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
matrix:
version:
- '1.10'
- '1.6'
- 'nightly'
os:
- ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ ScientificTypes = "3.0"
StatsBase = "0.34"
TableOperations = "1.2"
Tables = "1.11"
julia = "1.6.7"
julia = "1.10"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ Fit a contrast encoing scheme on given data in `X`.
"""
function contrast_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
buildmatrix = nothing,
ignore::Bool = true,
Expand Down
8 changes: 4 additions & 4 deletions src/encoders/contrast_encoder/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
### ContrastEncoding with MLJ Interface

# 1. Interface Struct
mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised
features::A1
ignore::Bool
mode::Union{Symbol, AS}
buildmatrix::Any
mode:: ASS
buildmatrix::A2
ordered_factor::Bool
end;

Expand Down
2 changes: 1 addition & 1 deletion src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ categorical features with their (normalized or raw) frequencies of occurrence in
"""
function frequency_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
normalize::Bool = false,
Expand Down
4 changes: 2 additions & 2 deletions src/encoders/frequency_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
### FrequencyEncoding with MLJ Interface

# 1. Interface Struct
mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
mutable struct FrequencyEncoder{A <: Any} <: Unsupervised
features::A
ignore::Bool
ordered_factor::Bool
normalize::Bool
Expand Down
3 changes: 1 addition & 2 deletions src/encoders/missingness_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

# 1. Interface Struct
mutable struct MissingnessEncoder{
AS <: AbstractVector{Symbol},
T <: Type,
A <: Any,
} <: Unsupervised
features::AS
features::A
ignore::Bool
ordered_factor::Bool
label_for_missing::Dict{T, A}
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/missingness_encoding/missingness_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ types that are in `Char`, `AbstractString`, and `Number`.
"""
function missingness_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
label_for_missing::Dict{<:Type, <:Any} = Dict(
Expand Down
4 changes: 2 additions & 2 deletions src/encoders/ordinal_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
### OrdinalEncoding with MLJ Interface

# 1. Interface Struct
mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
mutable struct OrdinalEncoder{A <: Any} <: Unsupervised
features::A
ignore::Bool
ordered_factor::Bool
output_type::Type
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/ordinal_encoding/ordinal_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
"""
function ordinal_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
output_type::Type = Float32,
Expand Down
6 changes: 3 additions & 3 deletions src/encoders/target_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
### TargetEncoding with MLJ Interface

# 1. Interface Struct
mutable struct TargetEncoder{R1 <: Real, R2 <: Real, AS <: AbstractVector{Symbol}} <:
mutable struct TargetEncoder{R1 <: Real, R2 <: Real, A <: Any} <:
Unsupervised
features::AS
features::A
ignore::Bool
ordered_factor::Bool
lambda::R1
Expand Down Expand Up @@ -45,7 +45,7 @@ end
struct TargetEncoderResult{
I <: Integer,
S <: AbstractString,
A <: Any # Useless but likely can't do much better
A <: Any, # Useless but likely can't do much better
} <: MMI.MLJType
# target statistic for each level of each categorical feature
y_stat_given_feat_level::Dict{A, A}
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Fit a target encoder on table X with target y by computing the necessary statist
function target_encoder_fit(
X,
y::AbstractVector,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
lambda::Real = 1.0,
Expand Down
35 changes: 26 additions & 9 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@ logic?"

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `feature_mapper`: Defined above.
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
or a callable that returns true for features to be included/excluded
- ignore=true: Whether to exclude or includes the features given in features
- ordered_factor=false: Whether to encode OrderedFactor or ignore them
- feature_mapper: Defined above.

# Returns

- `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of
- mapping_per_feat_level: Maps each level for each feature in a subset of the categorical features of
X into a scalar or a vector.
- `encoded_features`: The subset of the categorical features of X that were encoded
- encoded_features: The subset of the categorical features of X that were encoded
"""
function generic_fit(X,
features::AbstractVector{Symbol} = Symbol[],
features = Symbol[],
args...;
ignore::Bool = true,
ordered_factor::Bool = false,
Expand All @@ -38,7 +40,22 @@ function generic_fit(X,
feat_names = Tables.schema(X).names

#2. Modify column_names based on features
feat_names = (ignore) ? setdiff(feat_names, features) : intersect(feat_names, features)
if features isa Symbol
features = [features]
end

if features isa AbstractVector{Symbol}
# Original behavior for vector of symbols
feat_names =
(ignore) ? setdiff(feat_names, features) : intersect(feat_names, features)
else
# If features is a callable, apply it to each feature name
if ignore
feat_names = filter(name -> !features(name), feat_names)
else
feat_names = filter(features, feat_names)
end
end

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Julia it is unfortunately difficult to recognise callability of an object (at least last time I researched this). So, reverse your logic here: if feature names is a vector of symbols then do X, otherwise do Y.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example of a callable that is not a function:

struct Foo end
(::Foo)(x) = 2x

f = Foo()
f(4) # 8

f isa Function # false

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 5e0af90

# 3. Define mapping per column per level dictionary
mapping_per_feat_level = Dict()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ types that are in `Char`, `AbstractString`, and `Number`.
"""
function cardinality_reducer_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
min_frequency::Real = 3,
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/cardinality_reducer/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@

# 1. Interface Struct
mutable struct CardinalityReducer{
AS <: AbstractVector{Symbol},
R <: Real,
T <: Type,
A <: Any,
} <: Unsupervised
features::AS
features::A
ignore::Bool
ordered_factor::Bool
min_frequency::R
Expand Down
25 changes: 19 additions & 6 deletions test/encoders/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ age = [23, 23, 14, 23])


@testset "Contrast Encoder Error Handling" begin

# Example definitions to allow the test to run
function dummy_buildmatrix(colname, k)
# Simple dummy function to generate a matrix of correct size
Expand All @@ -23,21 +22,35 @@ age = [23, 23, 14, 23])
)

# Test IGNORE_MUST_FALSE_VEC_MODE error
@test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true)
@test_throws MLJTransforms.IGNORE_MUST_FALSE_VEC_MODE begin
contrast_encoder_fit(data, [:A], mode=[:contrast], ignore=true)
end

# Test LENGTH_MISMATCH_VEC_MODE error
@test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false)
@test_throws MLJTransforms.LENGTH_MISMATCH_VEC_MODE(2, 1) begin
contrast_encoder_fit(data, [:A], mode=[:contrast, :dummy], buildmatrix=dummy_buildmatrix, ignore=false)
end

# Test BUILDFUNC_MUST_BE_SPECIFIED error
@test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false)
@test_throws MLJTransforms.BUILDFUNC_MUST_BE_SPECIFIED begin
contrast_encoder_fit(data, [:A], mode=:contrast, ignore=false)
end

# Test MATRIX_SIZE_ERROR
wrong_buildmatrix = (levels, k) -> randn(k, k) # Incorrect dimensions
@test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false)
k = 3 # Number of levels in data[:A]
wrong_size = (k, k)
@test_throws MLJTransforms.MATRIX_SIZE_ERROR(k, wrong_size, :A) begin
contrast_encoder_fit(data, [:A], mode=:contrast, buildmatrix=wrong_buildmatrix, ignore=false)
end

# Test MATRIX_SIZE_ERROR_HYP
wrong_buildmatrix_hyp = (levels, k) -> randn(k, k+1) # Incorrect dimensions for hypothesis matrix
@test_throws ArgumentError contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false)
wrong_size_hyp = (k, k+1)
@test_throws MLJTransforms.MATRIX_SIZE_ERROR_HYP(k, wrong_size_hyp, :A) begin
contrast_encoder_fit(data, [:A], mode=:hypothesis, buildmatrix=wrong_buildmatrix_hyp, ignore=false)
end

end

@testset "Dummy Coding Tests" begin
Expand Down
13 changes: 9 additions & 4 deletions test/encoders/missingness_encoding.jl
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
using MLJTransforms: missingness_encoder_fit, missingness_encoder_transform

@testset "Throws errors when needed" begin
@test_throws ArgumentError begin
@testset "Missingness Encoder Error Handling" begin
# Test COLLISION_NEW_VAL_ME error - when label_for_missing value already exists in levels
@test_throws MLJTransforms.COLLISION_NEW_VAL_ME("missing") begin
X = generate_X_with_missingness(;john_name="missing")
cache = missingness_encoder_fit(
X;
label_for_missing = Dict(AbstractString => "missing", Char => 'm'),
)
end
@test_throws ArgumentError begin

# Test VALID_TYPES_NEW_VAL_ME error - when label_for_missing key is not a supported type
@test_throws MLJTransforms.VALID_TYPES_NEW_VAL_ME(Bool) begin
X = generate_X_with_missingness()
cache = missingness_encoder_fit(
X;
label_for_missing = Dict(AbstractString => "Other", Bool => 'X'),
)
end
@test_throws ArgumentError begin

# Test UNSPECIFIED_COL_TYPE_ME error - when column type isn't in label_for_missing
@test_throws MLJTransforms.UNSPECIFIED_COL_TYPE_ME(Char, Dict(AbstractString => "X")) begin
X = generate_X_with_missingness()
cache = missingness_encoder_fit(
X;
Expand Down
14 changes: 8 additions & 6 deletions test/encoders/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,15 @@ end
@test fitresult.task == generic_cache[:task]

# Test invalid `m`
@test_throws ArgumentError begin
t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = -5)
invalid_m = -5
@test_throws MLJTransforms.NON_NEGATIVE_m(invalid_m) begin
t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 0.5, m = invalid_m)
end

# Test invalid `lambda`
@test_throws ArgumentError begin
t = TargetEncoder(ignore = true, ordered_factor = false, lambda = 1.1, m = 1)

# Test invalid `lambda` (value > 1)
invalid_lambda = 1.1
@test_throws MLJTransforms.INVALID_lambda(invalid_lambda) begin
t = TargetEncoder(ignore = true, ordered_factor = false, lambda = invalid_lambda, m = 1)
end

# Test report
Expand Down
39 changes: 38 additions & 1 deletion test/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ end
# Dummy encoder that maps each level to its hash (some arbitrary function)
function dummy_encoder_fit(
X,
features::AbstractVector{Symbol} = Symbol[];
features = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
)
Expand All @@ -81,6 +81,7 @@ function dummy_encoder_fit(
)
cache = Dict(
:hash_given_feat_val => hash_given_feat_val,
:encoded => encoded_features,
)
return cache
end
Expand Down Expand Up @@ -161,4 +162,40 @@ end
F = [enc(:F, X[:F][i]) for i in 1:10]
)
@test X_tr == target
end

@testset "Callable feature functionality tests" begin
X = dataset_forms[1]
feat_names = Tables.schema(X).names

# Define a predicate: include only columns with name in uppercase list [:A, :C, :E]
predicate = name -> name in [:A, :C, :E]

# Test 1: ignore=true should exclude predicate columns
cache1 = dummy_encoder_fit(X, predicate; ignore=true, ordered_factor=false)
@test !(:A in cache1[:encoded]) && !(:C in cache1[:encoded]) && !(:E in cache1[:encoded])

# Test 2: ignore=false should include only predicate columns
cache2 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=false)
@test Set(cache2[:encoded]) == Set([:A, :C])

# Test 3: predicate with ordered_factor=true picks up ordered factors (e.g., :E)
cache3 = dummy_encoder_fit(X, predicate; ignore=false, ordered_factor=true)
@test Set(cache3[:encoded]) == Set([:A, :C, :E])
end

@testset "Single Symbol and list of one symbol equivalence" begin
X = dataset_forms[1]
feat_names = Tables.schema(X).names

# Test 1: Single Symbol
single_symbol = :A
cache1 = dummy_encoder_fit(X, single_symbol; ignore=true, ordered_factor=false)
@test !(:A in cache1[:encoded])
# Test 2: List of one symbol
single_symbol_list = [:A]
cache2 = dummy_encoder_fit(X, single_symbol_list; ignore=true, ordered_factor=false)
@test !(:A in cache2[:encoded])
# Test 3: Both should yield the same result
@test cache1[:encoded] == cache2[:encoded]
end
Loading