Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJTransforms"
uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
authors = ["Essam <[email protected]> and contributors"]
version = "0.1.0"
version = "0.1.1"

[deps]
BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
Expand All @@ -14,6 +14,7 @@ MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
Expand All @@ -22,14 +23,15 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
[compat]
BitBasis = "0.9"
CategoricalArrays = "0.10"
MLJModelInterface = "1.11"
Combinatorics = "1"
Dates = "1"
Distributions = "0.25"
LinearAlgebra = "1"
MLJModelInterface = "1.11"
OrderedCollections = "1"
Parameters = "0.12"
ScientificTypes = "3.0"
ScientificTypes = "3.1.0"
ScientificTypesBase = "3.0.0"
Statistics = "1"
StatsBase = "0.34"
TableOperations = "1.2"
Expand All @@ -38,11 +40,11 @@ julia = "1.10"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]
10 changes: 6 additions & 4 deletions src/MLJTransforms.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
module MLJTransforms
using Tables
using ScientificTypes
using ScientificTypes: scitype
# Note: The `scitype` in
# MLJModelInterface clashes with the `scitype` in ScientificTypes. See also
# https://github.com/JuliaAI/MLJBase.jl/issues/1002
import ScientificTypes: elscitype, schema, coerce, ScientificTimeType
using MLJModelInterface # exports `scitype`, which will call `ScientificTypes.scitype`,
# once MLJBase is loaded (but this is not a dependency!)
using CategoricalArrays
using MLJModelInterface
using TableOperations
using StatsBase
using LinearAlgebra
Expand All @@ -15,7 +18,6 @@ using Parameters
using Dates
using OrderedCollections


const MMI = MLJModelInterface

# Functions of generic use across transformers
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/other_transformers/continuous_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ end
metadata_model(ContinuousEncoder,
input_scitype = Table,
output_scitype = Table(Continuous),
load_path = "MLJModels.ContinuousEncoder")
load_path = "MLJTransforms.ContinuousEncoder")

"""
$(MLJModelInterface.doc_header(ContinuousEncoder))
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/other_transformers/fill_imputer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,12 @@ metadata_model(UnivariateFillImputer,
AbstractVector{<:Count},
AbstractVector{<:Finite}},
human_name = "single variable fill imputer",
load_path = "MLJModels.UnivariateFillImputer")
load_path = "MLJTransforms.UnivariateFillImputer")

metadata_model(FillImputer,
input_scitype = Table,
output_scitype = Table,
load_path = "MLJModels.FillImputer")
load_path = "MLJTransforms.FillImputer")

"""
$(MLJModelInterface.doc_header(UnivariateFillImputer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ metadata_model(InteractionTransformer,
input_scitype = Tuple{Table},
output_scitype = Table,
human_name = "interaction transformer",
load_path = "MLJModels.InteractionTransformer")
load_path = "MLJTransforms.InteractionTransformer")

"""
$(MLJModelInterface.doc_header(InteractionTransformer))
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/other_transformers/metadata_shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ metadata_pkg.(
UnivariateTimeTypeToContinuous,
InteractionTransformer
),
package_name = "MLJModels",
package_uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7",
package_url = "https://github.com/JuliaAI/MLJModels.jl",
package_name = "MLJTransforms",
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
is_pure_julia = true,
package_license = "MIT")
2 changes: 1 addition & 1 deletion src/transformers/other_transformers/one_hot_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ metadata_model(OneHotEncoder,
input_scitype = Table,
output_scitype = Table,
human_name = "one-hot encoder",
load_path = "MLJModels.OneHotEncoder")
load_path = "MLJTransforms.OneHotEncoder")

"""
$(MLJModelInterface.doc_header(OneHotEncoder))
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/other_transformers/standardizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,12 @@ metadata_model(UnivariateStandardizer,
input_scitype = AbstractVector{<:Infinite},
output_scitype = AbstractVector{Continuous},
human_name = "single variable discretizer",
load_path = "MLJModels.UnivariateStandardizer")
load_path = "MLJTransforms.UnivariateStandardizer")

metadata_model(Standardizer,
input_scitype = Union{Table, AbstractVector{<:Continuous}},
output_scitype = Union{Table, AbstractVector{<:Continuous}},
load_path = "MLJModels.Standardizer")
load_path = "MLJTransforms.Standardizer")

"""
$(MLJModelInterface.doc_header(Standardizer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ metadata_model(UnivariateBoxCoxTransformer,
input_scitype = AbstractVector{Continuous},
output_scitype = AbstractVector{Continuous},
human_name = "single variable Box-Cox transformer",
load_path = "MLJModels.UnivariateBoxCoxTransformer")
load_path = "MLJTransforms.UnivariateBoxCoxTransformer")

"""
$(MLJModelInterface.doc_header(UnivariateBoxCoxTransformer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ metadata_model(UnivariateDiscretizer,
input_scitype = AbstractVector{<:Continuous},
output_scitype = AbstractVector{<:OrderedFactor},
human_name = "single variable discretizer",
load_path = "MLJModels.UnivariateDiscretizer")
load_path = "MLJTransforms.UnivariateDiscretizer")


"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ metadata_model(UnivariateTimeTypeToContinuous,
output_scitype = AbstractVector{Continuous},
human_name ="single variable transformer that creates "*
"continuous representations of temporally typed data",
load_path = "MLJModels.UnivariateTimeTypeToContinuous")
load_path = "MLJTransforms.UnivariateTimeTypeToContinuous")

"""
$(MLJModelInterface.doc_header(UnivariateTimeTypeToContinuous))
Expand Down
54 changes: 27 additions & 27 deletions test/encoders/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -195,16 +195,16 @@ end

df = DataFrame(X)

mf = ModelFrame(
@formula(age ~ (name + height + favnum)),
mf = StatsModels.ModelFrame(
StatsModels.@formula(age ~ (name + height + favnum)),
df,
contrasts = Dict(
:name => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 3)),
:favnum => StatsModels.ContrastsCoding(buildrandomcontrast(nothing, 4)),
),
)

X_tr_sm = ModelMatrix(mf).m[:, 2:end]
X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end]

@test X_tr_mlj == X_tr_sm
end
Expand All @@ -221,24 +221,24 @@ end
X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1]
df = DataFrame(X)

mf = ModelFrame(
@formula(age ~ (name + height + favnum)),
mf = StatsModels.ModelFrame(
StatsModels.@formula(age ~ (name + height + favnum)),
df,
contrasts = Dict(
:name => HypothesisCoding(
:name => StatsModels.HypothesisCoding(
buildrandomhypothesis(nothing, 3);
levels = levels(X.name),
labels = [],
),
:favnum => HypothesisCoding(
:favnum => StatsModels.HypothesisCoding(
buildrandomhypothesis(nothing, 4);
levels = levels(X.favnum),
labels = [],
),
),
)

X_tr_sm = ModelMatrix(mf).m[:, 2:end]
X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end]

@test X_tr_mlj == X_tr_sm
end
Expand All @@ -257,11 +257,11 @@ end
for ind in 1:6
stats_models(k, ind) = [
StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)),
DummyCoding(; base = (k == 3) ? "Mary" : 10),
EffectsCoding(; base = (k == 3) ? "Mary" : 10),
SeqDiffCoding(),
HelmertCoding(),
HypothesisCoding(
StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10),
StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10),
StatsModels.SeqDiffCoding(),
StatsModels.HelmertCoding(),
StatsModels.HypothesisCoding(
buildrandomhypothesis(nothing, k);
levels = (k == 3) ? levels(X.name) : levels(X.favnum),
labels = [],
Expand All @@ -277,8 +277,8 @@ end

df = DataFrame(X)

mf = ModelFrame(
@formula(age ~ (name + height + favnum)),
mf = StatsModels.ModelFrame(
StatsModels.@formula(age ~ (name + height + favnum)),
df,
contrasts = Dict(
:name => stats_models(3, ind),
Expand All @@ -287,7 +287,7 @@ end
)

X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1]
X_tr_sm = ModelMatrix(mf).m[:, 2:end]
X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end]
@test X_tr_mlj ≈ X_tr_sm
end
end
Expand All @@ -298,11 +298,11 @@ end
for ind2 in 2:5
stats_models(k, ind) = [
StatsModels.ContrastsCoding(buildrandomcontrast(nothing, k)),
DummyCoding(; base = (k == 3) ? "Mary" : 10),
EffectsCoding(; base = (k == 3) ? "Mary" : 10),
SeqDiffCoding(),
HelmertCoding(),
HypothesisCoding(
StatsModels.DummyCoding(; base = (k == 3) ? "Mary" : 10),
StatsModels.EffectsCoding(; base = (k == 3) ? "Mary" : 10),
StatsModels.SeqDiffCoding(),
StatsModels.HelmertCoding(),
StatsModels.HypothesisCoding(
buildrandomhypothesis(nothing, k);
levels = (k == 3) ? levels(X.name) : levels(X.favnum),
labels = [],
Expand Down Expand Up @@ -331,8 +331,8 @@ end

df = DataFrame(X)

mf = ModelFrame(
@formula(age ~ (name + height + favnum)),
mf = StatsModels.ModelFrame(
StatsModels.@formula(age ~ (name + height + favnum)),
df,
contrasts = Dict(
:name => stats_models(3, ind1),
Expand All @@ -341,7 +341,7 @@ end
)

X_tr_mlj = Tables.matrix(X_tr)[:, 1:end-1]
X_tr_sm = ModelMatrix(mf).m[:, 2:end]
X_tr_sm = StatsModels.ModelMatrix(mf).m[:, 2:end]

@test X_tr_mlj ≈ X_tr_sm
end
Expand All @@ -358,7 +358,7 @@ end
encoder = ContrastEncoder(ignore = true, ordered_factor = false)
mach = machine(encoder, X)
fit!(mach)
Xnew_transf = MMI.transform(mach, X)
Xnew_transf = MLJBase.transform(mach, X)

# same output
@test X_transf == Xnew_transf
Expand Down Expand Up @@ -392,7 +392,7 @@ end
buildmatrix = matrix_func[i],
)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)
Xnew = MLJBase.transform(mach, X)

# Test Consistency with Types
scs = schema(Xnew).scitypes
Expand All @@ -406,4 +406,4 @@ end
@test last_type <: Integer && isconcretetype(last_type)
@test last_sctype <: Count
end
end
end
6 changes: 3 additions & 3 deletions test/encoders/frequency_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform

X = dataset_forms[1]
normalize = [false, true]
A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6])
A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6])
for norm in normalize
result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val]
enc =
Expand Down Expand Up @@ -72,7 +72,7 @@ end
encoder = FrequencyEncoder(ignore = true, ordered_factor = false)
mach = machine(encoder, X)
fit!(mach)
Xnew_transf = MMI.transform(mach, X)
Xnew_transf = MLJBase.transform(mach, X)

# same output
@test X_transf == Xnew_transf
Expand Down Expand Up @@ -111,7 +111,7 @@ end

encoder = FrequencyEncoder(ordered_factor = false, normalize = false)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)
Xnew = MLJBase.transform(mach, X)


scs = schema(Xnew).scitypes
Expand Down
4 changes: 2 additions & 2 deletions test/encoders/missingness_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ end
encoder = MissingnessEncoder(ignore = true, ordered_factor = false)
mach = machine(encoder, X)
fit!(mach)
Xnew_transf = MMI.transform(mach, X)
Xnew_transf = MLJBase.transform(mach, X)

# same output
@test isequal(X_transf, Xnew_transf)
Expand All @@ -197,7 +197,7 @@ end

encoder = MissingnessEncoder()
mach = fit!(machine(encoder, Xm))
Xnew = MMI.transform(mach, Xm)
Xnew = MLJBase.transform(mach, Xm)

schema(Xm)
schema(Xnew)
Expand Down
8 changes: 4 additions & 4 deletions test/encoders/ordinal_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ push!(
@test ordinal_encoder_fit(dataset_forms[1]) == ordinal_encoder_fit(dataset_forms[2])
X = dataset_forms[1]
result = ordinal_encoder_fit(X)[:index_given_feat_level]
A_col, C_col, D_col, F_col = MMI.selectcols(X, [1, 3, 4, 6])
A_col, C_col, D_col, F_col = selectcols(X, [1, 3, 4, 6])
true_output = Dict{Symbol, Dict{Any, AbstractFloat}}(
:F => Dict(
"m" => findfirst(==("m"), levels(F_col)),
Expand Down Expand Up @@ -70,7 +70,7 @@ end
encoder = OrdinalEncoder(ignore = true, ordered_factor = false)
mach = machine(encoder, X)
fit!(mach)
Xnew_transf = MMI.transform(mach, X)
Xnew_transf = MLJBase.transform(mach, X)

# same output
@test X_transf == Xnew_transf
Expand Down Expand Up @@ -108,7 +108,7 @@ end

encoder = OrdinalEncoder(ordered_factor = false)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)
Xnew = MLJBase.transform(mach, X)

scs = schema(Xnew).scitypes
ts = schema(Xnew).types
Expand All @@ -123,7 +123,7 @@ end
## Int32 case
encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)
Xnew = MLJBase.transform(mach, X)
scs = schema(Xnew).scitypes
ts = schema(Xnew).types
# Check scitypes for previously categorical features
Expand Down
Loading