Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d2588d5
Add SemiSuperStream generator to simulates semi-supervised setting
MinhHuong Mar 15, 2019
0349262
Change the GUI to add SSL. New SSL generator added. Base SSL learner …
MinhHuong Mar 19, 2019
e9749cd
Fix a little GUI bug
MinhHuong Mar 19, 2019
91f8c4d
First version: add SSL version of Clustream and ClustreamKernel; add …
MinhHuong Apr 1, 2019
c0e4a12
Save current progress: basic SSL classifier
MinhHuong Apr 4, 2019
9d6668b
Implement masked data and adapt the evaluator + ARFF parser
MinhHuong Apr 4, 2019
08256ae
Add more measures
MinhHuong Apr 5, 2019
76ce0dc
Save before applying pseudo-label approach
MinhHuong Apr 10, 2019
7e999ca
Pseudo-label approach implemented
MinhHuong Apr 16, 2019
535ef2b
WIP: K-nearest clusters
MinhHuong Apr 20, 2019
8a430b8
Adding label count to the highest cluster class
MinhHuong Apr 23, 2019
1638c3d
Adapt the label count to the cluster(er)s
MinhHuong Apr 23, 2019
1ca3e13
Adapt the label count to the clusters and clusterers; Add util method…
MinhHuong Apr 24, 2019
14a2c36
Adapt DenStream to use with SSL
MinhHuong May 6, 2019
14e628c
Adapt D-Stream to SSL
MinhHuong May 9, 2019
b1ebdcd
Add LabelFeature in with weighting scheme
MinhHuong May 13, 2019
33db27f
Down-weight instances with pseudo-labels
MinhHuong May 14, 2019
40c0709
Save current progress
MinhHuong May 20, 2019
09d034d
Finish one-hot encoding implementation
MinhHuong May 27, 2019
51a9a4d
Implement self-training classifier, minmax scaling filter, standardiz…
MinhHuong Jun 6, 2019
3f7031c
Implement Incremental version of self-training
MinhHuong Jul 1, 2019
ff7f8cf
Save current progress before tweaking cluster and label
MinhHuong Jul 24, 2019
80b04f7
Implement Sublearner for Cluster-and-label
MinhHuong Jul 26, 2019
7962923
Prepare for the last version
MinhHuong Nov 9, 2019
bd05eb3
WIP: Cleaning code
MinhHuong Nov 9, 2019
2d55741
Cleaned up the code
MinhHuong Nov 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dumpClustering.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Nr;Event;
64 changes: 64 additions & 0 deletions expe/cl-categorical.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
########################################################
# #
# CLUSTER-AND-LABEL WITH DIFFERENT CATEGORICAL METRICS #
# #
########################################################

rootfolder=/mnt/d/Study/M2_DK/Internship
folder="cl-categorical"
m=100
h=1000
evaluator="BasicClassificationPerformanceEvaluator -o -p -r -f"
learner_sp="trees.HoeffdingTree"
global_learner="trees.HoeffdingTree"
local_learner="bayes.NaiveBayes"

for d in randomTree elecNormNew airlines;
# for d in agrawal;
do
for ratio in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99;
do
# compute the percentage of unlabeled data
frac=`echo $ratio | cut -d. -f2`
len=${#frac}
if [[ $len == 1 ]]; then
((percent=frac*10))
else
percent=$frac
fi

# define necessary variables
data="$rootfolder/data/semi-masked/$d-semi-$percent.arff"
stream="ArffFileStream -f $data"

# define the task
for method in 0 1 2 3 4 5;
do
echo
echo "==============================================================================================="
echo "CLUSTER-N-LABEL & MULTIVIEW & CATEGORICAL METRICS: dataset = $d, r = $percent, method = $method"
echo "==============================================================================================="
echo

if [[ $method == 0 ]]; then
method_name="nothing"
elif [[ $method == 1 ]]; then
method_name="euclidean"
elif [[ $method == 2 ]]; then
method_name="of"
elif [[ $method == 3 ]]; then
method_name="lin"
elif [[ $method == 4 ]]; then
method_name="goodall3"
elif [[ $method == 5 ]]; then
method_name="iof"
fi

clusterer="semisupervised.ClustreamSSL -h $h -l 0.0 -m $m -k 5 -a $method"
learner_ssl="semisupervised.ClusterAndLabelSubLearnerClassifier -c ($clusterer) -p -l ($local_learner) -g ($global_learner) -t 1"
result="$rootfolder/experiments/$folder/$d-cl-$percent-$method_name.csv"
task="EvaluateInterleavedTestThenTrainSemi -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task
done
done
done
45 changes: 45 additions & 0 deletions expe/cl-ensemble.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
###################################
# #
# CLUSTER-AND-LABEL WITH ENSEMBLE #
# #
###################################

rootfolder=/mnt/d/Study/M2_DK/Internship
folder="cl-ensemble"
m=100
h=1000
k=1
evaluator="BasicClassificationPerformanceEvaluator -o -p -r -f"
learner_sp="trees.HoeffdingTree"
clusterer="semisupervised.ClustreamSSL -h $h -l 0.0 -m $m -k 5 -a 1"
learner_ssl="semisupervised.ClusterAndLabelClassifier -c ($clusterer) -p -k 1"

# for d in led randomRBF randomTree elecNormNew covtypeNorm airlines;
for d in agrawal hyperplane;
do
for ratio in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99;
do
# compute the percentage of unlabeled data
frac=`echo $ratio | cut -d. -f2`
len=${#frac}
if [[ $len == 1 ]]; then
((percent=frac*10))
else
percent=$frac
fi

# define necessary variables
data="$rootfolder/data/semi-masked/$d-semi-$percent.arff"
stream="ArffFileStream -f $data"

echo
echo "=================================================================================="
echo "CLUSTER-N-LABEL, ENSEMBLE WITH LEVERAGE BAGGING: dataset = $d, r = $percent"
echo "=================================================================================="
echo
ensemble="meta.LeveragingBag -l ($learner_ssl)"
result="$rootfolder/experiments/$folder/$d-cl-$percent-$m-$h-$k-leveragebag.csv"
task="EvaluateInterleavedTestThenTrain -l ($ensemble) -s ($stream) -d ($result) -e ($evaluator) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task
done
done
45 changes: 45 additions & 0 deletions expe/cl-sublearner-forcepred.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
###################################################################
# #
# CLUSTER-AND-LABEL WITH SUBLEARNER AND FORCED CORRECT PREDICTION #
# #
###################################################################

rootfolder=/mnt/d/Study/M2_DK/Internship
folder="cl-sublearner"
m=100
h=1000

for d in agrawal led randomTree randomRBF hyperplane elecNormNew covtypeNorm airlines;
do
for ratio in 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99;
do
# compute the percentage of unlabeled data
frac=`echo $ratio | cut -d. -f2`
len=${#frac}
if [[ $len == 1 ]]; then
((percent=frac*10))
else
percent=$frac
fi

# define necessary variables
data="$rootfolder/data/semi-masked/$d-semi-$percent.arff"
evaluator="BasicClassificationPerformanceEvaluator -o -p -r -f"
learner_sp="trees.HoeffdingTree"
clusterer="semisupervised.ClustreamSSL -h $h -l 0.0 -m $m -k 5 -a 1" # still use Euclidean distance
global_learner="trees.HoeffdingTree"
local_learner="bayes.NaiveBayes"
stream="ArffFileStream -f $data"

# define the task
echo
echo "=================================================================================="
echo "CLUSTER-N-LABEL & HETERO & FORCE PREDICTION: dataset = $d, r = $percent"
echo "=================================================================================="
echo
learner_ssl="semisupervised.ClusterAndLabelSubLearnerClassifier -c ($clusterer) -p -l ($local_learner) -g ($global_learner) -t 2"
result="$rootfolder/experiments/$folder/$d-cl-$percent-hetero-forced-prediction.csv"
task="EvaluateInterleavedTestThenTrainSemi -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -e ($evaluator) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task
done
done
47 changes: 47 additions & 0 deletions expe/cl-sublearner-measures.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
################################################################
# #
# CLUSTER-AND-LABEL + HETEROGENEOUS LEARNERS + MORE STATISTICS #
# #
################################################################

rootfolder=/mnt/d/Study/M2_DK/Internship
folder="cl-sublearner"
m=100
h=1000
evaluator="BasicClassificationPerformanceEvaluator -o -p -r -f"
learner_sp="trees.HoeffdingTree"
clusterer="semisupervised.ClustreamSSL -h $h -l 0.0 -m $m -k 5 -a 1"
global_learner="trees.HoeffdingTree"
local_learner="bayes.NaiveBayes"

for d in agrawal led randomTree randomRBF hyperplane elecNormNew covtypeNorm airlines;
# for d in elecNormNew;
do
#for ratio in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99;
for ratio in 0.90 0.95 0.99;
do
# compute the percentage of unlabeled data
frac=`echo $ratio | cut -d. -f2`
len=${#frac}
if [[ $len == 1 ]]; then
((percent=frac*10))
else
percent=$frac
fi

# define necessary variables
data="$rootfolder/data/semi-masked/$d-semi-$percent.arff"
stream="ArffFileStream -f $data"

# define the task
echo
echo "=================================================================================="
echo "CLUSTER-N-LABEL & HETEROGENEOUS LEARNERS: dataset = $d, r = $percent"
echo "=================================================================================="
echo
learner_ssl="semisupervised.ClusterAndLabelSubLearnerClassifier -c ($clusterer) -p -l ($local_learner) -g ($global_learner) -t 1"
result="$rootfolder/experiments/$folder/$d-cl-$percent-hetero-measures-euclidean.csv"
task="EvaluateInterleavedTestThenTrainSemi -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -e ($evaluator) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task
done
done
65 changes: 65 additions & 0 deletions expe/cl-sublearner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#####################################
# #
# CLUSTER-AND-LABEL WITH SUBLEARNER #
# #
#####################################

rootfolder=/mnt/d/Study/M2_DK/Internship
folder="cl-sublearner"
m=100
h=1000

for d in agrawal led randomTree randomRBF hyperplane elecNormNew covtypeNorm airlines;
do
for ratio in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99;
do
# compute the percentage of unlabeled data
frac=`echo $ratio | cut -d. -f2`
len=${#frac}
if [[ $len == 1 ]]; then
((percent=frac*10))
else
percent=$frac
fi

# define necessary variables
data="$rootfolder/data/semi-masked/$d-semi-$percent.arff"
evaluator="BasicClassificationPerformanceEvaluator -o -p -r -f"
learner_sp="trees.HoeffdingTree"
clusterer="semisupervised.ClustreamSSL -h $h -l 0.0 -m $m -k 5 -a 1"
global_learner="trees.HoeffdingTree"
local_learner="bayes.NaiveBayes"
stream="ArffFileStream -f $data"

# define the task
echo
echo "=================================================================================="
echo "CLUSTER-N-LABEL & MULTIVIEW: dataset = $d, r = $percent"
echo "=================================================================================="
echo
learner_ssl="semisupervised.ClusterAndLabelSubLearnerClassifier -c ($clusterer) -p -l ($local_learner) -g ($global_learner) -t 1"
result="$rootfolder/experiments/$folder/$d-cl-$percent-multiview.csv"
task="EvaluateInterleavedTestThenTrainSemi -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task

echo
echo "=================================================================================="
echo "CLUSTER-N-LABEL STANDARD: dataset = $d, r = $percent"
echo "=================================================================================="
echo
learner_ssl="semisupervised.ClusterAndLabelClassifier -c ($clusterer) -p -k 1"
result="$rootfolder/experiments/$folder/$d-cl-$percent-standard.csv"
task="EvaluateInterleavedTestThenTrainSemi -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task

echo
echo "=================================================================================="
echo "FULLY SUPERVISED: dataset = $d, r = $percent"
echo "=================================================================================="
echo
learner_ssl="semisupervised.ClusterAndLabelSubLearnerClassifier -c ($clusterer) -p -l ($local_learner) -g ($global_learner) -t 1"
result="$rootfolder/experiments/$folder/$d-cl-$percent-supervised.csv"
task="EvaluateInterleavedTestThenTrainSemi -a -b ($learner_sp) -l ($learner_ssl) -s ($stream) -d ($result) -i -1 -f 1000 -q 1000"
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask $task
done
done
1 change: 1 addition & 0 deletions expe/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
java -classpath /mnt/d/Study/M2_DK/Internship/work/moa/moa/target/classes moa.DoTask
Empty file added java
Empty file.
3 changes: 1 addition & 2 deletions moa/.classpath
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-10">
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="module" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
Expand Down
6 changes: 3 additions & 3 deletions moa/.settings/org.eclipse.jdt.core.prefs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=10
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
org.eclipse.jdt.core.compiler.compliance=10
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.release=enabled
org.eclipse.jdt.core.compiler.source=10
org.eclipse.jdt.core.compiler.source=1.8
Loading