Skip to content

Commit 33276cc

Browse files
Written IS-SSL test docs #161
1 parent 892fef1 commit 33276cc

File tree

9 files changed

+187
-0
lines changed

9 files changed

+187
-0
lines changed

docs/anexos.pdf

260 KB
Binary file not shown.
185 KB
Loading
192 KB
Loading
79.5 KB
Loading

docs/tex/D_Manual_programador.tex

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,3 +481,47 @@ \subsubsection{Sonar Cloud}
481481
\imagenRuta{../img/anexos/manual-programador/SonarCloud-IS-SSL}{SonarCloud.}{SonarCloud-IS-SSL}
482482

483483
\subsection{Pruebas del sistema}
484+
485+
En esta sección se van a describir las pruebas que poseen las bibliotecas.
486+
487+
Las pruebas son realmente sencillas, ya que los propios algoritmos no poseen muchas disyunciones. Es por ello que el conjunto de pruebas codificado está compuesto de pruebas unitarias para cada algoritmo programado, permitiendo comprobar tanto que las entradas son correctas, como sus salidas. En el caso de los algoritmos de selección de instancias se asegura que las instancias recuperadas pertenecen al conjunto de datos de entrada, y que no son más que las que se introdujeron en primera instancia. De tal manera que en caso de que los algoritmos se modifiquen, el filtrado lo sigan realizando. La prueba base se puede visualizar en la Figura~\ref{fig:base-test-is}.
488+
489+
Para la biblioteca de algoritmos de aprendizaje semi-supervisado se poseen también pruebas unitarias para todos y cada uno de ellos, la prueba base se puede visualizar en~\ref{fig:base-test-ssl}. En ellas se comprueban:
490+
\begin{itemize}
491+
\tightlist
492+
\item Tipo de objeto de entrada.
493+
\item Instanciación con mediante diferentes parámetros, tanto por defecto como mediante uso de diccionarios.
494+
\item Entrenamiento del propio algoritmo (\texttt{fit}).
495+
\item Predicción usando el modelo, \texttt{predict}.
496+
\item Las etiquetas devueltas como resultado de la predicción existen en el conjunto de entrenamiento, no se ha inventado ninguna.
497+
\end{itemize}
498+
499+
500+
\imagenFlotante{../img/anexos/manual-programador/base-tests-is}{Prueba base para algoritmos de selección de instancias.}{base-test-is}
501+
\imagenFlotante{../img/anexos/manual-programador/base-tests-ssl}{Prueba base para algoritmos de aprendizaje semi-supervisado.}{base-test-ssl}
502+
\begin{figure}
503+
\centering
504+
\includegraphics[width=0.5\textwidth]{../img/anexos/manual-programador/tests-superados-is-ssl}
505+
\caption{Prueba base para algoritmos de aprendizaje semi-supervisado.}\label{fig:tests-superados-is-ssl}
506+
507+
\end{figure}
508+
509+
Además, se ha codificado una prueba para asegurar que se siguen leyendo los ficheros \texttt{ARFF} correctamente.
510+
511+
El resultado de las ejecuciones se puede ver la Figura~\ref{fig:tests-superados-is-ssl}.
512+
513+
514+
515+
516+
517+
518+
519+
520+
521+
522+
523+
524+
525+
526+
527+

semisupervised/utils/_split.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ def split(samples, y):
2222
y = y.to_numpy()
2323

2424
labeled_indexes = y != (-1 or np.NaN or None)
25+
26+
labeled_indexes = np.ravel(labeled_indexes)
2527

2628
L = samples.iloc[labeled_indexes].to_numpy()
2729
U = samples.iloc[~labeled_indexes].to_numpy()
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,36 @@
1515

1616

1717
def to_dataframe(y):
18+
"""
19+
If the input is not a dataframe, convert it to a dataframe
20+
21+
:param y: The target variable
22+
:return: A dataframe
23+
"""
1824
if not isinstance(y, pd.DataFrame):
1925
return pd.DataFrame(y)
2026

2127

2228
@pytest.fixture
2329
def iris_dataset():
30+
"""
31+
It loads the iris dataset, converts the target variable to a dataframe, and
32+
returns the data and target
33+
:return: The dataframe of the features and the dataframe of the labels
34+
"""
2435
x, y = load_iris(return_X_y=True, as_frame=True)
2536
y = to_dataframe(y)
2637
return x, y
2738

2839

2940
@pytest.fixture
3041
def iris_dataset_ss():
42+
"""
43+
It loads the iris dataset, randomly selects 30% of the data points to be
44+
unlabeled, and returns the labeled and unlabeled data
45+
:return: The original dataframe, the original labels, the complete
46+
dataframe, and the complete labels.
47+
"""
3148
x, y = load_iris(return_X_y=True, as_frame=True)
3249
y = to_dataframe(y)
3350
li = list(set(range(x.shape[0])))
@@ -45,6 +62,16 @@ def iris_dataset_ss():
4562

4663

4764
def base(x, y, algorithm, params=None):
65+
"""
66+
It takes in a dataframe of features, a dataframe of labels, an algorithm,
67+
and a dictionary of parameters. It then creates a model using the
68+
algorithm and parameters, and filters the data using the model
69+
70+
:param x: The input dataframe
71+
:param y: The target variable
72+
:param algorithm: The algorithm to use
73+
:param params: a dictionary of parameters to pass to the algorithm
74+
"""
4875
assert isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame)
4976
model = algorithm(**params) if params is not None else algorithm()
5077
x_filtered, y_filtered = model.filter(x, y)
@@ -57,46 +84,91 @@ def base(x, y, algorithm, params=None):
5784

5885

5986
def test_enn_original(iris_dataset):
87+
"""
88+
It tests the ENN algorithm on the iris dataset
89+
90+
:param iris_dataset: This is the dataset to use
91+
"""
6092
x, y = iris_dataset
6193
base(x, y, ENN, {'nearest_neighbors': 3, 'power_parameter': 2})
6294

6395

6496
def test_cnn(iris_dataset):
97+
"""
98+
It tests the CNN algorithm on the iris dataset
99+
100+
:param iris_dataset: This is the dataset to use
101+
"""
65102
x, y = iris_dataset
66103
base(x, y, CNN)
67104

68105

69106
def test_rnn(iris_dataset):
107+
"""
108+
It tests the RNN algorithm on the iris dataset
109+
110+
:param iris_dataset: This is the dataset to use
111+
"""
70112
x, y = iris_dataset
71113
base(x, y, RNN)
72114

73115

74116
def test_icf(iris_dataset):
117+
"""
118+
It tests the ICF algorithm on the iris dataset
119+
120+
:param iris_dataset: This is the dataset to use
121+
"""
75122
x, y = iris_dataset
76123
base(x, y, ICF, {'nearest_neighbors': 3, 'power_parameter': 2})
77124

78125

79126
def test_mss(iris_dataset):
127+
"""
128+
It tests the MSS algorithm on the iris dataset
129+
130+
:param iris_dataset: This is the dataset to use
131+
"""
80132
x, y = iris_dataset
81133
base(x, y, MSS)
82134

83135

84136
def test_drop3(iris_dataset):
137+
"""
138+
It tests the DROP3 algorithm on the iris dataset
139+
140+
:param iris_dataset: This is the dataset to use
141+
"""
85142
x, y = iris_dataset
86143
base(x, y, DROP3, {'nearest_neighbors': 3, 'power_parameter': 2})
87144

88145

89146
def test_local_sets_lssm(iris_dataset):
147+
"""
148+
It tests the LSSm algorithm on the iris dataset
149+
150+
:param iris_dataset: This is the dataset to use
151+
"""
90152
x, y = iris_dataset
91153
base(x, y, LSSm)
92154

93155

94156
def test_local_sets_lsbo(iris_dataset):
157+
"""
158+
It tests the LSBo algorithm on the iris dataset
159+
160+
:param iris_dataset: This is the dataset to use
161+
"""
95162
x, y = iris_dataset
96163
base(x, y, LSBo)
97164

98165

99166
def test_enn_ss(iris_dataset_ss):
167+
"""
168+
It tests the safe ENN algorithm on the iris dataset
169+
170+
:param iris_dataset: This is the dataset to use
171+
"""
100172
original, original_labels, complete, complete_labels, = iris_dataset_ss
101173

102174
model = ENN()
@@ -118,6 +190,13 @@ def test_enn_ss(iris_dataset_ss):
118190

119191

120192
def test_different_len(iris_dataset):
193+
"""
194+
It tests that the filter method raises a ValueError if the length of the
195+
input and output dataframes are different
196+
197+
:param iris_dataset: a fixture that returns a tuple of two pandas
198+
DataFrames, one for the features and one for the target
199+
"""
121200
x, y = iris_dataset
122201
y = y.loc[:-1]
123202
model1 = LSSm()
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020

2121
@pytest.fixture
2222
def digits_dataset_ss():
23+
"""
24+
It loads the digits dataset, splits it into train and test sets, and then
25+
randomly assigns 55% of the training set as unlabeled
26+
:return: x_train, x_test, y_train, y_test, opt_labels
27+
"""
2328
x, y = load_digits(return_X_y=True, as_frame=True)
2429
x = x.to_numpy()
2530
y = y.to_numpy()
@@ -39,6 +44,18 @@ def digits_dataset_ss():
3944

4045

4146
def base(x_train, x_test, y_train, y_test, opt_labels, algorithm, params=None):
47+
"""
48+
It takes in a training and testing set, a list of possible labels, and a
49+
model, and checks the predictions of the model on the testing set
50+
51+
:param x_train: The training data
52+
:param x_test: The test data
53+
:param y_train: The training labels
54+
:param y_test: the actual labels of the test set
55+
:param opt_labels: the set of labels that the model can predict
56+
:param algorithm: the algorithm to use
57+
:param params: a dictionary of parameters to pass to the algorithm
58+
"""
4259
assert isinstance(x_train, pd.DataFrame) and isinstance(y_train,
4360
pd.DataFrame)
4461
assert isinstance(x_test, pd.DataFrame) and isinstance(y_test,
@@ -52,6 +69,11 @@ def base(x_train, x_test, y_train, y_test, opt_labels, algorithm, params=None):
5269

5370

5471
def test_co_training(digits_dataset_ss):
72+
"""
73+
It tests the Co-Training algorithm on the digits dataset
74+
75+
:param digits_dataset_ss: The dataset we're using
76+
"""
5577
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
5678
base(x_train, x_test, y_train, y_test, opt_labels, CoTraining,
5779
{'p': 1, 'n': 3, 'k': 1, 'u': 7})
@@ -73,13 +95,23 @@ def test_co_training(digits_dataset_ss):
7395

7496

7597
def test_tri_training(digits_dataset_ss):
98+
"""
99+
It tests the Tri-Training algorithm on the digits dataset
100+
101+
:param digits_dataset_ss: the dataset we're using
102+
"""
76103
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
77104
base(x_train, x_test, y_train, y_test, opt_labels, TriTraining,
78105
{'c1': KNeighborsClassifier, 'c1_params': {'n_neighbors': 3},
79106
'c2': KNeighborsClassifier})
80107

81108

82109
def test_demo_co_learning(digits_dataset_ss):
110+
"""
111+
It tests the Democratic Co-Learning algorithm on the digits dataset
112+
113+
:param digits_dataset_ss: The dataset we're using
114+
"""
83115
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
84116
base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning)
85117
base(x_train, x_test, y_train, y_test, opt_labels, DemocraticCoLearning,
@@ -88,11 +120,25 @@ def test_demo_co_learning(digits_dataset_ss):
88120

89121

90122
def test_density_peaks(digits_dataset_ss):
123+
"""
124+
It takes the training and testing data, and the optimal labels, and runs the
125+
STDPNF algorithm on it
126+
127+
:param digits_dataset_ss: a tuple of (x_train, x_test, y_train, y_test,
128+
opt_labels)
129+
"""
91130
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
92131
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF)
93132

94133

95134
def test_density_peaks_filtering(digits_dataset_ss):
135+
"""
136+
It tests that the `filtering` option works as expected in the STDPNF
137+
algorithm
138+
139+
:param digits_dataset_ss: a fixture that returns a tuple of (x_train,
140+
x_test, y_train, y_test, opt_labels)
141+
"""
96142
x_train, x_test, y_train, y_test, opt_labels = digits_dataset_ss
97143
with pytest.raises(AttributeError):
98144
base(x_train, x_test, y_train, y_test, opt_labels, STDPNF,
@@ -106,6 +152,12 @@ def test_density_peaks_filtering(digits_dataset_ss):
106152

107153

108154
def test_different_len(digits_dataset_ss):
155+
"""
156+
It tests that if the length of the input and output are different, then the
157+
model will raise a ValueError
158+
159+
:param digits_dataset_ss: a tuple of (x, x_test, y, y_test, y_pred)
160+
"""
109161
x, _, y, _, _ = digits_dataset_ss
110162
co = CoTraining()
111163
tri = TriTraining()

tests/utils.py renamed to tests/test_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,20 @@
1414

1515
@pytest.fixture
1616
def arff_path_file():
17+
"""
18+
It returns the path to the iris dataset in the datasets folder
19+
:return: The path to the iris.arff file
20+
"""
1721
return join('datasets', 'iris.arff')
1822

1923

2024
def test_arff_data(arff_path_file):
25+
"""
26+
`arff_data` loads an arff file into a `Bunch` object, which is a
27+
dictionary-like object.
28+
29+
:param arff_path_file: The path to the arff file
30+
"""
2131
dataset = arff_data(arff_path_file)
2232
assert isinstance(dataset, Bunch)
2333
dataset1 = arff_data(arff_path_file, ['a', 'b', 'c', 'd'])

0 commit comments

Comments
 (0)