Skip to content

Commit 22a7310

Browse files
committed
Added UnitTests: Modified the exisiting data tests to follow pytest schema, currently they were a bunch of if else and it would not have failed build even if they failed. Also moved it under a new folder called tests/unit as that is more recommended way
1 parent dacef07 commit 22a7310

File tree

2 files changed

+54
-54
lines changed

2 files changed

+54
-54
lines changed

azure-pipelines.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ steps:
2828
2929
displayName: 'replace subscription value'
3030

31-
- script: 'python code/testing/data_test.py data/diabetes.csv && python code/testing/data_test.py data/diabetes_bad_dist.csv && python code/testing/data_test.py data/diabetes_bad_schema.csv && python code/testing/data_test.py data/diabetes_missing_values.csv'
31+
- script: 'pytest tests/unit/data_test.py'
3232
displayName: 'Data Quality Check'
3333

3434
- script: 'python aml_service/00-WorkSpace.py'
Lines changed: 53 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,32 @@
1616
THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS
1717
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1818
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19-
MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19+
MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT,
20+
21+
INDIRECT, INCIDENTAL,
2022
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2123
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
2224
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
2325
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2426
ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
2527
POSSIBILITY OF SUCH DAMAGE.
2628
"""
27-
28-
import sys
2929
import os
3030
import numpy as np
3131
import pandas as pd
3232

33+
34+
# get absolute path of csv files from data folder
35+
def get_absPath(filename):
36+
"""Returns the path of the notebooks folder"""
37+
path = os.path.abspath(
38+
os.path.join(os.path.dirname(__file__), os.path.pardir,
39+
os.path.pardir, "data", filename))
40+
return path
41+
42+
3343
# number of features
34-
n_columns = 10
44+
expected_columns = 10
3545

3646
# distribution of features in the training set
3747
historical_mean = np.array(
@@ -65,60 +75,50 @@
6575
]
6676
)
6777

68-
# maximal relative change in feature mean or standrd deviation that we can tolerate
78+
# maximal relative change in feature mean or standrd deviation
79+
# that we can tolerate
6980
shift_tolerance = 3
7081

7182

72-
def check_schema(X):
73-
n_actual_columns = X.shape[1]
74-
if n_actual_columns != n_columns:
75-
print(
76-
"Error: found {} feature columns. The data should have {} feature columns.".format(
77-
n_actual_columns, n_columns
78-
)
79-
)
80-
return False
81-
82-
return True
83-
84-
85-
def check_missing_values(dataset):
83+
def test_check_schema():
84+
datafile = get_absPath("diabetes.csv")
85+
# check that file exists
86+
assert(os.path.exists(datafile))
87+
dataset = pd.read_csv(datafile)
88+
header = dataset[dataset.columns[:-1]]
89+
actual_columns = header.shape[1]
90+
# check header has expected number of columns
91+
assert(actual_columns == expected_columns)
92+
93+
94+
def test_check_bad_schema():
95+
datafile = get_absPath("diabetes_bad_schema.csv")
96+
# check that file exists
97+
assert(os.path.exists(datafile))
98+
dataset = pd.read_csv(datafile)
99+
header = dataset[dataset.columns[:-1]]
100+
actual_columns = header.shape[1]
101+
# check header has expected number of columns
102+
assert(actual_columns != expected_columns)
103+
104+
105+
def test_check_missing_values():
106+
datafile = get_absPath("diabetes_missing_values.csv")
107+
# check that file exists
108+
assert(os.path.exists(datafile))
109+
dataset = pd.read_csv(datafile)
86110
n_nan = np.sum(np.isnan(dataset.values))
87-
if n_nan > 0:
88-
print("Warning: the data has {} missing values".format(n_nan))
89-
return False
90-
return True
111+
assert(n_nan > 0)
91112

92113

93-
def check_distribution(dataset):
114+
def test_check_distribution():
115+
datafile = get_absPath("diabetes_bad_dist.csv")
116+
# check that file exists
117+
assert(os.path.exists(datafile))
118+
dataset = pd.read_csv(datafile)
94119
mean = np.mean(dataset.values, axis=0)
95120
std = np.mean(dataset.values, axis=0)
96-
if (
97-
np.sum(abs(mean - historical_mean) > shift_tolerance * abs(historical_mean)) > 0
98-
or np.sum(abs(std - historical_std) > shift_tolerance * abs(historical_std)) > 0
99-
):
100-
print("Warning: new data has different distribution than the training data")
101-
return False
102-
return True
103-
104-
105-
def main():
106-
filename = sys.argv[1]
107-
if not os.path.exists(filename):
108-
print("Error: The file {} does not exist".format(filename))
109-
return
110-
111-
dataset = pd.read_csv(filename)
112-
if check_schema(dataset[dataset.columns[:-1]]):
113-
print("Data schema test succeeded")
114-
if check_missing_values(dataset) and check_distribution(dataset):
115-
print("Missing values test passed")
116-
print("Data distribution test passed")
117-
else:
118-
print(
119-
"There might be some issues with the data. Please check warning messages."
120-
)
121-
122-
123-
if __name__ == "__main__":
124-
main()
121+
assert(np.sum(abs(mean - historical_mean) > shift_tolerance *
122+
abs(historical_mean)) or
123+
np.sum(abs(std - historical_std) > shift_tolerance *
124+
abs(historical_std)) > 0)

0 commit comments

Comments
 (0)