|
16 | 16 | THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS |
17 | 17 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | | -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 19 | +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, |
| 20 | +
|
| 21 | +INDIRECT, INCIDENTAL, |
20 | 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
21 | 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
22 | 24 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
23 | 25 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
24 | 26 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE |
25 | 27 | POSSIBILITY OF SUCH DAMAGE. |
26 | 28 | """ |
27 | | - |
28 | | -import sys |
29 | 29 | import os |
30 | 30 | import numpy as np |
31 | 31 | import pandas as pd |
32 | 32 |
|
| 33 | + |
| 34 | +# get absolute path of csv files from data folder |
| 35 | +def get_absPath(filename): |
| 36 | + """Returns the path of the notebooks folder""" |
| 37 | + path = os.path.abspath( |
| 38 | + os.path.join(os.path.dirname(__file__), os.path.pardir, |
| 39 | + os.path.pardir, "data", filename)) |
| 40 | + return path |
| 41 | + |
| 42 | + |
33 | 43 | # number of features |
34 | | -n_columns = 10 |
| 44 | +expected_columns = 10 |
35 | 45 |
|
36 | 46 | # distribution of features in the training set |
37 | 47 | historical_mean = np.array( |
|
65 | 75 | ] |
66 | 76 | ) |
67 | 77 |
|
68 | | -# maximal relative change in feature mean or standrd deviation that we can tolerate |
| 78 | +# maximal relative change in feature mean or standrd deviation |
| 79 | +# that we can tolerate |
69 | 80 | shift_tolerance = 3 |
70 | 81 |
|
71 | 82 |
|
72 | | -def check_schema(X): |
73 | | - n_actual_columns = X.shape[1] |
74 | | - if n_actual_columns != n_columns: |
75 | | - print( |
76 | | - "Error: found {} feature columns. The data should have {} feature columns.".format( |
77 | | - n_actual_columns, n_columns |
78 | | - ) |
79 | | - ) |
80 | | - return False |
81 | | - |
82 | | - return True |
83 | | - |
84 | | - |
85 | | -def check_missing_values(dataset): |
| 83 | +def test_check_schema(): |
| 84 | + datafile = get_absPath("diabetes.csv") |
| 85 | + # check that file exists |
| 86 | + assert(os.path.exists(datafile)) |
| 87 | + dataset = pd.read_csv(datafile) |
| 88 | + header = dataset[dataset.columns[:-1]] |
| 89 | + actual_columns = header.shape[1] |
| 90 | + # check header has expected number of columns |
| 91 | + assert(actual_columns == expected_columns) |
| 92 | + |
| 93 | + |
| 94 | +def test_check_bad_schema(): |
| 95 | + datafile = get_absPath("diabetes_bad_schema.csv") |
| 96 | + # check that file exists |
| 97 | + assert(os.path.exists(datafile)) |
| 98 | + dataset = pd.read_csv(datafile) |
| 99 | + header = dataset[dataset.columns[:-1]] |
| 100 | + actual_columns = header.shape[1] |
| 101 | + # check header has expected number of columns |
| 102 | + assert(actual_columns != expected_columns) |
| 103 | + |
| 104 | + |
| 105 | +def test_check_missing_values(): |
| 106 | + datafile = get_absPath("diabetes_missing_values.csv") |
| 107 | + # check that file exists |
| 108 | + assert(os.path.exists(datafile)) |
| 109 | + dataset = pd.read_csv(datafile) |
86 | 110 | n_nan = np.sum(np.isnan(dataset.values)) |
87 | | - if n_nan > 0: |
88 | | - print("Warning: the data has {} missing values".format(n_nan)) |
89 | | - return False |
90 | | - return True |
| 111 | + assert(n_nan > 0) |
91 | 112 |
|
92 | 113 |
|
93 | | -def check_distribution(dataset): |
| 114 | +def test_check_distribution(): |
| 115 | + datafile = get_absPath("diabetes_bad_dist.csv") |
| 116 | + # check that file exists |
| 117 | + assert(os.path.exists(datafile)) |
| 118 | + dataset = pd.read_csv(datafile) |
94 | 119 | mean = np.mean(dataset.values, axis=0) |
95 | 120 | std = np.mean(dataset.values, axis=0) |
96 | | - if ( |
97 | | - np.sum(abs(mean - historical_mean) > shift_tolerance * abs(historical_mean)) > 0 |
98 | | - or np.sum(abs(std - historical_std) > shift_tolerance * abs(historical_std)) > 0 |
99 | | - ): |
100 | | - print("Warning: new data has different distribution than the training data") |
101 | | - return False |
102 | | - return True |
103 | | - |
104 | | - |
105 | | -def main(): |
106 | | - filename = sys.argv[1] |
107 | | - if not os.path.exists(filename): |
108 | | - print("Error: The file {} does not exist".format(filename)) |
109 | | - return |
110 | | - |
111 | | - dataset = pd.read_csv(filename) |
112 | | - if check_schema(dataset[dataset.columns[:-1]]): |
113 | | - print("Data schema test succeeded") |
114 | | - if check_missing_values(dataset) and check_distribution(dataset): |
115 | | - print("Missing values test passed") |
116 | | - print("Data distribution test passed") |
117 | | - else: |
118 | | - print( |
119 | | - "There might be some issues with the data. Please check warning messages." |
120 | | - ) |
121 | | - |
122 | | - |
123 | | -if __name__ == "__main__": |
124 | | - main() |
| 121 | + assert(np.sum(abs(mean - historical_mean) > shift_tolerance * |
| 122 | + abs(historical_mean)) or |
| 123 | + np.sum(abs(std - historical_std) > shift_tolerance * |
| 124 | + abs(historical_std)) > 0) |
0 commit comments