Skip to content

Commit 56fcc00

Browse files
Neeratyoymfeurer
authored andcommitted
Delete files uploaded to test server during unit testing (#735)
* Collecting and cleaning unit test dump * Adding session level fixture with yield to delay deletion of files * Adding PEP8 ignore F401 * Changelog update + pytest argument fix * Messy first draft of possible designs * Leaner implementation without additional imports * Reordering flows to delete subflows later * Updating with design changes for tracking files for deletion * Handling edge cases * Fixing unit test git status * Fixing PEP8 issues * FIxing type annotation * Logging and leaner flow * Fixing PEP8 and unit test errors * Fixing test cases; Renaming function * Fixing clustering task unit test * Updating docs for unit test deletion
1 parent 88b87ad commit 56fcc00

File tree

16 files changed

+293
-48
lines changed

16 files changed

+293
-48
lines changed

CONTRIBUTING.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ following rules before you submit a pull request:
8181
Drafts often benefit from the inclusion of a
8282
[task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments)
8383
in the PR description.
84+
85+
- Add [unit tests](https://github.com/openml/openml-python/tree/develop/tests) and [examples](https://github.com/openml/openml-python/tree/develop/examples) for any new functionality being introduced.
86+
- If an unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`.
87+
- Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`.
8488

8589
- All tests pass when running `pytest`. On
8690
Unix-like systems, check with (from the toplevel source folder):

PULL_REQUEST_TEMPLATE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Please make sure that:
99
* for any new function or class added, please add it to doc/api.rst
1010
* the list of classes and functions should be alphabetical
1111
* for any new functionality, consider adding a relevant example
12+
* add unit tests for new functionalities
13+
* collect files uploaded to test server using _mark_entity_for_removal()
1214
-->
1315

1416
#### Reference Issue

doc/progress.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,17 @@ Changelog
88

99
0.10.0
1010
~~~~~~
11-
* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
11+
* FIX #261: Test server is cleared of all files uploaded during unit testing.
12+
* FIX #447: All files created by unit tests no longer persist in local.
1213
* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
13-
* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
14+
* FIX #447: All files created by unit tests are deleted after the completion of all unit tests.
1415
* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
16+
* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
1517
* DOC #639: More descriptive documention for function to convert array format.
1618
* ADD #687: Adds a function to retrieve the list of evaluation measures available.
1719
* ADD #695: A function to retrieve all the data quality measures available.
18-
* FIX #447: All files created by unit tests are deleted after the completion of all unit tests.
20+
* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
21+
* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
1922
* MAINT #726: Update examples to remove deprecation warnings from scikit-learn
2023

2124
0.9.0

openml/testing.py

Lines changed: 87 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from openml.tasks import TaskTypeEnum
1919

2020
import pytest
21+
import logging
2122

2223

2324
class TestBase(unittest.TestCase):
@@ -28,6 +29,18 @@ class TestBase(unittest.TestCase):
2829
Currently hard-codes a read-write key.
2930
Hopefully soon allows using a test server, not the production server.
3031
"""
32+
publish_tracker = {'run': [], 'data': [], 'flow': [], 'task': [],
33+
'study': [], 'user': []} # type: dict
34+
test_server = "https://test.openml.org/api/v1/xml"
35+
# amueller's read/write key that he will throw away later
36+
apikey = "610344db6388d9ba34f6db45a3cf71de"
37+
38+
# creating logger for unit test file deletion status
39+
logger = logging.getLogger("unit_tests")
40+
logger.setLevel(logging.INFO)
41+
fh = logging.FileHandler('TestBase.log')
42+
fh.setLevel(logging.INFO)
43+
logger.addHandler(fh)
3144

3245
def setUp(self, n_levels: int = 1):
3346
"""Setup variables and temporary directories.
@@ -46,6 +59,7 @@ def setUp(self, n_levels: int = 1):
4659
Number of nested directories the test is in. Necessary to resolve the path to the
4760
``files`` directory, which is located directly under the ``tests`` directory.
4861
"""
62+
4963
# This cache directory is checked in to git to simulate a populated
5064
# cache
5165
self.maxDiff = None
@@ -71,12 +85,9 @@ def setUp(self, n_levels: int = 1):
7185
os.chdir(self.workdir)
7286

7387
self.cached = True
74-
# amueller's read/write key that he will throw away later
75-
openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
88+
openml.config.apikey = TestBase.apikey
7689
self.production_server = "https://openml.org/api/v1/xml"
77-
self.test_server = "https://test.openml.org/api/v1/xml"
78-
79-
openml.config.server = self.test_server
90+
openml.config.server = TestBase.test_server
8091
openml.config.avoid_duplicate_runs = False
8192
openml.config.cache_directory = self.workdir
8293

@@ -87,7 +98,7 @@ def setUp(self, n_levels: int = 1):
8798
with open(openml.config.config_file, 'w') as fh:
8899
fh.write('apikey = %s' % openml.config.apikey)
89100

90-
# Increase the number of retries to avoid spurios server failures
101+
# Increase the number of retries to avoid spurious server failures
91102
self.connection_n_retries = openml.config.connection_n_retries
92103
openml.config.connection_n_retries = 10
93104

@@ -104,9 +115,43 @@ def tearDown(self):
104115
openml.config.server = self.production_server
105116
openml.config.connection_n_retries = self.connection_n_retries
106117

118+
@classmethod
119+
def _mark_entity_for_removal(self, entity_type, entity_id):
120+
""" Static record of entities uploaded to test server
121+
122+
Dictionary of lists where the keys are 'entity_type'.
123+
Each such dictionary is a list of integer IDs.
124+
For entity_type='flow', each list element is a tuple
125+
of the form (Flow ID, Flow Name).
126+
"""
127+
if entity_type not in TestBase.publish_tracker:
128+
TestBase.publish_tracker[entity_type] = [entity_id]
129+
else:
130+
TestBase.publish_tracker[entity_type].append(entity_id)
131+
132+
@classmethod
133+
def _delete_entity_from_tracker(self, entity_type, entity):
134+
""" Deletes entity records from the static file_tracker
135+
136+
Given an entity type and corresponding ID, deletes all entries, including
137+
duplicate entries of the ID for the entity type.
138+
"""
139+
if entity_type in TestBase.publish_tracker:
140+
# removes duplicate entries
141+
TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type]))
142+
if entity_type == 'flow':
143+
delete_index = [i for i, (id_, _) in
144+
enumerate(TestBase.publish_tracker[entity_type])
145+
if id_ == entity][0]
146+
else:
147+
delete_index = [i for i, id_ in
148+
enumerate(TestBase.publish_tracker[entity_type])
149+
if id_ == entity][0]
150+
TestBase.publish_tracker[entity_type].pop(delete_index)
151+
107152
@pytest.fixture(scope="session", autouse=True)
108153
def _cleanup_fixture(self):
109-
"""Cleans up files generated by Unit tests
154+
"""Cleans up files generated by unit tests
110155
111156
This function is called at the beginning of the invocation of
112157
TestBase (defined below), by each of class that inherits TestBase.
@@ -125,7 +170,6 @@ def _cleanup_fixture(self):
125170
else:
126171
static_cache_dir = os.path.join(static_cache_dir, '../')
127172
directory = os.path.join(static_cache_dir, 'tests/files/')
128-
# directory = "{}/tests/files/".format(static_cache_dir)
129173
files = os.walk(directory)
130174
old_file_list = []
131175
for root, _, filenames in files:
@@ -135,17 +179,51 @@ def _cleanup_fixture(self):
135179
# pauses the code execution here till all tests in the 'session' is over
136180
yield
137181
# resumes from here after all collected tests are completed
182+
183+
#
184+
# Local file deletion
185+
#
138186
files = os.walk(directory)
139187
new_file_list = []
140188
for root, _, filenames in files:
141189
for filename in filenames:
142190
new_file_list.append(os.path.join(root, filename))
143191
# filtering the files generated during this run
144192
new_file_list = list(set(new_file_list) - set(old_file_list))
145-
print("Files to delete in local: {}".format(new_file_list))
146193
for file in new_file_list:
147194
os.remove(file)
148195

196+
#
197+
# Test server deletion
198+
#
199+
openml.config.server = TestBase.test_server
200+
openml.config.apikey = TestBase.apikey
201+
202+
# legal_entities defined in openml.utils._delete_entity - {'user'}
203+
entity_types = {'run', 'data', 'flow', 'task', 'study'}
204+
# 'run' needs to be first entity to allow other dependent entities to be deleted
205+
# cloning file tracker to allow deletion of entries of deleted files
206+
tracker = TestBase.publish_tracker.copy()
207+
208+
# reordering to delete sub flows at the end of flows
209+
# sub-flows have shorter names, hence, sorting by descending order of flow name length
210+
if 'flow' in entity_types:
211+
flow_deletion_order = [entity_id for entity_id, _ in
212+
sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)]
213+
tracker['flow'] = flow_deletion_order
214+
215+
# deleting all collected entities published to test server
216+
for entity_type in entity_types:
217+
for i, entity in enumerate(tracker[entity_type]):
218+
try:
219+
openml.utils._delete_entity(entity_type, entity)
220+
TestBase.logger.info("Deleted ({}, {})".format(entity_type, entity))
221+
# deleting actual entry from tracker
222+
TestBase._delete_entity_from_tracker(entity_type, entity)
223+
except Exception as e:
224+
TestBase.logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))
225+
TestBase.logger.info("End of cleanup_fixture from {}".format(self.__class__))
226+
149227
def _get_sentinel(self, sentinel=None):
150228
if sentinel is None:
151229
# Create a unique prefix for the flow. Necessary because the flow

tests/test_datasets/test_dataset_functions.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,9 @@ def test_publish_dataset(self):
478478
data_file=file_path,
479479
)
480480
dataset.publish()
481+
TestBase._mark_entity_for_removal('data', dataset.dataset_id)
482+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
483+
dataset.dataset_id))
481484
self.assertIsInstance(dataset.dataset_id, int)
482485

483486
def test__retrieve_class_labels(self):
@@ -498,6 +501,9 @@ def test_upload_dataset_with_url(self):
498501
url="https://www.openml.org/data/download/61/dataset_61_iris.arff",
499502
)
500503
dataset.publish()
504+
TestBase._mark_entity_for_removal('data', dataset.dataset_id)
505+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
506+
dataset.dataset_id))
501507
self.assertIsInstance(dataset.dataset_id, int)
502508

503509
def test_data_status(self):
@@ -507,6 +513,9 @@ def test_data_status(self):
507513
version=1,
508514
url="https://www.openml.org/data/download/61/dataset_61_iris.arff")
509515
dataset.publish()
516+
TestBase._mark_entity_for_removal('data', dataset.dataset_id)
517+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
518+
dataset.dataset_id))
510519
did = dataset.dataset_id
511520

512521
# admin key for test server (only adminds can activate datasets.
@@ -620,6 +629,9 @@ def test_create_dataset_numpy(self):
620629
)
621630

622631
upload_did = dataset.publish()
632+
TestBase._mark_entity_for_removal('data', upload_did)
633+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
634+
upload_did))
623635

624636
self.assertEqual(
625637
_get_online_dataset_arff(upload_did),
@@ -682,6 +694,9 @@ def test_create_dataset_list(self):
682694
)
683695

684696
upload_did = dataset.publish()
697+
TestBase._mark_entity_for_removal('data', upload_did)
698+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
699+
upload_did))
685700
self.assertEqual(
686701
_get_online_dataset_arff(upload_did),
687702
dataset._dataset,
@@ -725,6 +740,9 @@ def test_create_dataset_sparse(self):
725740
)
726741

727742
upload_did = xor_dataset.publish()
743+
TestBase._mark_entity_for_removal('data', upload_did)
744+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
745+
upload_did))
728746
self.assertEqual(
729747
_get_online_dataset_arff(upload_did),
730748
xor_dataset._dataset,
@@ -762,6 +780,9 @@ def test_create_dataset_sparse(self):
762780
)
763781

764782
upload_did = xor_dataset.publish()
783+
TestBase._mark_entity_for_removal('data', upload_did)
784+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
785+
upload_did))
765786
self.assertEqual(
766787
_get_online_dataset_arff(upload_did),
767788
xor_dataset._dataset,
@@ -885,6 +906,9 @@ def test_create_dataset_pandas(self):
885906
paper_url=paper_url
886907
)
887908
upload_did = dataset.publish()
909+
TestBase._mark_entity_for_removal('data', upload_did)
910+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
911+
upload_did))
888912
self.assertEqual(
889913
_get_online_dataset_arff(upload_did),
890914
dataset._dataset,
@@ -919,6 +943,9 @@ def test_create_dataset_pandas(self):
919943
paper_url=paper_url
920944
)
921945
upload_did = dataset.publish()
946+
TestBase._mark_entity_for_removal('data', upload_did)
947+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
948+
upload_did))
922949
self.assertEqual(
923950
_get_online_dataset_arff(upload_did),
924951
dataset._dataset,
@@ -955,6 +982,9 @@ def test_create_dataset_pandas(self):
955982
paper_url=paper_url
956983
)
957984
upload_did = dataset.publish()
985+
TestBase._mark_entity_for_removal('data', upload_did)
986+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
987+
upload_did))
958988
downloaded_data = _get_online_dataset_arff(upload_did)
959989
self.assertEqual(
960990
downloaded_data,
@@ -1123,6 +1153,9 @@ def test___publish_fetch_ignore_attribute(self):
11231153

11241154
# publish dataset
11251155
upload_did = dataset.publish()
1156+
TestBase._mark_entity_for_removal('data', upload_did)
1157+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
1158+
upload_did))
11261159
# test if publish was successful
11271160
self.assertIsInstance(upload_did, int)
11281161
# variables to carry forward for test_publish_fetch_ignore_attribute()
@@ -1253,6 +1286,9 @@ def test_create_dataset_row_id_attribute_inference(self):
12531286
)
12541287
self.assertEqual(dataset.row_id_attribute, output_row_id)
12551288
upload_did = dataset.publish()
1289+
TestBase._mark_entity_for_removal('data', upload_did)
1290+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
1291+
upload_did))
12561292
arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
12571293
arff_data = np.array(arff_dataset['data'], dtype=object)
12581294
# if we set the name of the index then the index will be added to

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,8 @@ def test_openml_param_name_to_sklearn(self):
11261126
task = openml.tasks.get_task(115)
11271127
run = openml.runs.run_flow_on_task(flow, task)
11281128
run = run.publish()
1129+
TestBase._mark_entity_for_removal('run', run.run_id)
1130+
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
11291131
run = openml.runs.get_run(run.run_id)
11301132
setup = openml.setups.get_setup(run.setup_id)
11311133

0 commit comments

Comments
 (0)