Skip to content

Commit 5f97020

Browse files
Make combine_corpus only look at folders with a corpus description (#290)
Currently the combine corpus tooling just looks through all files and folders in a directory, assuming them all to be folders with a corpus_description.json in them. We don't actually throw an error (only logging one), but it makes sense to be more robust and not log an error when we do something like iterate over a logging file.
1 parent c7bf806 commit 5f97020

File tree

2 files changed

+35
-9
lines changed

2 files changed

+35
-9
lines changed

compiler_opt/tools/combine_training_corpus_lib.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,13 @@ def combine_corpus(root_dir: str) -> None:
2828
module_names = []
2929
output_corpus_description = {}
3030

31-
for sub_dir in tf.io.gfile.listdir(root_dir):
32-
path = os.path.join(root_dir, sub_dir, _FILE_NAME)
31+
corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
32+
for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
33+
logging.info('processing %s', corpus_description_path)
3334

34-
logging.info('processing %s', path)
35-
36-
if not tf.io.gfile.exists(path):
37-
logging.error('%s does not exist.', path)
38-
continue
39-
40-
with tf.io.gfile.GFile(path, 'r') as f:
35+
with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
4136
corpus_description = json.load(f)
37+
sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
4238
module_names.extend([
4339
os.path.join(sub_dir, name) for name in corpus_description['modules']
4440
])

compiler_opt/tools/combine_training_corpus_test.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,36 @@ def test_combine_corpus(self):
5454
self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
5555
self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
5656

57+
def test_empty_folder(self):
58+
corpus_dir = self.create_tempdir()
59+
subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
60+
_ = corpus_dir.mkdir(dir_path='empty_dir')
61+
subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
62+
subcorpus1_description_file = subcorpus1_dir.create_file(
63+
file_path='corpus_description.json')
64+
subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
65+
combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
66+
with open(
67+
os.path.join(corpus_dir, 'corpus_description.json'),
68+
encoding='utf-8') as combined_corpus_description_file:
69+
combined_corpus_description = json.load(combined_corpus_description_file)
70+
self.assertLen(combined_corpus_description['modules'], 2)
71+
72+
def test_ignore_extra_file(self):
73+
corpus_dir = self.create_tempdir()
74+
subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
75+
_ = corpus_dir.create_file(file_path='empty.log')
76+
subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
77+
subcorpus1_description_file = subcorpus1_dir.create_file(
78+
file_path='corpus_description.json')
79+
subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
80+
combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
81+
with open(
82+
os.path.join(corpus_dir, 'corpus_description.json'),
83+
encoding='utf-8') as combined_corpus_description_file:
84+
combined_corpus_description = json.load(combined_corpus_description_file)
85+
self.assertLen(combined_corpus_description['modules'], 2)
86+
5787
def test_different_corpora(self):
5888
corpus_dir = self.create_tempdir()
5989
subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')

0 commit comments

Comments
 (0)