Skip to content

Commit 7175852

Browse files
authored
fix combine_training_corpus.py to use corpus_description.json (#97)
* have combine_training_corpus.py use corpus_description * keep empty line. * yapf fix * raise error if input corpora differ more than modules. * fix pylint * fix
1 parent 40dddb1 commit 7175852

File tree

1 file changed

+20
-7
lines changed

1 file changed

+20
-7
lines changed

compiler_opt/tools/combine_training_corpus.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
# limitations under the License.
1515
r"""Combine multiple training corpus into a single training corpus.
1616
17+
Currently only support the case that multiple corpus share the same
18+
configurables except the "modules" field.
19+
1720
Usage: we'd like to combine training corpus corpus1 and corpus2 into
1821
combinedcorpus; we first structure the files as follows:
1922
@@ -27,10 +30,11 @@
2730
compiler_opt/tools/combine_training_corpus.py \
2831
--root_dir=$PATH_TO_combinedcorpus
2932
30-
generates combinedcorpus/module_path file. In this way corpus1 and
31-
corpus2 are combined into combinedcorpus.
33+
generates combinedcorpus/corpus_description.json file. In this way corpus1
34+
and corpus2 are combined into combinedcorpus.
3235
"""
3336

37+
import json
3438
import os
3539

3640
from absl import app
@@ -43,14 +47,15 @@
4347

4448
FLAGS = flags.FLAGS
4549

46-
_FILE_NAME = 'module_paths'
50+
_FILE_NAME = 'corpus_description.json'
4751

4852

4953
def main(argv):
5054
if len(argv) > 1:
5155
raise app.UsageError('Too many command-line arguments.')
5256

5357
module_names = []
58+
output_corpus_description = {}
5459

5560
for sub_dir in tf.io.gfile.listdir(FLAGS.root_dir):
5661
path = os.path.join(FLAGS.root_dir, sub_dir, _FILE_NAME)
@@ -62,12 +67,20 @@ def main(argv):
6267
continue
6368

6469
with tf.io.gfile.GFile(path, 'r') as f:
65-
module_names.extend(
66-
[os.path.join(sub_dir, name.rstrip('\n')) for name in f])
70+
corpus_description = json.load(f)
71+
module_names.extend([
72+
os.path.join(sub_dir, name) for name in corpus_description['modules']
73+
])
74+
del corpus_description['modules']
75+
if len(output_corpus_description) == 0:
76+
output_corpus_description = corpus_description
77+
elif corpus_description != output_corpus_description:
78+
raise ValueError('Input corpora differ more than modules.')
79+
80+
output_corpus_description['modules'] = module_names
6781

6882
with tf.io.gfile.GFile(os.path.join(FLAGS.root_dir, _FILE_NAME), 'w') as f:
69-
for module in module_names:
70-
f.write(module + '\n')
83+
json.dump(output_corpus_description, f, indent=2)
7184

7285

7386
if __name__ == '__main__':

0 commit comments

Comments
 (0)