14
14
# limitations under the License.
15
15
r"""Combine multiple training corpus into a single training corpus.
16
16
17
+ Currently only support the case that multiple corpus share the same
18
+ configurables except the "modules" field.
19
+
17
20
Usage: we'd like to combine training corpus corpus1 and corpus2 into
18
21
combinedcorpus; we first structure the files as follows:
19
22
27
30
compiler_opt/tools/combine_training_corpus.py \
28
31
--root_dir=$PATH_TO_combinedcorpus
29
32
30
- generates combinedcorpus/module_path file. In this way corpus1 and
31
- corpus2 are combined into combinedcorpus.
33
+ generates combinedcorpus/corpus_description.json file. In this way corpus1
34
+ and corpus2 are combined into combinedcorpus.
32
35
"""
33
36
37
+ import json
34
38
import os
35
39
36
40
from absl import app
43
47
44
48
FLAGS = flags .FLAGS
45
49
46
- _FILE_NAME = 'module_paths '
50
+ _FILE_NAME = 'corpus_description.json '
47
51
48
52
49
53
def main (argv ):
50
54
if len (argv ) > 1 :
51
55
raise app .UsageError ('Too many command-line arguments.' )
52
56
53
57
module_names = []
58
+ output_corpus_description = {}
54
59
55
60
for sub_dir in tf .io .gfile .listdir (FLAGS .root_dir ):
56
61
path = os .path .join (FLAGS .root_dir , sub_dir , _FILE_NAME )
@@ -62,12 +67,20 @@ def main(argv):
62
67
continue
63
68
64
69
with tf .io .gfile .GFile (path , 'r' ) as f :
65
- module_names .extend (
66
- [os .path .join (sub_dir , name .rstrip ('\n ' )) for name in f ])
70
+ corpus_description = json .load (f )
71
+ module_names .extend ([
72
+ os .path .join (sub_dir , name ) for name in corpus_description ['modules' ]
73
+ ])
74
+ del corpus_description ['modules' ]
75
+ if len (output_corpus_description ) == 0 :
76
+ output_corpus_description = corpus_description
77
+ elif corpus_description != output_corpus_description :
78
+ raise ValueError ('Input corpora differ more than modules.' )
79
+
80
+ output_corpus_description ['modules' ] = module_names
67
81
68
82
with tf .io .gfile .GFile (os .path .join (FLAGS .root_dir , _FILE_NAME ), 'w' ) as f :
69
- for module in module_names :
70
- f .write (module + '\n ' )
83
+ json .dump (output_corpus_description , f , indent = 2 )
71
84
72
85
73
86
if __name__ == '__main__' :
0 commit comments