1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15-
16- from sdp .processors .base_processor import BaseProcessor , BaseParallelProcessor , DataEntry
1715import json
18- import ndjson
16+ from sdp .processors .base_processor import BaseProcessor , BaseParallelProcessor , DataEntry
17+ from sdp .utils .common import load_manifest , save_manifest
1918from nemo_text_processing .inverse_text_normalization .inverse_normalize import InverseNormalizer
2019from nemo .collections .nlp .models import PunctuationCapitalizationModel
2120
@@ -47,11 +46,10 @@ def __init__(self,
4746 self .normalizer = InverseNormalizer (lang = language )
4847
4948 def read_manifest (self ):
50- ''' Reads metadata from NDJSON file in the input manifest
49+ ''' Reads metadata from JSONL file in the input manifest
5150 and converts it to data entries '''
5251
53- with open (self .input_manifest_file , "r" , encoding = "utf8" ) as fin :
54- dataset_entries = ndjson .load (fin )
52+ dataset_entries = load_manifest (self .input_manifest_file , encoding = "utf8" )
5553
5654 return dataset_entries
5755
@@ -102,8 +100,7 @@ def __init__(self,
102100 self .pnc_model .cuda ()
103101
104102 def process (self ):
105- with open (self .input_manifest_file ) as f :
106- manifest = ndjson .load (f )
103+ manifest = load_manifest (self .input_manifest_file )
107104
108105 results = []
109106 all_text = []
@@ -123,8 +120,7 @@ def process(self):
123120 i += 1
124121 results .append (metadata )
125122
126- with open (self .output_manifest_file , 'w' ) as f :
127- ndjson .dump (results , f )
123+ save_manifest (results , self .output_manifest_file )
128124
129125class PunctuationAndCapitalizationProcessor (BaseProcessor ):
130126 """This processor performs punctuation and capitalization on text data.
@@ -163,8 +159,7 @@ def __init__(self,
163159 self .pnc_model .cuda ()
164160
165161 def process (self ):
166- with open (self .input_manifest_file ) as f :
167- manifest = ndjson .load (f )
162+ manifest = load_manifest (self .input_manifest_file )
168163
169164 all_text = []
170165
0 commit comments