@@ -23,20 +23,28 @@ def convert_json_to_brat(jsonfile, outdir):
2323
2424 # Iterate over documents
2525 for d in docs :
26- if 'ner' not in d ['metadata' ]:
27- print 'No named entities found for' , d ['file' ]
28- continue
29-
30- # Output relevant annotations into a brat .ann file
31- ners = d ['metadata' ]['ner' ]
3226 res_name = d ['metadata' ]['resourceName' ]
3327 if type (res_name ) == list :
3428 # Sometimes Tika returns this as something like
3529 # "resourceName": ["2005_1725.pdf", "High Quality.joboptions"]
3630 res_name = res_name [0 ]
31+
32+ # Output text into a .txt file
33+ text = d ['content_ann_s' ]
34+ outfn = os .path .join (outdir , res_name [:- 4 ] + '.txt' )
35+ with io .open (outfn , 'w' , encoding = 'utf8' ) as outf :
36+ print ('Writing text to %s' % outfn )
37+ outf .write (text + '\n ' )
38+
39+ if 'ner' not in d ['metadata' ]:
40+ print ('No named entities found for %s' % d ['file' ])
41+ continue
42+
43+ # Output relevant annotations into a brat .ann file
44+ ners = d ['metadata' ]['ner' ]
3745 outfn = os .path .join (outdir , res_name [:- 4 ] + '.ann' )
3846 outf = io .open (outfn , 'w' , encoding = 'utf8' )
39- print 'Writing to' , outfn
47+ print ( 'Writing annotations to %s' % outfn )
4048 for (t , n ) in enumerate (ners ):
4149 outf .write ('T%d\t %s %s %s\t %s\n ' % \
4250 (t + 1 , n ['label' ], n ['begin' ], n ['end' ], n ['text' ]))
0 commit comments