Skip to content

Commit 5b256a6

Browse files
committed
Output .txt (content_ann_s) along with .ann. (#16)
1 parent b69e1c0 commit 5b256a6

File tree

1 file changed

+15
-7
lines changed

1 file changed

+15
-7
lines changed

src/parserindexer/json2brat.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,28 @@ def convert_json_to_brat(jsonfile, outdir):
2323

2424
# Iterate over documents
2525
for d in docs:
26-
if 'ner' not in d['metadata']:
27-
print 'No named entities found for', d['file']
28-
continue
29-
30-
# Output relevant annotations into a brat .ann file
31-
ners = d['metadata']['ner']
3226
res_name = d['metadata']['resourceName']
3327
if type(res_name) == list:
3428
# Sometimes Tika returns this as something like
3529
# "resourceName": ["2005_1725.pdf", "High Quality.joboptions"]
3630
res_name = res_name[0]
31+
32+
# Output text into a .txt file
33+
text = d['content_ann_s']
34+
outfn = os.path.join(outdir, res_name[:-4] + '.txt')
35+
with io.open(outfn, 'w', encoding='utf8') as outf:
36+
print('Writing text to %s' % outfn)
37+
outf.write(text + '\n')
38+
39+
if 'ner' not in d['metadata']:
40+
print('No named entities found for %s' % d['file'])
41+
continue
42+
43+
# Output relevant annotations into a brat .ann file
44+
ners = d['metadata']['ner']
3745
outfn = os.path.join(outdir, res_name[:-4] + '.ann')
3846
outf = io.open(outfn, 'w', encoding='utf8')
39-
print 'Writing to', outfn
47+
print('Writing annotations to %s' % outfn)
4048
for (t, n) in enumerate(ners):
4149
outf.write('T%d\t%s %s %s\t%s\n' % \
4250
(t+1, n['label'], n['begin'], n['end'], n['text']))

0 commit comments

Comments
 (0)