-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_psm_annotation.py
More file actions
executable file
·85 lines (77 loc) · 2.88 KB
/
extract_psm_annotation.py
File metadata and controls
executable file
·85 lines (77 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
# utilities for dealing with LRLPs
import argparse
import codecs
import sys
import os
import re
import os.path
from zipfile import ZipFile as zf
import xml.etree.ElementTree as ET
import gzip
import os
scriptdir = os.path.dirname(os.path.abspath(__file__))
import datetime
from io import TextIOWrapper
# Scrape monolingual psms for posts and headlines (elsewhere)
def main():
import codecs
parser = argparse.ArgumentParser(description="Extract and print psm annotat" \
"ion data from LRLP in a form that is amen" \
"able to insertion into future xml",
formatter_class=\
argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
default=[sys.stdin,], help="input zip file(s)" \
" (each contains a multi file)")
parser.add_argument("--outfile", "-o", type=argparse.FileType('w'),
default=sys.stdout,
help="where to write extracted semantic info")
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
outfile = args.outfile
nonehash = {"value":"None"}
for infile in args.infile:
inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
archive = zf(infile)
for info in archive.infolist():
if info.file_size < 20:
continue
# Assume psm structure
if os.path.dirname(info.filename) != 'psm':
continue
with TextIOWrapper(archive.open(info, 'r')) as ifh:
xobj = ET.parse(ifh)
try:
headlines = [(x.get("begin_offset"), x.get("char_length")) \
for x in xobj.findall("string[@type='headline']")]
# TODO: funornone this back into functional
postnodes = xobj.findall("string[@type='post']")
posts = []
for x in postnodes:
post = []
anode = x.find("attribute[@name='author']")
if anode is None:
anode = nonehash
dnode = x.find("attribute[@name='datetime']")
if dnode is None:
dnode = nonehash
posts.append((x.get("begin_offset"),
x.get("char_length"),
anode.get('value'),
dnode.get('value')))
except:
print(info.filename)
raise
sys.exit(1)
# GENRE/LANG/DATE info will be gleaned from filename later.
# assume psm.xml and strip it off
fname = os.path.basename(info.filename).split(".psm.xml")[0]
for h in headlines:
outfile.write("\t".join(("headline", fname)+h)+"\n")
for p in posts:
outfile.write("\t".join(("post", fname)+p)+"\n")
if __name__ == '__main__':
main()