elisatools/extract_psm_annotation.py at master · panx27/elisatools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3

# utilities for dealing with LRLPs
import argparse
import codecs
import sys
import os
import re
import os.path
from zipfile import ZipFile as zf
import xml.etree.ElementTree as ET
import gzip
import os
scriptdir = os.path.dirname(os.path.abspath(__file__))
import datetime
from io import TextIOWrapper

# Scrape monolingual psms for posts and headlines (elsewhere)

def main():
  import codecs
  parser = argparse.ArgumentParser(description="Extract and print psm annotat" \
                                   "ion data from LRLP in a form that is amen" \
                                   "able to insertion into future xml",
                                   formatter_class=\
                                   argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                      default=[sys.stdin,], help="input zip file(s)" \
                      " (each contains a multi file)")
  parser.add_argument("--outfile", "-o", type=argparse.FileType('w'),
                      default=sys.stdout,
                      help="where to write extracted semantic info")
  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  outfile = args.outfile

  nonehash = {"value":"None"}

  for infile in args.infile:
    inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
    archive = zf(infile)
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # Assume psm structure
      if os.path.dirname(info.filename) != 'psm':
        continue
      with TextIOWrapper(archive.open(info, 'r')) as ifh:
        xobj = ET.parse(ifh)
        try:
          headlines = [(x.get("begin_offset"), x.get("char_length")) \
                       for x in xobj.findall("string[@type='headline']")]
          # TODO: funornone this back into functional
          postnodes = xobj.findall("string[@type='post']")
          posts = []
          for x in postnodes:
            post = []
            anode = x.find("attribute[@name='author']")
            if anode is None:
              anode = nonehash
            dnode = x.find("attribute[@name='datetime']")
            if dnode is None:
              dnode = nonehash
            posts.append((x.get("begin_offset"),
                          x.get("char_length"),
                          anode.get('value'),
                          dnode.get('value')))
        except:
          print(info.filename)
          raise
          sys.exit(1)

        # GENRE/LANG/DATE info will be gleaned from filename later.
        # assume psm.xml and strip it off
        fname = os.path.basename(info.filename).split(".psm.xml")[0]
        for h in headlines:
          outfile.write("\t".join(("headline", fname)+h)+"\n")
        for p in posts:
          outfile.write("\t".join(("post", fname)+p)+"\n")

if __name__ == '__main__':
  main()