|
| 1 | + |
| 2 | +from __future__ import print_function, unicode_literals |
| 3 | +from lxml import etree |
| 4 | +from depccg import PyJaAStarParser |
| 5 | +import argparse |
| 6 | + |
| 7 | +def depccg2xml(tree, sid): |
| 8 | + def traverse(node, spid=0): |
| 9 | + id = "s{}_sp{}".format(sid, spid) |
| 10 | + xml_node = etree.SubElement(res, "span") |
| 11 | + xml_node.set("category", str(node.cat)) |
| 12 | + xml_node.set("begin", str(node.start_of_span)) |
| 13 | + xml_node.set("end", str(node.start_of_span+len(node))) |
| 14 | + xml_node.set("id", id) |
| 15 | + if node.is_leaf: |
| 16 | + xml_node.set("terminal", |
| 17 | + "s{}_{}".format(sid, node.head_id)) |
| 18 | + else: |
| 19 | + spid, childid = traverse(node.left_child, spid+1) |
| 20 | + if not node.is_unary: |
| 21 | + spid, tmp = traverse(node.right_child, spid+1) |
| 22 | + childid += " " + tmp |
| 23 | + xml_node.set("child", childid) |
| 24 | + xml_node.set("rule", node.op_string) |
| 25 | + return spid, id |
| 26 | + res = etree.Element("ccg") |
| 27 | + res.set("id", "s{}_ccg0".format(sid)) |
| 28 | + _, id = traverse(tree) |
| 29 | + res.set("root", str(id)) |
| 30 | + return res |
| 31 | + |
| 32 | + |
| 33 | +parser = argparse.ArgumentParser("A* CCG parser") |
| 34 | +parser.add_argument("model", help="model directory") |
| 35 | +parser.add_argument("input", help="input xml") |
| 36 | +args = parser.parse_args() |
| 37 | + |
| 38 | +xml_root = etree.parse(args.input).getroot() |
| 39 | +sents = xml_root[0][0] |
| 40 | + |
| 41 | +parser = PyJaAStarParser(args.model) |
| 42 | +res = parser.parse_doc( |
| 43 | + [[t.get("surf") for t in sent[0]] for sent in sents]) |
| 44 | + |
| 45 | +for i, (sent, tree) in enumerate(zip(sents, res)): |
| 46 | + sent.append(depccg2xml(tree, i)) |
| 47 | + |
| 48 | +print(etree.tostring(xml_root, |
| 49 | + pretty_print=True).decode("utf-8")) |
0 commit comments