Skip to content

Commit abb485b

Browse files
committed
added a script for depccg parsing
1 parent e332e91 commit abb485b

File tree

1 file changed

+49
-0
lines changed

1 file changed

+49
-0
lines changed

ja/rte.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
from __future__ import print_function, unicode_literals
3+
from lxml import etree
4+
from depccg import PyJaAStarParser
5+
import argparse
6+
7+
def depccg2xml(tree, sid):
8+
def traverse(node, spid=0):
9+
id = "s{}_sp{}".format(sid, spid)
10+
xml_node = etree.SubElement(res, "span")
11+
xml_node.set("category", str(node.cat))
12+
xml_node.set("begin", str(node.start_of_span))
13+
xml_node.set("end", str(node.start_of_span+len(node)))
14+
xml_node.set("id", id)
15+
if node.is_leaf:
16+
xml_node.set("terminal",
17+
"s{}_{}".format(sid, node.head_id))
18+
else:
19+
spid, childid = traverse(node.left_child, spid+1)
20+
if not node.is_unary:
21+
spid, tmp = traverse(node.right_child, spid+1)
22+
childid += " " + tmp
23+
xml_node.set("child", childid)
24+
xml_node.set("rule", node.op_string)
25+
return spid, id
26+
res = etree.Element("ccg")
27+
res.set("id", "s{}_ccg0".format(sid))
28+
_, id = traverse(tree)
29+
res.set("root", str(id))
30+
return res
31+
32+
33+
parser = argparse.ArgumentParser("A* CCG parser")
34+
parser.add_argument("model", help="model directory")
35+
parser.add_argument("input", help="input xml")
36+
args = parser.parse_args()
37+
38+
xml_root = etree.parse(args.input).getroot()
39+
sents = xml_root[0][0]
40+
41+
parser = PyJaAStarParser(args.model)
42+
res = parser.parse_doc(
43+
[[t.get("surf") for t in sent[0]] for sent in sents])
44+
45+
for i, (sent, tree) in enumerate(zip(sents, res)):
46+
sent.append(depccg2xml(tree, i))
47+
48+
print(etree.tostring(xml_root,
49+
pretty_print=True).decode("utf-8"))

0 commit comments

Comments
 (0)