Skip to content

Commit 5473abd

Browse files
committed
Add distance and direction features, delete full graph (unneeded), minor stuff
1 parent 9c2edda commit 5473abd

File tree

8 files changed

+212
-107
lines changed

8 files changed

+212
-107
lines changed

dependency-parser.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# !/bin/python
22
# -*- coding: utf-8 -*-
33

4-
import cProfile
5-
64
import time
75
import os
86
import codecs
@@ -33,7 +31,6 @@ def save(file_name, model):
3331

3432

3533
def train(args):
36-
3734
print "\tCreating feature map..."
3835
start = time.time()
3936

@@ -76,26 +73,33 @@ def train(args):
7673

7774
print "\t\tEpoch: " + str(epoch) + ", Smoothing coefficient: " + str(alpha)
7875

79-
total = 0
80-
correct = 0
76+
total = 0.0
77+
correct = 0.0
8178
errors = 0
8279

80+
correct_arcs = 0.0
81+
total_arcs = 0.0
82+
8383
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
8484

8585
sparse_graph = Graph(sentence, "sparse", feat_map).heads # gold graph
8686
complete_sparse_graph = Graph(sentence, "complete-sparse", feat_map, weight_vector).heads # complete graph
8787

8888
# call the perceptron
89-
weight_vector, correct, errors = structured_perceptron(complete_sparse_graph, weight_vector, correct,
90-
errors, "train", sparse_graph, alpha)
89+
weight_vector, correct, errors, correct_arcs, total_arcs = structured_perceptron(complete_sparse_graph,
90+
weight_vector, correct,
91+
errors, correct_arcs,
92+
total_arcs, "train",
93+
sparse_graph, alpha)
9194

9295
total += 1
9396

9497
# print some information every 500 sentences
9598
if total % 500 == 0:
9699
stop2 = time.time()
97-
print "\t\t\tInstance Nr. " + str(total) + "\tCorrect: " + str(correct) + "\t(" \
98-
+ str((correct * 100) / total) + "%)\tErrors: " + str(errors) + "\t" + str(stop2-start2) + " sec"
100+
print "\t\t\tInstance Nr. {0}\tCorrect sentences: {1}\t({2}%)\tUAS: {3}%\tErrors: {4}\t{5} sec".format(
101+
total, correct, round((correct / total) * 100, 2), round((correct_arcs / total_arcs) * 100, 2),
102+
errors, round(stop2 - start2, 2))
99103
start2 = time.time()
100104

101105
# decrease alpha after every epoch if activated
@@ -115,7 +119,6 @@ def train(args):
115119

116120

117121
def test(args):
118-
119122
# load classifier vectors (model) and feature vector from file:
120123
print "\tLoading the model and the features from file '" + str(args.model) + "'"
121124
start = time.time()
@@ -152,7 +155,7 @@ def test(args):
152155
tmp_errors = errors
153156

154157
# call the perceptron
155-
predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, "test", None)
158+
predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, None, None, "test", None)
156159

157160
if tmp_errors == errors: # no error occured during prediction
158161
write_graph_to_file(predicted_graph, args.out_file)

modules/cle.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def chu_liu_edmonds(graph):
4444

4545
# add arcs from inside cycle to outside cycle, then delete t_c
4646
if t_c in y:
47-
for arc in (arc for arc in y[t_c] if arc.dependent not in c):
47+
# for arc in (arc for arc in y[t_c] if arc.dependent not in c):
48+
for arc in y[t_c]:
4849
# dependents of t_c which are not in c
4950
for head in (head for head in g if head == arc.former_head):
5051
# heads in g_a which are former head of arc
@@ -73,7 +74,7 @@ def chu_liu_edmonds(graph):
7374
y[head_of_cycle] = new_dependents
7475

7576
# add arcs from inside cycle to inside cycle except the one pointing to cycle_start_node
76-
for head in (head for head in g_a if head in c):
77+
for head in c:
7778
# every head in g_a that is in c
7879
for arc in (arc for arc in g_a[head] if arc.dependent in c and not arc.dependent == cycle_start_node):
7980
# every dependent of head in g_a if it is in c but not cycle_start_node

modules/evaluation.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,52 @@
11
import codecs
22

33
from token import sentences
4-
from graphs import Graph
4+
from graphs import Graph, reverse_head_graph
55
from perceptron import make_graph_compareable
66

77

88
def evaluate(args):
9-
full_graphs_gold = {}
9+
sparse_graphs_gold = {}
10+
sparse_graphs_predicted = {}
1011

11-
total = 0
12-
correct = 0
12+
total = 0.0
13+
correct = 0.0
14+
15+
total_arcs = 0.0
16+
correct_arcs = 0.0
1317

1418
for sentence in sentences(codecs.open(args.gold, encoding='utf-8')):
15-
full_graph_gold = Graph(sentence, "full").heads
16-
full_graphs_gold[len(full_graphs_gold)] = full_graph_gold
17-
full_graphs_predicted = {}
19+
sparse_graph_gold = Graph(sentence, "sparse").heads
20+
sparse_graphs_gold[len(sparse_graphs_gold)] = sparse_graph_gold
1821
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
19-
full_graph_predicted = Graph(sentence, "full").heads
20-
full_graphs_predicted[len(full_graphs_predicted)] = full_graph_predicted
22+
sparse_graph_predicted = Graph(sentence, "sparse").heads
23+
sparse_graphs_predicted[len(sparse_graphs_predicted)] = sparse_graph_predicted
2124

22-
if len(full_graphs_gold) == len(full_graphs_predicted):
23-
for gold_graph in sorted(full_graphs_gold.keys()):
25+
if len(sparse_graphs_gold) == len(sparse_graphs_predicted):
26+
for gold_graph in sorted(sparse_graphs_gold.keys()):
2427
total += 1
25-
if make_graph_compareable(full_graphs_gold[gold_graph]) == make_graph_compareable(
26-
full_graphs_predicted[gold_graph]):
28+
if make_graph_compareable(sparse_graphs_gold[gold_graph]) == make_graph_compareable(
29+
sparse_graphs_predicted[gold_graph]):
2730
correct += 1
31+
2832
else:
29-
print "Error in file length, Gold: " + str(len(full_graphs_gold)) + ", Predicted: " + str(
30-
len(full_graphs_predicted))
33+
print "Error in file length, Gold: " + str(len(sparse_graphs_gold)) + ", Predicted: " + str(
34+
len(sparse_graphs_predicted))
35+
36+
for predicted_graph in sorted(sparse_graphs_predicted.keys()):
37+
rev_predicted = reverse_head_graph(sparse_graphs_predicted[predicted_graph])
38+
rev_gold = reverse_head_graph(sparse_graphs_gold[predicted_graph])
39+
for dependent in rev_predicted:
40+
for arc in rev_predicted[dependent]:
41+
if arc.head == rev_gold[dependent][0].head:
42+
correct_arcs += 1
43+
total_arcs += 1
3144

3245
with open(args.out_file, "w") as out:
3346
print >> out, "Total: " + str(total)
3447
print >> out, "Correct: " + str(correct)
48+
print >> out, "%: " + str(round(correct/total, 2) * 100)
49+
print >> out, ""
50+
print >> out, "Total Arcs: " + str(total_arcs)
51+
print >> out, "Correct: " + str(correct_arcs)
52+
print >> out, "%: " + str(round(correct_arcs/total_arcs, 2) * 100)

modules/featmap.py

Lines changed: 95 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from token import sentences
22
import codecs
33

4-
def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos):
4+
5+
def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos, direction, distance):
56

67
# generator that yields features based on the following information:
78

@@ -13,62 +14,113 @@ def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos):
1314
# 6 = hlemma
1415
# 7 = dlemma
1516

16-
yield u'1:{0}'.format(hform)
17-
yield u'2:{0}'.format(hpos)
18-
yield u'3:{0}'.format(dform)
19-
yield u'4:{0}'.format(dpos)
20-
yield u'6:{0}'.format(hlemma)
21-
yield u'7:{0}'.format(dlemma)
22-
yield u'5:{0}'.format(bpos)
23-
24-
yield u'1,4:{0},{1}'.format(hform, dpos)
25-
yield u'2,3:{0},{1}'.format(hpos, dform)
26-
yield u'1,2:{0},{1}'.format(hform, hpos)
27-
yield u'3,4:{0},{1}'.format(dform, dpos)
28-
yield u'1,3:{0},{1}'.format(hform, dform)
29-
yield u'2,4:{0},{1}'.format(hpos, dpos)
30-
yield u'6,4:{0},{1}'.format(hlemma, dpos)
31-
yield u'2,7:{0},{1}'.format(hpos, dlemma)
32-
yield u'6,2:{0},{1}'.format(hlemma, hpos)
33-
yield u'7,4:{0},{1}'.format(dlemma, dpos)
34-
yield u'6,7:{0},{1}'.format(hlemma, dlemma)
35-
36-
yield u'1,2,3,4:{0},{1},{2},{3}'.format(hform, hpos, dform, dpos)
37-
yield u'2,3,4:{0},{1},{2}'.format(hpos, dform, dpos)
38-
yield u'1,3,4:{0},{1},{2}'.format(hform, dform, dpos)
39-
yield u'1,2,3:{0},{1},{2}'.format(hform, hpos, dform)
40-
yield u'1,2,4:{0},{1},{2}'.format(hform, hpos, dpos)
41-
yield u'2,5,4:{0},{1},{2}'.format(hpos, bpos, dpos)
42-
yield u'2,5,3:{0},{1},{2}'.format(hpos, bpos, dform)
43-
yield u'1,5,4:{0},{1},{2}'.format(hform, bpos, dpos)
44-
yield u'1,5,3:{0},{1},{2}'.format(hform, bpos, dform)
45-
46-
yield u'6,2,7,4:{0},{1},{2},{3}'.format(hlemma, hpos, dlemma, dpos)
47-
yield u'2,7,4:{0},{1},{2}'.format(hpos, dlemma, dpos)
48-
yield u'6,7,4:{0},{1},{2}'.format(hlemma, dlemma, dpos)
49-
yield u'6,2,7:{0},{1},{2}'.format(hlemma, hpos, dlemma)
50-
yield u'6,2,4:{0},{1},{2}'.format(hlemma, hpos, dpos)
51-
yield u'2,5,7:{0},{1},{2}'.format(hpos, bpos, dlemma)
52-
yield u'6,5,4:{0},{1},{2}'.format(hlemma, bpos, dpos)
53-
yield u'6,5,7:{0},{1},{2}'.format(hlemma, bpos, dlemma)
17+
yield u'1,dir,dist:{0},{1},{2}'.format(hform, direction, distance)
18+
yield u'2,dir,dist:{0},{1},{2}'.format(hpos, direction, distance)
19+
yield u'3,dir,dist:{0},{1},{2}'.format(dform, direction, distance)
20+
yield u'4,dir,dist:{0},{1},{2}'.format(dpos, direction, distance)
21+
yield u'6,dir,dist:{0},{1},{2}'.format(hlemma, direction, distance)
22+
yield u'7,dir,dist:{0},{1},{2}'.format(dlemma, direction, distance)
23+
yield u'5,dir,dist:{0},{1},{2}'.format(bpos, direction, distance)
24+
25+
yield u'1,4,dir,dist:{0},{1},{2},{3}'.format(hform, dpos, direction, distance)
26+
yield u'2,3,dir,dist:{0},{1},{2},{3}'.format(hpos, dform, direction, distance)
27+
yield u'1,2,dir,dist:{0},{1},{2},{3}'.format(hform, hpos, direction, distance)
28+
yield u'3,4,dir,dist:{0},{1},{2},{3}'.format(dform, dpos, direction, distance)
29+
yield u'1,3,dir,dist:{0},{1},{2},{3}'.format(hform, dform, direction, distance)
30+
yield u'2,4,dir,dist:{0},{1},{2},{3}'.format(hpos, dpos, direction, distance)
31+
yield u'6,4,dir,dist:{0},{1},{2},{3}'.format(hlemma, dpos, direction, distance)
32+
yield u'2,7,dir,dist:{0},{1},{2},{3}'.format(hpos, dlemma, direction, distance)
33+
yield u'6,2,dir,dist:{0},{1},{2},{3}'.format(hlemma, hpos, direction, distance)
34+
yield u'7,4,dir,dist:{0},{1},{2},{3}'.format(dlemma, dpos, direction, distance)
35+
yield u'6,7,dir,dist:{0},{1},{2},{3}'.format(hlemma, dlemma, direction, distance)
36+
37+
yield u'1,2,3,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hform, hpos, dform, dpos, direction, distance)
38+
yield u'2,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dform, dpos, direction, distance)
39+
yield u'1,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, dform, dpos, direction, distance)
40+
yield u'1,2,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dform, direction, distance)
41+
yield u'1,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dpos, direction, distance)
42+
yield u'2,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dpos, direction, distance)
43+
yield u'2,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dform, direction, distance)
44+
yield u'1,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dpos, direction, distance)
45+
yield u'1,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dform, direction, distance)
46+
47+
yield u'6,2,7,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hlemma, hpos, dlemma, dpos, direction, distance)
48+
yield u'2,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dlemma, dpos, direction, distance)
49+
yield u'6,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, dlemma, dpos, direction, distance)
50+
yield u'6,2,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dlemma, direction, distance)
51+
yield u'6,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dpos, direction, distance)
52+
yield u'2,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dlemma, direction, distance)
53+
yield u'6,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dpos, direction, distance)
54+
yield u'6,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dlemma, direction, distance)
55+
56+
57+
def give_distance(id1, id2, direction):
58+
59+
if direction == "right":
60+
d = id1 - id2
61+
else:
62+
d = id2 - id1
63+
64+
if d < 1:
65+
print "Error in distance computing, distance is too low."
66+
distance = "__ERROR__"
67+
elif d == 1:
68+
distance = "1"
69+
elif d == 2:
70+
distance = "2"
71+
elif d == 3:
72+
distance = "3"
73+
elif d == 4:
74+
distance = "4"
75+
elif d == 5:
76+
distance = "5"
77+
elif 5 < d <= 10:
78+
distance = "6-10"
79+
elif 10 < d <= 20:
80+
distance = "11-20"
81+
else:
82+
distance = ">20"
83+
84+
return distance
85+
86+
87+
def give_direction(id1, id2):
88+
if id2 < id1:
89+
direction = "right"
90+
else:
91+
direction = "left"
92+
93+
return direction
5494

5595

5696
def fm(infile):
97+
5798
# takes a file in conll06 format, returns a feature map
5899
feat_map = {} # featmap as dictionary {feature:index}
59100
index = 0 # index in featmap
101+
60102
for sentence in sentences(codecs.open(infile, encoding='utf-8')):
61103
for token1 in sentence:
62-
104+
direction = "left"
105+
distance = give_distance(0, token1.id, direction)
63106
# add root features
64-
for feature in give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form, token1.lemma, token1.pos, token1.rel):
107+
for feature in give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form, token1.lemma, token1.pos,
108+
token1.rel, direction, distance):
65109
if feature not in feat_map:
66110
feat_map[feature] = index
67111
index += 1
68112

69113
# add other features
70-
for token2 in sentence:
71-
for feature in give_features(token1.form, token1.lemma, token1.pos, token2.form, token2.lemma, token2.pos, token2.rel):
114+
for token2 in (token2 for token2 in sentence if token2.id != token1.id):
115+
116+
# direction
117+
direction = give_direction(token1.id, token2.id)
118+
119+
# distance
120+
distance = give_distance(token1.id, token2.id, direction)
121+
122+
for feature in give_features(token1.form, token1.lemma, token1.pos, token2.form, token2.lemma,
123+
token2.pos, token2.rel, direction, distance):
72124
if feature not in feat_map:
73125
feat_map[feature] = index
74126
index += 1

0 commit comments

Comments
 (0)