YNedderhoff
diff --git a/‎dependency-parser.py‎
Lines changed: 14 additions & 11 deletions b/‎dependency-parser.py‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎modules/cle.py‎
Lines changed: 3 additions & 2 deletions b/‎modules/cle.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎modules/evaluation.py‎
Lines changed: 33 additions & 15 deletions b/‎modules/evaluation.py‎
Lines changed: 33 additions & 15 deletions
diff --git a/‎modules/featmap.py‎
Lines changed: 95 additions & 43 deletions b/‎modules/featmap.py‎
Lines changed: 95 additions & 43 deletions
@@ -1,8 +1,6 @@
 # !/bin/python
 #  -*- coding: utf-8 -*-
 
-import cProfile
-
 import time
 import os
 import codecs
@@ -33,7 +31,6 @@ def save(file_name, model):
 
 
 def train(args):
-
     print "\tCreating feature map..."
     start = time.time()
 
@@ -76,26 +73,33 @@ def train(args):
 
         print "\t\tEpoch: " + str(epoch) + ", Smoothing coefficient: " + str(alpha)
 
-        total = 0
-        correct = 0
+        total = 0.0
+        correct = 0.0
         errors = 0
 
+        correct_arcs = 0.0
+        total_arcs = 0.0
+
         for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
 
             sparse_graph = Graph(sentence, "sparse", feat_map).heads  # gold graph
             complete_sparse_graph = Graph(sentence, "complete-sparse", feat_map, weight_vector).heads  # complete graph
 
             # call the perceptron
-            weight_vector, correct, errors = structured_perceptron(complete_sparse_graph, weight_vector, correct,
-                                                                   errors, "train", sparse_graph, alpha)
+            weight_vector, correct, errors, correct_arcs, total_arcs = structured_perceptron(complete_sparse_graph,
+                                                                                             weight_vector, correct,
+                                                                                             errors, correct_arcs,
+                                                                                             total_arcs, "train",
+                                                                                             sparse_graph, alpha)
 
             total += 1
 
             # print some information every 500 sentences
             if total % 500 == 0:
                 stop2 = time.time()
-                print "\t\t\tInstance Nr. " + str(total) + "\tCorrect: " + str(correct) + "\t(" \
-                      + str((correct * 100) / total) + "%)\tErrors: " + str(errors) + "\t" + str(stop2-start2) + " sec"
+                print "\t\t\tInstance Nr. {0}\tCorrect sentences: {1}\t({2}%)\tUAS: {3}%\tErrors: {4}\t{5} sec".format(
+                    total, correct, round((correct / total) * 100, 2), round((correct_arcs / total_arcs) * 100, 2),
+                    errors, round(stop2 - start2, 2))
                 start2 = time.time()
 
         # decrease alpha after every epoch if activated
@@ -115,7 +119,6 @@ def train(args):
 
 
 def test(args):
-
     # load classifier vectors (model) and feature vector from file:
     print "\tLoading the model and the features from file '" + str(args.model) + "'"
     start = time.time()
@@ -152,7 +155,7 @@ def test(args):
         tmp_errors = errors
 
         # call the perceptron
-        predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, "test", None)
+        predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, None, None, "test", None)
 
         if tmp_errors == errors:  # no error occured during prediction
             write_graph_to_file(predicted_graph, args.out_file)
 
@@ -44,7 +44,8 @@ def chu_liu_edmonds(graph):
 
         # add arcs from inside cycle to outside cycle, then delete t_c
         if t_c in y:
-            for arc in (arc for arc in y[t_c] if arc.dependent not in c):
+            # for arc in (arc for arc in y[t_c] if arc.dependent not in c):
+            for arc in y[t_c]:
                 # dependents of t_c which are not in c
                 for head in (head for head in g if head == arc.former_head):
                     # heads in g_a which are former head of arc
@@ -73,7 +74,7 @@ def chu_liu_edmonds(graph):
         y[head_of_cycle] = new_dependents
 
         # add arcs from inside cycle to inside cycle except the one pointing to cycle_start_node
-        for head in (head for head in g_a if head in c):
+        for head in c:
             # every head in g_a that is in c
             for arc in (arc for arc in g_a[head] if arc.dependent in c and not arc.dependent == cycle_start_node):
                 # every dependent of head in g_a if it is in c but not cycle_start_node
 
@@ -1,34 +1,52 @@
 import codecs
 
 from token import sentences
-from graphs import Graph
+from graphs import Graph, reverse_head_graph
 from perceptron import make_graph_compareable
 
 
 def evaluate(args):
-    full_graphs_gold = {}
+    sparse_graphs_gold = {}
+    sparse_graphs_predicted = {}
 
-    total = 0
-    correct = 0
+    total = 0.0
+    correct = 0.0
+
+    total_arcs = 0.0
+    correct_arcs = 0.0
 
     for sentence in sentences(codecs.open(args.gold, encoding='utf-8')):
-        full_graph_gold = Graph(sentence, "full").heads
-        full_graphs_gold[len(full_graphs_gold)] = full_graph_gold
-    full_graphs_predicted = {}
+        sparse_graph_gold = Graph(sentence, "sparse").heads
+        sparse_graphs_gold[len(sparse_graphs_gold)] = sparse_graph_gold
     for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
-        full_graph_predicted = Graph(sentence, "full").heads
-        full_graphs_predicted[len(full_graphs_predicted)] = full_graph_predicted
+        sparse_graph_predicted = Graph(sentence, "sparse").heads
+        sparse_graphs_predicted[len(sparse_graphs_predicted)] = sparse_graph_predicted
 
-    if len(full_graphs_gold) == len(full_graphs_predicted):
-        for gold_graph in sorted(full_graphs_gold.keys()):
+    if len(sparse_graphs_gold) == len(sparse_graphs_predicted):
+        for gold_graph in sorted(sparse_graphs_gold.keys()):
             total += 1
-            if make_graph_compareable(full_graphs_gold[gold_graph]) == make_graph_compareable(
-                    full_graphs_predicted[gold_graph]):
+            if make_graph_compareable(sparse_graphs_gold[gold_graph]) == make_graph_compareable(
+                    sparse_graphs_predicted[gold_graph]):
                 correct += 1
+
     else:
-        print "Error in file length, Gold: " + str(len(full_graphs_gold)) + ", Predicted: " + str(
-            len(full_graphs_predicted))
+        print "Error in file length, Gold: " + str(len(sparse_graphs_gold)) + ", Predicted: " + str(
+            len(sparse_graphs_predicted))
+
+    for predicted_graph in sorted(sparse_graphs_predicted.keys()):
+        rev_predicted = reverse_head_graph(sparse_graphs_predicted[predicted_graph])
+        rev_gold = reverse_head_graph(sparse_graphs_gold[predicted_graph])
+        for dependent in rev_predicted:
+            for arc in rev_predicted[dependent]:
+                if arc.head == rev_gold[dependent][0].head:
+                    correct_arcs += 1
+                total_arcs += 1
 
     with open(args.out_file, "w") as out:
         print >> out, "Total: " + str(total)
         print >> out, "Correct: " + str(correct)
+        print >> out, "%: " + str(round(correct/total, 2) * 100)
+        print >> out, ""
+        print >> out, "Total Arcs: " + str(total_arcs)
+        print >> out, "Correct: " + str(correct_arcs)
+        print >> out, "%: " + str(round(correct_arcs/total_arcs, 2) * 100)
@@ -1,7 +1,8 @@
 from token import sentences
 import codecs
 
-def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos):
+
+def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos, direction, distance):
 
     # generator that yields features based on the following information:
 
@@ -13,62 +14,113 @@ def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos):
     # 6 = hlemma
     # 7 = dlemma
 
-    yield u'1:{0}'.format(hform)
-    yield u'2:{0}'.format(hpos)
-    yield u'3:{0}'.format(dform)
-    yield u'4:{0}'.format(dpos)
-    yield u'6:{0}'.format(hlemma)
-    yield u'7:{0}'.format(dlemma)
-    yield u'5:{0}'.format(bpos)
-
-    yield u'1,4:{0},{1}'.format(hform, dpos)
-    yield u'2,3:{0},{1}'.format(hpos, dform)
-    yield u'1,2:{0},{1}'.format(hform, hpos)
-    yield u'3,4:{0},{1}'.format(dform, dpos)
-    yield u'1,3:{0},{1}'.format(hform, dform)
-    yield u'2,4:{0},{1}'.format(hpos, dpos)
-    yield u'6,4:{0},{1}'.format(hlemma, dpos)
-    yield u'2,7:{0},{1}'.format(hpos, dlemma)
-    yield u'6,2:{0},{1}'.format(hlemma, hpos)
-    yield u'7,4:{0},{1}'.format(dlemma, dpos)
-    yield u'6,7:{0},{1}'.format(hlemma, dlemma)
-
-    yield u'1,2,3,4:{0},{1},{2},{3}'.format(hform, hpos, dform, dpos)
-    yield u'2,3,4:{0},{1},{2}'.format(hpos, dform, dpos)
-    yield u'1,3,4:{0},{1},{2}'.format(hform, dform, dpos)
-    yield u'1,2,3:{0},{1},{2}'.format(hform, hpos, dform)
-    yield u'1,2,4:{0},{1},{2}'.format(hform, hpos, dpos)
-    yield u'2,5,4:{0},{1},{2}'.format(hpos, bpos, dpos)
-    yield u'2,5,3:{0},{1},{2}'.format(hpos, bpos, dform)
-    yield u'1,5,4:{0},{1},{2}'.format(hform, bpos, dpos)
-    yield u'1,5,3:{0},{1},{2}'.format(hform, bpos, dform)
-
-    yield u'6,2,7,4:{0},{1},{2},{3}'.format(hlemma, hpos, dlemma, dpos)
-    yield u'2,7,4:{0},{1},{2}'.format(hpos, dlemma, dpos)
-    yield u'6,7,4:{0},{1},{2}'.format(hlemma, dlemma, dpos)
-    yield u'6,2,7:{0},{1},{2}'.format(hlemma, hpos, dlemma)
-    yield u'6,2,4:{0},{1},{2}'.format(hlemma, hpos, dpos)
-    yield u'2,5,7:{0},{1},{2}'.format(hpos, bpos, dlemma)
-    yield u'6,5,4:{0},{1},{2}'.format(hlemma, bpos, dpos)
-    yield u'6,5,7:{0},{1},{2}'.format(hlemma, bpos, dlemma)
+    yield u'1,dir,dist:{0},{1},{2}'.format(hform, direction, distance)
+    yield u'2,dir,dist:{0},{1},{2}'.format(hpos, direction, distance)
+    yield u'3,dir,dist:{0},{1},{2}'.format(dform, direction, distance)
+    yield u'4,dir,dist:{0},{1},{2}'.format(dpos, direction, distance)
+    yield u'6,dir,dist:{0},{1},{2}'.format(hlemma, direction, distance)
+    yield u'7,dir,dist:{0},{1},{2}'.format(dlemma, direction, distance)
+    yield u'5,dir,dist:{0},{1},{2}'.format(bpos, direction, distance)
+
+    yield u'1,4,dir,dist:{0},{1},{2},{3}'.format(hform, dpos, direction, distance)
+    yield u'2,3,dir,dist:{0},{1},{2},{3}'.format(hpos, dform, direction, distance)
+    yield u'1,2,dir,dist:{0},{1},{2},{3}'.format(hform, hpos, direction, distance)
+    yield u'3,4,dir,dist:{0},{1},{2},{3}'.format(dform, dpos, direction, distance)
+    yield u'1,3,dir,dist:{0},{1},{2},{3}'.format(hform, dform, direction, distance)
+    yield u'2,4,dir,dist:{0},{1},{2},{3}'.format(hpos, dpos, direction, distance)
+    yield u'6,4,dir,dist:{0},{1},{2},{3}'.format(hlemma, dpos, direction, distance)
+    yield u'2,7,dir,dist:{0},{1},{2},{3}'.format(hpos, dlemma, direction, distance)
+    yield u'6,2,dir,dist:{0},{1},{2},{3}'.format(hlemma, hpos, direction, distance)
+    yield u'7,4,dir,dist:{0},{1},{2},{3}'.format(dlemma, dpos, direction, distance)
+    yield u'6,7,dir,dist:{0},{1},{2},{3}'.format(hlemma, dlemma, direction, distance)
+
+    yield u'1,2,3,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hform, hpos, dform, dpos, direction, distance)
+    yield u'2,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dform, dpos, direction, distance)
+    yield u'1,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, dform, dpos, direction, distance)
+    yield u'1,2,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dform, direction, distance)
+    yield u'1,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dpos, direction, distance)
+    yield u'2,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dpos, direction, distance)
+    yield u'2,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dform, direction, distance)
+    yield u'1,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dpos, direction, distance)
+    yield u'1,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dform, direction, distance)
+
+    yield u'6,2,7,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hlemma, hpos, dlemma, dpos, direction, distance)
+    yield u'2,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dlemma, dpos, direction, distance)
+    yield u'6,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, dlemma, dpos, direction, distance)
+    yield u'6,2,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dlemma, direction, distance)
+    yield u'6,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dpos, direction, distance)
+    yield u'2,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dlemma, direction, distance)
+    yield u'6,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dpos, direction, distance)
+    yield u'6,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dlemma, direction, distance)
+
+
+def give_distance(id1, id2, direction):
+
+    if direction == "right":
+        d = id1 - id2
+    else:
+        d = id2 - id1
+
+    if d < 1:
+        print "Error in distance computing, distance is too low."
+        distance = "__ERROR__"
+    elif d == 1:
+        distance = "1"
+    elif d == 2:
+        distance = "2"
+    elif d == 3:
+        distance = "3"
+    elif d == 4:
+        distance = "4"
+    elif d == 5:
+        distance = "5"
+    elif 5 < d <= 10:
+        distance = "6-10"
+    elif 10 < d <= 20:
+        distance = "11-20"
+    else:
+        distance = ">20"
+    
+    return distance
+
+
+def give_direction(id1, id2):
+    if id2 < id1:
+        direction = "right"
+    else:
+        direction = "left"
+
+    return direction
 
 
 def fm(infile):
+
     # takes a file in conll06 format, returns a feature map
     feat_map = {}  # featmap as dictionary {feature:index}
     index = 0  # index in featmap
+
     for sentence in sentences(codecs.open(infile, encoding='utf-8')):
         for token1 in sentence:
-
+            direction = "left"
+            distance = give_distance(0, token1.id, direction)
             # add root features
-            for feature in give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form, token1.lemma, token1.pos, token1.rel):
+            for feature in give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form, token1.lemma, token1.pos,
+                                         token1.rel, direction, distance):
                 if feature not in feat_map:
                     feat_map[feature] = index
                     index += 1
 
             # add other features
-            for token2 in sentence:
-                for feature in give_features(token1.form, token1.lemma, token1.pos, token2.form, token2.lemma, token2.pos, token2.rel):
+            for token2 in (token2 for token2 in sentence if token2.id != token1.id):
+
+                # direction
+                direction = give_direction(token1.id, token2.id)
+
+                # distance
+                distance = give_distance(token1.id, token2.id, direction)
+
+                for feature in give_features(token1.form, token1.lemma, token1.pos, token2.form, token2.lemma,
+                                             token2.pos, token2.rel, direction, distance):
                     if feature not in feat_map:
                         feat_map[feature] = index
                         index += 1