YNedderhoff
diff --git a/‎create_screens.sh‎
Lines changed: 0 additions & 7 deletions b/‎create_screens.sh‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎create_screens_english.sh‎
Lines changed: 7 additions & 0 deletions b/‎create_screens_english.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎create_screens_german.sh‎
Lines changed: 2 additions & 2 deletions b/‎create_screens_german.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dependency-parser.py‎
Lines changed: 56 additions & 80 deletions b/‎dependency-parser.py‎
Lines changed: 56 additions & 80 deletions
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+screen -dmS "cle_1" ./start-cle_automated_english.sh 1 m_e-1 p_e-1 e_e-1
+screen -dmS "cle_2" ./start-cle_automated_english.sh 10 m_e-10 p_e-10 e_e-10
+screen -dmS "cle_3" ./start-cle_automated_english.sh 10 m_e-10_da p_e-10_da e_e-10_da -decrease-alpha
+#screen -dmS "cle_4" ./start-cle_automated_english.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
+#screen -dmS "cle_5" ./start-cle_automated_english.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences
@@ -3,5 +3,5 @@
 screen -dmS "cle_1" ./start-cle_automated_german.sh 1 m_e-1 p_e-1 e_e-1
 screen -dmS "cle_2" ./start-cle_automated_german.sh 10 m_e-10 p_e-10 e_e-10
 screen -dmS "cle_3" ./start-cle_automated_german.sh 10 m_e-10_da p_e-10_da e_e-10_da -decrease-alpha
-screen -dmS "cle_4" ./start-cle_automated_german.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
-screen -dmS "cle_5" ./start-cle_automated_german.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences
+#screen -dmS "cle_4" ./start-cle_automated_german.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
+#screen -dmS "cle_5" ./start-cle_automated_german.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences
@@ -1,29 +1,21 @@
 # !/bin/python
 #  -*- coding: utf-8 -*-
 
+import cProfile
+
 import time
 import os
 import codecs
-import random
 
 import cPickle
 import gzip
 
-from copy import deepcopy
-
 from modules.perceptron import structured_perceptron
 from modules.token import sentences
-from modules.featmap import fm, add_feat_vec_to_sparse_graph, add_feat_vec_to_full_graph, reverse_feat_map
-from modules.graphs import CompleteFullGraph, FullGraph, SparseGraph, write_graph_to_file
+from modules.featmap import fm
+from modules.graphs import Graph, write_graph_to_file
 from modules.evaluation import evaluate
 
-def create_weight_vector(l):
-    # returns a list of length len(l) filled with 0.0
-    w = []
-    for i in range(l):
-        w.append(0.0)
-    return w
-
 
 def load(file_name):
     # load the model (weight vectors) from a file:
@@ -41,114 +33,87 @@ def save(file_name, model):
 
 
 def train(args):
-    start = time.time()
+
     print "\tCreating feature map..."
+    start = time.time()
 
-    # feat map is a dictionary with every existing feature in the training data as keys,
-    # and unique indexes as values. Example: u'hpos,dform:VBD,way': 3781
     feat_map = fm(args.in_file)
-    rev_feat_map = reverse_feat_map(feat_map)
+
     stop = time.time()
     print "\t\tNumber of features: " + str(len(feat_map))
     print "\t\tDone, " + str(stop - start) + " sec"
 
-    start = time.time()
     print "\tCreating weight vector..."
+    start = time.time()
 
-    weight_vector = create_weight_vector(len(feat_map))
+    weight_vector = [0.0 for i in xrange(len(feat_map))]
 
     stop = time.time()
-    print "\t\tNumber of features: " + str(len(feat_map))
+    print "\t\tLength of weight vector: " + str(len(weight_vector))
     print "\t\tDone, " + str(stop - start) + " sec"
 
+    print "\tCounting sentences..."
     start = time.time()
-    print "\tCreating sparse graph representation of every sentence..."
-
-    sparse_graphs = {}
 
+    sentence_count = 0
     for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
-
-        full_graph = FullGraph(sentence).heads
-        sparse_graph = SparseGraph(sentence).heads
-
-        # Check if full_graph and sparse_graph ids match at every point
-        for full_head in full_graph:
-            for full_arc in full_graph[full_head]:
-                counter = 0
-                for sparse_arc in sparse_graph[full_head]:
-                    if full_arc.dependent == sparse_arc.dependent:
-                        counter += 1
-                if counter != 1:
-                    print "Error: The full and sparse graph representations do not match."
-
-        # add feature vec to every graph
-        sparse_graph = add_feat_vec_to_sparse_graph(full_graph, sparse_graph, feat_map)
-
-        # check if every feature vector is filled with the correct number of features.
-
-        for head in sparse_graph:
-            for arc in sparse_graph[head]:
-                if arc.feat_vec:
-                    if arc.head == 0:
-                        if len(arc.feat_vec) != 10:
-                            print "Length of arc feature vector is wrong."
-                            print arc.feat_vec
-                    else:
-                        if len(arc.feat_vec) != 20:
-                            print "Length of arc feature vector is wrong."
-                            print arc.feat_vec
-                else:
-                    print "Error: Feature vector is empty."
-
-        sparse_graphs[len(sparse_graphs)] = sparse_graph
+        sentence_count += 1
 
     stop = time.time()
-    print "\t\tNumber of sentences: " + str(len(sparse_graphs))
     print "\t\tDone, " + str(stop - start) + " sec"
 
+    print "\tStart training, Total Instances: " + str(sentence_count)
     start = time.time()
-    print "\tStart training, Total Instances: " + str(len(sparse_graphs))
 
     if args.decrease_alpha:
         print "\t\tReduce smoothing coefficient activated."
 
     alpha = 0.5  # smoothing coefficient for the weight adjustments
-    graph_ids = sparse_graphs.keys()  # list of dict keys, needed when shuffeling tokens after every epoch
 
     for epoch in range(1, int(args.epochs) + 1):
 
+        start2 = time.time()
+
         print "\t\tEpoch: " + str(epoch) + ", Smoothing coefficient: " + str(alpha)
 
         total = 0
         correct = 0
         errors = 0
 
-        for graph_id in graph_ids:
-            weight_vector, correct, errors = structured_perceptron(deepcopy(sparse_graphs[graph_id]), feat_map, rev_feat_map, weight_vector, correct, errors, "train", alpha)
+        for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
+
+            sparse_graph = Graph(sentence, "sparse", feat_map).heads  # gold graph
+            complete_sparse_graph = Graph(sentence, "complete-sparse", feat_map, weight_vector).heads  # complete graph
+
+            # call the perceptron
+            weight_vector, correct, errors = structured_perceptron(complete_sparse_graph, weight_vector, correct,
+                                                                   errors, "train", sparse_graph, alpha)
+
             total += 1
+
+            # print some information every 500 sentences
             if total % 500 == 0:
+                stop2 = time.time()
                 print "\t\t\tInstance Nr. " + str(total) + "\tCorrect: " + str(correct) + "\t(" \
-                    + str((correct*100)/total) + "%)\tErrors: " + str(errors)
-                # print "\t\t\tCurrent weight vector:"
-                # print "\t\t\t" + str(weight_vector)
+                      + str((correct * 100) / total) + "%)\tErrors: " + str(errors) + "\t" + str(stop2-start2) + " sec"
+                start2 = time.time()
 
-        if args.decrease_alpha:  # decrease alpha after every epoch
+        # decrease alpha after every epoch if activated
+        if args.decrease_alpha:
             alpha /= 2
 
-        if args.shuffle_sentences:  # shuffle sentences after every epoch
-            random.shuffle(graph_ids)
-
     stop = time.time()
     print "\t\tDone, " + str(stop - start) + " sec"
 
-    start = time.time()
     print "\tSaving the model and the features to file '" + str(args.model) + "'..."
+    start = time.time()
 
     save(args.model, [feat_map, weight_vector])
 
     stop = time.time()
     print "\t\tDone, " + str(stop - start) + " sec"
 
+
 def test(args):
 
     # load classifier vectors (model) and feature vector from file:
@@ -157,47 +122,54 @@ def test(args):
 
     model_list = load(args.model)
     feat_map = model_list[0]
-    rev_feat_map = reverse_feat_map(feat_map)
     weight_vector = model_list[1]
 
     stop = time.time()
     print "\t\t" + str(len(feat_map)) + " features loaded"
     print "\t\tDone, " + str(stop - start) + " sec."
+
+    print "\tCounting sentences..."
     start = time.time()
+
     sentence_count = 0
     for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
         sentence_count += 1
 
+    stop = time.time()
+    print "\t\tDone, " + str(stop - start) + " sec"
+
     print "\tStart annotating the test file, Total Instances: " + str(sentence_count)
+    start = time.time()
 
     total = 0
     errors = 0
 
     for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
 
-        # create complete, directed graph representation of sentence
-        full_graph = CompleteFullGraph(sentence).heads
-
-        # add feature vec
-        full_graph = add_feat_vec_to_full_graph(full_graph, feat_map)
+        # complete graph in full arc representation
+        full_graph = Graph(sentence, "complete-full", feat_map, weight_vector).heads
 
         tmp_errors = errors
 
-        predicted_graph, errors = structured_perceptron(deepcopy(full_graph), feat_map, rev_feat_map, weight_vector, 0, errors, "test")
+        # call the perceptron
+        predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, "test", None)
 
         if tmp_errors == errors:  # no error occured during prediction
             write_graph_to_file(predicted_graph, args.out_file)
-        else:  # a error occured during prediction
+        else:  # an error occured during prediction
             write_graph_to_file(full_graph, args.out_file, "error")
 
         total += 1
+
+        # print some information every 500 sentences
         if total % 500 == 0:
             print "\t\tInstance Nr. " + str(total) + "\tErrors: " + str(errors)
             # print "\t\t\tCurrent weight vector:"
             # print "\t\t\t" + str(weight_vector)
     stop = time.time()
     print "\t\tDone, " + str(stop - start) + " sec"
 
+
 if __name__ == '__main__':
 
     t0 = time.time()
@@ -217,8 +189,10 @@ def test(args):
     arg_par.add_argument('-g', '--gold', dest='gold', help='gold', default='gold.conll06')
     arg_par.add_argument('-o', '--output', dest='out_file', help='output file', default='predicted.conll06')
     arg_par.add_argument('-e', '--epochs', dest='epochs', help='epochs', default='10')
-    arg_par.add_argument('-decrease-alpha', dest='decrease_alpha', action='store_true', help='decrease alpha', default=False)
-    arg_par.add_argument('-shuffle-sentences', dest='shuffle_sentences', action='store_true', help='shuffle sentences', default=False)
+    arg_par.add_argument('-decrease-alpha', dest='decrease_alpha', action='store_true', help='decrease alpha',
+                         default=False)
+    arg_par.add_argument('-shuffle-sentences', dest='shuffle_sentences', action='store_true', help='shuffle sentences',
+                         default=False)
 
     arguments = arg_par.parse_args()
 
@@ -228,6 +202,7 @@ def test(args):
         if arguments.train:
             print "Running in training mode\n"
             train(arguments)
+            # cProfile.run("train(arguments)")
 
         elif arguments.test:
             print "Running in test mode\n"
@@ -236,6 +211,7 @@ def test(args):
         elif arguments.evaluate:
             print "Running in evaluation mode\n"
             evaluate(arguments)
+
         """
         elif arguments.tag:
             print "Running in tag mode\n"
@@ -244,4 +220,4 @@ def test(args):
         """
 
     t1 = time.time()
-    print "\n\tDone. Total time: " + str(t1 - t0) + "sec.\n"
+    print "\n\tDone. Total time: " + str(t1 - t0) + " sec.\n"