11# !/bin/python
22# -*- coding: utf-8 -*-
33
4+ import cProfile
5+
46import time
57import os
68import codecs
7- import random
89
910import cPickle
1011import gzip
1112
12- from copy import deepcopy
13-
1413from modules .perceptron import structured_perceptron
1514from modules .token import sentences
16- from modules .featmap import fm , add_feat_vec_to_sparse_graph , add_feat_vec_to_full_graph , reverse_feat_map
17- from modules .graphs import CompleteFullGraph , FullGraph , SparseGraph , write_graph_to_file
15+ from modules .featmap import fm
16+ from modules .graphs import Graph , write_graph_to_file
1817from modules .evaluation import evaluate
1918
20- def create_weight_vector (l ):
21- # returns a list of length len(l) filled with 0.0
22- w = []
23- for i in range (l ):
24- w .append (0.0 )
25- return w
26-
2719
2820def load (file_name ):
2921 # load the model (weight vectors) from a file:
@@ -41,114 +33,87 @@ def save(file_name, model):
4133
4234
4335def train (args ):
44- start = time . time ()
36+
4537 print "\t Creating feature map..."
38+ start = time .time ()
4639
47- # feat map is a dictionary with every existing feature in the training data as keys,
48- # and unique indexes as values. Example: u'hpos,dform:VBD,way': 3781
4940 feat_map = fm (args .in_file )
50- rev_feat_map = reverse_feat_map ( feat_map )
41+
5142 stop = time .time ()
5243 print "\t \t Number of features: " + str (len (feat_map ))
5344 print "\t \t Done, " + str (stop - start ) + " sec"
5445
55- start = time .time ()
5646 print "\t Creating weight vector..."
47+ start = time .time ()
5748
58- weight_vector = create_weight_vector (len (feat_map ))
49+ weight_vector = [ 0.0 for i in xrange (len (feat_map ))]
5950
6051 stop = time .time ()
61- print "\t \t Number of features : " + str (len (feat_map ))
52+ print "\t \t Length of weight vector : " + str (len (weight_vector ))
6253 print "\t \t Done, " + str (stop - start ) + " sec"
6354
55+ print "\t Counting sentences..."
6456 start = time .time ()
65- print "\t Creating sparse graph representation of every sentence..."
66-
67- sparse_graphs = {}
6857
58+ sentence_count = 0
6959 for sentence in sentences (codecs .open (args .in_file , encoding = 'utf-8' )):
70-
71- full_graph = FullGraph (sentence ).heads
72- sparse_graph = SparseGraph (sentence ).heads
73-
74- # Check if full_graph and sparse_graph ids match at every point
75- for full_head in full_graph :
76- for full_arc in full_graph [full_head ]:
77- counter = 0
78- for sparse_arc in sparse_graph [full_head ]:
79- if full_arc .dependent == sparse_arc .dependent :
80- counter += 1
81- if counter != 1 :
82- print "Error: The full and sparse graph representations do not match."
83-
84- # add feature vec to every graph
85- sparse_graph = add_feat_vec_to_sparse_graph (full_graph , sparse_graph , feat_map )
86-
87- # check if every feature vector is filled with the correct number of features.
88-
89- for head in sparse_graph :
90- for arc in sparse_graph [head ]:
91- if arc .feat_vec :
92- if arc .head == 0 :
93- if len (arc .feat_vec ) != 10 :
94- print "Length of arc feature vector is wrong."
95- print arc .feat_vec
96- else :
97- if len (arc .feat_vec ) != 20 :
98- print "Length of arc feature vector is wrong."
99- print arc .feat_vec
100- else :
101- print "Error: Feature vector is empty."
102-
103- sparse_graphs [len (sparse_graphs )] = sparse_graph
60+ sentence_count += 1
10461
10562 stop = time .time ()
106- print "\t \t Number of sentences: " + str (len (sparse_graphs ))
10763 print "\t \t Done, " + str (stop - start ) + " sec"
10864
65+ print "\t Start training, Total Instances: " + str (sentence_count )
10966 start = time .time ()
110- print "\t Start training, Total Instances: " + str (len (sparse_graphs ))
11167
11268 if args .decrease_alpha :
11369 print "\t \t Reduce smoothing coefficient activated."
11470
11571 alpha = 0.5 # smoothing coefficient for the weight adjustments
116- graph_ids = sparse_graphs .keys () # list of dict keys, needed when shuffeling tokens after every epoch
11772
11873 for epoch in range (1 , int (args .epochs ) + 1 ):
11974
75+ start2 = time .time ()
76+
12077 print "\t \t Epoch: " + str (epoch ) + ", Smoothing coefficient: " + str (alpha )
12178
12279 total = 0
12380 correct = 0
12481 errors = 0
12582
126- for graph_id in graph_ids :
127- weight_vector , correct , errors = structured_perceptron (deepcopy (sparse_graphs [graph_id ]), feat_map , rev_feat_map , weight_vector , correct , errors , "train" , alpha )
83+ for sentence in sentences (codecs .open (args .in_file , encoding = 'utf-8' )):
84+
85+ sparse_graph = Graph (sentence , "sparse" , feat_map ).heads # gold graph
86+ complete_sparse_graph = Graph (sentence , "complete-sparse" , feat_map , weight_vector ).heads # complete graph
87+
88+ # call the perceptron
89+ weight_vector , correct , errors = structured_perceptron (complete_sparse_graph , weight_vector , correct ,
90+ errors , "train" , sparse_graph , alpha )
91+
12892 total += 1
93+
94+ # print some information every 500 sentences
12995 if total % 500 == 0 :
96+ stop2 = time .time ()
13097 print "\t \t \t Instance Nr. " + str (total ) + "\t Correct: " + str (correct ) + "\t (" \
131- + str ((correct * 100 )/ total ) + "%)\t Errors: " + str (errors )
132- # print "\t\t\tCurrent weight vector:"
133- # print "\t\t\t" + str(weight_vector)
98+ + str ((correct * 100 ) / total ) + "%)\t Errors: " + str (errors ) + "\t " + str (stop2 - start2 ) + " sec"
99+ start2 = time .time ()
134100
135- if args .decrease_alpha : # decrease alpha after every epoch
101+ # decrease alpha after every epoch if activated
102+ if args .decrease_alpha :
136103 alpha /= 2
137104
138- if args .shuffle_sentences : # shuffle sentences after every epoch
139- random .shuffle (graph_ids )
140-
141105 stop = time .time ()
142106 print "\t \t Done, " + str (stop - start ) + " sec"
143107
144- start = time .time ()
145108 print "\t Saving the model and the features to file '" + str (args .model ) + "'..."
109+ start = time .time ()
146110
147111 save (args .model , [feat_map , weight_vector ])
148112
149113 stop = time .time ()
150114 print "\t \t Done, " + str (stop - start ) + " sec"
151115
116+
152117def test (args ):
153118
154119 # load classifier vectors (model) and feature vector from file:
@@ -157,47 +122,54 @@ def test(args):
157122
158123 model_list = load (args .model )
159124 feat_map = model_list [0 ]
160- rev_feat_map = reverse_feat_map (feat_map )
161125 weight_vector = model_list [1 ]
162126
163127 stop = time .time ()
164128 print "\t \t " + str (len (feat_map )) + " features loaded"
165129 print "\t \t Done, " + str (stop - start ) + " sec."
130+
131+ print "\t Counting sentences..."
166132 start = time .time ()
133+
167134 sentence_count = 0
168135 for sentence in sentences (codecs .open (args .in_file , encoding = 'utf-8' )):
169136 sentence_count += 1
170137
138+ stop = time .time ()
139+ print "\t \t Done, " + str (stop - start ) + " sec"
140+
171141 print "\t Start annotating the test file, Total Instances: " + str (sentence_count )
142+ start = time .time ()
172143
173144 total = 0
174145 errors = 0
175146
176147 for sentence in sentences (codecs .open (args .in_file , encoding = 'utf-8' )):
177148
178- # create complete, directed graph representation of sentence
179- full_graph = CompleteFullGraph (sentence ).heads
180-
181- # add feature vec
182- full_graph = add_feat_vec_to_full_graph (full_graph , feat_map )
149+ # complete graph in full arc representation
150+ full_graph = Graph (sentence , "complete-full" , feat_map , weight_vector ).heads
183151
184152 tmp_errors = errors
185153
186- predicted_graph , errors = structured_perceptron (deepcopy (full_graph ), feat_map , rev_feat_map , weight_vector , 0 , errors , "test" )
154+ # call the perceptron
155+ predicted_graph , errors = structured_perceptron (full_graph , weight_vector , 0 , errors , "test" , None )
187156
188157 if tmp_errors == errors : # no error occured during prediction
189158 write_graph_to_file (predicted_graph , args .out_file )
190- else : # a error occured during prediction
159+ else : # an error occured during prediction
191160 write_graph_to_file (full_graph , args .out_file , "error" )
192161
193162 total += 1
163+
164+ # print some information every 500 sentences
194165 if total % 500 == 0 :
195166 print "\t \t Instance Nr. " + str (total ) + "\t Errors: " + str (errors )
196167 # print "\t\t\tCurrent weight vector:"
197168 # print "\t\t\t" + str(weight_vector)
198169 stop = time .time ()
199170 print "\t \t Done, " + str (stop - start ) + " sec"
200171
172+
201173if __name__ == '__main__' :
202174
203175 t0 = time .time ()
@@ -217,8 +189,10 @@ def test(args):
217189 arg_par .add_argument ('-g' , '--gold' , dest = 'gold' , help = 'gold' , default = 'gold.conll06' )
218190 arg_par .add_argument ('-o' , '--output' , dest = 'out_file' , help = 'output file' , default = 'predicted.conll06' )
219191 arg_par .add_argument ('-e' , '--epochs' , dest = 'epochs' , help = 'epochs' , default = '10' )
220- arg_par .add_argument ('-decrease-alpha' , dest = 'decrease_alpha' , action = 'store_true' , help = 'decrease alpha' , default = False )
221- arg_par .add_argument ('-shuffle-sentences' , dest = 'shuffle_sentences' , action = 'store_true' , help = 'shuffle sentences' , default = False )
192+ arg_par .add_argument ('-decrease-alpha' , dest = 'decrease_alpha' , action = 'store_true' , help = 'decrease alpha' ,
193+ default = False )
194+ arg_par .add_argument ('-shuffle-sentences' , dest = 'shuffle_sentences' , action = 'store_true' , help = 'shuffle sentences' ,
195+ default = False )
222196
223197 arguments = arg_par .parse_args ()
224198
@@ -228,6 +202,7 @@ def test(args):
228202 if arguments .train :
229203 print "Running in training mode\n "
230204 train (arguments )
205+ # cProfile.run("train(arguments)")
231206
232207 elif arguments .test :
233208 print "Running in test mode\n "
@@ -236,6 +211,7 @@ def test(args):
236211 elif arguments .evaluate :
237212 print "Running in evaluation mode\n "
238213 evaluate (arguments )
214+
239215 """
240216 elif arguments.tag:
241217 print "Running in tag mode\n "
@@ -244,4 +220,4 @@ def test(args):
244220 """
245221
246222 t1 = time .time ()
247- print "\n \t Done. Total time: " + str (t1 - t0 ) + "sec.\n "
223+ print "\n \t Done. Total time: " + str (t1 - t0 ) + " sec.\n "
0 commit comments