Skip to content

Commit cbf947a

Browse files
committed
Optimize performance at some points, extend feature set
1 parent a4f6ea0 commit cbf947a

17 files changed

+456
-627
lines changed

create_screens.sh

Lines changed: 0 additions & 7 deletions
This file was deleted.

create_screens_english.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
screen -dmS "cle_1" ./start-cle_automated_english.sh 1 m_e-1 p_e-1 e_e-1
4+
screen -dmS "cle_2" ./start-cle_automated_english.sh 10 m_e-10 p_e-10 e_e-10
5+
screen -dmS "cle_3" ./start-cle_automated_english.sh 10 m_e-10_da p_e-10_da e_e-10_da -decrease-alpha
6+
#screen -dmS "cle_4" ./start-cle_automated_english.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
7+
#screen -dmS "cle_5" ./start-cle_automated_english.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences

create_screens_german.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
screen -dmS "cle_1" ./start-cle_automated_german.sh 1 m_e-1 p_e-1 e_e-1
44
screen -dmS "cle_2" ./start-cle_automated_german.sh 10 m_e-10 p_e-10 e_e-10
55
screen -dmS "cle_3" ./start-cle_automated_german.sh 10 m_e-10_da p_e-10_da e_e-10_da -decrease-alpha
6-
screen -dmS "cle_4" ./start-cle_automated_german.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
7-
screen -dmS "cle_5" ./start-cle_automated_german.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences
6+
#screen -dmS "cle_4" ./start-cle_automated_german.sh 10 m_e-10_ss p_e-10_ss e_e-10_ss -shuffle-sentences
7+
#screen -dmS "cle_5" ./start-cle_automated_german.sh 10 m_e-10_da_ss p_e-10_da_ss e_e-10_da_ss -decrease-alpha -shuffle-sentences

dependency-parser.py

Lines changed: 56 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,21 @@
11
# !/bin/python
22
# -*- coding: utf-8 -*-
33

4+
import cProfile
5+
46
import time
57
import os
68
import codecs
7-
import random
89

910
import cPickle
1011
import gzip
1112

12-
from copy import deepcopy
13-
1413
from modules.perceptron import structured_perceptron
1514
from modules.token import sentences
16-
from modules.featmap import fm, add_feat_vec_to_sparse_graph, add_feat_vec_to_full_graph, reverse_feat_map
17-
from modules.graphs import CompleteFullGraph, FullGraph, SparseGraph, write_graph_to_file
15+
from modules.featmap import fm
16+
from modules.graphs import Graph, write_graph_to_file
1817
from modules.evaluation import evaluate
1918

20-
def create_weight_vector(l):
21-
# returns a list of length len(l) filled with 0.0
22-
w = []
23-
for i in range(l):
24-
w.append(0.0)
25-
return w
26-
2719

2820
def load(file_name):
2921
# load the model (weight vectors) from a file:
@@ -41,114 +33,87 @@ def save(file_name, model):
4133

4234

4335
def train(args):
44-
start = time.time()
36+
4537
print "\tCreating feature map..."
38+
start = time.time()
4639

47-
# feat map is a dictionary with every existing feature in the training data as keys,
48-
# and unique indexes as values. Example: u'hpos,dform:VBD,way': 3781
4940
feat_map = fm(args.in_file)
50-
rev_feat_map = reverse_feat_map(feat_map)
41+
5142
stop = time.time()
5243
print "\t\tNumber of features: " + str(len(feat_map))
5344
print "\t\tDone, " + str(stop - start) + " sec"
5445

55-
start = time.time()
5646
print "\tCreating weight vector..."
47+
start = time.time()
5748

58-
weight_vector = create_weight_vector(len(feat_map))
49+
weight_vector = [0.0 for i in xrange(len(feat_map))]
5950

6051
stop = time.time()
61-
print "\t\tNumber of features: " + str(len(feat_map))
52+
print "\t\tLength of weight vector: " + str(len(weight_vector))
6253
print "\t\tDone, " + str(stop - start) + " sec"
6354

55+
print "\tCounting sentences..."
6456
start = time.time()
65-
print "\tCreating sparse graph representation of every sentence..."
66-
67-
sparse_graphs = {}
6857

58+
sentence_count = 0
6959
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
70-
71-
full_graph = FullGraph(sentence).heads
72-
sparse_graph = SparseGraph(sentence).heads
73-
74-
# Check if full_graph and sparse_graph ids match at every point
75-
for full_head in full_graph:
76-
for full_arc in full_graph[full_head]:
77-
counter = 0
78-
for sparse_arc in sparse_graph[full_head]:
79-
if full_arc.dependent == sparse_arc.dependent:
80-
counter += 1
81-
if counter != 1:
82-
print "Error: The full and sparse graph representations do not match."
83-
84-
# add feature vec to every graph
85-
sparse_graph = add_feat_vec_to_sparse_graph(full_graph, sparse_graph, feat_map)
86-
87-
# check if every feature vector is filled with the correct number of features.
88-
89-
for head in sparse_graph:
90-
for arc in sparse_graph[head]:
91-
if arc.feat_vec:
92-
if arc.head == 0:
93-
if len(arc.feat_vec) != 10:
94-
print "Length of arc feature vector is wrong."
95-
print arc.feat_vec
96-
else:
97-
if len(arc.feat_vec) != 20:
98-
print "Length of arc feature vector is wrong."
99-
print arc.feat_vec
100-
else:
101-
print "Error: Feature vector is empty."
102-
103-
sparse_graphs[len(sparse_graphs)] = sparse_graph
60+
sentence_count += 1
10461

10562
stop = time.time()
106-
print "\t\tNumber of sentences: " + str(len(sparse_graphs))
10763
print "\t\tDone, " + str(stop - start) + " sec"
10864

65+
print "\tStart training, Total Instances: " + str(sentence_count)
10966
start = time.time()
110-
print "\tStart training, Total Instances: " + str(len(sparse_graphs))
11167

11268
if args.decrease_alpha:
11369
print "\t\tReduce smoothing coefficient activated."
11470

11571
alpha = 0.5 # smoothing coefficient for the weight adjustments
116-
graph_ids = sparse_graphs.keys() # list of dict keys, needed when shuffeling tokens after every epoch
11772

11873
for epoch in range(1, int(args.epochs) + 1):
11974

75+
start2 = time.time()
76+
12077
print "\t\tEpoch: " + str(epoch) + ", Smoothing coefficient: " + str(alpha)
12178

12279
total = 0
12380
correct = 0
12481
errors = 0
12582

126-
for graph_id in graph_ids:
127-
weight_vector, correct, errors = structured_perceptron(deepcopy(sparse_graphs[graph_id]), feat_map, rev_feat_map, weight_vector, correct, errors, "train", alpha)
83+
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
84+
85+
sparse_graph = Graph(sentence, "sparse", feat_map).heads # gold graph
86+
complete_sparse_graph = Graph(sentence, "complete-sparse", feat_map, weight_vector).heads # complete graph
87+
88+
# call the perceptron
89+
weight_vector, correct, errors = structured_perceptron(complete_sparse_graph, weight_vector, correct,
90+
errors, "train", sparse_graph, alpha)
91+
12892
total += 1
93+
94+
# print some information every 500 sentences
12995
if total % 500 == 0:
96+
stop2 = time.time()
13097
print "\t\t\tInstance Nr. " + str(total) + "\tCorrect: " + str(correct) + "\t(" \
131-
+ str((correct*100)/total) + "%)\tErrors: " + str(errors)
132-
# print "\t\t\tCurrent weight vector:"
133-
# print "\t\t\t" + str(weight_vector)
98+
+ str((correct * 100) / total) + "%)\tErrors: " + str(errors) + "\t" + str(stop2-start2) + " sec"
99+
start2 = time.time()
134100

135-
if args.decrease_alpha: # decrease alpha after every epoch
101+
# decrease alpha after every epoch if activated
102+
if args.decrease_alpha:
136103
alpha /= 2
137104

138-
if args.shuffle_sentences: # shuffle sentences after every epoch
139-
random.shuffle(graph_ids)
140-
141105
stop = time.time()
142106
print "\t\tDone, " + str(stop - start) + " sec"
143107

144-
start = time.time()
145108
print "\tSaving the model and the features to file '" + str(args.model) + "'..."
109+
start = time.time()
146110

147111
save(args.model, [feat_map, weight_vector])
148112

149113
stop = time.time()
150114
print "\t\tDone, " + str(stop - start) + " sec"
151115

116+
152117
def test(args):
153118

154119
# load classifier vectors (model) and feature vector from file:
@@ -157,47 +122,54 @@ def test(args):
157122

158123
model_list = load(args.model)
159124
feat_map = model_list[0]
160-
rev_feat_map = reverse_feat_map(feat_map)
161125
weight_vector = model_list[1]
162126

163127
stop = time.time()
164128
print "\t\t" + str(len(feat_map)) + " features loaded"
165129
print "\t\tDone, " + str(stop - start) + " sec."
130+
131+
print "\tCounting sentences..."
166132
start = time.time()
133+
167134
sentence_count = 0
168135
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
169136
sentence_count += 1
170137

138+
stop = time.time()
139+
print "\t\tDone, " + str(stop - start) + " sec"
140+
171141
print "\tStart annotating the test file, Total Instances: " + str(sentence_count)
142+
start = time.time()
172143

173144
total = 0
174145
errors = 0
175146

176147
for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
177148

178-
# create complete, directed graph representation of sentence
179-
full_graph = CompleteFullGraph(sentence).heads
180-
181-
# add feature vec
182-
full_graph = add_feat_vec_to_full_graph(full_graph, feat_map)
149+
# complete graph in full arc representation
150+
full_graph = Graph(sentence, "complete-full", feat_map, weight_vector).heads
183151

184152
tmp_errors = errors
185153

186-
predicted_graph, errors = structured_perceptron(deepcopy(full_graph), feat_map, rev_feat_map, weight_vector, 0, errors, "test")
154+
# call the perceptron
155+
predicted_graph, errors = structured_perceptron(full_graph, weight_vector, 0, errors, "test", None)
187156

188157
if tmp_errors == errors: # no error occured during prediction
189158
write_graph_to_file(predicted_graph, args.out_file)
190-
else: # a error occured during prediction
159+
else: # an error occured during prediction
191160
write_graph_to_file(full_graph, args.out_file, "error")
192161

193162
total += 1
163+
164+
# print some information every 500 sentences
194165
if total % 500 == 0:
195166
print "\t\tInstance Nr. " + str(total) + "\tErrors: " + str(errors)
196167
# print "\t\t\tCurrent weight vector:"
197168
# print "\t\t\t" + str(weight_vector)
198169
stop = time.time()
199170
print "\t\tDone, " + str(stop - start) + " sec"
200171

172+
201173
if __name__ == '__main__':
202174

203175
t0 = time.time()
@@ -217,8 +189,10 @@ def test(args):
217189
arg_par.add_argument('-g', '--gold', dest='gold', help='gold', default='gold.conll06')
218190
arg_par.add_argument('-o', '--output', dest='out_file', help='output file', default='predicted.conll06')
219191
arg_par.add_argument('-e', '--epochs', dest='epochs', help='epochs', default='10')
220-
arg_par.add_argument('-decrease-alpha', dest='decrease_alpha', action='store_true', help='decrease alpha', default=False)
221-
arg_par.add_argument('-shuffle-sentences', dest='shuffle_sentences', action='store_true', help='shuffle sentences', default=False)
192+
arg_par.add_argument('-decrease-alpha', dest='decrease_alpha', action='store_true', help='decrease alpha',
193+
default=False)
194+
arg_par.add_argument('-shuffle-sentences', dest='shuffle_sentences', action='store_true', help='shuffle sentences',
195+
default=False)
222196

223197
arguments = arg_par.parse_args()
224198

@@ -228,6 +202,7 @@ def test(args):
228202
if arguments.train:
229203
print "Running in training mode\n"
230204
train(arguments)
205+
# cProfile.run("train(arguments)")
231206

232207
elif arguments.test:
233208
print "Running in test mode\n"
@@ -236,6 +211,7 @@ def test(args):
236211
elif arguments.evaluate:
237212
print "Running in evaluation mode\n"
238213
evaluate(arguments)
214+
239215
"""
240216
elif arguments.tag:
241217
print "Running in tag mode\n"
@@ -244,4 +220,4 @@ def test(args):
244220
"""
245221

246222
t1 = time.time()
247-
print "\n\tDone. Total time: " + str(t1 - t0) + "sec.\n"
223+
print "\n\tDone. Total time: " + str(t1 - t0) + " sec.\n"

0 commit comments

Comments
 (0)