Skip to content

Commit 907066a

Browse files
committed
Adjust feature set
1 parent 73f2a8e commit 907066a

File tree

5 files changed

+179
-88
lines changed

5 files changed

+179
-88
lines changed

modules/featmap.py

Lines changed: 135 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,60 +2,95 @@
22
import codecs
33

44

5-
def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos, direction, distance):
6-
5+
def give_features(hform, hlemma, hpos, dform, dlemma, dpos, hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos,
6+
direction, distance):
77
# generator that yields features based on the following information:
88

99
# 1 = hform
1010
# 2 = hpos
1111
# 3 = dform
1212
# 4 = dpos
13-
# 5 = bpos
14-
# 6 = hlemma
15-
# 7 = dlemma
16-
17-
yield u'1,dir,dist:{0},{1},{2}'.format(hform, direction, distance)
18-
yield u'2,dir,dist:{0},{1},{2}'.format(hpos, direction, distance)
19-
yield u'3,dir,dist:{0},{1},{2}'.format(dform, direction, distance)
20-
yield u'4,dir,dist:{0},{1},{2}'.format(dpos, direction, distance)
21-
yield u'6,dir,dist:{0},{1},{2}'.format(hlemma, direction, distance)
22-
yield u'7,dir,dist:{0},{1},{2}'.format(dlemma, direction, distance)
23-
yield u'5,dir,dist:{0},{1},{2}'.format(bpos, direction, distance)
24-
25-
yield u'1,4,dir,dist:{0},{1},{2},{3}'.format(hform, dpos, direction, distance)
26-
yield u'2,3,dir,dist:{0},{1},{2},{3}'.format(hpos, dform, direction, distance)
27-
yield u'1,2,dir,dist:{0},{1},{2},{3}'.format(hform, hpos, direction, distance)
28-
yield u'3,4,dir,dist:{0},{1},{2},{3}'.format(dform, dpos, direction, distance)
29-
yield u'1,3,dir,dist:{0},{1},{2},{3}'.format(hform, dform, direction, distance)
30-
yield u'2,4,dir,dist:{0},{1},{2},{3}'.format(hpos, dpos, direction, distance)
31-
yield u'6,4,dir,dist:{0},{1},{2},{3}'.format(hlemma, dpos, direction, distance)
32-
yield u'2,7,dir,dist:{0},{1},{2},{3}'.format(hpos, dlemma, direction, distance)
33-
yield u'6,2,dir,dist:{0},{1},{2},{3}'.format(hlemma, hpos, direction, distance)
34-
yield u'7,4,dir,dist:{0},{1},{2},{3}'.format(dlemma, dpos, direction, distance)
35-
yield u'6,7,dir,dist:{0},{1},{2},{3}'.format(hlemma, dlemma, direction, distance)
36-
37-
yield u'1,2,3,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hform, hpos, dform, dpos, direction, distance)
38-
yield u'2,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dform, dpos, direction, distance)
39-
yield u'1,3,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, dform, dpos, direction, distance)
40-
yield u'1,2,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dform, direction, distance)
41-
yield u'1,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, hpos, dpos, direction, distance)
42-
yield u'2,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dpos, direction, distance)
43-
yield u'2,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dform, direction, distance)
44-
yield u'1,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dpos, direction, distance)
45-
yield u'1,5,3,dir,dist:{0},{1},{2},{3},{4}'.format(hform, bpos, dform, direction, distance)
46-
47-
yield u'6,2,7,4,dir,dist:{0},{1},{2},{3},{4},{5}'.format(hlemma, hpos, dlemma, dpos, direction, distance)
48-
yield u'2,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, dlemma, dpos, direction, distance)
49-
yield u'6,7,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, dlemma, dpos, direction, distance)
50-
yield u'6,2,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dlemma, direction, distance)
51-
yield u'6,2,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dpos, direction, distance)
52-
yield u'2,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hpos, bpos, dlemma, direction, distance)
53-
yield u'6,5,4,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dpos, direction, distance)
54-
yield u'6,5,7,dir,dist:{0},{1},{2},{3},{4}'.format(hlemma, bpos, dlemma, direction, distance)
13+
# 5 = hlemma
14+
# 6 = dlemma
15+
16+
# 7 = hrform
17+
# 8 = hrpos
18+
# 9 = hlform
19+
# 10 = hlpos
20+
# 11 = drform
21+
# 12 = drpos
22+
# 13 = dlform
23+
# 14 = dlpos
24+
25+
# 15 = direction
26+
# 16 = distance
27+
28+
yield u'1,15,16:{0},{1},{2}'.format(hform, direction, distance)
29+
yield u'2,15,16:{0},{1},{2}'.format(hpos, direction, distance)
30+
yield u'3,15,16:{0},{1},{2}'.format(dform, direction, distance)
31+
yield u'4,15,16:{0},{1},{2}'.format(dpos, direction, distance)
32+
yield u'5,15,16:{0},{1},{2}'.format(hlemma, direction, distance)
33+
yield u'6,15,16:{0},{1},{2}'.format(dlemma, direction, distance)
34+
35+
yield u'1,4,15,16:{0},{1},{2},{3}'.format(hform, dpos, direction, distance)
36+
yield u'2,3,15,16:{0},{1},{2},{3}'.format(hpos, dform, direction, distance)
37+
yield u'1,2,15,16:{0},{1},{2},{3}'.format(hform, hpos, direction, distance)
38+
yield u'3,4,15,16:{0},{1},{2},{3}'.format(dform, dpos, direction, distance)
39+
yield u'1,3,15,16:{0},{1},{2},{3}'.format(hform, dform, direction, distance)
40+
yield u'2,4,15,16:{0},{1},{2},{3}'.format(hpos, dpos, direction, distance)
41+
yield u'5,4,15,16:{0},{1},{2},{3}'.format(hlemma, dpos, direction, distance)
42+
yield u'2,6,15,16:{0},{1},{2},{3}'.format(hpos, dlemma, direction, distance)
43+
yield u'5,2,15,16:{0},{1},{2},{3}'.format(hlemma, hpos, direction, distance)
44+
yield u'6,4,15,16:{0},{1},{2},{3}'.format(dlemma, dpos, direction, distance)
45+
yield u'5,6,15,16:{0},{1},{2},{3}'.format(hlemma, dlemma, direction, distance)
46+
47+
yield u'1,2,3,4,15,16:{0},{1},{2},{3},{4},{5}'.format(hform, hpos, dform, dpos, direction, distance)
48+
yield u'2,3,4,15,16:{0},{1},{2},{3},{4}'.format(hpos, dform, dpos, direction, distance)
49+
yield u'1,3,4,15,16:{0},{1},{2},{3},{4}'.format(hform, dform, dpos, direction, distance)
50+
yield u'1,2,3,15,16:{0},{1},{2},{3},{4}'.format(hform, hpos, dform, direction, distance)
51+
yield u'1,2,4,15,16:{0},{1},{2},{3},{4}'.format(hform, hpos, dpos, direction, distance)
52+
53+
yield u'5,2,6,4,15,16:{0},{1},{2},{3},{4},{5}'.format(hlemma, hpos, dlemma, dpos, direction, distance)
54+
yield u'2,6,4,15,16:{0},{1},{2},{3},{4}'.format(hpos, dlemma, dpos, direction, distance)
55+
yield u'5,6,4,15,16:{0},{1},{2},{3},{4}'.format(hlemma, dlemma, dpos, direction, distance)
56+
yield u'5,2,6,15,16:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dlemma, direction, distance)
57+
yield u'5,2,4,15,16:{0},{1},{2},{3},{4}'.format(hlemma, hpos, dpos, direction, distance)
58+
59+
if hrform != "__NULL__":
60+
yield u'7,15,16:{0},{1},{2}'.format(hrform, direction, distance)
61+
yield u'1,7,15,16:{0},{1},{2},{3}'.format(hform, hrform, direction, distance)
62+
if hrpos != "__NULL__":
63+
yield u'8,15,16:{0},{1},{2}'.format(hrpos, direction, distance)
64+
yield u'2,8,15,16:{0},{1},{2},{3}'.format(hpos, hrpos, direction, distance)
65+
if hlform != "__NULL__":
66+
yield u'9,15,16:{0},{1},{2}'.format(hlform, direction, distance)
67+
yield u'9,1,15,16:{0},{1},{2},{3}'.format(hlform, hform, direction, distance)
68+
if hlpos != "__NULL__":
69+
yield u'10,15,16:{0},{1},{2}'.format(hlpos, direction, distance)
70+
yield u'10,2,15,16:{0},{1},{2},{3}'.format(hlpos, hpos, direction, distance)
71+
if drform != "__NULL__":
72+
yield u'11,15,16:{0},{1},{2}'.format(drform, direction, distance)
73+
yield u'3,11,15,16:{0},{1},{2},{3}'.format(dform, drform, direction, distance)
74+
if drpos != "__NULL__":
75+
yield u'12,15,16:{0},{1},{2}'.format(drpos, direction, distance)
76+
yield u'4,12,15,16:{0},{1},{2},{3}'.format(dpos, drpos, direction, distance)
77+
if dlform != "__NULL__":
78+
yield u'13,15,16:{0},{1},{2}'.format(dlform, direction, distance)
79+
yield u'13,3,15,16:{0},{1},{2},{3}'.format(dlform, dform, direction, distance)
80+
if dlpos != "__NULL__":
81+
yield u'14,15,16:{0},{1},{2}'.format(dlpos, direction, distance)
82+
yield u'14,4,15,16:{0},{1},{2},{3}'.format(dlpos, dpos, direction, distance)
83+
if hlform != "__NULL__" and hrform != "__NULL":
84+
yield u'9,1,7,15,16:{0},{1},{2},{3},{4}'.format(hlform, hform, hrform, direction, distance)
85+
if hlpos != "__NULL__" and hrpos != "__NULL":
86+
yield u'10,2,8,15,16:{0},{1},{2},{3},{4}'.format(hlpos, hpos, hrpos, direction, distance)
87+
if dlform != "__NULL__" and drform != "__NULL":
88+
yield u'13,3,11,15,16:{0},{1},{2},{3},{4}'.format(dlform, dform, drform, direction, distance)
89+
if dlpos != "__NULL__" and drpos != "__NULL__":
90+
yield u'14,4,12,15,16:{0},{1},{2},{3},{4}'.format(dlpos, dpos, drpos, direction, distance)
5591

5692

5793
def give_distance(id1, id2, direction):
58-
5994
if direction == "right":
6095
d = id1 - id2
6196
else:
@@ -80,7 +115,7 @@ def give_distance(id1, id2, direction):
80115
distance = "11-20"
81116
else:
82117
distance = ">20"
83-
118+
84119
return distance
85120

86121

@@ -93,34 +128,82 @@ def give_direction(id1, id2):
93128
return direction
94129

95130

96-
def fm(infile):
131+
def give_surrounding_information(sentence, id1, id2):
132+
hrform = "__NULL__"
133+
hrpos = "__NULL__"
134+
hlform = "__NULL__"
135+
hlpos = "__NULL__"
136+
137+
drform = "__NULL__"
138+
drpos = "__NULL__"
139+
dlform = "__NULL__"
140+
dlpos = "__NULL__"
141+
142+
if id1 not in [0, 1, len(sentence)]:
143+
hrform = sentence[id1].form
144+
hrpos = sentence[id1].pos
145+
hlform = sentence[id1 - 2].form
146+
hlpos = sentence[id1 - 2].pos
147+
elif id1 == 0:
148+
hrform = sentence[id1].form
149+
hrpos = sentence[id1].pos
150+
elif id1 == 1:
151+
hrform = sentence[id1].form
152+
hrpos = sentence[id1].pos
153+
elif id1 == len(sentence):
154+
hlform = sentence[id1 - 2].form
155+
hlpos = sentence[id1 - 2].pos
156+
157+
if id2 not in [0, 1, len(sentence)]:
158+
drform = sentence[id2].form
159+
drpos = sentence[id2].pos
160+
dlform = sentence[id2 - 2].form
161+
dlpos = sentence[id2 - 2].pos
162+
elif id2 == 0:
163+
drform = sentence[id2].form
164+
drpos = sentence[id2].pos
165+
elif id2 == 1:
166+
drform = sentence[id2].form
167+
drpos = sentence[id2].pos
168+
elif id2 == len(sentence):
169+
dlform = sentence[id2 - 2].form
170+
dlpos = sentence[id2 - 2].pos
171+
172+
return hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos
97173

174+
175+
def fm(infile):
98176
# takes a file in conll06 format, returns a feature map
99177
feat_map = {} # featmap as dictionary {feature:index}
100178
index = 0 # index in featmap
101179

102180
for sentence in sentences(codecs.open(infile, encoding='utf-8')):
181+
103182
for token1 in sentence:
104183
direction = "left"
105184
distance = give_distance(0, token1.id, direction)
185+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = "__NULL__"
186+
106187
# add root features
107188
for feature in give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form, token1.lemma, token1.pos,
108-
token1.rel, direction, distance):
189+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos, direction,
190+
distance):
109191
if feature not in feat_map:
110192
feat_map[feature] = index
111193
index += 1
112194

113195
# add other features
114196
for token2 in (token2 for token2 in sentence if token2.id != token1.id):
115197

116-
# direction
117198
direction = give_direction(token1.id, token2.id)
118-
119-
# distance
120199
distance = give_distance(token1.id, token2.id, direction)
200+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = give_surrounding_information(sentence,
201+
token1.id,
202+
token2.id)
121203

122204
for feature in give_features(token1.form, token1.lemma, token1.pos, token2.form, token2.lemma,
123-
token2.pos, token2.rel, direction, distance):
205+
token2.pos, hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos,
206+
direction, distance):
124207
if feature not in feat_map:
125208
feat_map[feature] = index
126209
index += 1

modules/graphs.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
from featmap import give_features, give_distance, give_direction
1+
from featmap import give_features, give_distance, give_direction, give_surrounding_information
22
import codecs
33

44

55
class Arc:
66
def __init__(self, mode="sparse", head=None, dependent=None, head_form=None, dependent_form=None, head_lemma=None,
7-
dependent_lemma=None, head_pos=None, dependent_pos=None, dependent_rel=None, s=0.0):
7+
dependent_lemma=None, head_pos=None, dependent_pos=None, s=0.0):
88
self.head = head
99
self.dependent = dependent
1010
self.score = s
@@ -23,8 +23,6 @@ def __init__(self, mode="sparse", head=None, dependent=None, head_form=None, dep
2323
self.head_pos = head_pos
2424
self.dependent_pos = dependent_pos
2525

26-
self.rel = dependent_rel
27-
2826

2927
class ManualSparseGraph: # heads has the same structure as in the other graph classes, but for addition of arcs
3028
def __init__(self):
@@ -54,12 +52,13 @@ def __init__(self, tokens, mode="sparse", feat_map=None, weight_vector=None):
5452
if token1.head == 0:
5553
direction = "left"
5654
distance = give_distance(0, token1.id, direction)
57-
55+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = "__NULL__"
5856
new_arc = Arc("sparse", 0, token1.id)
5957
if feat_map is not None:
6058
new_arc.feat_vec = [f for f in (feat_map[feature] for feature in
6159
give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form,
62-
token1.lemma, token1.pos, token1.rel, direction,
60+
token1.lemma, token1.pos, hrform, hrpos, hlform,
61+
hlpos, drform, drpos, dlform, dlpos, direction,
6362
distance) if feature in feat_map)]
6463
self.heads[0].append(new_arc)
6564

@@ -68,14 +67,18 @@ def __init__(self, tokens, mode="sparse", feat_map=None, weight_vector=None):
6867
for token2 in (token2 for token2 in tokens if token2.head == token1.id):
6968
direction = give_direction(token1.id, token2.id)
7069
distance = give_distance(token1.id, token2.id, direction)
70+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = give_surrounding_information(tokens,
71+
token1.id,
72+
token2.id)
7173

7274
new_arc = Arc("sparse", token1.id, token2.id)
7375
if feat_map is not None:
7476
new_arc.feat_vec = [f for f in (feat_map[feature] for feature in
7577
give_features(token1.form, token1.lemma, token1.pos,
76-
token2.form,
77-
token2.lemma, token2.pos, token2.rel, direction,
78-
distance) if feature in feat_map)]
78+
token2.form, token2.lemma, token2.pos, hrform,
79+
hrpos, hlform, hlpos, drform, drpos, dlform,
80+
dlpos, direction, distance) if
81+
feature in feat_map)]
7982
dependents.append(new_arc)
8083
if dependents:
8184
self.heads[token1.id] = dependents
@@ -88,17 +91,19 @@ def __init__(self, tokens, mode="sparse", feat_map=None, weight_vector=None):
8891

8992
direction = "left"
9093
distance = give_distance(0, token1.id, direction)
94+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = "__NULL__"
9195

9296
if mode == "complete-sparse":
9397
new_arc = Arc("sparse", 0, token1.id)
9498
else:
9599
new_arc = Arc("full", 0, token1.id, "__ROOT__", token1.form, "__ROOT__", token1.lemma, "__ROOT__",
96-
token1.pos, token1.rel)
100+
token1.pos)
97101

98102
new_arc.feat_vec = [f for f in (feat_map[feature] for feature in
99103
give_features("__ROOT__", "__ROOT__", "__ROOT__", token1.form,
100-
token1.lemma, token1.pos, token1.rel, direction, distance)
101-
if feature in feat_map)]
104+
token1.lemma, token1.pos, hrform, hrpos, hlform, hlpos,
105+
drform, drpos, dlform, dlpos, direction, distance) if
106+
feature in feat_map)]
102107
for feature in new_arc.feat_vec:
103108
new_arc.score += weight_vector[feature]
104109
self.heads[0].append(new_arc)
@@ -109,16 +114,20 @@ def __init__(self, tokens, mode="sparse", feat_map=None, weight_vector=None):
109114

110115
direction = give_direction(token1.id, token2.id)
111116
distance = give_distance(token1.id, token2.id, direction)
117+
hrform, hrpos, hlform, hlpos, drform, drpos, dlform, dlpos = give_surrounding_information(tokens,
118+
token1.id,
119+
token2.id)
112120

113121
if mode == "complete-sparse":
114122
new_arc = Arc("sparse", token1.id, token2.id)
115123
else:
116124
new_arc = Arc("full", token1.id, token2.id, token1.form, token2.form, token1.lemma,
117-
token2.lemma, token1.pos, token2.pos, token2.rel)
125+
token2.lemma, token1.pos, token2.pos)
118126

119127
new_arc.feat_vec = [f for f in (feat_map[feature] for feature in
120128
give_features(token1.form, token1.lemma, token1.pos, token2.form,
121-
token2.lemma, token2.pos, token2.rel, direction,
129+
token2.lemma, token2.pos, hrform, hrpos, hlform,
130+
hlpos, drform, drpos, dlform, dlpos, direction,
122131
distance) if feature in feat_map)]
123132
for feature in new_arc.feat_vec:
124133
new_arc.score += weight_vector[feature]

modules/token.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ def __init__(self, line):
1414
self.head = None
1515
else:
1616
self.head = int(entries[6])
17-
self.rel = entries[7].rstrip()
1817

1918

2019
def sentences(file_stream):

0 commit comments

Comments
 (0)