22import codecs
33
44
5- def give_features (hform , hlemma , hpos , dform , dlemma , dpos , bpos , direction , distance ):
6-
5+ def give_features (hform , hlemma , hpos , dform , dlemma , dpos , hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos ,
6+ direction , distance ):
77 # generator that yields features based on the following information:
88
99 # 1 = hform
1010 # 2 = hpos
1111 # 3 = dform
1212 # 4 = dpos
13- # 5 = bpos
14- # 6 = hlemma
15- # 7 = dlemma
16-
17- yield u'1,dir,dist:{0},{1},{2}' .format (hform , direction , distance )
18- yield u'2,dir,dist:{0},{1},{2}' .format (hpos , direction , distance )
19- yield u'3,dir,dist:{0},{1},{2}' .format (dform , direction , distance )
20- yield u'4,dir,dist:{0},{1},{2}' .format (dpos , direction , distance )
21- yield u'6,dir,dist:{0},{1},{2}' .format (hlemma , direction , distance )
22- yield u'7,dir,dist:{0},{1},{2}' .format (dlemma , direction , distance )
23- yield u'5,dir,dist:{0},{1},{2}' .format (bpos , direction , distance )
24-
25- yield u'1,4,dir,dist:{0},{1},{2},{3}' .format (hform , dpos , direction , distance )
26- yield u'2,3,dir,dist:{0},{1},{2},{3}' .format (hpos , dform , direction , distance )
27- yield u'1,2,dir,dist:{0},{1},{2},{3}' .format (hform , hpos , direction , distance )
28- yield u'3,4,dir,dist:{0},{1},{2},{3}' .format (dform , dpos , direction , distance )
29- yield u'1,3,dir,dist:{0},{1},{2},{3}' .format (hform , dform , direction , distance )
30- yield u'2,4,dir,dist:{0},{1},{2},{3}' .format (hpos , dpos , direction , distance )
31- yield u'6,4,dir,dist:{0},{1},{2},{3}' .format (hlemma , dpos , direction , distance )
32- yield u'2,7,dir,dist:{0},{1},{2},{3}' .format (hpos , dlemma , direction , distance )
33- yield u'6,2,dir,dist:{0},{1},{2},{3}' .format (hlemma , hpos , direction , distance )
34- yield u'7,4,dir,dist:{0},{1},{2},{3}' .format (dlemma , dpos , direction , distance )
35- yield u'6,7,dir,dist:{0},{1},{2},{3}' .format (hlemma , dlemma , direction , distance )
36-
37- yield u'1,2,3,4,dir,dist:{0},{1},{2},{3},{4},{5}' .format (hform , hpos , dform , dpos , direction , distance )
38- yield u'2,3,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , dform , dpos , direction , distance )
39- yield u'1,3,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , dform , dpos , direction , distance )
40- yield u'1,2,3,dir,dist:{0},{1},{2},{3},{4}' .format (hform , hpos , dform , direction , distance )
41- yield u'1,2,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , hpos , dpos , direction , distance )
42- yield u'2,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dpos , direction , distance )
43- yield u'2,5,3,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dform , direction , distance )
44- yield u'1,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , bpos , dpos , direction , distance )
45- yield u'1,5,3,dir,dist:{0},{1},{2},{3},{4}' .format (hform , bpos , dform , direction , distance )
46-
47- yield u'6,2,7,4,dir,dist:{0},{1},{2},{3},{4},{5}' .format (hlemma , hpos , dlemma , dpos , direction , distance )
48- yield u'2,7,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , dlemma , dpos , direction , distance )
49- yield u'6,7,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , dlemma , dpos , direction , distance )
50- yield u'6,2,7,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dlemma , direction , distance )
51- yield u'6,2,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dpos , direction , distance )
52- yield u'2,5,7,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dlemma , direction , distance )
53- yield u'6,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , bpos , dpos , direction , distance )
54- yield u'6,5,7,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , bpos , dlemma , direction , distance )
13+ # 5 = hlemma
14+ # 6 = dlemma
15+
16+ # 7 = hrform
17+ # 8 = hrpos
18+ # 9 = hlform
19+ # 10 = hlpos
20+ # 11 = drform
21+ # 12 = drpos
22+ # 13 = dlform
23+ # 14 = dlpos
24+
25+ # 15 = direction
26+ # 16 = distance
27+
28+ yield u'1,15,16:{0},{1},{2}' .format (hform , direction , distance )
29+ yield u'2,15,16:{0},{1},{2}' .format (hpos , direction , distance )
30+ yield u'3,15,16:{0},{1},{2}' .format (dform , direction , distance )
31+ yield u'4,15,16:{0},{1},{2}' .format (dpos , direction , distance )
32+ yield u'5,15,16:{0},{1},{2}' .format (hlemma , direction , distance )
33+ yield u'6,15,16:{0},{1},{2}' .format (dlemma , direction , distance )
34+
35+ yield u'1,4,15,16:{0},{1},{2},{3}' .format (hform , dpos , direction , distance )
36+ yield u'2,3,15,16:{0},{1},{2},{3}' .format (hpos , dform , direction , distance )
37+ yield u'1,2,15,16:{0},{1},{2},{3}' .format (hform , hpos , direction , distance )
38+ yield u'3,4,15,16:{0},{1},{2},{3}' .format (dform , dpos , direction , distance )
39+ yield u'1,3,15,16:{0},{1},{2},{3}' .format (hform , dform , direction , distance )
40+ yield u'2,4,15,16:{0},{1},{2},{3}' .format (hpos , dpos , direction , distance )
41+ yield u'5,4,15,16:{0},{1},{2},{3}' .format (hlemma , dpos , direction , distance )
42+ yield u'2,6,15,16:{0},{1},{2},{3}' .format (hpos , dlemma , direction , distance )
43+ yield u'5,2,15,16:{0},{1},{2},{3}' .format (hlemma , hpos , direction , distance )
44+ yield u'6,4,15,16:{0},{1},{2},{3}' .format (dlemma , dpos , direction , distance )
45+ yield u'5,6,15,16:{0},{1},{2},{3}' .format (hlemma , dlemma , direction , distance )
46+
47+ yield u'1,2,3,4,15,16:{0},{1},{2},{3},{4},{5}' .format (hform , hpos , dform , dpos , direction , distance )
48+ yield u'2,3,4,15,16:{0},{1},{2},{3},{4}' .format (hpos , dform , dpos , direction , distance )
49+ yield u'1,3,4,15,16:{0},{1},{2},{3},{4}' .format (hform , dform , dpos , direction , distance )
50+ yield u'1,2,3,15,16:{0},{1},{2},{3},{4}' .format (hform , hpos , dform , direction , distance )
51+ yield u'1,2,4,15,16:{0},{1},{2},{3},{4}' .format (hform , hpos , dpos , direction , distance )
52+
53+ yield u'5,2,6,4,15,16:{0},{1},{2},{3},{4},{5}' .format (hlemma , hpos , dlemma , dpos , direction , distance )
54+ yield u'2,6,4,15,16:{0},{1},{2},{3},{4}' .format (hpos , dlemma , dpos , direction , distance )
55+ yield u'5,6,4,15,16:{0},{1},{2},{3},{4}' .format (hlemma , dlemma , dpos , direction , distance )
56+ yield u'5,2,6,15,16:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dlemma , direction , distance )
57+ yield u'5,2,4,15,16:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dpos , direction , distance )
58+
59+ if hrform != "__NULL__" :
60+ yield u'7,15,16:{0},{1},{2}' .format (hrform , direction , distance )
61+ yield u'1,7,15,16:{0},{1},{2},{3}' .format (hform , hrform , direction , distance )
62+ if hrpos != "__NULL__" :
63+ yield u'8,15,16:{0},{1},{2}' .format (hrpos , direction , distance )
64+ yield u'2,8,15,16:{0},{1},{2},{3}' .format (hpos , hrpos , direction , distance )
65+ if hlform != "__NULL__" :
66+ yield u'9,15,16:{0},{1},{2}' .format (hlform , direction , distance )
67+ yield u'9,1,15,16:{0},{1},{2},{3}' .format (hlform , hform , direction , distance )
68+ if hlpos != "__NULL__" :
69+ yield u'10,15,16:{0},{1},{2}' .format (hlpos , direction , distance )
70+ yield u'10,2,15,16:{0},{1},{2},{3}' .format (hlpos , hpos , direction , distance )
71+ if drform != "__NULL__" :
72+ yield u'11,15,16:{0},{1},{2}' .format (drform , direction , distance )
73+ yield u'3,11,15,16:{0},{1},{2},{3}' .format (dform , drform , direction , distance )
74+ if drpos != "__NULL__" :
75+ yield u'12,15,16:{0},{1},{2}' .format (drpos , direction , distance )
76+ yield u'4,12,15,16:{0},{1},{2},{3}' .format (dpos , drpos , direction , distance )
77+ if dlform != "__NULL__" :
78+ yield u'13,15,16:{0},{1},{2}' .format (dlform , direction , distance )
79+ yield u'13,3,15,16:{0},{1},{2},{3}' .format (dlform , dform , direction , distance )
80+ if dlpos != "__NULL__" :
81+ yield u'14,15,16:{0},{1},{2}' .format (dlpos , direction , distance )
82+ yield u'14,4,15,16:{0},{1},{2},{3}' .format (dlpos , dpos , direction , distance )
83+ if hlform != "__NULL__" and hrform != "__NULL" :
84+ yield u'9,1,7,15,16:{0},{1},{2},{3},{4}' .format (hlform , hform , hrform , direction , distance )
85+ if hlpos != "__NULL__" and hrpos != "__NULL" :
86+ yield u'10,2,8,15,16:{0},{1},{2},{3},{4}' .format (hlpos , hpos , hrpos , direction , distance )
87+ if dlform != "__NULL__" and drform != "__NULL" :
88+ yield u'13,3,11,15,16:{0},{1},{2},{3},{4}' .format (dlform , dform , drform , direction , distance )
89+ if dlpos != "__NULL__" and drpos != "__NULL__" :
90+ yield u'14,4,12,15,16:{0},{1},{2},{3},{4}' .format (dlpos , dpos , drpos , direction , distance )
5591
5692
5793def give_distance (id1 , id2 , direction ):
58-
5994 if direction == "right" :
6095 d = id1 - id2
6196 else :
@@ -80,7 +115,7 @@ def give_distance(id1, id2, direction):
80115 distance = "11-20"
81116 else :
82117 distance = ">20"
83-
118+
84119 return distance
85120
86121
@@ -93,34 +128,82 @@ def give_direction(id1, id2):
93128 return direction
94129
95130
96- def fm (infile ):
131+ def give_surrounding_information (sentence , id1 , id2 ):
132+ hrform = "__NULL__"
133+ hrpos = "__NULL__"
134+ hlform = "__NULL__"
135+ hlpos = "__NULL__"
136+
137+ drform = "__NULL__"
138+ drpos = "__NULL__"
139+ dlform = "__NULL__"
140+ dlpos = "__NULL__"
141+
142+ if id1 not in [0 , 1 , len (sentence )]:
143+ hrform = sentence [id1 ].form
144+ hrpos = sentence [id1 ].pos
145+ hlform = sentence [id1 - 2 ].form
146+ hlpos = sentence [id1 - 2 ].pos
147+ elif id1 == 0 :
148+ hrform = sentence [id1 ].form
149+ hrpos = sentence [id1 ].pos
150+ elif id1 == 1 :
151+ hrform = sentence [id1 ].form
152+ hrpos = sentence [id1 ].pos
153+ elif id1 == len (sentence ):
154+ hlform = sentence [id1 - 2 ].form
155+ hlpos = sentence [id1 - 2 ].pos
156+
157+ if id2 not in [0 , 1 , len (sentence )]:
158+ drform = sentence [id2 ].form
159+ drpos = sentence [id2 ].pos
160+ dlform = sentence [id2 - 2 ].form
161+ dlpos = sentence [id2 - 2 ].pos
162+ elif id2 == 0 :
163+ drform = sentence [id2 ].form
164+ drpos = sentence [id2 ].pos
165+ elif id2 == 1 :
166+ drform = sentence [id2 ].form
167+ drpos = sentence [id2 ].pos
168+ elif id2 == len (sentence ):
169+ dlform = sentence [id2 - 2 ].form
170+ dlpos = sentence [id2 - 2 ].pos
171+
172+ return hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos
97173
174+
175+ def fm (infile ):
98176 # takes a file in conll06 format, returns a feature map
99177 feat_map = {} # featmap as dictionary {feature:index}
100178 index = 0 # index in featmap
101179
102180 for sentence in sentences (codecs .open (infile , encoding = 'utf-8' )):
181+
103182 for token1 in sentence :
104183 direction = "left"
105184 distance = give_distance (0 , token1 .id , direction )
185+ hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos = "__NULL__"
186+
106187 # add root features
107188 for feature in give_features ("__ROOT__" , "__ROOT__" , "__ROOT__" , token1 .form , token1 .lemma , token1 .pos ,
108- token1 .rel , direction , distance ):
189+ hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos , direction ,
190+ distance ):
109191 if feature not in feat_map :
110192 feat_map [feature ] = index
111193 index += 1
112194
113195 # add other features
114196 for token2 in (token2 for token2 in sentence if token2 .id != token1 .id ):
115197
116- # direction
117198 direction = give_direction (token1 .id , token2 .id )
118-
119- # distance
120199 distance = give_distance (token1 .id , token2 .id , direction )
200+ hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos = give_surrounding_information (sentence ,
201+ token1 .id ,
202+ token2 .id )
121203
122204 for feature in give_features (token1 .form , token1 .lemma , token1 .pos , token2 .form , token2 .lemma ,
123- token2 .pos , token2 .rel , direction , distance ):
205+ token2 .pos , hrform , hrpos , hlform , hlpos , drform , drpos , dlform , dlpos ,
206+ direction , distance ):
124207 if feature not in feat_map :
125208 feat_map [feature ] = index
126209 index += 1
0 commit comments