11from token import sentences
22import codecs
33
4- def give_features (hform , hlemma , hpos , dform , dlemma , dpos , bpos ):
4+
5+ def give_features (hform , hlemma , hpos , dform , dlemma , dpos , bpos , direction , distance ):
56
67 # generator that yields features based on the following information:
78
@@ -13,62 +14,113 @@ def give_features(hform, hlemma, hpos, dform, dlemma, dpos, bpos):
1314 # 6 = hlemma
1415 # 7 = dlemma
1516
16- yield u'1:{0}' .format (hform )
17- yield u'2:{0}' .format (hpos )
18- yield u'3:{0}' .format (dform )
19- yield u'4:{0}' .format (dpos )
20- yield u'6:{0}' .format (hlemma )
21- yield u'7:{0}' .format (dlemma )
22- yield u'5:{0}' .format (bpos )
23-
24- yield u'1,4:{0},{1}' .format (hform , dpos )
25- yield u'2,3:{0},{1}' .format (hpos , dform )
26- yield u'1,2:{0},{1}' .format (hform , hpos )
27- yield u'3,4:{0},{1}' .format (dform , dpos )
28- yield u'1,3:{0},{1}' .format (hform , dform )
29- yield u'2,4:{0},{1}' .format (hpos , dpos )
30- yield u'6,4:{0},{1}' .format (hlemma , dpos )
31- yield u'2,7:{0},{1}' .format (hpos , dlemma )
32- yield u'6,2:{0},{1}' .format (hlemma , hpos )
33- yield u'7,4:{0},{1}' .format (dlemma , dpos )
34- yield u'6,7:{0},{1}' .format (hlemma , dlemma )
35-
36- yield u'1,2,3,4:{0},{1},{2},{3}' .format (hform , hpos , dform , dpos )
37- yield u'2,3,4:{0},{1},{2}' .format (hpos , dform , dpos )
38- yield u'1,3,4:{0},{1},{2}' .format (hform , dform , dpos )
39- yield u'1,2,3:{0},{1},{2}' .format (hform , hpos , dform )
40- yield u'1,2,4:{0},{1},{2}' .format (hform , hpos , dpos )
41- yield u'2,5,4:{0},{1},{2}' .format (hpos , bpos , dpos )
42- yield u'2,5,3:{0},{1},{2}' .format (hpos , bpos , dform )
43- yield u'1,5,4:{0},{1},{2}' .format (hform , bpos , dpos )
44- yield u'1,5,3:{0},{1},{2}' .format (hform , bpos , dform )
45-
46- yield u'6,2,7,4:{0},{1},{2},{3}' .format (hlemma , hpos , dlemma , dpos )
47- yield u'2,7,4:{0},{1},{2}' .format (hpos , dlemma , dpos )
48- yield u'6,7,4:{0},{1},{2}' .format (hlemma , dlemma , dpos )
49- yield u'6,2,7:{0},{1},{2}' .format (hlemma , hpos , dlemma )
50- yield u'6,2,4:{0},{1},{2}' .format (hlemma , hpos , dpos )
51- yield u'2,5,7:{0},{1},{2}' .format (hpos , bpos , dlemma )
52- yield u'6,5,4:{0},{1},{2}' .format (hlemma , bpos , dpos )
53- yield u'6,5,7:{0},{1},{2}' .format (hlemma , bpos , dlemma )
17+ yield u'1,dir,dist:{0},{1},{2}' .format (hform , direction , distance )
18+ yield u'2,dir,dist:{0},{1},{2}' .format (hpos , direction , distance )
19+ yield u'3,dir,dist:{0},{1},{2}' .format (dform , direction , distance )
20+ yield u'4,dir,dist:{0},{1},{2}' .format (dpos , direction , distance )
21+ yield u'6,dir,dist:{0},{1},{2}' .format (hlemma , direction , distance )
22+ yield u'7,dir,dist:{0},{1},{2}' .format (dlemma , direction , distance )
23+ yield u'5,dir,dist:{0},{1},{2}' .format (bpos , direction , distance )
24+
25+ yield u'1,4,dir,dist:{0},{1},{2},{3}' .format (hform , dpos , direction , distance )
26+ yield u'2,3,dir,dist:{0},{1},{2},{3}' .format (hpos , dform , direction , distance )
27+ yield u'1,2,dir,dist:{0},{1},{2},{3}' .format (hform , hpos , direction , distance )
28+ yield u'3,4,dir,dist:{0},{1},{2},{3}' .format (dform , dpos , direction , distance )
29+ yield u'1,3,dir,dist:{0},{1},{2},{3}' .format (hform , dform , direction , distance )
30+ yield u'2,4,dir,dist:{0},{1},{2},{3}' .format (hpos , dpos , direction , distance )
31+ yield u'6,4,dir,dist:{0},{1},{2},{3}' .format (hlemma , dpos , direction , distance )
32+ yield u'2,7,dir,dist:{0},{1},{2},{3}' .format (hpos , dlemma , direction , distance )
33+ yield u'6,2,dir,dist:{0},{1},{2},{3}' .format (hlemma , hpos , direction , distance )
34+ yield u'7,4,dir,dist:{0},{1},{2},{3}' .format (dlemma , dpos , direction , distance )
35+ yield u'6,7,dir,dist:{0},{1},{2},{3}' .format (hlemma , dlemma , direction , distance )
36+
37+ yield u'1,2,3,4,dir,dist:{0},{1},{2},{3},{4},{5}' .format (hform , hpos , dform , dpos , direction , distance )
38+ yield u'2,3,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , dform , dpos , direction , distance )
39+ yield u'1,3,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , dform , dpos , direction , distance )
40+ yield u'1,2,3,dir,dist:{0},{1},{2},{3},{4}' .format (hform , hpos , dform , direction , distance )
41+ yield u'1,2,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , hpos , dpos , direction , distance )
42+ yield u'2,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dpos , direction , distance )
43+ yield u'2,5,3,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dform , direction , distance )
44+ yield u'1,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hform , bpos , dpos , direction , distance )
45+ yield u'1,5,3,dir,dist:{0},{1},{2},{3},{4}' .format (hform , bpos , dform , direction , distance )
46+
47+ yield u'6,2,7,4,dir,dist:{0},{1},{2},{3},{4},{5}' .format (hlemma , hpos , dlemma , dpos , direction , distance )
48+ yield u'2,7,4,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , dlemma , dpos , direction , distance )
49+ yield u'6,7,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , dlemma , dpos , direction , distance )
50+ yield u'6,2,7,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dlemma , direction , distance )
51+ yield u'6,2,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , hpos , dpos , direction , distance )
52+ yield u'2,5,7,dir,dist:{0},{1},{2},{3},{4}' .format (hpos , bpos , dlemma , direction , distance )
53+ yield u'6,5,4,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , bpos , dpos , direction , distance )
54+ yield u'6,5,7,dir,dist:{0},{1},{2},{3},{4}' .format (hlemma , bpos , dlemma , direction , distance )
55+
56+
57+ def give_distance (id1 , id2 , direction ):
58+
59+ if direction == "right" :
60+ d = id1 - id2
61+ else :
62+ d = id2 - id1
63+
64+ if d < 1 :
65+ print "Error in distance computing, distance is too low."
66+ distance = "__ERROR__"
67+ elif d == 1 :
68+ distance = "1"
69+ elif d == 2 :
70+ distance = "2"
71+ elif d == 3 :
72+ distance = "3"
73+ elif d == 4 :
74+ distance = "4"
75+ elif d == 5 :
76+ distance = "5"
77+ elif 5 < d <= 10 :
78+ distance = "6-10"
79+ elif 10 < d <= 20 :
80+ distance = "11-20"
81+ else :
82+ distance = ">20"
83+
84+ return distance
85+
86+
87+ def give_direction (id1 , id2 ):
88+ if id2 < id1 :
89+ direction = "right"
90+ else :
91+ direction = "left"
92+
93+ return direction
5494
5595
5696def fm (infile ):
97+
5798 # takes a file in conll06 format, returns a feature map
5899 feat_map = {} # featmap as dictionary {feature:index}
59100 index = 0 # index in featmap
101+
60102 for sentence in sentences (codecs .open (infile , encoding = 'utf-8' )):
61103 for token1 in sentence :
62-
104+ direction = "left"
105+ distance = give_distance (0 , token1 .id , direction )
63106 # add root features
64- for feature in give_features ("__ROOT__" , "__ROOT__" , "__ROOT__" , token1 .form , token1 .lemma , token1 .pos , token1 .rel ):
107+ for feature in give_features ("__ROOT__" , "__ROOT__" , "__ROOT__" , token1 .form , token1 .lemma , token1 .pos ,
108+ token1 .rel , direction , distance ):
65109 if feature not in feat_map :
66110 feat_map [feature ] = index
67111 index += 1
68112
69113 # add other features
70- for token2 in sentence :
71- for feature in give_features (token1 .form , token1 .lemma , token1 .pos , token2 .form , token2 .lemma , token2 .pos , token2 .rel ):
114+ for token2 in (token2 for token2 in sentence if token2 .id != token1 .id ):
115+
116+ # direction
117+ direction = give_direction (token1 .id , token2 .id )
118+
119+ # distance
120+ distance = give_distance (token1 .id , token2 .id , direction )
121+
122+ for feature in give_features (token1 .form , token1 .lemma , token1 .pos , token2 .form , token2 .lemma ,
123+ token2 .pos , token2 .rel , direction , distance ):
72124 if feature not in feat_map :
73125 feat_map [feature ] = index
74126 index += 1
0 commit comments