kaiser/target_identifier.py at master · machinereading/kaiser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

# coding: utf-8

# In[1]:


import os
import json
from collections import Counter
import jpype
import sys
sys.path.append('../')


# In[2]:


from kaiser.koreanframenet import koreanframenet
kfn = koreanframenet.interface(1.1)
from konlpy.tag import Kkma
kkma = Kkma()

try:
    target_dir = os.path.dirname( os.path.abspath( __file__ ))
except:
    target_dir = '.'


# In[27]:


class targetIdentifier():
    def __init__(self, srl='framenet', language='ko', only_lu=True):
        self.srl = srl
        self.language = language
        self.only_lu = only_lu

        with open(target_dir+'/data/targetdic-1.1.json','r') as f:
            targetdic = json.load(f)
        self.targetdic = targetdic

    def targetize(self, word):
        jpype.attachThreadToJVM()
        target_candis = []
        morps = kkma.pos(word)
        v = False
        for m,p in morps:
            if p == 'XSV' or p == 'VV' or p == 'VA':
                v = True

        if v:
            for i in range(len(morps)):
                m,p = morps[i]
                if p == 'VA' or p == 'VV':
                    if p == 'VV':
                        pos = 'v'
                    elif p == 'VA':
                        pos = 'a'
                    else:
                        pos = 'v'

                    if m[0] == word[0] and len(m) >= 1:
                        target_candis.append((m,pos))
#                 if p == 'NNG' or p == 'NNP':
                if p == 'NNG':
                    pos = 'n'
                    if m[0] == word[0] and len(m) >= 1:
                        target_candis.append((m,pos))
                if i > 0 and p == 'XSV':
                    pos = 'v'
                    if m[0] == word[0] and len(m) >= 1:
                        target_candis.append((m,pos))
                    r = morps[i-1][0]+m
                    if r[0] == word[0]:
                        target_candis.append((r,pos))
        else:
            pos = 'n'
            pos_list = []
            for m,p in morps:
                if p.startswith('J'):
                    pos_list.append(m)
                elif p == 'VCP' or p == 'EFN':
                    pos_list.append(m)
            for m, p in morps:
#                 if p == 'NNG' or p == 'NNP':
                if p == 'NNG':
                    if len(pos_list) == 0:
                        if m == word:
                            target_candis.append((m, pos))
                    else:
                        if m[0] == word[0]:
                            target_candis.append((m, pos))
        return target_candis

    def get_lu_by_token(self, token):
        target_candis = self.targetize(token)
        lu_candis = []
        for target_candi, word_pos in target_candis:
            for lu in self.targetdic:
                if target_candi in self.targetdic[lu]:
                    lu_pos = lu.split('.')[-1]
                    if word_pos == lu_pos:
                        lu_candis.append(lu)
            if self.only_lu==False:
                lu_candis.append(target_candi+'.'+word_pos)
        common = Counter(lu_candis).most_common()
        if len(common) > 0:
            result = common[0][0]
        else:
            result = False
        return result

    def target_id(self, input_conll):
        result = []
        tokens = input_conll[0]
        for idx in range(len(tokens)):
            token = tokens[idx]
            lu = self.get_lu_by_token(token)
            lus = ['_' for i in range(len(tokens))]
            if lu:
                lus[idx] = lu
                instance = []
    #             instance.append(idxs)
                instance.append(tokens)
                instance.append(lus)
                result.append(instance)
        return result

    def pred_id(self, input_conll):
        result = []
        tokens = input_conll[0]
        for idx in range(len(tokens)):
            token = tokens[idx]
            lus = ['_' for i in range(len(tokens))]
            target_candis = self.targetize(token)
            for target_candi, word_pos in target_candis:
                if word_pos == 'v' or word_pos == 'a':
                    lus[idx] = 'PRED'
                    instance = []
                    instance.append(tokens)
                    instance.append(lus)
                    result.append(instance)
        return result


# In[35]:


# text = '애플은 스티브 잡스와 스티브 워즈니악과 론 웨인이 1976년에 설립한 컴퓨터 회사이다.'
# text = '헤밍웨이는 1899년 7월 21일 미국 일리노이에서 태어났고 62세에 자살로 사망했다.'
# text = '헤밍웨이는 풀린 파이퍼와 이혼한 뒤 마사 겔혼과 재혼하였다'
# text = '애플은 스티브 잡스와 스티브 워즈니악과 론 웨인이 1976년에 설립한 회사이다.'
# text = '헤밍웨이는 태어났고 마사 겔혼과 이혼하였다.'
# text = '헤밍웨이는 풀린 파이퍼와 이혼한 뒤 마사 겔혼과 재혼하였다'
# text = '멜 깁슨이 출연한 영화를 제작한 사람은 누구인가?'
# targetid = targetIdentifier(only_lu=False)

# i = [text.split(' ')]
# # d = targetid.pred_id(i)
# # print(d)
# # print('')
# d = targetid.target_id(i)
# from pprint import pprint
# pprint(d)