-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathtokens.js
More file actions
200 lines (172 loc) Β· 4.86 KB
/
tokens.js
File metadata and controls
200 lines (172 loc) Β· 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import { getSimilarity } from './util'
export class Doc {
constructor(words, spaces, attrs = {}) {
this._doc = attrs.doc || {}
this._tokens = attrs.tokens || []
this._ents = attrs.ents || []
this._sents = attrs.sents || []
this._chunks = attrs.noun_chunks || []
this._model = attrs.model
this._api = attrs.api
this.tokens = words.map((word, i) => new Token(this, word, spaces[i], this._tokens[i]))
for (let i = 0; i < this.tokens.length; i++) {
this[i] = this.tokens[i]
}
this.cats = this._doc.cats
this.isTagged = this._doc.is_tagged
this.isParsed = this._doc.is_parsed
this.isNered = this._doc.is_nered
this.isSentenced = this._doc.is_sentenced
}
inspect() {
return this.text
}
get text() {
let text = ''
for (let token of this.tokens) {
text += token.textWithWs
}
return text
}
get length() {
return this.tokens.length
}
get ents() {
return this._ents.map(({ start, end, label }) => new Span(this, start, end, label))
}
get sents() {
return this._sents.map(({ start, end }) => new Span(this, start, end))
}
get nounChunks() {
return this._chunks.map(({ start, end }) => new Span(this, start, end))
}
*[Symbol.iterator]() {
let i = 0
while (this.tokens[i] !== undefined) {
yield this.tokens[i]
++i
}
}
toString() {
return this.text
}
map(func) {
let tokens = []
for (let token of this) {
tokens.push(func(token))
}
return tokens
}
slice(start, end) {
return new Span(this, start, end)
}
async similarity(obj) {
return await getSimilarity(this._api, this._model, this.text, obj.text)
}
}
export class Span {
constructor(doc, start, end, label) {
this.doc = doc
this.start = start
this.end = end
this._label = label
this.tokens = [...this.doc].slice(this.start, this.end)
for (let i = 0; i < this.tokens.length; i++) {
this[i] = this.tokens[0]
}
}
get text() {
let text = ''
for (let token of this.tokens) {
text += token.textWithWs
}
return text.trim()
}
get length() {
return this.tokens.length
}
get label() {
if (this._label) {
return this._label
}
// Manually check if span is an entity
for (let ent of this.doc.ents) {
if (ent.start === this.start && ent.end == this.end) {
return ent.label
}
}
}
*[Symbol.iterator]() {
let i = 0
while (this.tokens[i] !== undefined) {
yield this.tokens[i]
++i
}
}
slice(start, end) {
return new Span(this, start, end)
}
toString() {
return this.text
}
inspect() {
return this.text
}
async similarity(obj) {
return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
}
}
export class Token {
constructor(doc, word, space, attrs = {}) {
this.doc = doc
this.whitespace = space ? ' ' : ''
this.text = word
this.textWithWs = this.text + this.whitespace
this.orth = attrs.orth
this.i = attrs.i
this.entType = attrs.ent_type
this.entIob = attrs.ent_iob
this.lemma = attrs.lemma
this.norm = attrs.norm
this.lower = attrs.lower
;(this.shape = attrs.shape), (this.prefix = attrs.prefix)
this.suffix = attrs.suffix
this.pos = attrs.pos
this.tag = attrs.tag
this.dep = attrs.dep
this.isAlpha = attrs.is_alpha
this.isAscii = attrs.is_ascii
this.isDigit = attrs.is_digit
this.isLower = attrs.is_lower
this.isUpper = attrs.is_upper
this.isTitle = attrs.is_title
this.isPunct = attrs.is_punct
this.isLeftPunct = attrs.is_left_punct
this.isRightPunct = attrs.is_right_punct
this.isSpace = attrs.is_space
this.isBracket = attrs.is_bracket
this.isCurrency = attrs.is_currency
this.likeUrl = attrs.like_url
this.likeNum = attrs.like_num
this.likeEmail = attrs.like_email
this.isOov = attrs.is_oov
this.isStop = attrs.is_stop
this.isSentStart = attrs.is_sent_start
this._head = attrs.head
}
get length() {
return this.text.length
}
get head() {
return this.doc[this._head]
}
toString() {
return this.text
}
inspect() {
return this.text
}
async similarity(obj) {
return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
}
}