Skip to content
This repository was archived by the owner on Aug 25, 2024. It is now read-only.

Commit 004d867

Browse files
authored
operations: nlp: Add sklearn NLP operations
1 parent dabfb17 commit 004d867

File tree

5 files changed

+511
-3
lines changed

5 files changed

+511
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77
## [Unreleased]
88
### Added
99
- Tutorial for using NLP operations with models
10-
- Operations plugin for NLP
10+
- Operations plugin for NLP wrapping spacy and scikit functions
1111
- Support for default value in a Definition
1212
- Transformers Question Answering model
1313
- Source for reading images in directories

operations/nlp/dffml_operations_nlp/operations.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,25 @@
33
import spacy
44
from spacy.lang.en import English
55
from spacy.lang.en.stop_words import STOP_WORDS
6+
from sklearn.feature_extraction.text import (
7+
CountVectorizer,
8+
TfidfVectorizer,
9+
)
610

711
from dffml.df.base import op
812
from dffml.df.types import Definition
913

1014

15+
def _load_model(spacy_model: str):
16+
try:
17+
nlp = spacy.load(spacy_model)
18+
except OSError:
19+
raise Exception(
20+
f"Can't find model {spacy_model}. Try running `python -m spacy download {spacy_model}"
21+
)
22+
return nlp
23+
24+
1125
@op
1226
async def remove_stopwords(
1327
text: str, custom_stop_words: List[str] = None
@@ -51,6 +65,277 @@ async def remove_stopwords(
5165
return " ".join(clean_tokens)
5266

5367

68+
@op
69+
async def pos_tagger(
70+
text: str, spacy_model: str, tag_type: str = "fine_grained"
71+
) -> List[str]:
72+
"""
73+
Assigns part-of-speech tags to text.
74+
75+
Parameters
76+
----------
77+
text : str
78+
Text to be tagged.
79+
80+
spacy_model: str
81+
A spacy model with tagger and parser.
82+
83+
Returns
84+
-------
85+
result: list
86+
A list containing tuples of word and their respective pos tag.
87+
"""
88+
nlp = _load_model(spacy_model)
89+
doc = nlp(text)
90+
pos_tags = []
91+
if tag_type is "fine_grained":
92+
for token in doc:
93+
pos_tags.append((token.text, token.tag_))
94+
elif tag_type is "coarse_grained":
95+
for token in doc:
96+
pos_tags.append((token.text, token.pos_))
97+
return pos_tags
98+
99+
100+
@op
101+
async def lemmatizer(text: str, spacy_model: str) -> List[str]:
102+
"""
103+
Reduce words in the text to their dictionary form (lemma)
104+
105+
Parameters
106+
----------
107+
text : str
108+
String to lemmatize.
109+
110+
spacy_model: str
111+
Spacy model to be used for lemmatization.
112+
113+
Returns
114+
-------
115+
result: list
116+
A list containing base form of the words.
117+
"""
118+
nlp = _load_model(spacy_model)
119+
doc = nlp(text)
120+
lemma = []
121+
for word in doc:
122+
lemma.append(word.lemma_)
123+
return lemma
124+
125+
126+
@op
127+
async def get_similarity(text_1: str, text_2: str, spacy_model: str) -> float:
128+
"""
129+
Calculates similarity between two text strings as a score between 0 and 1.
130+
131+
Parameters
132+
----------
133+
text_1 : str
134+
First string to compare.
135+
136+
text_2 : str
137+
Second string to compare.
138+
139+
spacy_model: str
140+
Spacy model to be used for extracting word vectors which are used for calculating similarity.
141+
142+
Returns
143+
-------
144+
result: float
145+
A similarity score between 0 and 1.
146+
"""
147+
nlp = _load_model(spacy_model)
148+
text_1_doc = nlp(text_1)
149+
text_2_doc = nlp(text_2)
150+
return text_1_doc.similarity(text_2_doc)
151+
152+
153+
@op
154+
async def get_noun_chunks(text: str, spacy_model: str) -> List[str]:
155+
"""
156+
Extracts the noun chunks from text.
157+
158+
Parameters
159+
----------
160+
text : str
161+
String to extract noun chunks from.
162+
163+
spacy_model: str
164+
A spacy model with the capability of parsing.
165+
166+
Returns
167+
-------
168+
result: list
169+
A list containing noun chunks.
170+
"""
171+
nlp = _load_model(spacy_model)
172+
text_doc = nlp(text)
173+
noun_chunks = list(text_doc.noun_chunks)
174+
return noun_chunks
175+
176+
177+
@op
178+
async def get_sentences(text: str, spacy_model: str) -> List[str]:
179+
"""
180+
Extracts the sentences from text.
181+
182+
Parameters
183+
----------
184+
text : str
185+
String to extract sentences from.
186+
187+
spacy_model: str
188+
A spacy model with the capability of parsing. Sentence
189+
boundaries are calculated from the syntactic dependency parse.
190+
191+
Returns
192+
-------
193+
result: list
194+
A list containing sentences.
195+
"""
196+
nlp = _load_model(spacy_model)
197+
text_doc = nlp(text)
198+
sentences = list(text_doc.sents)
199+
return sentences
200+
201+
202+
@op
203+
async def count_vectorizer(
204+
text: List[str],
205+
encoding: str = "utf-8",
206+
decode_error: str = "strict",
207+
strip_accents: str = None,
208+
lowercase: bool = True,
209+
# preprocessor=None,
210+
# tokenizer=None,
211+
stop_words: str = None,
212+
token_pattern: str = "(?u)\\b\\w\\w+\\b",
213+
ngram_range: List[int] = None,
214+
analyzer: str = "word",
215+
max_df: float = 1.0,
216+
min_df: float = 1,
217+
max_features: int = None,
218+
vocabulary: dict = None,
219+
binary: bool = False,
220+
get_feature_names: bool = False,
221+
) -> List[int]:
222+
"""
223+
Converts a collection of text documents to a matrix of token counts using sklearn CountVectorizer's `fit_transform` method.
224+
For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
225+
Parameters specific to this operation are described below.
226+
227+
Parameters
228+
----------
229+
text : list
230+
A list of strings.
231+
232+
get_feature_names: bool
233+
If `True` return feature names using get_feature_names method of CountVectorizer.
234+
235+
Returns
236+
-------
237+
result: list
238+
A list containing token counts and feature names if `get_feature_names` is `True`.
239+
"""
240+
if ngram_range is None:
241+
ngram_range = (1, 1)
242+
else:
243+
ngram_range = tuple(ngram_range)
244+
vectorizer = CountVectorizer(
245+
encoding=encoding,
246+
decode_error=decode_error,
247+
strip_accents=strip_accents,
248+
lowercase=lowercase,
249+
stop_words=stop_words,
250+
token_pattern=token_pattern,
251+
ngram_range=ngram_range,
252+
analyzer=analyzer,
253+
max_df=max_df,
254+
min_df=min_df,
255+
max_features=max_features,
256+
vocabulary=vocabulary,
257+
binary=binary,
258+
)
259+
names = None
260+
X = vectorizer.fit_transform(text).toarray()
261+
if get_feature_names:
262+
names = vectorizer.get_feature_names()
263+
return [X, names]
264+
265+
266+
@op
267+
async def tfidf_vectorizer(
268+
text: List[str],
269+
encoding: str = "utf-8",
270+
decode_error: str = "strict",
271+
strip_accents: str = None,
272+
lowercase: bool = True,
273+
# preprocessor=None,
274+
# tokenizer=None,
275+
analyzer: str = "word",
276+
stop_words: str = None,
277+
token_pattern: str = "(?u)\\b\\w\\w+\\b",
278+
ngram_range: List[int] = None,
279+
max_df: str = 1.0,
280+
min_df: str = 1,
281+
max_features: str = None,
282+
vocabulary: str = None,
283+
binary: bool = False,
284+
norm: str = "l2",
285+
use_idf: bool = True,
286+
smooth_idf: bool = True,
287+
sublinear_tf: bool = False,
288+
get_feature_names: bool = False,
289+
) -> List[float]:
290+
"""
291+
Convert a collection of raw documents to a matrix of TF-IDF features using sklearn TfidfVectorizer's `fit_transform` method.
292+
For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
293+
Parameters specific to this operation are described below.
294+
295+
Parameters
296+
----------
297+
text : list
298+
A list of strings.
299+
300+
get_feature_names: bool
301+
If `True` return feature names using get_feature_names method of TfidfVectorizer.
302+
303+
Returns
304+
-------
305+
result: list
306+
A list containing token counts and feature names if `get_feature_names` is `True`.
307+
"""
308+
if ngram_range is None:
309+
ngram_range = (1, 1)
310+
else:
311+
ngram_range = tuple(ngram_range)
312+
vectorizer = TfidfVectorizer(
313+
encoding=encoding,
314+
decode_error=decode_error,
315+
strip_accents=strip_accents,
316+
lowercase=lowercase,
317+
analyzer=analyzer,
318+
stop_words=stop_words,
319+
token_pattern=token_pattern,
320+
ngram_range=ngram_range,
321+
max_df=max_df,
322+
min_df=min_df,
323+
max_features=max_features,
324+
vocabulary=vocabulary,
325+
binary=binary,
326+
norm=norm,
327+
use_idf=use_idf,
328+
smooth_idf=smooth_idf,
329+
sublinear_tf=sublinear_tf,
330+
)
331+
332+
names = None
333+
X = vectorizer.fit_transform(text).toarray()
334+
if get_feature_names:
335+
names = vectorizer.get_feature_names()
336+
return [X, names]
337+
338+
54339
# Definitions
55340
text_def = Definition(name="text_def", primitive="str")
56341
max_len_def = Definition(name="max_len_def", primitive="int")

operations/nlp/setup.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
"dffml.operation": [
1414
f"remove_stopwords = {common.IMPORT_NAME}.operations:remove_stopwords",
1515
f"get_embedding = {common.IMPORT_NAME}.operations:get_embedding",
16+
f"pos_tagger = {common.IMPORT_NAME}.operations:pos_tagger",
17+
f"lemmatizer = {common.IMPORT_NAME}.operations:lemmatizer",
18+
f"get_similarity = {common.IMPORT_NAME}.operations:get_similarity",
19+
f"get_noun_chunks = {common.IMPORT_NAME}.operations:get_noun_chunks",
20+
f"get_sentences = {common.IMPORT_NAME}.operations:get_sentences",
21+
f"count_vectorizer = {common.IMPORT_NAME}.operations:count_vectorizer",
22+
f"tfidf_vectorizer = {common.IMPORT_NAME}.operations:tfidf_vectorizer",
1623
]
1724
}
1825

operations/nlp/setup_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
AUTHOR_NAME = "0dust"
1111
AUTHOR_EMAIL = "[email protected]"
1212
# Install dffml if it is not installed in development mode
13-
INSTALL_REQUIRES = ["spacy>=2.3.0"] + (
13+
INSTALL_REQUIRES = ["spacy>=2.3.0", "scikit-learn>=0.21.2"] + (
1414
["dffml>=0.3.7"]
1515
if not any(
1616
list(

0 commit comments

Comments
 (0)