|
3 | 3 | import spacy |
4 | 4 | from spacy.lang.en import English |
5 | 5 | from spacy.lang.en.stop_words import STOP_WORDS |
| 6 | +from sklearn.feature_extraction.text import ( |
| 7 | + CountVectorizer, |
| 8 | + TfidfVectorizer, |
| 9 | +) |
6 | 10 |
|
7 | 11 | from dffml.df.base import op |
8 | 12 | from dffml.df.types import Definition |
9 | 13 |
|
10 | 14 |
|
| 15 | +def _load_model(spacy_model: str): |
| 16 | + try: |
| 17 | + nlp = spacy.load(spacy_model) |
| 18 | + except OSError: |
| 19 | + raise Exception( |
| 20 | + f"Can't find model {spacy_model}. Try running `python -m spacy download {spacy_model}" |
| 21 | + ) |
| 22 | + return nlp |
| 23 | + |
| 24 | + |
11 | 25 | @op |
12 | 26 | async def remove_stopwords( |
13 | 27 | text: str, custom_stop_words: List[str] = None |
@@ -51,6 +65,277 @@ async def remove_stopwords( |
51 | 65 | return " ".join(clean_tokens) |
52 | 66 |
|
53 | 67 |
|
| 68 | +@op |
| 69 | +async def pos_tagger( |
| 70 | + text: str, spacy_model: str, tag_type: str = "fine_grained" |
| 71 | +) -> List[str]: |
| 72 | + """ |
| 73 | + Assigns part-of-speech tags to text. |
| 74 | +
|
| 75 | + Parameters |
| 76 | + ---------- |
| 77 | + text : str |
| 78 | + Text to be tagged. |
| 79 | +
|
| 80 | + spacy_model: str |
| 81 | + A spacy model with tagger and parser. |
| 82 | +
|
| 83 | + Returns |
| 84 | + ------- |
| 85 | + result: list |
| 86 | + A list containing tuples of word and their respective pos tag. |
| 87 | + """ |
| 88 | + nlp = _load_model(spacy_model) |
| 89 | + doc = nlp(text) |
| 90 | + pos_tags = [] |
| 91 | + if tag_type is "fine_grained": |
| 92 | + for token in doc: |
| 93 | + pos_tags.append((token.text, token.tag_)) |
| 94 | + elif tag_type is "coarse_grained": |
| 95 | + for token in doc: |
| 96 | + pos_tags.append((token.text, token.pos_)) |
| 97 | + return pos_tags |
| 98 | + |
| 99 | + |
| 100 | +@op |
| 101 | +async def lemmatizer(text: str, spacy_model: str) -> List[str]: |
| 102 | + """ |
| 103 | + Reduce words in the text to their dictionary form (lemma) |
| 104 | +
|
| 105 | + Parameters |
| 106 | + ---------- |
| 107 | + text : str |
| 108 | + String to lemmatize. |
| 109 | +
|
| 110 | + spacy_model: str |
| 111 | + Spacy model to be used for lemmatization. |
| 112 | +
|
| 113 | + Returns |
| 114 | + ------- |
| 115 | + result: list |
| 116 | + A list containing base form of the words. |
| 117 | + """ |
| 118 | + nlp = _load_model(spacy_model) |
| 119 | + doc = nlp(text) |
| 120 | + lemma = [] |
| 121 | + for word in doc: |
| 122 | + lemma.append(word.lemma_) |
| 123 | + return lemma |
| 124 | + |
| 125 | + |
| 126 | +@op |
| 127 | +async def get_similarity(text_1: str, text_2: str, spacy_model: str) -> float: |
| 128 | + """ |
| 129 | + Calculates similarity between two text strings as a score between 0 and 1. |
| 130 | +
|
| 131 | + Parameters |
| 132 | + ---------- |
| 133 | + text_1 : str |
| 134 | + First string to compare. |
| 135 | + |
| 136 | + text_2 : str |
| 137 | + Second string to compare. |
| 138 | +
|
| 139 | + spacy_model: str |
| 140 | + Spacy model to be used for extracting word vectors which are used for calculating similarity. |
| 141 | +
|
| 142 | + Returns |
| 143 | + ------- |
| 144 | + result: float |
| 145 | + A similarity score between 0 and 1. |
| 146 | + """ |
| 147 | + nlp = _load_model(spacy_model) |
| 148 | + text_1_doc = nlp(text_1) |
| 149 | + text_2_doc = nlp(text_2) |
| 150 | + return text_1_doc.similarity(text_2_doc) |
| 151 | + |
| 152 | + |
| 153 | +@op |
| 154 | +async def get_noun_chunks(text: str, spacy_model: str) -> List[str]: |
| 155 | + """ |
| 156 | + Extracts the noun chunks from text. |
| 157 | +
|
| 158 | + Parameters |
| 159 | + ---------- |
| 160 | + text : str |
| 161 | + String to extract noun chunks from. |
| 162 | +
|
| 163 | + spacy_model: str |
| 164 | + A spacy model with the capability of parsing. |
| 165 | +
|
| 166 | + Returns |
| 167 | + ------- |
| 168 | + result: list |
| 169 | + A list containing noun chunks. |
| 170 | + """ |
| 171 | + nlp = _load_model(spacy_model) |
| 172 | + text_doc = nlp(text) |
| 173 | + noun_chunks = list(text_doc.noun_chunks) |
| 174 | + return noun_chunks |
| 175 | + |
| 176 | + |
| 177 | +@op |
| 178 | +async def get_sentences(text: str, spacy_model: str) -> List[str]: |
| 179 | + """ |
| 180 | + Extracts the sentences from text. |
| 181 | +
|
| 182 | + Parameters |
| 183 | + ---------- |
| 184 | + text : str |
| 185 | + String to extract sentences from. |
| 186 | +
|
| 187 | + spacy_model: str |
| 188 | + A spacy model with the capability of parsing. Sentence |
| 189 | + boundaries are calculated from the syntactic dependency parse. |
| 190 | +
|
| 191 | + Returns |
| 192 | + ------- |
| 193 | + result: list |
| 194 | + A list containing sentences. |
| 195 | + """ |
| 196 | + nlp = _load_model(spacy_model) |
| 197 | + text_doc = nlp(text) |
| 198 | + sentences = list(text_doc.sents) |
| 199 | + return sentences |
| 200 | + |
| 201 | + |
| 202 | +@op |
| 203 | +async def count_vectorizer( |
| 204 | + text: List[str], |
| 205 | + encoding: str = "utf-8", |
| 206 | + decode_error: str = "strict", |
| 207 | + strip_accents: str = None, |
| 208 | + lowercase: bool = True, |
| 209 | + # preprocessor=None, |
| 210 | + # tokenizer=None, |
| 211 | + stop_words: str = None, |
| 212 | + token_pattern: str = "(?u)\\b\\w\\w+\\b", |
| 213 | + ngram_range: List[int] = None, |
| 214 | + analyzer: str = "word", |
| 215 | + max_df: float = 1.0, |
| 216 | + min_df: float = 1, |
| 217 | + max_features: int = None, |
| 218 | + vocabulary: dict = None, |
| 219 | + binary: bool = False, |
| 220 | + get_feature_names: bool = False, |
| 221 | +) -> List[int]: |
| 222 | + """ |
| 223 | + Converts a collection of text documents to a matrix of token counts using sklearn CountVectorizer's `fit_transform` method. |
| 224 | + For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html |
| 225 | + Parameters specific to this operation are described below. |
| 226 | +
|
| 227 | + Parameters |
| 228 | + ---------- |
| 229 | + text : list |
| 230 | + A list of strings. |
| 231 | +
|
| 232 | + get_feature_names: bool |
| 233 | + If `True` return feature names using get_feature_names method of CountVectorizer. |
| 234 | +
|
| 235 | + Returns |
| 236 | + ------- |
| 237 | + result: list |
| 238 | + A list containing token counts and feature names if `get_feature_names` is `True`. |
| 239 | + """ |
| 240 | + if ngram_range is None: |
| 241 | + ngram_range = (1, 1) |
| 242 | + else: |
| 243 | + ngram_range = tuple(ngram_range) |
| 244 | + vectorizer = CountVectorizer( |
| 245 | + encoding=encoding, |
| 246 | + decode_error=decode_error, |
| 247 | + strip_accents=strip_accents, |
| 248 | + lowercase=lowercase, |
| 249 | + stop_words=stop_words, |
| 250 | + token_pattern=token_pattern, |
| 251 | + ngram_range=ngram_range, |
| 252 | + analyzer=analyzer, |
| 253 | + max_df=max_df, |
| 254 | + min_df=min_df, |
| 255 | + max_features=max_features, |
| 256 | + vocabulary=vocabulary, |
| 257 | + binary=binary, |
| 258 | + ) |
| 259 | + names = None |
| 260 | + X = vectorizer.fit_transform(text).toarray() |
| 261 | + if get_feature_names: |
| 262 | + names = vectorizer.get_feature_names() |
| 263 | + return [X, names] |
| 264 | + |
| 265 | + |
| 266 | +@op |
| 267 | +async def tfidf_vectorizer( |
| 268 | + text: List[str], |
| 269 | + encoding: str = "utf-8", |
| 270 | + decode_error: str = "strict", |
| 271 | + strip_accents: str = None, |
| 272 | + lowercase: bool = True, |
| 273 | + # preprocessor=None, |
| 274 | + # tokenizer=None, |
| 275 | + analyzer: str = "word", |
| 276 | + stop_words: str = None, |
| 277 | + token_pattern: str = "(?u)\\b\\w\\w+\\b", |
| 278 | + ngram_range: List[int] = None, |
| 279 | + max_df: str = 1.0, |
| 280 | + min_df: str = 1, |
| 281 | + max_features: str = None, |
| 282 | + vocabulary: str = None, |
| 283 | + binary: bool = False, |
| 284 | + norm: str = "l2", |
| 285 | + use_idf: bool = True, |
| 286 | + smooth_idf: bool = True, |
| 287 | + sublinear_tf: bool = False, |
| 288 | + get_feature_names: bool = False, |
| 289 | +) -> List[float]: |
| 290 | + """ |
| 291 | + Convert a collection of raw documents to a matrix of TF-IDF features using sklearn TfidfVectorizer's `fit_transform` method. |
| 292 | + For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html |
| 293 | + Parameters specific to this operation are described below. |
| 294 | +
|
| 295 | + Parameters |
| 296 | + ---------- |
| 297 | + text : list |
| 298 | + A list of strings. |
| 299 | +
|
| 300 | + get_feature_names: bool |
| 301 | + If `True` return feature names using get_feature_names method of TfidfVectorizer. |
| 302 | +
|
| 303 | + Returns |
| 304 | + ------- |
| 305 | + result: list |
| 306 | + A list containing token counts and feature names if `get_feature_names` is `True`. |
| 307 | + """ |
| 308 | + if ngram_range is None: |
| 309 | + ngram_range = (1, 1) |
| 310 | + else: |
| 311 | + ngram_range = tuple(ngram_range) |
| 312 | + vectorizer = TfidfVectorizer( |
| 313 | + encoding=encoding, |
| 314 | + decode_error=decode_error, |
| 315 | + strip_accents=strip_accents, |
| 316 | + lowercase=lowercase, |
| 317 | + analyzer=analyzer, |
| 318 | + stop_words=stop_words, |
| 319 | + token_pattern=token_pattern, |
| 320 | + ngram_range=ngram_range, |
| 321 | + max_df=max_df, |
| 322 | + min_df=min_df, |
| 323 | + max_features=max_features, |
| 324 | + vocabulary=vocabulary, |
| 325 | + binary=binary, |
| 326 | + norm=norm, |
| 327 | + use_idf=use_idf, |
| 328 | + smooth_idf=smooth_idf, |
| 329 | + sublinear_tf=sublinear_tf, |
| 330 | + ) |
| 331 | + |
| 332 | + names = None |
| 333 | + X = vectorizer.fit_transform(text).toarray() |
| 334 | + if get_feature_names: |
| 335 | + names = vectorizer.get_feature_names() |
| 336 | + return [X, names] |
| 337 | + |
| 338 | + |
54 | 339 | # Definitions |
55 | 340 | text_def = Definition(name="text_def", primitive="str") |
56 | 341 | max_len_def = Definition(name="max_len_def", primitive="int") |
|
0 commit comments