operations: nlp: Add sklearn NLP operations

0dust · web-flow · commit 004d867b4406 · 2020-07-27T11:08:19.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
 - Tutorial for using NLP operations with models
-- Operations plugin for NLP
+- Operations plugin for NLP wrapping spacy and scikit functions
 - Support for default value in a Definition
 - Transformers Question Answering model
 - Source for reading images in directories
diff --git a/operations/nlp/dffml_operations_nlp/operations.py b/operations/nlp/dffml_operations_nlp/operations.py
@@ -3,11 +3,25 @@
 import spacy
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
+from sklearn.feature_extraction.text import (
+    CountVectorizer,
+    TfidfVectorizer,
+)
 
 from dffml.df.base import op
 from dffml.df.types import Definition
 
 
+def _load_model(spacy_model: str):
+    try:
+        nlp = spacy.load(spacy_model)
+    except OSError:
+        raise Exception(
+            f"Can't find model {spacy_model}. Try running `python -m spacy download {spacy_model}"
+        )
+    return nlp
+
+
 @op
 async def remove_stopwords(
     text: str, custom_stop_words: List[str] = None
@@ -51,6 +65,277 @@ async def remove_stopwords(
     return " ".join(clean_tokens)
 
 
+@op
+async def pos_tagger(
+    text: str, spacy_model: str, tag_type: str = "fine_grained"
+) -> List[str]:
+    """
+    Assigns part-of-speech tags to text.
+
+    Parameters
+    ----------
+    text : str
+        Text to be tagged.
+
+    spacy_model: str
+        A spacy model with tagger and parser.
+
+    Returns
+    -------
+    result: list
+        A list containing tuples of word and their respective pos tag.
+    """
+    nlp = _load_model(spacy_model)
+    doc = nlp(text)
+    pos_tags = []
+    if tag_type is "fine_grained":
+        for token in doc:
+            pos_tags.append((token.text, token.tag_))
+    elif tag_type is "coarse_grained":
+        for token in doc:
+            pos_tags.append((token.text, token.pos_))
+    return pos_tags
+
+
+@op
+async def lemmatizer(text: str, spacy_model: str) -> List[str]:
+    """
+    Reduce words in the text to their dictionary form (lemma)
+
+    Parameters
+    ----------
+    text : str
+        String to lemmatize.
+
+    spacy_model: str
+        Spacy model to be used for lemmatization.
+
+    Returns
+    -------
+    result: list
+        A list containing base form of the words.
+    """
+    nlp = _load_model(spacy_model)
+    doc = nlp(text)
+    lemma = []
+    for word in doc:
+        lemma.append(word.lemma_)
+    return lemma
+
+
+@op
+async def get_similarity(text_1: str, text_2: str, spacy_model: str) -> float:
+    """
+    Calculates similarity between two text strings as a score between 0 and 1.
+
+    Parameters
+    ----------
+    text_1 : str
+        First string to compare.
+    
+    text_2 : str
+        Second string to compare.
+
+    spacy_model: str
+        Spacy model to be used for extracting word vectors which are used for calculating similarity.
+
+    Returns
+    -------
+    result: float
+        A similarity score between 0 and 1.
+    """
+    nlp = _load_model(spacy_model)
+    text_1_doc = nlp(text_1)
+    text_2_doc = nlp(text_2)
+    return text_1_doc.similarity(text_2_doc)
+
+
+@op
+async def get_noun_chunks(text: str, spacy_model: str) -> List[str]:
+    """
+    Extracts the noun chunks from text.
+
+    Parameters
+    ----------
+    text : str
+        String to extract noun chunks from.
+
+    spacy_model: str
+        A spacy model with the capability of parsing.
+
+    Returns
+    -------
+    result: list
+        A list containing noun chunks.
+    """
+    nlp = _load_model(spacy_model)
+    text_doc = nlp(text)
+    noun_chunks = list(text_doc.noun_chunks)
+    return noun_chunks
+
+
+@op
+async def get_sentences(text: str, spacy_model: str) -> List[str]:
+    """
+    Extracts the sentences from text.
+
+    Parameters
+    ----------
+    text : str
+        String to extract sentences from.
+
+    spacy_model: str
+        A spacy model with the capability of parsing. Sentence 
+        boundaries are calculated from the syntactic dependency parse.
+
+    Returns
+    -------
+    result: list
+        A list containing sentences.
+    """
+    nlp = _load_model(spacy_model)
+    text_doc = nlp(text)
+    sentences = list(text_doc.sents)
+    return sentences
+
+
+@op
+async def count_vectorizer(
+    text: List[str],
+    encoding: str = "utf-8",
+    decode_error: str = "strict",
+    strip_accents: str = None,
+    lowercase: bool = True,
+    # preprocessor=None,
+    # tokenizer=None,
+    stop_words: str = None,
+    token_pattern: str = "(?u)\\b\\w\\w+\\b",
+    ngram_range: List[int] = None,
+    analyzer: str = "word",
+    max_df: float = 1.0,
+    min_df: float = 1,
+    max_features: int = None,
+    vocabulary: dict = None,
+    binary: bool = False,
+    get_feature_names: bool = False,
+) -> List[int]:
+    """
+    Converts a collection of text documents to a matrix of token counts using sklearn CountVectorizer's `fit_transform` method. 
+    For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
+    Parameters specific to this operation are described below.
+
+    Parameters
+    ----------
+    text : list
+        A list of strings.
+
+    get_feature_names: bool
+        If `True` return feature names using get_feature_names method of CountVectorizer.
+
+    Returns
+    -------
+    result: list
+        A list containing token counts and feature names if `get_feature_names` is `True`.
+    """
+    if ngram_range is None:
+        ngram_range = (1, 1)
+    else:
+        ngram_range = tuple(ngram_range)
+    vectorizer = CountVectorizer(
+        encoding=encoding,
+        decode_error=decode_error,
+        strip_accents=strip_accents,
+        lowercase=lowercase,
+        stop_words=stop_words,
+        token_pattern=token_pattern,
+        ngram_range=ngram_range,
+        analyzer=analyzer,
+        max_df=max_df,
+        min_df=min_df,
+        max_features=max_features,
+        vocabulary=vocabulary,
+        binary=binary,
+    )
+    names = None
+    X = vectorizer.fit_transform(text).toarray()
+    if get_feature_names:
+        names = vectorizer.get_feature_names()
+    return [X, names]
+
+
+@op
+async def tfidf_vectorizer(
+    text: List[str],
+    encoding: str = "utf-8",
+    decode_error: str = "strict",
+    strip_accents: str = None,
+    lowercase: bool = True,
+    # preprocessor=None,
+    # tokenizer=None,
+    analyzer: str = "word",
+    stop_words: str = None,
+    token_pattern: str = "(?u)\\b\\w\\w+\\b",
+    ngram_range: List[int] = None,
+    max_df: str = 1.0,
+    min_df: str = 1,
+    max_features: str = None,
+    vocabulary: str = None,
+    binary: bool = False,
+    norm: str = "l2",
+    use_idf: bool = True,
+    smooth_idf: bool = True,
+    sublinear_tf: bool = False,
+    get_feature_names: bool = False,
+) -> List[float]:
+    """
+    Convert a collection of raw documents to a matrix of TF-IDF features using sklearn TfidfVectorizer's `fit_transform` method.
+    For details on parameters check https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
+    Parameters specific to this operation are described below.
+
+    Parameters
+    ----------
+    text : list
+        A list of strings.
+
+    get_feature_names: bool
+        If `True` return feature names using get_feature_names method of TfidfVectorizer.
+
+    Returns
+    -------
+    result: list
+        A list containing token counts and feature names if `get_feature_names` is `True`.
+    """
+    if ngram_range is None:
+        ngram_range = (1, 1)
+    else:
+        ngram_range = tuple(ngram_range)
+    vectorizer = TfidfVectorizer(
+        encoding=encoding,
+        decode_error=decode_error,
+        strip_accents=strip_accents,
+        lowercase=lowercase,
+        analyzer=analyzer,
+        stop_words=stop_words,
+        token_pattern=token_pattern,
+        ngram_range=ngram_range,
+        max_df=max_df,
+        min_df=min_df,
+        max_features=max_features,
+        vocabulary=vocabulary,
+        binary=binary,
+        norm=norm,
+        use_idf=use_idf,
+        smooth_idf=smooth_idf,
+        sublinear_tf=sublinear_tf,
+    )
+
+    names = None
+    X = vectorizer.fit_transform(text).toarray()
+    if get_feature_names:
+        names = vectorizer.get_feature_names()
+    return [X, names]
+
+
 # Definitions
 text_def = Definition(name="text_def", primitive="str")
 max_len_def = Definition(name="max_len_def", primitive="int")
diff --git a/operations/nlp/setup.py b/operations/nlp/setup.py
@@ -13,6 +13,13 @@
     "dffml.operation": [
         f"remove_stopwords = {common.IMPORT_NAME}.operations:remove_stopwords",
         f"get_embedding = {common.IMPORT_NAME}.operations:get_embedding",
+        f"pos_tagger = {common.IMPORT_NAME}.operations:pos_tagger",
+        f"lemmatizer = {common.IMPORT_NAME}.operations:lemmatizer",
+        f"get_similarity = {common.IMPORT_NAME}.operations:get_similarity",
+        f"get_noun_chunks = {common.IMPORT_NAME}.operations:get_noun_chunks",
+        f"get_sentences = {common.IMPORT_NAME}.operations:get_sentences",
+        f"count_vectorizer = {common.IMPORT_NAME}.operations:count_vectorizer",
+        f"tfidf_vectorizer = {common.IMPORT_NAME}.operations:tfidf_vectorizer",
     ]
 }
 
diff --git a/operations/nlp/setup_common.py b/operations/nlp/setup_common.py
@@ -10,7 +10,7 @@
 AUTHOR_NAME = "0dust"
 AUTHOR_EMAIL = "himanshutripathi366@gmail.com"
 # Install dffml if it is not installed in development mode
-INSTALL_REQUIRES = ["spacy>=2.3.0"] + (
+INSTALL_REQUIRES = ["spacy>=2.3.0", "scikit-learn>=0.21.2"] + (
     ["dffml>=0.3.7"]
     if not any(
         list(
diff --git a/operations/nlp/tests/test_operations.py b/operations/nlp/tests/test_operations.py

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,13 @@`
`13`	`13`	`"dffml.operation": [`
`14`	`14`	`f"remove_stopwords = {common.IMPORT_NAME}.operations:remove_stopwords",`
`15`	`15`	`f"get_embedding = {common.IMPORT_NAME}.operations:get_embedding",`
	`16`	`+ f"pos_tagger = {common.IMPORT_NAME}.operations:pos_tagger",`
	`17`	`+ f"lemmatizer = {common.IMPORT_NAME}.operations:lemmatizer",`
	`18`	`+ f"get_similarity = {common.IMPORT_NAME}.operations:get_similarity",`
	`19`	`+ f"get_noun_chunks = {common.IMPORT_NAME}.operations:get_noun_chunks",`
	`20`	`+ f"get_sentences = {common.IMPORT_NAME}.operations:get_sentences",`
	`21`	`+ f"count_vectorizer = {common.IMPORT_NAME}.operations:count_vectorizer",`
	`22`	`+ f"tfidf_vectorizer = {common.IMPORT_NAME}.operations:tfidf_vectorizer",`
`16`	`23`	`]`
`17`	`24`	`}`
`18`	`25`