Merge branch 'main' into 0.3.0

CalebCourier · CalebCourier · commit 5bce4deb88b1 · 2023-12-05T10:33:08.000-08:00
diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml
@@ -33,18 +33,18 @@ jobs:
         uses: actions/cache@v3
         with:
             path: ~/.cache/pypoetry
-            key: poetry-cache-${{ runner.os }}-${{ steps.setup_python.outputs.python-version }}-${{ env.POETRY_VERSION }
+            key: poetry-cache-${{ runner.os }}-${{ steps.setup_python.outputs.python-version }}-${{ env.POETRY_VERSION }}
       - name: Install Poetry
         uses: snok/install-poetry@v1
       - name: Install dependencies
         run: poetry install --with docs
       - name: Build
-        run: mkdocs build
+        run: poetry run mkdocs build
       - name: Upload artifact
         uses: actions/upload-pages-artifact@v2
         with:
           # Upload build folder
           path: 'site'
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v2
+        uses: actions/deploy-pages@v2
diff --git a/.github/workflows/scripts/run_notebooks.sh b/.github/workflows/scripts/run_notebooks.sh
@@ -9,7 +9,7 @@ cd docs/examples
 # Function to process a notebook
 process_notebook() {
     notebook="$1"
-    invalid_notebooks=("valid_chess_moves.ipynb" "translation_with_quality_check.ipynb" "llamaindex-output-parsing.ipynb")
+    invalid_notebooks=("valid_chess_moves.ipynb" "translation_with_quality_check.ipynb" "llamaindex-output-parsing.ipynb" "competitors_check.ipynb")
     if [[ ! " ${invalid_notebooks[@]} " =~ " ${notebook} " ]]; then
         echo "Processing $notebook..."
         poetry run jupyter nbconvert --to notebook --execute "$notebook"
diff --git a/docs/api_reference/validators.md b/docs/api_reference/validators.md
@@ -11,7 +11,6 @@
             - "!validate"
             - "!register_validator"
             - "!PydanticReAsk"
-            - "!Filter"
             - "!Refrain"
             - "!ValidationResult"
             - "!PassResult"
diff --git a/docs/examples/competitors_check.ipynb b/docs/examples/competitors_check.ipynb
diff --git a/guardrails/utils/openai_utils/v0.py b/guardrails/utils/openai_utils/v0.py
@@ -84,8 +84,8 @@ def construct_nonchat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
@@ -152,8 +152,8 @@ def construct_chat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
@@ -296,8 +296,8 @@ async def construct_chat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
diff --git a/guardrails/utils/openai_utils/v1.py b/guardrails/utils/openai_utils/v1.py
@@ -76,8 +76,8 @@ def construct_nonchat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
@@ -140,8 +140,8 @@ def construct_chat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
@@ -298,8 +298,8 @@ async def construct_chat_response(
     ) -> LLMResponse:
         """Construct an LLMResponse from an OpenAI response.
 
-        Splits execution based on whether the `stream` parameter
-        is set in the kwargs.
+        Splits execution based on whether the `stream` parameter is set
+        in the kwargs.
         """
         if stream:
             # If stream is defined and set to True,
diff --git a/guardrails/validators/__init__.py b/guardrails/validators/__init__.py
@@ -13,6 +13,7 @@
 )
 from guardrails.validators.bug_free_python import BugFreePython
 from guardrails.validators.bug_free_sql import BugFreeSQL
+from guardrails.validators.competitor_check import CompetitorCheck
 from guardrails.validators.detect_secrets import DetectSecrets, detect_secrets
 from guardrails.validators.endpoint_is_reachable import EndpointIsReachable
 from guardrails.validators.ends_with import EndsWith
@@ -75,6 +76,7 @@
     "PIIFilter",
     "SimilarToList",
     "DetectSecrets",
+    "CompetitorCheck",
     # Validator helpers
     "detect_secrets",
     "AnalyzerEngine",
diff --git a/guardrails/validators/competitor_check.py b/guardrails/validators/competitor_check.py
@@ -0,0 +1,174 @@
+import re
+from typing import Callable, Dict, List, Optional
+
+from guardrails.logger import logger
+from guardrails.validator_base import (
+    FailResult,
+    PassResult,
+    ValidationResult,
+    Validator,
+    register_validator,
+)
+
+
+try:
+    import nltk  # type: ignore
+except ImportError:
+    nltk = None  # type: ignore
+
+if nltk is not None:
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt")
+        
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
+@register_validator(name="competitor-check", data_type="string")
+class CompetitorCheck(Validator):
+    """Validates that LLM-generated text is not naming any competitors from a
+    given list.
+
+    In order to use this validator you need to provide an extensive list of the
+    competitors you want to avoid naming including all common variations.
+
+    Args:
+        competitors (List[str]): List of competitors you want to avoid naming
+    """
+
+    def __init__(
+        self,
+        competitors: List[str],
+        on_fail: Optional[Callable] = None,
+    ):
+        super().__init__(competitors=competitors, on_fail=on_fail)
+        self._competitors = competitors
+        model = "en_core_web_trf"
+        if spacy is None:
+            raise ImportError(
+                "You must install spacy in order to use the CompetitorCheck validator."
+            )
+
+        if not spacy.util.is_package(model):
+            logger.info(
+                f"Spacy model {model} not installed. "
+                "Download should start now and take a few minutes."
+            )
+            spacy.cli.download(model)  # type: ignore
+
+        self.nlp = spacy.load(model)
+
+    def exact_match(self, text: str, competitors: List[str]) -> List[str]:
+        """Performs exact match to find competitors from a list in a given
+        text.
+
+        Args:
+            text (str): The text to search for competitors.
+            competitors (list): A list of competitor entities to match.
+
+        Returns:
+            list: A list of matched entities.
+        """
+
+        found_entities = []
+        for entity in competitors:
+            pattern = rf"\b{re.escape(entity)}\b"
+            match = re.search(pattern.lower(), text.lower())
+            if match:
+                found_entities.append(entity)
+        return found_entities
+
+    def perform_ner(self, text: str, nlp) -> List[str]:
+        """Performs named entity recognition on text using a provided NLP
+        model.
+
+        Args:
+            text (str): The text to perform named entity recognition on.
+            nlp: The NLP model to use for entity recognition.
+
+        Returns:
+            entities: A list of entities found.
+        """
+
+        doc = nlp(text)
+        entities = []
+        for ent in doc.ents:
+            entities.append(ent.text)
+        return entities
+
+    def is_entity_in_list(self, entities: List[str], competitors: List[str]) -> List:
+        """Checks if any entity from a list is present in a given list of
+        competitors.
+
+        Args:
+            entities (list): A list of entities to check
+            competitors (list): A list of competitor names to match
+
+        Returns:
+            List: List of found competitors
+        """
+
+        found_competitors = []
+        for entity in entities:
+            for item in competitors:
+                pattern = rf"\b{re.escape(item)}\b"
+                match = re.search(pattern.lower(), entity.lower())
+                if match:
+                    found_competitors.append(item)
+        return found_competitors
+
+    def validate(self, value: str, metadata=Dict) -> ValidationResult:
+        """Checks a text to find competitors' names in it.
+
+        While running, store sentences naming competitors and generate a fixed output
+        filtering out all flagged sentences.
+
+        Args:
+            value (str): The value to be validated.
+            metadata (Dict, optional): Additional metadata. Defaults to empty dict.
+
+        Returns:
+            ValidationResult: The validation result.
+        """
+
+        if nltk is None:
+            raise ImportError(
+                "`nltk` library is required for `competitors-check` validator. "
+                "Please install it with `poetry add nltk`."
+            )
+        sentences = nltk.sent_tokenize(value)
+        flagged_sentences = []
+        filtered_sentences = []
+        list_of_competitors_found = []
+
+        for sentence in sentences:
+            entities = self.exact_match(sentence, self._competitors)
+            if entities:
+                ner_entities = self.perform_ner(sentence, self.nlp)
+                found_competitors = self.is_entity_in_list(ner_entities, entities)
+
+                if found_competitors:
+                    flagged_sentences.append((found_competitors, sentence))
+                    list_of_competitors_found.append(found_competitors)
+                    logger.debug(f"Found: {found_competitors} named in '{sentence}'")
+                else:
+                    filtered_sentences.append(sentence)
+
+            else:
+                filtered_sentences.append(sentence)
+
+        filtered_output = " ".join(filtered_sentences)
+
+        if len(flagged_sentences):
+            return FailResult(
+                error_message=(
+                    f"Found the following competitors: {list_of_competitors_found}. "
+                    "Please avoid naming those competitors next time"
+                ),
+                fix_value=filtered_output,
+            )
+        else:
+            return PassResult()
diff --git a/guardrails/validators/extractive_summary.py b/guardrails/validators/extractive_summary.py
@@ -73,7 +73,7 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult:
         except ImportError:
             raise ImportError(
                 "`thefuzz` library is required for `extractive-summary` validator. "
-                "Please install it with `pip install thefuzz`."
+                "Please install it with `poetry add thefuzz`."
             )
 
         # Split the value into sentences.
diff --git a/guardrails/validators/is_high_quality_translation.py b/guardrails/validators/is_high_quality_translation.py
@@ -45,7 +45,7 @@ def __init__(self, *args, **kwargs):
         except ImportError:
             raise ImportError(
                 "`is-high-quality-translation` validator requires the `inspiredco`"
-                "package. Please install it with `pip install inspiredco`."
+                "package. Please install it with `poetry add inspiredco`."
             )
 
     def validate(self, value: Any, metadata: Dict) -> ValidationResult:
diff --git a/guardrails/validators/is_profanity_free.py b/guardrails/validators/is_profanity_free.py
@@ -31,7 +31,7 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult:
         except ImportError:
             raise ImportError(
                 "`is-profanity-free` validator requires the `alt-profanity-check`"
-                "package. Please install it with `pip install profanity-check`."
+                "package. Please install it with `poetry add profanity-check`."
             )
 
         prediction = predict([value])
diff --git a/guardrails/validators/provenance.py b/guardrails/validators/provenance.py
@@ -182,7 +182,7 @@ def validate_each_sentence(
         if nltk is None:
             raise ImportError(
                 "`nltk` library is required for `provenance-v0` validator. "
-                "Please install it with `pip install nltk`."
+                "Please install it with `poetry add nltk`."
             )
         # Split the value into sentences using nltk sentence tokenizer.
         sentences = nltk.sent_tokenize(value)
@@ -542,7 +542,7 @@ def validate_each_sentence(
         if nltk is None:
             raise ImportError(
                 "`nltk` library is required for `provenance-v0` validator. "
-                "Please install it with `pip install nltk`."
+                "Please install it with `poetry add nltk`."
             )
         # Split the value into sentences using nltk sentence tokenizer.
         sentences = nltk.sent_tokenize(value)
diff --git a/guardrails/validators/remove_redundant_sentences.py b/guardrails/validators/remove_redundant_sentences.py
@@ -45,7 +45,7 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult:
         except ImportError:
             raise ImportError(
                 "`thefuzz` library is required for `remove-redundant-sentences` "
-                "validator. Please install it with `pip install thefuzz`."
+                "validator. Please install it with `poetry add thefuzz`."
             )
 
         # Split the value into sentences.
diff --git a/guardrails/validators/similar_to_document.py b/guardrails/validators/similar_to_document.py
@@ -53,7 +53,7 @@ def __init__(
         if not _HAS_NUMPY:
             raise ImportError(
                 f"The {self.__class__.__name__} validator requires the numpy package.\n"
-                "`pip install numpy` to install it."
+                "`poetry add numpy` to install it."
             )
 
         self.client = OpenAIClient()
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -60,6 +60,7 @@ nav:
     - 'Detect and limit hallucinations in generated text': examples/provenance.ipynb
     - 'Check whether a value is similar to a set of other values': examples/value_within_distribution.ipynb
     - 'Using GuardrailsOutputParser in LlamaIndex': examples/llamaindex-output-parsing.ipynb
+    - 'Check if a competitor is named': examples/competitors_check.ipynb
   - 'Integrations':
     - 'Azure OpenAI': integrations/azure_openai.ipynb
     - 'OpenAI Functions': integrations/openai_functions.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ manifest-ml = {version = "^0.1.8", optional = true}
 inspiredco = {version = "^0.0.2", optional = true}
 presidio_analyzer = {version = "^2.2.33", optional = true}
 presidio_anonymizer = {version = "^2.2.33", optional = true}
+spacy = {version = "^3.7.2", optional = true}
 
 [tool.poetry.extras]
 sql = ["sqlvalidator", "sqlalchemy", "sqlglot"]
@@ -52,6 +53,7 @@ detect-secrets = ["detect-secrets"]
 manifest = ["manifest-ml"]
 critique = ["inspiredco"]
 pii = ["presidio_analyzer", "presidio_anonymizer"]
+competitor-check = ["spacy"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.3"
diff --git a/tests/unit_tests/validators/test_competitor_check.py b/tests/unit_tests/validators/test_competitor_check.py
diff --git a/tests/unit_tests/validators/test_regex_match.py b/tests/unit_tests/validators/test_regex_match.py

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def validate(self, value: Any, metadata: Dict) -> ValidationResult:`
`73`	`73`	`except ImportError:`
`74`	`74`	`raise ImportError(`
`75`	`75`	"`thefuzz` library is required for `extractive-summary` validator. "
`76`		- "Please install it with `pip install thefuzz`."
	`76`	+ "Please install it with `poetry add thefuzz`."
`77`	`77`	`)`
`78`	`78`
`79`	`79`	`# Split the value into sentences.`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def __init__(self, args, *kwargs):`
`45`	`45`	`except ImportError:`
`46`	`46`	`raise ImportError(`
`47`	`47`	"`is-high-quality-translation` validator requires the `inspiredco`"
`48`		- "package. Please install it with `pip install inspiredco`."
	`48`	+ "package. Please install it with `poetry add inspiredco`."
`49`	`49`	`)`
`50`	`50`
`51`	`51`	`def validate(self, value: Any, metadata: Dict) -> ValidationResult:`