Merge pull request #122 from 2Toad/jp-issue-120

JasonPierce · web-flow · commit 303974703b65 · 2024-10-22T22:34:55.000-05:00
Fixes #120: Words added to whitelist are still being censored
diff --git a/src/benchmark/benchmark.ts b/src/benchmark/benchmark.ts
@@ -110,7 +110,7 @@ suite
   .on("cycle", (event: Event) => {
     console.log(String(event.target));
   })
-  .on("complete", () => {
-    console.log(`Fastest: ${suite.filter("fastest").map("name")}`);
+  .on("complete", function () {
+    console.log(`Fastest: ${this.filter("fastest").map("name")[0]}`);
   })
   .run({ async: true });
diff --git a/src/profanity.ts b/src/profanity.ts
@@ -5,13 +5,10 @@ import { profaneWords } from "./data";
 
 export class Profanity {
   options: ProfanityOptions;
-
   whitelist: List;
 
   private blacklist: List;
-
   private removed: List;
-
   private regexes: Map<string, RegExp>;
 
   constructor(options?: ProfanityOptions | Partial<ProfanityOptions>) {
@@ -23,6 +20,13 @@ export class Profanity {
     this.regexes = new Map<string, RegExp>();
   }
 
+  /**
+   * Checks if the given text contains any profanity.
+   * @param text - The text to check for profanity.
+   * @param languages - Optional array of language codes to use for profanity detection.
+   *                    If not provided, uses the languages specified in the options.
+   * @returns True if profanity is found, false otherwise.
+   */
   exists(text: string, languages?: string[]): boolean {
     if (typeof text !== "string") {
       return false;
@@ -34,49 +38,26 @@ export class Profanity {
     const lowercaseText = text.toLowerCase();
 
     let match: RegExpExecArray | null;
-    do {
-      match = regex.exec(lowercaseText);
-      if (match !== null) {
-        const matchStart = match.index;
-        const matchEnd = matchStart + match[0].length;
-
-        // Check if the matched word is part of a whitelisted word
-        let isWhitelisted = false;
-        this.whitelist.words.forEach((whitelistedWord) => {
-          const whitelistedIndex = lowercaseText.indexOf(whitelistedWord, Math.max(0, matchStart - whitelistedWord.length + 1));
-          if (whitelistedIndex !== -1) {
-            const whitelistedEnd = whitelistedIndex + whitelistedWord.length;
-
-            if (this.options.wholeWord) {
-              // For whole word matching, ensure the whitelisted word exactly matches the profane word
-              // and is not part of a hyphenated or underscore-separated word
-              if (
-                matchStart === whitelistedIndex &&
-                matchEnd === whitelistedEnd &&
-                (matchStart === 0 || !/[\w-_]/.test(lowercaseText[matchStart - 1])) &&
-                // eslint-disable-next-line security/detect-object-injection
-                (matchEnd === lowercaseText.length || !/[\w-_]/.test(lowercaseText[matchEnd]))
-              ) {
-                isWhitelisted = true;
-              }
-            } else {
-              // For partial matching, check if the profane word is contained within the whitelisted word
-              if ((matchStart >= whitelistedIndex && matchStart < whitelistedEnd) || (matchEnd > whitelistedIndex && matchEnd <= whitelistedEnd)) {
-                isWhitelisted = true;
-              }
-            }
-          }
-        });
+    while ((match = regex.exec(lowercaseText)) !== null) {
+      const matchStart = match.index;
+      const matchEnd = matchStart + match[0].length;
 
-        if (!isWhitelisted) {
-          return true;
-        }
+      if (!this.isWhitelisted(matchStart, matchEnd, lowercaseText)) {
+        return true;
       }
-    } while (match !== null);
+    }
 
     return false;
   }
 
+  /**
+   * Censors profanity in the given text.
+   * @param text - The text to censor.
+   * @param censorType - The type of censoring to apply. Defaults to CensorType.Word.
+   * @param languages - Optional array of language codes to use for profanity detection.
+   *                    If not provided, uses the languages specified in the options.
+   * @returns The censored text.
+   */
   censor(text: string, censorType: CensorType = CensorType.Word, languages?: string[]): string {
     if (typeof text !== "string") {
       return text;
@@ -87,45 +68,37 @@ export class Profanity {
 
     const lowercaseText = text.toLowerCase();
 
-    switch (censorType) {
-      case CensorType.Word:
-        return text.replace(regex, (match) => {
-          const underscore = match.includes("_") ? "_" : "";
-          return this.options.grawlix + underscore;
-        });
-      case CensorType.FirstChar: {
-        return this.replaceProfanity(text, lowercaseText, (word) => this.options.grawlixChar + word.slice(1), regex);
-      }
-      case CensorType.FirstVowel:
-      case CensorType.AllVowels: {
-        const vowelRegex = new RegExp("[aeiou]", censorType === CensorType.FirstVowel ? "i" : "ig");
-        return this.replaceProfanity(text, lowercaseText, (word) => word.replace(vowelRegex, this.options.grawlixChar), regex);
-      }
-      default:
-        throw new Error(`Invalid replacement type: "${censorType}"`);
-    }
-  }
-
-  private replaceProfanity(text: string, lowercaseText: string, replacer: (word: string) => string, regex: RegExp): string {
-    let result = text;
-    let offset = 0;
-
-    let match: RegExpExecArray | null;
-    do {
-      match = regex.exec(lowercaseText);
-      if (match !== null) {
-        const matchStart = match.index;
-        const matchEnd = matchStart + match[0].length;
-        const originalWord = text.slice(matchStart + offset, matchEnd + offset);
-        const censoredWord = replacer(originalWord);
-        result = result.slice(0, matchStart + offset) + censoredWord + result.slice(matchEnd + offset);
-        offset += censoredWord.length - originalWord.length;
-      }
-    } while (match !== null);
-
-    return result;
+    return this.replaceProfanity(
+      text,
+      lowercaseText,
+      (word, start, end) => {
+        if (this.isWhitelisted(start, end, lowercaseText)) {
+          return word;
+        }
+        switch (censorType) {
+          case CensorType.Word: {
+            const underscore = word.includes("_") ? "_" : "";
+            return this.options.grawlix + underscore;
+          }
+          case CensorType.FirstChar:
+            return this.options.grawlixChar + word.slice(1);
+          case CensorType.FirstVowel:
+          case CensorType.AllVowels: {
+            const vowelRegex = new RegExp("[aeiou]", censorType === CensorType.FirstVowel ? "i" : "ig");
+            return word.replace(vowelRegex, this.options.grawlixChar);
+          }
+          default:
+            throw new Error(`Invalid replacement type: "${censorType}"`);
+        }
+      },
+      regex,
+    );
   }
 
+  /**
+   * Adds words to the profanity blacklist.
+   * @param words - An array of words to add to the blacklist.
+   */
   addWords(words: string[]): void {
     const removedWords: string[] = [];
     const blacklistWords: string[] = [];
@@ -147,6 +120,10 @@ export class Profanity {
     }
   }
 
+  /**
+   * Removes words from the profanity blacklist.
+   * @param words - An array of words to remove from the blacklist.
+   */
   removeWords(words: string[]): void {
     const blacklistedWords: string[] = [];
     const removeWords: string[] = [];
@@ -168,6 +145,72 @@ export class Profanity {
     }
   }
 
+  /**
+   * Checks if a given match is whitelisted.
+   * @param matchStart - The starting index of the match in the text.
+   * @param matchEnd - The ending index of the match in the text.
+   * @param text - The lowercase text being checked.
+   * @returns True if the match is whitelisted, false otherwise.
+   */
+  private isWhitelisted(matchStart: number, matchEnd: number, text: string): boolean {
+    for (const whitelistedWord of this.whitelist.words) {
+      const whitelistedIndex = text.indexOf(whitelistedWord, Math.max(0, matchStart - whitelistedWord.length + 1));
+      if (whitelistedIndex !== -1) {
+        const whitelistedEnd = whitelistedIndex + whitelistedWord.length;
+
+        if (this.options.wholeWord) {
+          if (
+            matchStart === whitelistedIndex &&
+            matchEnd === whitelistedEnd &&
+            (matchStart === 0 || !/[\w-_]/.test(text[matchStart - 1])) &&
+            (matchEnd === text.length || !/[\w-_]/.test(text[matchEnd]))
+          ) {
+            return true;
+          }
+        } else {
+          if (
+            (matchStart >= whitelistedIndex && matchStart < whitelistedEnd) ||
+            (matchEnd > whitelistedIndex && matchEnd <= whitelistedEnd) ||
+            (whitelistedIndex >= matchStart && whitelistedEnd <= matchEnd)
+          ) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Replaces profanity in the text using the provided replacer function.
+   * @param text - The original text.
+   * @param lowercaseText - The lowercase version of the text.
+   * @param replacer - A function that determines how to replace profane words.
+   * @param regex - The regular expression used to find profane words.
+   * @returns The text with profanity replaced.
+   */
+  private replaceProfanity(
+    text: string,
+    lowercaseText: string,
+    replacer: (word: string, start: number, end: number) => string,
+    regex: RegExp,
+  ): string {
+    let result = text;
+    let offset = 0;
+
+    let match: RegExpExecArray | null;
+    while ((match = regex.exec(lowercaseText)) !== null) {
+      const matchStart = match.index;
+      const matchEnd = matchStart + match[0].length;
+      const originalWord = text.slice(matchStart + offset, matchEnd + offset);
+      const censoredWord = replacer(originalWord, matchStart, matchEnd);
+      result = result.slice(0, matchStart + offset) + censoredWord + result.slice(matchEnd + offset);
+      offset += censoredWord.length - originalWord.length;
+    }
+
+    return result;
+  }
+
   /**
    * Determines the list of languages to use, either from the provided list or falling back to default languages.
    * @param languages - An optional list of languages to use.
diff --git a/tests/profanity.spec.ts b/tests/profanity.spec.ts
@@ -81,29 +81,39 @@ describe("Profanity", () => {
       it("should detect custom added words (wholeWord = true)", () => {
         customProfanity.addWords(["cucumber", "banana"]);
         expect(customProfanity.exists("I love cucumbers")).to.be.false;
+        expect(customProfanity.censor("I love cucumbers")).to.equal("I love cucumbers");
         expect(customProfanity.exists("I love cucumber")).to.be.true;
+        expect(customProfanity.censor("I love cucumber")).to.equal(`I love ${customProfanity.options.grawlix}`);
         expect(customProfanity.exists("Bananas are yellow")).to.be.false;
+        expect(customProfanity.censor("Bananas are yellow")).to.equal("Bananas are yellow");
         expect(customProfanity.exists("This banana is yellow")).to.be.true;
+        expect(customProfanity.censor("This banana is yellow")).to.equal(`This ${customProfanity.options.grawlix} is yellow`);
       });
 
       it("should detect custom added words (wholeWord = false)", () => {
         const customProfanityPartial = new Profanity({ wholeWord: false });
         customProfanityPartial.addWords(["cucumber", "banana"]);
         expect(customProfanityPartial.exists("I love cucumbers")).to.be.true;
+        expect(customProfanityPartial.censor("I love cucumbers")).to.equal(`I love ${customProfanityPartial.options.grawlix}s`);
         expect(customProfanityPartial.exists("Bananas are yellow")).to.be.true;
+        expect(customProfanityPartial.censor("Bananas are yellow")).to.equal(`${customProfanityPartial.options.grawlix}s are yellow`);
       });
 
       it("should not detect removed words", () => {
         customProfanity.removeWords(["butt", "arse"]);
         expect(customProfanity.exists("Don't be a butt")).to.be.false;
+        expect(customProfanity.censor("Don't be a butt")).to.equal("Don't be a butt");
         expect(customProfanity.exists("You're an arse")).to.be.false;
+        expect(customProfanity.censor("You're an arse")).to.equal("You're an arse");
       });
 
       it("should handle adding and removing words in sequence", () => {
         customProfanity.addWords(["test"]);
         expect(customProfanity.exists("test")).to.be.true;
+        expect(customProfanity.censor("test")).to.equal(customProfanity.options.grawlix);
         customProfanity.removeWords(["test"]);
         expect(customProfanity.exists("test")).to.be.false;
+        expect(customProfanity.censor("test")).to.equal("test");
       });
     });
   });
@@ -116,15 +126,24 @@ describe("Profanity", () => {
     });
 
     describe("wholeWord = true", () => {
+      it("should whitelist a word", () => {
+        customProfanity.whitelist.addWords(["butt"]);
+        expect(customProfanity.exists("Don't be a butt")).to.be.false;
+        expect(customProfanity.censor("Don't be a butt")).to.equal("Don't be a butt");
+      });
+
       it("should whitelist multiple words", () => {
         customProfanity.whitelist.addWords(["butt", "arse"]);
         expect(customProfanity.exists("Should we censor the word butt or arse?")).to.be.false;
+        expect(customProfanity.censor("Should we censor the word butt or arse?")).to.equal("Should we censor the word butt or arse?");
       });
 
       it("should only whitelist exact whole words", () => {
         customProfanity.whitelist.addWords(["but"]);
         expect(customProfanity.exists("Don't be a but")).to.be.false;
+        expect(customProfanity.censor("Don't be a but")).to.equal("Don't be a but");
         expect(customProfanity.exists("Don't be a butt")).to.be.true;
+        expect(customProfanity.censor("Don't be a butt")).to.equal("Don't be a @#$%&!");
       });
 
       describe("Hyphenated and underscore-separated words", () => {
@@ -134,10 +153,12 @@ describe("Profanity", () => {
 
         it("should detect profanity in hyphenated words when part is whitelisted", () => {
           expect(customProfanity.exists("Don't be a butt-head")).to.be.true;
+          expect(customProfanity.censor("Don't be a butt-head")).to.equal(`Don't be a ${customProfanity.options.grawlix}-head`);
         });
 
         it("should detect profanity in underscore-separated words when part is whitelisted", () => {
           expect(customProfanity.exists("Don't be a butt_head")).to.be.true;
+          expect(customProfanity.censor("Don't be a butt_head")).to.equal(`Don't be a ${customProfanity.options.grawlix}_head`);
         });
       });
     });
@@ -161,22 +182,27 @@ describe("Profanity", () => {
 
         it("should detect 'arse' as profanity", () => {
           expect(customProfanityPartial.exists("what an arse")).to.be.true;
+          expect(customProfanityPartial.censor("what an arse")).to.equal(`what an ${customProfanityPartial.options.grawlix}`);
         });
 
         it("should not detect 'arsenic' as profanity due to whitelist", () => {
           expect(customProfanityPartial.exists("dedicated arsenic")).to.be.false;
+          expect(customProfanityPartial.censor("dedicated arsenic")).to.equal("dedicated arsenic");
         });
 
         it("should not detect 'class' as profanity due to whitelist", () => {
           expect(customProfanityPartial.exists("dedicated class person")).to.be.false;
+          expect(customProfanityPartial.censor("dedicated class person")).to.equal("dedicated class person");
         });
 
         it("should not detect 'classic' as profanity due to whitelist", () => {
           expect(customProfanityPartial.exists("dedicated classic")).to.be.false;
+          expect(customProfanityPartial.censor("dedicated classic")).to.equal("dedicated classic");
         });
 
         it("should not detect 'password' as profanity due to whitelist", () => {
           expect(customProfanityPartial.exists("dedicated password")).to.be.false;
+          expect(customProfanityPartial.censor("dedicated password")).to.equal("dedicated password");
         });
       });
     });
@@ -199,19 +225,23 @@ describe("Profanity", () => {
     it("should not detect whitelisted words", () => {
       customProfanity.whitelist.addWords(["classic", "assembly"]);
       expect(customProfanity.exists("That's a classic movie")).to.be.false;
+      expect(customProfanity.censor("That's a classic movie")).to.equal("That's a classic movie");
       expect(customProfanity.exists("The assembly line is efficient")).to.be.false;
+      expect(customProfanity.censor("The assembly line is efficient")).to.equal("The assembly line is efficient");
     });
 
     it("should detect profanity after removing from whitelist", () => {
       customProfanity.whitelist.addWords(["classic"]);
       customProfanity.whitelist.removeWords(["classic"]);
       expect(customProfanity.exists("That's a classic butt movie")).to.be.true;
+      expect(customProfanity.censor("That's a classic butt movie")).to.equal(`That's a classic ${customProfanity.options.grawlix} movie`);
     });
 
     it("should handle adding and removing words from whitelist in sequence", () => {
       customProfanity.whitelist.addWords(["test"]);
       customProfanity.addWords(["test"]);
       expect(customProfanity.exists("test")).to.be.false;
+      expect(customProfanity.censor("test")).to.equal("test");
     });
   });