|
| 1 | +import { describe, expect, test } from "vitest"; |
| 2 | +import { normalizeString } from "./normalizeString"; |
| 3 | + |
| 4 | +describe("normalizeString", () => { |
| 5 | + test("removes diacritical marks", () => { |
| 6 | + expect(normalizeString("Crème brûlée")).toBe("creme brulee"); |
| 7 | + expect(normalizeString(" niñO")).toBe("nino"); |
| 8 | + expect(normalizeString("café")).toBe("cafe"); |
| 9 | + expect(normalizeString("naïve")).toBe("naive"); |
| 10 | + expect(normalizeString("résumé")).toBe("resume"); |
| 11 | + expect(normalizeString("Zürich")).toBe("zurich"); |
| 12 | + expect(normalizeString("Москва")).toBe("москва"); // Cyrillic remains |
| 13 | + expect(normalizeString("北京")).toBe("北京"); // Chinese remains |
| 14 | + }); |
| 15 | + |
| 16 | + test("converts to lowercase", () => { |
| 17 | + expect(normalizeString("HELLO WORLD")).toBe("hello world"); |
| 18 | + expect(normalizeString("CamelCase")).toBe("camelcase"); |
| 19 | + expect(normalizeString("UPPERCASE")).toBe("uppercase"); |
| 20 | + expect(normalizeString("MiXeD CaSe")).toBe("mixed case"); |
| 21 | + }); |
| 22 | + |
| 23 | + test("normalizes spaces", () => { |
| 24 | + expect(normalizeString(" multiple spaces ")).toBe("multiple spaces"); |
| 25 | + expect(normalizeString("\t\ttabs\t\there\t")).toBe("tabs here"); |
| 26 | + expect(normalizeString("line\nbreaks\r\nhere")).toBe("line breaks here"); |
| 27 | + expect(normalizeString(" leading and trailing ")).toBe( |
| 28 | + "leading and trailing" |
| 29 | + ); |
| 30 | + }); |
| 31 | + |
| 32 | + test("handles special characters", () => { |
| 33 | + expect(normalizeString("[email protected]")).toBe("[email protected]"); |
| 34 | + expect(normalizeString("price: $99.99")).toBe("price: $99.99"); |
| 35 | + expect(normalizeString("100%")).toBe("100%"); |
| 36 | + expect(normalizeString("C++ & Java")).toBe("c++ & java"); |
| 37 | + expect(normalizeString("hello-world_test")).toBe("hello-world_test"); |
| 38 | + }); |
| 39 | + |
| 40 | + test("handles empty and whitespace-only strings", () => { |
| 41 | + expect(normalizeString("")).toBe(""); |
| 42 | + expect(normalizeString(" ")).toBe(""); |
| 43 | + expect(normalizeString("\t\n\r")).toBe(""); |
| 44 | + expect(normalizeString("\u00A0")).toBe(""); // non-breaking space |
| 45 | + }); |
| 46 | + |
| 47 | + test("handles unicode normalization", () => { |
| 48 | + expect(normalizeString("fi")).toBe("fi"); // ligature remains |
| 49 | + expect(normalizeString("½")).toBe("½"); // fraction remains |
| 50 | + expect(normalizeString("™")).toBe("™"); // trademark remains |
| 51 | + expect(normalizeString("😀")).toBe("😀"); // emoji remains |
| 52 | + }); |
| 53 | + |
| 54 | + test("handles combined diacritics", () => { |
| 55 | + expect(normalizeString("àáâãäå")).toBe("aaaaaa"); |
| 56 | + expect(normalizeString("èéêë")).toBe("eeee"); |
| 57 | + expect(normalizeString("ìíîï")).toBe("iiii"); |
| 58 | + expect(normalizeString("òóôõö")).toBe("ooooo"); |
| 59 | + expect(normalizeString("ùúûü")).toBe("uuuu"); |
| 60 | + expect(normalizeString("ýÿ")).toBe("yy"); |
| 61 | + expect(normalizeString("ñ")).toBe("n"); |
| 62 | + expect(normalizeString("ç")).toBe("c"); |
| 63 | + }); |
| 64 | + |
| 65 | + test("handles mixed content", () => { |
| 66 | + expect(normalizeString(" José's Café - 50% OFF! ")).toBe( |
| 67 | + "jose's cafe - 50% off!" |
| 68 | + ); |
| 69 | + expect(normalizeString("Björk & Sigur Rós")).toBe("bjork & sigur ros"); |
| 70 | + expect(normalizeString("Düsseldorf → München")).toBe( |
| 71 | + "dusseldorf → munchen" |
| 72 | + ); |
| 73 | + expect(normalizeString(" Naïve\tRésumé\n2024 ")).toBe( |
| 74 | + "naive resume 2024" |
| 75 | + ); |
| 76 | + }); |
| 77 | + |
| 78 | + test("preserves numbers and punctuation", () => { |
| 79 | + expect(normalizeString("123.456")).toBe("123.456"); |
| 80 | + expect(normalizeString("[email protected]")).toBe("[email protected]"); |
| 81 | + expect(normalizeString("hello, world!")).toBe("hello, world!"); |
| 82 | + expect(normalizeString("question?")).toBe("question?"); |
| 83 | + expect(normalizeString("[brackets] {braces} (parens)")).toBe( |
| 84 | + "[brackets] {braces} (parens)" |
| 85 | + ); |
| 86 | + }); |
| 87 | + |
| 88 | + test("handles very long strings", () => { |
| 89 | + const longString = "À".repeat(1000) + " " + "É".repeat(1000); |
| 90 | + const expected = "a".repeat(1000) + " " + "e".repeat(1000); |
| 91 | + expect(normalizeString(longString)).toBe(expected); |
| 92 | + }); |
| 93 | + |
| 94 | + test("handles invisible characters", () => { |
| 95 | + expect(normalizeString("hello\u200Bworld")).toBe("hello world"); // zero-width space |
| 96 | + expect(normalizeString("test\u2028text")).toBe("test text"); // line separator |
| 97 | + expect(normalizeString("foo\u2029bar")).toBe("foo bar"); // paragraph separator |
| 98 | + expect(normalizeString("data\u0000value")).toBe("data value"); // null character |
| 99 | + }); |
| 100 | + |
| 101 | + test("real-world examples", () => { |
| 102 | + expect(normalizeString("Señorita María González")).toBe( |
| 103 | + "senorita maria gonzalez" |
| 104 | + ); |
| 105 | + expect(normalizeString("François Müller")).toBe("francois muller"); |
| 106 | + expect(normalizeString("Łukasz Żółć")).toBe("łukasz zołc"); // Polish letters remain |
| 107 | + expect(normalizeString("Renée O'Connor")).toBe("renee o'connor"); |
| 108 | + expect(normalizeString(" Dr. José García-López ")).toBe( |
| 109 | + "dr. jose garcia-lopez" |
| 110 | + ); |
| 111 | + }); |
| 112 | +}); |
0 commit comments