Skip to content

Commit 9353bbd

Browse files
authored
Release v0.3.0 - Advanced Unicode string utilities (#10)
* Add comprehensive tests for string manipulation utilities - Implement tests for `graphemes` to validate splitting of various character types including ASCII, emojis, and Unicode characters. - Create tests for `isASCII` to check detection of ASCII and non-ASCII characters, including control characters and mixed content. - Add tests for `normalizeWhitespace` to ensure proper normalization of whitespace characters, including Unicode spaces and various options for trimming and collapsing. - Introduce tests for `removeNonPrintable` to verify removal of control characters while preserving specified whitespace types. - Develop tests for `toASCII` to confirm conversion of non-ASCII characters to their ASCII equivalents, handling diacritics, smart quotes, and various symbols. * chore: update version to 0.3.0 and enhance CHANGELOG with new utilities
1 parent acac262 commit 9353bbd

18 files changed

+1785
-30
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,6 @@ tmp/
3737
temp/
3838
# TypeDoc generated documentation
3939
docs/
40+
41+
# Claude
42+
CLAUDE.MD

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.3.0] - 2025-09-03
11+
12+
### Added
13+
- `codePoints` - Convert strings into arrays of Unicode code points
14+
- `graphemes` - Split strings into grapheme clusters (emoji-aware)
15+
- `isASCII` - Check if string contains only ASCII characters
16+
- `toASCII` - Convert strings to ASCII-safe representation with transliteration
17+
- `normalizeWhitespace` - Normalize various Unicode whitespace characters
18+
- `removeNonPrintable` - Remove control and formatting characters
19+
20+
### Enhanced
21+
- Comprehensive Unicode support across all new utilities
22+
- Support for complex emoji sequences and combining characters
23+
- Configurable options for whitespace normalization and character removal
24+
- Greek and Cyrillic transliteration in toASCII
25+
- Smart symbol conversion (quotes, dashes, fractions, currency)
26+
1027
## [0.2.0] - 2025-09-02
1128

1229
### Added
@@ -55,5 +72,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
5572
- 100% test coverage for utility functions
5673
- Modern build tooling with tsup and Vitest
5774

75+
[0.3.0]: https://github.com/Zheruel/nano-string-utils/releases/tag/v0.3.0
5876
[0.2.0]: https://github.com/Zheruel/nano-string-utils/releases/tag/v0.2.0
5977
[0.1.0]: https://github.com/Zheruel/nano-string-utils/releases/tag/v0.1.0

README.md

Lines changed: 144 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,47 @@ wordCount("Hello world test"); // 3
286286
wordCount("One-word counts as one"); // 5
287287
```
288288

289+
#### `normalizeWhitespace(str: string, options?: NormalizeWhitespaceOptions): string`
290+
291+
Normalizes various Unicode whitespace characters to regular spaces.
292+
293+
```javascript
294+
normalizeWhitespace("hello world"); // 'hello world'
295+
normalizeWhitespace("hello\u00A0world"); // 'hello world' (non-breaking space)
296+
normalizeWhitespace(" hello "); // 'hello'
297+
normalizeWhitespace("hello\n\nworld"); // 'hello world'
298+
299+
// With options
300+
normalizeWhitespace(" hello ", { trim: false }); // ' hello '
301+
normalizeWhitespace("a b", { collapse: false }); // 'a b'
302+
normalizeWhitespace("hello\n\nworld", { preserveNewlines: true }); // 'hello\n\nworld'
303+
304+
// Handles various Unicode spaces
305+
normalizeWhitespace("café\u2003test"); // 'café test' (em space)
306+
normalizeWhitespace("hello\u200Bworld"); // 'hello world' (zero-width space)
307+
normalizeWhitespace("日本\u3000"); // '日本 語' (ideographic space)
308+
```
309+
310+
#### `removeNonPrintable(str: string, options?: RemoveNonPrintableOptions): string`
311+
312+
Removes non-printable control characters and formatting characters from strings.
313+
314+
```javascript
315+
removeNonPrintable("hello\x00world"); // 'helloworld' (removes NULL character)
316+
removeNonPrintable("hello\nworld"); // 'helloworld' (removes newline by default)
317+
removeNonPrintable("hello\u200Bworld"); // 'helloworld' (removes zero-width space)
318+
removeNonPrintable("hello\u202Dworld"); // 'helloworld' (removes directional override)
319+
320+
// With options
321+
removeNonPrintable("hello\nworld", { keepNewlines: true }); // 'hello\nworld'
322+
removeNonPrintable("hello\tworld", { keepTabs: true }); // 'hello\tworld'
323+
removeNonPrintable("hello\r\nworld", { keepCarriageReturns: true }); // 'hello\rworld'
324+
325+
// Preserves emoji with zero-width joiners
326+
removeNonPrintable("👨‍👩‍👧‍👦"); // '👨‍👩‍👧‍👦' (family emoji preserved)
327+
removeNonPrintable("text\x1B[32mgreen\x1B[0m"); // 'text[32mgreen[0m' (ANSI escapes removed)
328+
```
329+
289330
#### `pad(str: string, length: number, chars?: string): string`
290331

291332
Pads a string to a given length by adding characters to both sides (centers the string).
@@ -316,6 +357,32 @@ padEnd("Hi", 6, "=-"); // 'Hi=-=-'
316357
padEnd("5", 3, "0"); // '500'
317358
```
318359

360+
#### `graphemes(str: string): string[]`
361+
362+
Splits a string into an array of grapheme clusters, properly handling emojis, combining characters, and complex Unicode.
363+
364+
```javascript
365+
graphemes("hello"); // ['h', 'e', 'l', 'l', 'o']
366+
graphemes("👨‍👩‍👧‍👦🎈"); // ['👨‍👩‍👧‍👦', '🎈']
367+
graphemes("café"); // ['c', 'a', 'f', 'é']
368+
graphemes("👍🏽"); // ['👍🏽'] - emoji with skin tone
369+
graphemes("🇺🇸"); // ['🇺🇸'] - flag emoji
370+
graphemes("hello👋world"); // ['h', 'e', 'l', 'l', 'o', '👋', 'w', 'o', 'r', 'l', 'd']
371+
```
372+
373+
#### `codePoints(str: string): number[]`
374+
375+
Converts a string into an array of Unicode code points, properly handling surrogate pairs and complex characters.
376+
377+
```javascript
378+
codePoints("hello"); // [104, 101, 108, 108, 111]
379+
codePoints("👍"); // [128077]
380+
codePoints(""); // [8364]
381+
codePoints("Hello 👋"); // [72, 101, 108, 108, 111, 32, 128075]
382+
codePoints("a👍b"); // [97, 128077, 98]
383+
codePoints("👨‍👩‍👧‍👦"); // [128104, 8205, 128105, 8205, 128103, 8205, 128102]
384+
```
385+
319386
### String Generation
320387

321388
#### `randomString(length: number, charset?: string): string`
@@ -360,38 +427,87 @@ isUrl("not a url"); // false
360427
isUrl("ftp://files.com/file.zip"); // true
361428
```
362429

430+
#### `isASCII(str: string): boolean`
431+
432+
Checks if a string contains only ASCII characters (code points 0-127).
433+
434+
```javascript
435+
isASCII("Hello World!"); // true
436+
isASCII("café"); // false
437+
isASCII("👍"); // false
438+
isASCII("abc123!@#"); // true
439+
isASCII(""); // true
440+
```
441+
442+
#### `toASCII(str: string, options?: { placeholder?: string }): string`
443+
444+
Converts a string to ASCII-safe representation by removing diacritics, converting common Unicode symbols, and optionally replacing non-ASCII characters.
445+
446+
```javascript
447+
toASCII("café"); // 'cafe'
448+
toASCII("Hello "world""); // 'Hello "world"'
449+
toASCII("em—dash"); // 'em-dash'
450+
toASCII("€100"); // 'EUR100'
451+
toASCII("½ + ¼ = ¾"); // '1/2 + 1/4 = 3/4'
452+
toASCII("→ ← ↑ ↓"); // '-> <- ^ v'
453+
toASCII("α β γ"); // 'a b g'
454+
toASCII("Привет"); // 'Privet'
455+
toASCII("你好"); // '' (removes non-convertible characters)
456+
toASCII("你好", { placeholder: "?" }); // '??'
457+
toASCII("Hello 世界", { placeholder: "?" }); // 'Hello ??'
458+
toASCII("© 2024 Müller™"); // '(c) 2024 Muller(TM)'
459+
```
460+
461+
Features:
462+
463+
- Removes diacritics/accents (café → cafe)
464+
- Converts smart quotes to regular quotes
465+
- Converts Unicode dashes to hyphens
466+
- Converts mathematical symbols (≈ → ~, ≠ → !=)
467+
- Converts currency symbols (€ → EUR, £ → GBP)
468+
- Converts fractions (½ → 1/2)
469+
- Transliterates common Greek and Cyrillic letters
470+
- Handles emojis and multi-byte Unicode correctly
471+
- Optional placeholder for non-convertible characters
472+
363473
## Bundle Size
364474

365475
Each utility is optimized to be as small as possible:
366476

367-
| Function | Size (minified) |
368-
| ------------ | --------------- |
369-
| slugify | ~200 bytes |
370-
| camelCase | ~250 bytes |
371-
| snakeCase | ~220 bytes |
372-
| kebabCase | ~200 bytes |
373-
| pascalCase | ~180 bytes |
374-
| constantCase | ~230 bytes |
375-
| dotCase | ~210 bytes |
376-
| pathCase | ~210 bytes |
377-
| sentenceCase | ~280 bytes |
378-
| titleCase | ~320 bytes |
379-
| capitalize | ~100 bytes |
380-
| truncate | ~150 bytes |
381-
| stripHtml | ~120 bytes |
382-
| escapeHtml | ~180 bytes |
383-
| randomString | ~200 bytes |
384-
| hashString | ~150 bytes |
385-
| reverse | ~80 bytes |
386-
| deburr | ~200 bytes |
387-
| isEmail | ~180 bytes |
388-
| isUrl | ~200 bytes |
389-
| wordCount | ~100 bytes |
390-
| template | ~350 bytes |
391-
| templateSafe | ~400 bytes |
392-
| pad | ~180 bytes |
393-
| padStart | ~150 bytes |
394-
| padEnd | ~150 bytes |
477+
| Function | Size (minified) |
478+
| ------------------- | --------------- |
479+
| slugify | ~200 bytes |
480+
| camelCase | ~250 bytes |
481+
| snakeCase | ~220 bytes |
482+
| kebabCase | ~200 bytes |
483+
| pascalCase | ~180 bytes |
484+
| constantCase | ~230 bytes |
485+
| dotCase | ~210 bytes |
486+
| pathCase | ~210 bytes |
487+
| sentenceCase | ~280 bytes |
488+
| titleCase | ~320 bytes |
489+
| capitalize | ~100 bytes |
490+
| truncate | ~150 bytes |
491+
| stripHtml | ~120 bytes |
492+
| escapeHtml | ~180 bytes |
493+
| randomString | ~200 bytes |
494+
| hashString | ~150 bytes |
495+
| reverse | ~80 bytes |
496+
| deburr | ~200 bytes |
497+
| isEmail | ~180 bytes |
498+
| isUrl | ~200 bytes |
499+
| isASCII | ~100 bytes |
500+
| toASCII | ~450 bytes |
501+
| wordCount | ~100 bytes |
502+
| normalizeWhitespace | ~280 bytes |
503+
| removeNonPrintable | ~200 bytes |
504+
| template | ~350 bytes |
505+
| templateSafe | ~400 bytes |
506+
| pad | ~180 bytes |
507+
| padStart | ~150 bytes |
508+
| padEnd | ~150 bytes |
509+
| graphemes | ~250 bytes |
510+
| codePoints | ~120 bytes |
395511

396512
Total package size: **< 5KB** minified + gzipped
397513

jsr.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@zheruel/nano-string-utils",
3-
"version": "0.2.0",
3+
"version": "0.3.0",
44
"exports": "./src/index.ts",
55
"publish": {
66
"include": [

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "nano-string-utils",
3-
"version": "0.2.0",
3+
"version": "0.3.0",
44
"description": "Ultra-lightweight string utilities with zero dependencies",
55
"type": "module",
66
"main": "./dist/index.cjs",

src/codePoints.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* Converts a string into an array of Unicode code points
3+
* @param str - The input string
4+
* @returns An array of Unicode code point numbers
5+
* @example
6+
* codePoints('hello') // [104, 101, 108, 108, 111]
7+
* codePoints('👍') // [128077]
8+
* codePoints('€') // [8364]
9+
* codePoints('a👍b') // [97, 128077, 98]
10+
*/
11+
export function codePoints(str: string): number[] {
12+
const points: number[] = [];
13+
for (const char of str) {
14+
const point = char.codePointAt(0);
15+
if (point !== undefined) {
16+
points.push(point);
17+
}
18+
}
19+
return points;
20+
}

src/graphemes.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/**
2+
* Split a string into an array of grapheme clusters
3+
* Handles emojis, combining characters, and other complex Unicode properly
4+
* @param str - The string to split
5+
* @returns Array of grapheme clusters
6+
* @example
7+
* graphemes('👨‍👩‍👧‍👦🎈') // ['👨‍👩‍👧‍👦', '🎈']
8+
* graphemes('café') // ['c', 'a', 'f', 'é']
9+
* graphemes('hello') // ['h', 'e', 'l', 'l', 'o']
10+
*/
11+
export function graphemes(str: string): string[] {
12+
if (!str) return [];
13+
14+
// Use Intl.Segmenter for proper grapheme cluster splitting
15+
if (typeof Intl !== "undefined" && "Segmenter" in Intl) {
16+
const segmenter = new Intl.Segmenter(undefined, {
17+
granularity: "grapheme",
18+
});
19+
return Array.from(segmenter.segment(str), (segment) => segment.segment);
20+
}
21+
22+
// Simple fallback for environments without Intl.Segmenter
23+
// This won't handle complex emojis properly but works for basic text
24+
return Array.from(str);
25+
}

src/index.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,15 @@ export { titleCase, type TitleCaseOptions } from "./titleCase.js";
2424
export { constantCase } from "./constantCase.js";
2525
export { dotCase } from "./dotCase.js";
2626
export { pathCase } from "./pathCase.js";
27+
export { graphemes } from "./graphemes.js";
28+
export { codePoints } from "./codePoints.js";
29+
export { isASCII } from "./isASCII.js";
30+
export {
31+
normalizeWhitespace,
32+
type NormalizeWhitespaceOptions,
33+
} from "./normalizeWhitespace.js";
34+
export {
35+
removeNonPrintable,
36+
type RemoveNonPrintableOptions,
37+
} from "./removeNonPrintable.js";
38+
export { toASCII, type ToASCIIOptions } from "./toASCII.js";

src/isASCII.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/**
2+
* Checks if a string contains only ASCII characters (code points 0-127)
3+
* @param str - The input string to check
4+
* @returns True if the string contains only ASCII characters, false otherwise
5+
* @example
6+
* isASCII('Hello World!') // true
7+
* isASCII('café') // false
8+
* isASCII('👍') // false
9+
* isASCII('abc123!@#') // true
10+
* isASCII('') // true
11+
*/
12+
export function isASCII(str: string): boolean {
13+
for (let i = 0; i < str.length; i++) {
14+
if (str.charCodeAt(i) > 127) {
15+
return false;
16+
}
17+
}
18+
return true;
19+
}

0 commit comments

Comments
 (0)