Add extra.UnicodeWordBoundary. Fix string.WordBoundary. Fix staticcheck warnings.

shellyln · shellyln · commit 446719d719b3 · 2023-02-04T11:08:27.000+09:00
diff --git a/Changelog.md b/Changelog.md
@@ -1,9 +1,13 @@
 # Changelog
 
 # v0.0.8
-* Add parsers to the extra package.
+* Add parsers to the extra package:
   * `AsciiIdentifierStr`
   * `UnicodeIdentifierStr`
+  * `UnicodeWordBoundary`
+* Fix `string.WordBoundary`:
+  * Add `$` to the word boundary character.
+* Fix staticcheck warnings.
 * Fix formula example.
 * Fix README.
 
diff --git a/extra/extra.go b/extra/extra.go
@@ -2,10 +2,12 @@ package extra
 
 import (
 	"unicode"
+	"unicode/utf8"
 
 	. "github.com/shellyln/takenoco/base"
 	clsz "github.com/shellyln/takenoco/extra/classes"
 	. "github.com/shellyln/takenoco/string"
+	strclsz "github.com/shellyln/takenoco/string/classes"
 )
 
 // Exclude parsed ASTs from the results.
@@ -235,6 +237,48 @@ func UnicodeIdentifierStr() ParserFn {
 	)
 }
 
+// Zero-width assertion on a word boundary.
+func UnicodeWordBoundary() ParserFn {
+	const ClassName = strclsz.WordBoundary
+	return LightBaseParser(ClassName, func(ctx ParserContext) (ParserContext, error) {
+		ctx.Length = 0
+		ctx.MatchStatus = MatchStatus_Unmatched
+
+		ch, length := utf8.DecodeRuneInString(ctx.Str[ctx.Position:])
+
+		if ctx.Position == 0 {
+			if 0 < length && isUnicodeWord(ch) {
+				ctx.MatchStatus = MatchStatus_Matched
+			}
+			return ctx, nil
+		}
+
+		var prevCh rune
+		var prevChLength int
+		for i := ctx.Position - 1; 0 <= i; i-- {
+			b := ctx.Str[i]
+			if b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
+				prevCh, prevChLength = utf8.DecodeRuneInString(ctx.Str[i:])
+				break
+			}
+		}
+
+		if ctx.Position == len(ctx.Str) {
+			if 0 < prevChLength && isUnicodeWord(prevCh) {
+				ctx.MatchStatus = MatchStatus_Matched
+			}
+		} else {
+			if length != 0 && prevChLength != 0 {
+				if isUnicodeWord(prevCh) && !isUnicodeWord(ch) || !isUnicodeWord(prevCh) && isUnicodeWord(ch) {
+					ctx.MatchStatus = MatchStatus_Matched
+				}
+			}
+		}
+
+		return ctx, nil
+	})
+}
+
 // Parse the ISO 8601 date string. (yyyy-MM-dd)
 func DateStr() ParserFn {
 	return Trans(
diff --git a/extra/util.go b/extra/util.go
@@ -0,0 +1,20 @@
+package extra
+
+import "unicode"
+
+// Unicode word characters
+func isUnicodeWord(r rune) bool {
+	// ID_Continue + '$' + U+200C + U+200D
+	// Alnum(), '_', '$', and ...
+	return (unicode.Is(unicode.L, r) ||
+		unicode.Is(unicode.Nl, r) ||
+		unicode.Is(unicode.Other_ID_Start, r) ||
+		unicode.Is(unicode.Mn, r) ||
+		unicode.Is(unicode.Mc, r) ||
+		unicode.Is(unicode.Nd, r) ||
+		unicode.Is(unicode.Pc, r) ||
+		unicode.Is(unicode.Other_ID_Continue, r) ||
+		r == '$' || r == 0x0200c || r == 0x0200d) &&
+		!unicode.Is(unicode.Pattern_Syntax, r) &&
+		!unicode.Is(unicode.Pattern_White_Space, r)
+}
diff --git a/string/parsers.go b/string/parsers.go
@@ -513,7 +513,7 @@ func WordBoundary() ParserFn {
 		var prevChLength int
 		for i := ctx.Position - 1; 0 <= i; i-- {
 			b := ctx.Str[i]
-			if 0x00 <= b && b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
+			if b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
 				prevCh, prevChLength = utf8.DecodeRuneInString(ctx.Str[i:])
 				break
 			}
diff --git a/string/util.go b/string/util.go
@@ -35,7 +35,7 @@ func isLineBreak(r rune) bool {
 
 // ASCII word characters
 func isWord(r rune) bool {
-	if 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' {
+	if 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' || r == '$' {
 		return true
 	} else {
 		return false

Original file line number	Diff line number	Diff line change
`@@ -513,7 +513,7 @@ func WordBoundary() ParserFn {`
`513`	`513`	`var prevChLength int`
`514`	`514`	`for i := ctx.Position - 1; 0 <= i; i-- {`
`515`	`515`	`b := ctx.Str[i]`
`516`		`- if 0x00 <= b && b <= 0x7f \|\| 0xc2 <= b && b <= 0xf0 \|\| b == 0xf3 {`
	`516`	`+ if b <= 0x7f \|\| 0xc2 <= b && b <= 0xf0 \|\| b == 0xf3 {`
`517`	`517`	`prevCh, prevChLength = utf8.DecodeRuneInString(ctx.Str[i:])`
`518`	`518`	`break`
`519`	`519`	`}`