Skip to content

Commit 446719d

Browse files
committed
Add extra.UnicodeWordBoundary. Fix string.WordBoundary. Fix staticcheck warnings.
1 parent b1390f8 commit 446719d

File tree

5 files changed

+71
-3
lines changed

5 files changed

+71
-3
lines changed

Changelog.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
# Changelog
22

33
# v0.0.8
4-
* Add parsers to the extra package.
4+
* Add parsers to the extra package:
55
* `AsciiIdentifierStr`
66
* `UnicodeIdentifierStr`
7+
* `UnicodeWordBoundary`
8+
* Fix `string.WordBoundary`:
9+
* Add `$` to the word boundary character.
10+
* Fix staticcheck warnings.
711
* Fix formula example.
812
* Fix README.
913

extra/extra.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package extra
22

33
import (
44
"unicode"
5+
"unicode/utf8"
56

67
. "github.com/shellyln/takenoco/base"
78
clsz "github.com/shellyln/takenoco/extra/classes"
89
. "github.com/shellyln/takenoco/string"
10+
strclsz "github.com/shellyln/takenoco/string/classes"
911
)
1012

1113
// Exclude parsed ASTs from the results.
@@ -235,6 +237,48 @@ func UnicodeIdentifierStr() ParserFn {
235237
)
236238
}
237239

240+
// Zero-width assertion on a word boundary.
241+
func UnicodeWordBoundary() ParserFn {
242+
const ClassName = strclsz.WordBoundary
243+
return LightBaseParser(ClassName, func(ctx ParserContext) (ParserContext, error) {
244+
ctx.Length = 0
245+
ctx.MatchStatus = MatchStatus_Unmatched
246+
247+
ch, length := utf8.DecodeRuneInString(ctx.Str[ctx.Position:])
248+
249+
if ctx.Position == 0 {
250+
if 0 < length && isUnicodeWord(ch) {
251+
ctx.MatchStatus = MatchStatus_Matched
252+
}
253+
return ctx, nil
254+
}
255+
256+
var prevCh rune
257+
var prevChLength int
258+
for i := ctx.Position - 1; 0 <= i; i-- {
259+
b := ctx.Str[i]
260+
if b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
261+
prevCh, prevChLength = utf8.DecodeRuneInString(ctx.Str[i:])
262+
break
263+
}
264+
}
265+
266+
if ctx.Position == len(ctx.Str) {
267+
if 0 < prevChLength && isUnicodeWord(prevCh) {
268+
ctx.MatchStatus = MatchStatus_Matched
269+
}
270+
} else {
271+
if length != 0 && prevChLength != 0 {
272+
if isUnicodeWord(prevCh) && !isUnicodeWord(ch) || !isUnicodeWord(prevCh) && isUnicodeWord(ch) {
273+
ctx.MatchStatus = MatchStatus_Matched
274+
}
275+
}
276+
}
277+
278+
return ctx, nil
279+
})
280+
}
281+
238282
// Parse the ISO 8601 date string. (yyyy-MM-dd)
239283
func DateStr() ParserFn {
240284
return Trans(

extra/util.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package extra
2+
3+
import "unicode"
4+
5+
// Unicode word characters
6+
func isUnicodeWord(r rune) bool {
7+
// ID_Continue + '$' + U+200C + U+200D
8+
// Alnum(), '_', '$', and ...
9+
return (unicode.Is(unicode.L, r) ||
10+
unicode.Is(unicode.Nl, r) ||
11+
unicode.Is(unicode.Other_ID_Start, r) ||
12+
unicode.Is(unicode.Mn, r) ||
13+
unicode.Is(unicode.Mc, r) ||
14+
unicode.Is(unicode.Nd, r) ||
15+
unicode.Is(unicode.Pc, r) ||
16+
unicode.Is(unicode.Other_ID_Continue, r) ||
17+
r == '$' || r == 0x0200c || r == 0x0200d) &&
18+
!unicode.Is(unicode.Pattern_Syntax, r) &&
19+
!unicode.Is(unicode.Pattern_White_Space, r)
20+
}

string/parsers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ func WordBoundary() ParserFn {
513513
var prevChLength int
514514
for i := ctx.Position - 1; 0 <= i; i-- {
515515
b := ctx.Str[i]
516-
if 0x00 <= b && b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
516+
if b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
517517
prevCh, prevChLength = utf8.DecodeRuneInString(ctx.Str[i:])
518518
break
519519
}

string/util.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func isLineBreak(r rune) bool {
3535

3636
// ASCII word characters
3737
func isWord(r rune) bool {
38-
if 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' {
38+
if 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' || r == '$' {
3939
return true
4040
} else {
4141
return false

0 commit comments

Comments
 (0)