@@ -2,10 +2,12 @@ package extra
22
33import (
44 "unicode"
5+ "unicode/utf8"
56
67 . "github.com/shellyln/takenoco/base"
78 clsz "github.com/shellyln/takenoco/extra/classes"
89 . "github.com/shellyln/takenoco/string"
10+ strclsz "github.com/shellyln/takenoco/string/classes"
911)
1012
1113// Exclude parsed ASTs from the results.
@@ -235,6 +237,48 @@ func UnicodeIdentifierStr() ParserFn {
235237 )
236238}
237239
240+ // Zero-width assertion on a word boundary.
241+ func UnicodeWordBoundary () ParserFn {
242+ const ClassName = strclsz .WordBoundary
243+ return LightBaseParser (ClassName , func (ctx ParserContext ) (ParserContext , error ) {
244+ ctx .Length = 0
245+ ctx .MatchStatus = MatchStatus_Unmatched
246+
247+ ch , length := utf8 .DecodeRuneInString (ctx .Str [ctx .Position :])
248+
249+ if ctx .Position == 0 {
250+ if 0 < length && isUnicodeWord (ch ) {
251+ ctx .MatchStatus = MatchStatus_Matched
252+ }
253+ return ctx , nil
254+ }
255+
256+ var prevCh rune
257+ var prevChLength int
258+ for i := ctx .Position - 1 ; 0 <= i ; i -- {
259+ b := ctx .Str [i ]
260+ if b <= 0x7f || 0xc2 <= b && b <= 0xf0 || b == 0xf3 {
261+ prevCh , prevChLength = utf8 .DecodeRuneInString (ctx .Str [i :])
262+ break
263+ }
264+ }
265+
266+ if ctx .Position == len (ctx .Str ) {
267+ if 0 < prevChLength && isUnicodeWord (prevCh ) {
268+ ctx .MatchStatus = MatchStatus_Matched
269+ }
270+ } else {
271+ if length != 0 && prevChLength != 0 {
272+ if isUnicodeWord (prevCh ) && ! isUnicodeWord (ch ) || ! isUnicodeWord (prevCh ) && isUnicodeWord (ch ) {
273+ ctx .MatchStatus = MatchStatus_Matched
274+ }
275+ }
276+ }
277+
278+ return ctx , nil
279+ })
280+ }
281+
238282// Parse the ISO 8601 date string. (yyyy-MM-dd)
239283func DateStr () ParserFn {
240284 return Trans (
0 commit comments