Skip to content

Commit 4ff2ba7

Browse files
authored
CORE: flexible UTF-8 encoding error handling (#361)
Also a bit faster.
1 parent 2e68b43 commit 4ff2ba7

File tree

1 file changed

+84
-53
lines changed

1 file changed

+84
-53
lines changed

modules/ln_core/utf8string.scm

Lines changed: 84 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -57,20 +57,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5757
(define utf8string? string?)
5858
(define utf8string=? string=?)
5959
(define utf8string-append string-append)
60-
(define utf8string-copy string-copy)
60+
(define utf8string-copy string-copy)
6161

6262
(define (utf8string-length src) (length (utf8string->unicode src)))
6363

6464
(define (utf8string-ref s idx) (integer->utf8char (list-ref (utf8string->unicode s) idx)))
6565

66-
(define (utf8string . cs)
66+
(define (utf8string . cs)
6767
(let ((utf8cs (map char->utf8char cs)))
6868
(apply string-append utf8cs)))
6969

70-
(define (make-utf8string n . c)
70+
(define (make-utf8string n . c)
7171
(let ((utf8c (char->utf8char (if (fx= (length c) 1) (car c) " "))))
72-
(let loop ((i 0)(s ""))
73-
(if (fx= i n) s
72+
(let loop ((i 0) (s ""))
73+
(if (fx= i n) s
7474
(loop (fx+ i 1) (string-append s utf8c))))))
7575

7676
(define (utf8string->list s) (map integer->utf8char (utf8string->unicode s)))
@@ -108,17 +108,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
108108
(define utf8string-explode (lambda (str seplst)
109109
(let ((utf8seplst (map char->utf8char seplst)))
110110
(let loop ((strlst (utf8string->list str))(tmp "")(res '()))
111-
(if (= (length strlst) 0) (append res
111+
(if (= (length strlst) 0) (append res
112112
(if (> (utf8string-length tmp) 0) (list tmp) '()))
113113
(let ((chop? (member (car strlst) utf8seplst)))
114114
(loop (cdr strlst) (if chop? "" (utf8string-append tmp (utf8string (car strlst))))
115-
(if chop? (append res (list tmp)
115+
(if chop? (append res (list tmp)
116116
(list (utf8string (car strlst)))) res))))))))
117117

118118
(define (utf8string-split str sep)
119119
(let ((utf8sep (char->utf8char sep)))
120120
(let loop ((cs (utf8string->list str))(subres "")(res '()))
121-
(if (= (length cs) 0) (if (> (utf8string-length subres) 0)
121+
(if (= (length cs) 0) (if (> (utf8string-length subres) 0)
122122
(append res (list subres)) res)
123123
(let* ((c (car cs))
124124
(split? (utf8char=? c utf8sep)))
@@ -138,20 +138,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138138
(c1 (if (zero? pat-len) #f (utf8string-ref pattern 0)))
139139
(c2 (if (<= pat-len 1) #f (utf8string-ref pattern 1))))
140140
(cond
141-
((not c1) 0)
141+
((not c1) 0)
142142
((not c2) (utf8string-index str c1 cmp))
143143
(else (let outer ((pos 0))
144144
(cond
145145
((> pos search-span) #f)
146146
((not (cmp c1 (utf8string-ref str pos)))
147-
(outer (+ 1 pos)))
147+
(outer (+ 1 pos)))
148148
((not (cmp c2 (utf8string-ref str (+ 1 pos))))
149149
(outer (+ 1 pos)))
150150
(else (let inner ((i-pat 2) (i-str (+ 2 pos)))
151-
(if (>= i-pat pat-len) pos
151+
(if (>= i-pat pat-len) pos
152152
(if (cmp (utf8string-ref pattern i-pat) (utf8string-ref str i-str))
153153
(inner (+ 1 i-pat) (+ 1 i-str))
154-
(outer (+ 1 pos))))))))))))
154+
(outer (+ 1 pos))))))))))))
155155

156156
(define (utf8string-contains str pattern) (utf8string:contains str pattern utf8char=?))
157157
(define (utf8string-contains-ci str pattern) (utf8string:contains str pattern utf8char-ci=?))
@@ -164,7 +164,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
164164
(utf8newchr (char->utf8char newchr)))
165165
(let loop ((oldcs (utf8string->list str))(newcs '()))
166166
(if (= (length oldcs) 0) (list->utf8string newcs)
167-
(loop (cdr oldcs) (append newcs
167+
(loop (cdr oldcs) (append newcs
168168
(list (if (utf8char=? (car oldcs) utf8oldchr) utf8newchr (car oldcs)))))))))
169169

170170
(define utf8string-replace-substring string-replace-substring)
@@ -177,53 +177,84 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177177
(let* ((moveindex (- (length first) 1))
178178
(newfirst (list-head first moveindex))
179179
(newsecond (append (list (list-ref first moveindex)) second))
180-
(neww (max (utf8string-length (utf8string-mapconcat newfirst " "))
180+
(neww (max (utf8string-length (utf8string-mapconcat newfirst " "))
181181
(utf8string-length (utf8string-mapconcat newsecond " ")))))
182182
(if (< neww bestw)
183183
(loop newfirst newsecond neww)
184184
(list (utf8string-mapconcat first " ") (utf8string-mapconcat second " ")))))))
185185

186186
;; -------------------------------
187187
;; unicode<->utf8 translation
188-
;; adopted from http://ccm.sherry.jp/cleite/ (public domain)
189-
190-
(define (utf8string->unicode src)
191-
(define (decode l)
192-
(if (null? l) l
193-
(cond ((fx= 0 (bitwise-and (car l) #x80))
194-
(cons (car l)
195-
(decode (list-tail l 1))))
196-
((fx= #xC0 (bitwise-and (car l) #xE0))
197-
(cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0) #x1F) 6)
198-
(bitwise-and (list-ref l 1) #x3F))
199-
(decode (list-tail l 2))))
200-
((fx= #xE0 (bitwise-and (car l) #xF0))
201-
(cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0) #x0F) 12)
202-
(arithmetic-shift (bitwise-and (list-ref l 1) #x3F) 6)
203-
(bitwise-and (list-ref l 2) #x3F))
204-
(decode (list-tail l 3))))
205-
((fx= #xF0 (bitwise-and (car l) #xF8))
206-
(cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0) #x07) 18)
207-
(arithmetic-shift (bitwise-and (list-ref l 1) #x3F) 12)
208-
(arithmetic-shift (bitwise-and (list-ref l 2) #x3F) 6)
209-
(bitwise-and (list-ref l 3) #x3F))
210-
(decode (list-tail l 4))))
211-
((fx= #xF8 (bitwise-and (car l) #xFC))
212-
(cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0) #x03) 24)
213-
(arithmetic-shift (bitwise-and (list-ref l 1) #x3F) 18)
214-
(arithmetic-shift (bitwise-and (list-ref l 2) #x3F) 12)
215-
(arithmetic-shift (bitwise-and (list-ref l 3) #x3F) 6)
216-
(bitwise-and (list-ref l 4) #x3F))
217-
(decode (list-tail l 5))))
218-
((fx= #xFC (bitwise-and (car l) #xFE))
219-
(cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0) #x01) 30)
220-
(arithmetic-shift (bitwise-and (list-ref l 1) #x3F) 24)
221-
(arithmetic-shift (bitwise-and (list-ref l 2) #x3F) 18)
222-
(arithmetic-shift (bitwise-and (list-ref l 3) #x3F) 12)
223-
(arithmetic-shift (bitwise-and (list-ref l 4) #x3F) 6)
224-
(bitwise-and (list-ref l 5) #x3F))
225-
(decode (list-tail l 6)))))))
226-
(decode (map char->integer (string->list src))))
188+
189+
(define utf8string->unicode:on-encoding-error
190+
(let ((handler #f))
191+
(define (utf8string->unicode:on-encoding-error-replace str idx) #xfffd)
192+
(define (utf8string->unicode:on-encoding-error-raise-error str idx)
193+
(error "UTF-8 char encoding error" str idx))
194+
(define (utf8string->unicode:on-encoding-error-log+replace str idx)
195+
(log-error "UTF-8 char encoding error" str idx)
196+
#xfffd)
197+
(define (select key)
198+
(set!
199+
handler
200+
(if (procedure? key)
201+
key
202+
(case key
203+
((replace #f) utf8string->unicode:on-encoding-error-replace)
204+
((log+replace) utf8string->unicode:on-encoding-error-log+replace)
205+
(else utf8string->unicode:on-encoding-error-raise-error)))))
206+
(select 'error)
207+
(lambda args
208+
(if (null? args) handler (select (car args))))))
209+
210+
(define (utf8string->unicode str #!optional (encoding-error #t))
211+
(define (on-encoding-error i)
212+
(cond
213+
((not encoding-error) #xfffd) ;; deliver replacement charater
214+
((procedure? encoding-error) (encoding-error str i)) ;; delegate to caller
215+
(else ((utf8string->unicode:on-encoding-error) str i)))) ;; delegate to registered
216+
(let next-char ((i 0))
217+
(if (fx>= i (string-length str)) '()
218+
(let* ((c (string-ref str i))
219+
(c1 (char->integer c)))
220+
(receive (size m1)
221+
(cond
222+
((fx< c1 #x80) (values 1 #x7f))
223+
((fx< c1 #xe0) (values 2 #x0f))
224+
((fx< c1 #xf0) (values 3 #x0f))
225+
((fx< c1 #xf8) (values 4 #x07))
226+
((fx< c1 #xfc) (values 5 #x03))
227+
(else (values 6 #x01)))
228+
(if (fx= size 1)
229+
(cons c1 (next-char (fx+ i 1)))
230+
(let ((limit (fx+ i size)))
231+
(let subc ((j (fx+ i 1)) (r (bitwise-and c1 m1)))
232+
(cond
233+
((fx>= j limit) (cons r (next-char limit)))
234+
((fx>= j (string-length str)) (list (on-encoding-error j)))
235+
(else
236+
(let ((cc (char->integer (string-ref str j))))
237+
(if (or (fx< cc #x80) (fx>= cc #xc0))
238+
(let ((r (on-encoding-error j))) ;; 1st force handling
239+
(cons r (next-char j)))
240+
(subc
241+
(fx+ j 1)
242+
(bitwise-ior
243+
(arithmetic-shift r 6)
244+
(bitwise-and cc #x3F)))))))))))))))
245+
246+
(define (utf8string->unicode/fallback str #!key (fallback #f))
247+
;; A version, which replaces string causing convertion erros with a
248+
;; hopefuly safe fallback.
249+
(let ((use-fallback (list 0))) ;; just a well known value
250+
(with-exception-catcher
251+
(lambda (exn)
252+
(if (eq? exn use-fallback)
253+
(case fallback
254+
((#f) (map char->integer (string->list str)))
255+
(else (fallback str)))
256+
(raise exn)))
257+
(lambda () (utf8string->unicode str (lambda (str i) (raise use-fallback)))))))
227258

228259
(define (unicode->utf8string src)
229260
(cond ((integer? src)

0 commit comments

Comments
 (0)