@@ -57,20 +57,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5757(define utf8string? string?)
5858(define utf8string=? string=?)
5959(define utf8string-append string-append)
60- (define utf8string-copy string-copy)
60+ (define utf8string-copy string-copy)
6161
6262(define (utf8string-length src ) (length (utf8string->unicode src)))
6363
6464(define (utf8string-ref s idx ) (integer->utf8char (list-ref (utf8string->unicode s) idx)))
6565
66- (define (utf8string . cs )
66+ (define (utf8string . cs )
6767 (let ((utf8cs (map char->utf8char cs)))
6868 (apply string-append utf8cs)))
6969
70- (define (make-utf8string n . c )
70+ (define (make-utf8string n . c )
7171 (let ((utf8c (char->utf8char (if (fx= (length c) 1 ) (car c) " " ))))
72- (let loop ((i 0 )(s " " ))
73- (if (fx= i n) s
72+ (let loop ((i 0 ) (s " " ))
73+ (if (fx= i n) s
7474 (loop (fx+ i 1 ) (string-append s utf8c))))))
7575
7676(define (utf8string->list s ) (map integer->utf8char (utf8string->unicode s)))
@@ -108,17 +108,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
108108(define utf8string-explode (lambda (str seplst )
109109 (let ((utf8seplst (map char->utf8char seplst)))
110110 (let loop ((strlst (utf8string->list str))(tmp " " )(res '() ))
111- (if (= (length strlst) 0 ) (append res
111+ (if (= (length strlst) 0 ) (append res
112112 (if (> (utf8string-length tmp) 0 ) (list tmp) '() ))
113113 (let ((chop? (member (car strlst) utf8seplst)))
114114 (loop (cdr strlst) (if chop? " " (utf8string-append tmp (utf8string (car strlst))))
115- (if chop? (append res (list tmp)
115+ (if chop? (append res (list tmp)
116116 (list (utf8string (car strlst)))) res))))))))
117117
118118(define (utf8string-split str sep )
119119 (let ((utf8sep (char->utf8char sep)))
120120 (let loop ((cs (utf8string->list str))(subres " " )(res '() ))
121- (if (= (length cs) 0 ) (if (> (utf8string-length subres) 0 )
121+ (if (= (length cs) 0 ) (if (> (utf8string-length subres) 0 )
122122 (append res (list subres)) res)
123123 (let* ((c (car cs))
124124 (split? (utf8char=? c utf8sep)))
@@ -138,20 +138,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138138 (c1 (if (zero? pat-len) #f (utf8string-ref pattern 0 )))
139139 (c2 (if (<= pat-len 1 ) #f (utf8string-ref pattern 1 ))))
140140 (cond
141- ((not c1) 0 )
141+ ((not c1) 0 )
142142 ((not c2) (utf8string-index str c1 cmp))
143143 (else (let outer ((pos 0 ))
144144 (cond
145145 ((> pos search-span) #f )
146146 ((not (cmp c1 (utf8string-ref str pos)))
147- (outer (+ 1 pos)))
147+ (outer (+ 1 pos)))
148148 ((not (cmp c2 (utf8string-ref str (+ 1 pos))))
149149 (outer (+ 1 pos)))
150150 (else (let inner ((i-pat 2 ) (i-str (+ 2 pos)))
151- (if (>= i-pat pat-len) pos
151+ (if (>= i-pat pat-len) pos
152152 (if (cmp (utf8string-ref pattern i-pat) (utf8string-ref str i-str))
153153 (inner (+ 1 i-pat) (+ 1 i-str))
154- (outer (+ 1 pos))))))))))))
154+ (outer (+ 1 pos))))))))))))
155155
156156(define (utf8string-contains str pattern ) (utf8string:contains str pattern utf8char=?))
157157(define (utf8string-contains-ci str pattern ) (utf8string:contains str pattern utf8char-ci=?))
@@ -164,7 +164,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
164164 (utf8newchr (char->utf8char newchr)))
165165 (let loop ((oldcs (utf8string->list str))(newcs '() ))
166166 (if (= (length oldcs) 0 ) (list->utf8string newcs)
167- (loop (cdr oldcs) (append newcs
167+ (loop (cdr oldcs) (append newcs
168168 (list (if (utf8char=? (car oldcs) utf8oldchr) utf8newchr (car oldcs)))))))))
169169
170170(define utf8string-replace-substring string-replace-substring)
@@ -177,53 +177,84 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177177 (let* ((moveindex (- (length first) 1 ))
178178 (newfirst (list-head first moveindex))
179179 (newsecond (append (list (list-ref first moveindex)) second))
180- (neww (max (utf8string-length (utf8string-mapconcat newfirst " " ))
180+ (neww (max (utf8string-length (utf8string-mapconcat newfirst " " ))
181181 (utf8string-length (utf8string-mapconcat newsecond " " )))))
182182 (if (< neww bestw)
183183 (loop newfirst newsecond neww)
184184 (list (utf8string-mapconcat first " " ) (utf8string-mapconcat second " " )))))))
185185
186186; ; -------------------------------
187187; ; unicode<->utf8 translation
188- ; ; adopted from http://ccm.sherry.jp/cleite/ (public domain)
189-
190- (define (utf8string->unicode src )
191- (define (decode l )
192- (if (null? l) l
193- (cond ((fx= 0 (bitwise-and (car l) #x80 ))
194- (cons (car l)
195- (decode (list-tail l 1 ))))
196- ((fx= #xC0 (bitwise-and (car l) #xE0 ))
197- (cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0 ) #x1F ) 6 )
198- (bitwise-and (list-ref l 1 ) #x3F ))
199- (decode (list-tail l 2 ))))
200- ((fx= #xE0 (bitwise-and (car l) #xF0 ))
201- (cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0 ) #x0F ) 12 )
202- (arithmetic-shift (bitwise-and (list-ref l 1 ) #x3F ) 6 )
203- (bitwise-and (list-ref l 2 ) #x3F ))
204- (decode (list-tail l 3 ))))
205- ((fx= #xF0 (bitwise-and (car l) #xF8 ))
206- (cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0 ) #x07 ) 18 )
207- (arithmetic-shift (bitwise-and (list-ref l 1 ) #x3F ) 12 )
208- (arithmetic-shift (bitwise-and (list-ref l 2 ) #x3F ) 6 )
209- (bitwise-and (list-ref l 3 ) #x3F ))
210- (decode (list-tail l 4 ))))
211- ((fx= #xF8 (bitwise-and (car l) #xFC ))
212- (cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0 ) #x03 ) 24 )
213- (arithmetic-shift (bitwise-and (list-ref l 1 ) #x3F ) 18 )
214- (arithmetic-shift (bitwise-and (list-ref l 2 ) #x3F ) 12 )
215- (arithmetic-shift (bitwise-and (list-ref l 3 ) #x3F ) 6 )
216- (bitwise-and (list-ref l 4 ) #x3F ))
217- (decode (list-tail l 5 ))))
218- ((fx= #xFC (bitwise-and (car l) #xFE ))
219- (cons (bitwise-ior (arithmetic-shift (bitwise-and (list-ref l 0 ) #x01 ) 30 )
220- (arithmetic-shift (bitwise-and (list-ref l 1 ) #x3F ) 24 )
221- (arithmetic-shift (bitwise-and (list-ref l 2 ) #x3F ) 18 )
222- (arithmetic-shift (bitwise-and (list-ref l 3 ) #x3F ) 12 )
223- (arithmetic-shift (bitwise-and (list-ref l 4 ) #x3F ) 6 )
224- (bitwise-and (list-ref l 5 ) #x3F ))
225- (decode (list-tail l 6 )))))))
226- (decode (map char->integer (string->list src))))
188+
189+ (define utf8string->unicode:on-encoding-error
190+ (let ((handler #f ))
191+ (define (utf8string->unicode:on-encoding-error-replace str idx ) #xfffd)
192+ (define (utf8string->unicode:on-encoding-error-raise-error str idx )
193+ (error " UTF-8 char encoding error" str idx))
194+ (define (utf8string->unicode:on-encoding-error-log+replace str idx )
195+ (log-error " UTF-8 char encoding error" str idx)
196+ #xfffd)
197+ (define (select key )
198+ (set!
199+ handler
200+ (if (procedure? key)
201+ key
202+ (case key
203+ ((replace #f ) utf8string->unicode:on-encoding-error-replace)
204+ ((log+replace) utf8string->unicode:on-encoding-error-log+replace)
205+ (else utf8string->unicode:on-encoding-error-raise-error)))))
206+ (select 'error )
207+ (lambda args
208+ (if (null? args) handler (select (car args))))))
209+
210+ (define (utf8string->unicode str #!optional (encoding-error #t ))
211+ (define (on-encoding-error i )
212+ (cond
213+ ((not encoding-error) #xfffd ) ; ; deliver replacement charater
214+ ((procedure? encoding-error) (encoding-error str i)) ; ; delegate to caller
215+ (else ((utf8string->unicode:on-encoding-error) str i)))) ; ; delegate to registered
216+ (let next-char ((i 0 ))
217+ (if (fx>= i (string-length str)) '()
218+ (let* ((c (string-ref str i))
219+ (c1 (char->integer c)))
220+ (receive (size m1)
221+ (cond
222+ ((fx< c1 #x80 ) (values 1 #x7f ))
223+ ((fx< c1 #xe0 ) (values 2 #x0f ))
224+ ((fx< c1 #xf0 ) (values 3 #x0f ))
225+ ((fx< c1 #xf8 ) (values 4 #x07 ))
226+ ((fx< c1 #xfc ) (values 5 #x03 ))
227+ (else (values 6 #x01 )))
228+ (if (fx= size 1 )
229+ (cons c1 (next-char (fx+ i 1 )))
230+ (let ((limit (fx+ i size)))
231+ (let subc ((j (fx+ i 1 )) (r (bitwise-and c1 m1)))
232+ (cond
233+ ((fx>= j limit) (cons r (next-char limit)))
234+ ((fx>= j (string-length str)) (list (on-encoding-error j)))
235+ (else
236+ (let ((cc (char->integer (string-ref str j))))
237+ (if (or (fx< cc #x80 ) (fx>= cc #xc0 ))
238+ (let ((r (on-encoding-error j))) ; ; 1st force handling
239+ (cons r (next-char j)))
240+ (subc
241+ (fx+ j 1 )
242+ (bitwise-ior
243+ (arithmetic-shift r 6 )
244+ (bitwise-and cc #x3F )))))))))))))))
245+
246+ (define (utf8string->unicode/fallback str #!key (fallback #f ))
247+ ; ; A version, which replaces string causing convertion erros with a
248+ ; ; hopefuly safe fallback.
249+ (let ((use-fallback (list 0 ))) ; ; just a well known value
250+ (with-exception-catcher
251+ (lambda (exn )
252+ (if (eq? exn use-fallback)
253+ (case fallback
254+ ((#f ) (map char->integer (string->list str)))
255+ (else (fallback str)))
256+ (raise exn)))
257+ (lambda () (utf8string->unicode str (lambda (str i ) (raise use-fallback)))))))
227258
228259(define (unicode->utf8string src )
229260 (cond ((integer? src)
0 commit comments