|
2 | 2 | (require pixie.streams :refer :all)) |
3 | 3 |
|
4 | 4 | (defprotocol IUTF8OutputStream |
5 | | - (write-char [this char])) |
| 5 | + (write-char [this char] "Write a single character to the UTF8 stream")) |
6 | 6 |
|
7 | 7 | (defprotocol IUTF8InputStream |
8 | | - (read-char [this])) |
| 8 | + (read-char [this] "Read a single character from the UTF8 stream")) |
9 | 9 |
|
10 | 10 | (deftype UTF8OutputStream [out] |
11 | 11 | IUTF8OutputStream |
|
21 | 21 | (<= ch 0x1FFFFF) (do (write-byte out (bit-or 0xE0 (bit-shift-right ch 18))) |
22 | 22 | (write-byte out (bit-or 0x80 (bit-and (bit-shift-right ch 12) 0x3F))) |
23 | 23 | (write-byte out (bit-or 0x80 (bit-and (bit-shift-right ch 6) 0x3F))) |
24 | | - (write-byte out (bit-or 0x80 (bit-and ch 0x3F))) )))) |
| 24 | + (write-byte out (bit-or 0x80 (bit-and ch 0x3F)))) |
| 25 | + :else (assert false (str "Cannot encode a UTF8 character of code " ch))))) |
25 | 26 | IDisposable |
26 | 27 | (-dispose! [this] |
27 | 28 | (dispose! out))) |
28 | 29 |
|
29 | 30 |
|
30 | | -(deftype UTF8InputStream [in] |
| 31 | +(deftype UTF8InputStream [in bad-char] |
31 | 32 | IUTF8InputStream |
32 | 33 | (read-char [this] |
33 | 34 | (let [ch (int (read-byte in)) |
34 | | - [n bytes] (cond |
35 | | - (>= 0x7F ch) [ch 1] |
36 | | - (= 0xC0 (bit-and ch 0xE0)) [(bit-and ch 31) 2] |
37 | | - (= 0xE0 (bit-and ch 0xF0)) [(bit-and ch 15) 3] |
38 | | - (= 0xF0 (bit-and ch 0xF8)) [(bit-and ch 7) 4] |
39 | | - :else (assert false (str "Got bad code " ch)))] |
| 35 | + [n bytes error?] (cond |
| 36 | + (>= 0x7F ch) [ch 1] |
| 37 | + (= 0xC0 (bit-and ch 0xE0)) [(bit-and ch 31) 2 false] |
| 38 | + (= 0xE0 (bit-and ch 0xF0)) [(bit-and ch 15) 3 false] |
| 39 | + (= 0xF0 (bit-and ch 0xF8)) [(bit-and ch 7) 4 false] |
| 40 | + (= 0xF8 (bit-and ch 0xF8)) [(bit-and ch 3) 5 true] |
| 41 | + (= 0xFC (bit-and ch 0xFE)) [(bit-and ch 1) 6 true] |
| 42 | + :else [n 1 true])] |
40 | 43 | (loop [i (dec bytes) |
41 | 44 | n n] |
42 | 45 | (if (pos? i) |
43 | 46 | (recur (dec i) |
44 | 47 | (bit-or (bit-shift-left n 6) |
45 | 48 | (bit-and (read-byte in) 0x3F))) |
46 | | - (char n))))) |
| 49 | + (if error? |
| 50 | + (if bad-char |
| 51 | + bad-char |
| 52 | + (throw (str "Invalid UTF8 character decoded: " n))) |
| 53 | + (char n)))))) |
47 | 54 | IDisposable |
48 | 55 | (-dispose! [this] |
49 | 56 | (dispose! in))) |
50 | 57 |
|
51 | | -(defn utf8-input-stream [i] |
52 | | - (->UTF8InputStream i)) |
| 58 | +(defn utf8-input-stream |
| 59 | + "Creates a UTF8 decoder that reads characters from the given IByteInputStream. If a bad character is found |
| 60 | + an error will be thrown, unless an optional bad-character marker character is provided." |
| 61 | + ([i] |
| 62 | + (->UTF8InputStream i nil)) |
| 63 | + ([i bad-char] |
| 64 | + (->UTF8InputStream i bad-char))) |
53 | 65 |
|
54 | | -(defn utf8-output-stream [o] |
| 66 | +(defn utf8-output-stream |
| 67 | + "Creates a UTF8 encoder that writes characters to the given IByteOutputStream." |
| 68 | + [o] |
55 | 69 | (->UTF8OutputStream o)) |
0 commit comments