|
| 1 | +(ns pixie.streams.utf8 |
| 2 | + (require pixie.streams :refer :all)) |
| 3 | + |
| 4 | +(defprotocol IUTF8OutputStream |
| 5 | + (write-char [this char] "Write a single character to the UTF8 stream")) |
| 6 | + |
| 7 | +(defprotocol IUTF8InputStream |
| 8 | + (read-char [this] "Read a single character from the UTF8 stream")) |
| 9 | + |
| 10 | +(deftype UTF8OutputStream [out] |
| 11 | + IUTF8OutputStream |
| 12 | + (write-char [this ch] |
| 13 | + (let [ch (int ch)] |
| 14 | + (cond |
| 15 | + (<= ch 0x7F) (write-byte out ch) |
| 16 | + (<= ch 0x7FF) (do (write-byte out (bit-or 0xC0 (bit-shift-right ch 6))) |
| 17 | + (write-byte out (bit-or 0x80 (bit-and ch 0x3F)))) |
| 18 | + (<= ch 0xFFFF) (do (write-byte out (bit-or 0xE0 (bit-shift-right ch 12))) |
| 19 | + (write-byte out (bit-or 0x80 (bit-and (bit-shift-right ch 6) 0x3F))) |
| 20 | + (write-byte out (bit-or 0x80 (bit-and ch 0x3F)))) |
| 21 | + (<= ch 0x1FFFFF) (do (write-byte out (bit-or 0xE0 (bit-shift-right ch 18))) |
| 22 | + (write-byte out (bit-or 0x80 (bit-and (bit-shift-right ch 12) 0x3F))) |
| 23 | + (write-byte out (bit-or 0x80 (bit-and (bit-shift-right ch 6) 0x3F))) |
| 24 | + (write-byte out (bit-or 0x80 (bit-and ch 0x3F)))) |
| 25 | + :else (assert false (str "Cannot encode a UTF8 character of code " ch))))) |
| 26 | + IDisposable |
| 27 | + (-dispose! [this] |
| 28 | + (dispose! out))) |
| 29 | + |
| 30 | + |
| 31 | +(deftype UTF8InputStream [in bad-char] |
| 32 | + IUTF8InputStream |
| 33 | + (read-char [this] |
| 34 | + (let [ch (int (read-byte in)) |
| 35 | + [n bytes error?] (cond |
| 36 | + (>= 0x7F ch) [ch 1] |
| 37 | + (= 0xC0 (bit-and ch 0xE0)) [(bit-and ch 31) 2 false] |
| 38 | + (= 0xE0 (bit-and ch 0xF0)) [(bit-and ch 15) 3 false] |
| 39 | + (= 0xF0 (bit-and ch 0xF8)) [(bit-and ch 7) 4 false] |
| 40 | + (= 0xF8 (bit-and ch 0xF8)) [(bit-and ch 3) 5 true] |
| 41 | + (= 0xFC (bit-and ch 0xFE)) [(bit-and ch 1) 6 true] |
| 42 | + :else [n 1 true])] |
| 43 | + (loop [i (dec bytes) |
| 44 | + n n] |
| 45 | + (if (pos? i) |
| 46 | + (recur (dec i) |
| 47 | + (bit-or (bit-shift-left n 6) |
| 48 | + (bit-and (read-byte in) 0x3F))) |
| 49 | + (if error? |
| 50 | + (if bad-char |
| 51 | + bad-char |
| 52 | + (throw (str "Invalid UTF8 character decoded: " n))) |
| 53 | + (char n)))))) |
| 54 | + IDisposable |
| 55 | + (-dispose! [this] |
| 56 | + (dispose! in))) |
| 57 | + |
| 58 | +(defn utf8-input-stream |
| 59 | + "Creates a UTF8 decoder that reads characters from the given IByteInputStream. If a bad character is found |
| 60 | + an error will be thrown, unless an optional bad-character marker character is provided." |
| 61 | + ([i] |
| 62 | + (->UTF8InputStream i nil)) |
| 63 | + ([i bad-char] |
| 64 | + (->UTF8InputStream i bad-char))) |
| 65 | + |
| 66 | +(defn utf8-output-stream |
| 67 | + "Creates a UTF8 encoder that writes characters to the given IByteOutputStream." |
| 68 | + [o] |
| 69 | + (->UTF8OutputStream o)) |
0 commit comments