@@ -65,7 +65,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
65
65
import Control.Monad.ST (unsafeIOToST , unsafeSTToIO )
66
66
#endif
67
67
68
- import Control.Exception (evaluate , try )
68
+ import Control.Exception (evaluate , try , throwIO , ErrorCall ( ErrorCall ) )
69
69
import Control.Monad.ST (runST )
70
70
import Data.Bits ((.&.) )
71
71
import Data.ByteString as B
@@ -131,6 +131,13 @@ decodeLatin1 (PS fp off len) = text a 0 len
131
131
return dest
132
132
133
133
-- | Decode a 'ByteString' containing UTF-8 encoded text.
134
+ --
135
+ -- __NOTE__: The replacement character returned by 'OnDecodeError'
136
+ -- MUST be within the BMP plane; surrogate code points will
137
+ -- automatically be remapped to the replacement char @U+FFFD@
138
+ -- (/since 0.11.3.0/), whereas code points beyond the BMP will throw an
139
+ -- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
140
+ -- those unsupported code points would result in undefined behavior.
134
141
decodeUtf8With :: OnDecodeError -> ByteString -> Text
135
142
decodeUtf8With onErr (PS fp off len) = runText $ \ done -> do
136
143
let go dest = withForeignPtr fp $ \ ptr ->
@@ -146,16 +153,52 @@ decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
146
153
x <- peek curPtr'
147
154
case onErr desc (Just x) of
148
155
Nothing -> loop $ curPtr' `plusPtr` 1
149
- Just c -> do
150
- destOff <- peek destOffPtr
151
- w <- unsafeSTToIO $
152
- unsafeWrite dest (fromIntegral destOff) (safe c)
153
- poke destOffPtr (destOff + fromIntegral w)
154
- loop $ curPtr' `plusPtr` 1
156
+ Just c
157
+ | c > ' \xFFFF ' -> throwUnsupportedReplChar
158
+ | otherwise -> do
159
+ destOff <- peek destOffPtr
160
+ w <- unsafeSTToIO $
161
+ unsafeWrite dest (fromIntegral destOff)
162
+ (safe c)
163
+ poke destOffPtr (destOff + fromIntegral w)
164
+ loop $ curPtr' `plusPtr` 1
155
165
loop (ptr `plusPtr` off)
156
166
(unsafeIOToST . go) =<< A. new len
157
167
where
158
168
desc = " Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
169
+
170
+ throwUnsupportedReplChar = throwIO $
171
+ ErrorCall " decodeUtf8With: non-BMP replacement characters not supported"
172
+ -- TODO: The code currently assumes that the transcoded UTF-16
173
+ -- stream is at most twice as long (in bytes) as the input UTF-8
174
+ -- stream. To justify this assumption one has to assume that the
175
+ -- error handler replacement character also satisfies this
176
+ -- invariant, by emitting at most one UTF16 code unit.
177
+ --
178
+ -- One easy way to support the full range of code-points for
179
+ -- replacement characters in the error handler is to simply change
180
+ -- the (over-)allocation to `A.new (2*len)` and then shrink back the
181
+ -- `ByteArray#` to the real size (recent GHCs have a cheap
182
+ -- `ByteArray#` resize-primop for that which allow the GC to reclaim
183
+ -- the overallocation). However, this would require 4 times as much
184
+ -- (temporary) storage as the original UTF-8 required.
185
+ --
186
+ -- Another strategy would be to optimistically assume that
187
+ -- replacement characters are within the BMP, and if the case of a
188
+ -- non-BMP replacement occurs reallocate the target buffer (or throw
189
+ -- an exception, and fallback to a pessimistic codepath, like e.g.
190
+ -- `decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)`)
191
+ --
192
+ -- Alternatively, `OnDecodeError` could become a datastructure which
193
+ -- statically encodes the replacement-character range,
194
+ -- e.g. something isomorphic to
195
+ --
196
+ -- Either (... -> Maybe Word16) (... -> Maybe Char)
197
+ --
198
+ -- And allow to statically switch between the BMP/non-BMP
199
+ -- replacement-character codepaths. There's multiple ways to address
200
+ -- this with different tradeoffs; but ideally we should optimise for
201
+ -- the optimistic/error-free case.
159
202
{- INLINE[0] decodeUtf8With #-}
160
203
161
204
-- $stream
0 commit comments