|
| 1 | +{-# LANGUAGE UnboxedTuples, BangPatterns #-} |
| 2 | +-- | JSON Canonicalization Scheme https://datatracker.ietf.org/doc/html/rfc8785 |
| 3 | +module Data.Aeson.RFC8785 ( |
| 4 | + encodeCanonical, |
| 5 | +) where |
| 6 | + |
| 7 | +import Data.List (sortBy) |
| 8 | +import Data.Ord (comparing) |
| 9 | +import GHC.Integer (quotRemInteger) |
| 10 | +import Math.NumberTheory.Logarithms (integerLog10) |
| 11 | + |
| 12 | +import Data.Aeson |
| 13 | +import Data.Aeson.Encoding |
| 14 | +import Data.Aeson.Encoding.Internal |
| 15 | +import Data.Aeson.Internal.Prelude |
| 16 | +import Data.Aeson.Internal.Word8 |
| 17 | + |
| 18 | +import qualified Data.Aeson.Key as Key |
| 19 | +import qualified Data.Aeson.KeyMap as KM |
| 20 | +import qualified Data.ByteString as BS |
| 21 | +import qualified Data.ByteString.Builder as B |
| 22 | +import qualified Data.ByteString.Builder.Prim as BP |
| 23 | +import qualified Data.ByteString.Lazy as LBS |
| 24 | +import qualified Data.Scientific as Sci |
| 25 | +import qualified Data.Text.Encoding as TE |
| 26 | +import qualified Data.Vector as V |
| 27 | + |
| 28 | +-- $setup |
| 29 | +-- >>> import Data.Aeson |
| 30 | + |
| 31 | +-- | Encode to JSON according to RFC 8785 canonicalization scheme. |
| 32 | +-- https://datatracker.ietf.org/doc/html/rfc8785 |
| 33 | +-- |
| 34 | +-- 'encodeCanonical' uses 'toJSON' to produce intermediate 'Value', |
| 35 | +-- as 'toEncoding' may (and most likely) produces non-canonical JSON. |
| 36 | +-- |
| 37 | +-- Note: @decode (encodeCanonical v) === Just v@ for all @v :: Value@, |
| 38 | +-- i.e. 'encodeCanonical' doesn't lose any information. |
| 39 | +-- |
| 40 | +-- However, the example in RFC8785 /loses/ information as the intermediate |
| 41 | +-- number representation is 'Double', also current @toJSON :: Double -> Value@ |
| 42 | +-- sometimes produces too precise values. For example |
| 43 | +-- |
| 44 | +-- >>> toJSON (1e23 :: Double) |
| 45 | +-- Number 9.999999999999999e22 |
| 46 | +-- |
| 47 | +-- 'show' also behaves the same: |
| 48 | +-- |
| 49 | +-- >>> 1e23 :: Double |
| 50 | +-- 9.999999999999999e22 |
| 51 | +-- |
| 52 | +-- Note: RFC8785 is __not the same scheme__ as used in |
| 53 | +-- [canonical-json](https://hackage.haskell.org/package/canonical-json) package |
| 54 | +-- (https://wiki.laptop.org/go/Canonical_JSON). |
| 55 | +-- That scheme produces /invalid/ JSON (e.g. control characters encoded as is, not escaped) |
| 56 | +-- and cannot encode non-integral numbers. |
| 57 | +-- |
| 58 | +-- @since 2.2.1.0 |
| 59 | +-- |
| 60 | +encodeCanonical :: ToJSON a => a -> LBS.ByteString |
| 61 | +encodeCanonical = encodingToLazyByteString . toCanonical . toJSON |
| 62 | + |
| 63 | +toCanonical :: Value -> Encoding |
| 64 | +toCanonical Null = null_ |
| 65 | +toCanonical (Bool b) = bool b |
| 66 | +toCanonical (Number n) = canonicalNumber n |
| 67 | +toCanonical (String s) = canonicalString s |
| 68 | +toCanonical (Array v) = list toCanonical (V.toList v) |
| 69 | +toCanonical (Object m) = dict (canonicalString . Key.toText) toCanonical ifr $ |
| 70 | + sortBy (\(k1, _) (k2, _) -> propertyCmp k1 k2) (KM.toList m) |
| 71 | + |
| 72 | +ifr :: (k -> v -> a -> a) -> a -> [(k, v)] -> a |
| 73 | +ifr f z = foldr (\(k, v) -> f k v) z |
| 74 | +{-# INLINE ifr #-} |
| 75 | + |
| 76 | +-- Property name strings to be sorted are formatted as arrays of UTF-16 code units. |
| 77 | +propertyCmp :: Key -> Key -> Ordering |
| 78 | +propertyCmp = comparing f where |
| 79 | + -- this is slow implementation, but it's obviously not wrong. |
| 80 | + f :: Key -> BS.ByteString |
| 81 | + f = TE.encodeUtf16BE . Key.toText |
| 82 | + |
| 83 | +-- strings are already serialized canonically. |
| 84 | +canonicalString :: Text -> Encoding' a |
| 85 | +canonicalString = text |
| 86 | + |
| 87 | +-- RFC 8785 is outsourcing number format to ECMA-262. |
| 88 | +-- 10th edition, 7.1.12.1 NumberToString |
| 89 | +-- https://262.ecma-international.org/10.0/#sec-tostring-applied-to-the-number-type |
| 90 | +-- |
| 91 | +-- Note: this specification is not lossy |
| 92 | +-- Given 'Scientific' we can choose n,k,s uniquely: 'nks'. |
| 93 | +-- |
| 94 | +-- RFC8785 Appendix D says "don't use bignums". |
| 95 | +canonicalNumber :: Scientific -> Encoding |
| 96 | +canonicalNumber m = case compare m 0 of |
| 97 | + EQ -> Encoding (B.word8 W8_0) |
| 98 | + LT -> Encoding (B.word8 W8_MINUS <> fromEncoding (canonicalNumber' (negate m))) |
| 99 | + GT -> canonicalNumber' m |
| 100 | + |
| 101 | +-- input: Positive number |
| 102 | +canonicalNumber' :: Scientific -> Encoding |
| 103 | +canonicalNumber' m |
| 104 | + | k <= n, n <= 21 |
| 105 | + = Encoding $ |
| 106 | + BP.primMapListFixed BP.word8 ds <> |
| 107 | + BP.primMapListFixed BP.word8 (replicate (n - k) W8_0) |
| 108 | + |
| 109 | + | 0 < n, n <= 21 |
| 110 | + , let (pfx, sfx) = splitAt n ds |
| 111 | + = Encoding $ |
| 112 | + BP.primMapListFixed BP.word8 pfx <> |
| 113 | + B.word8 W8_DOT <> |
| 114 | + BP.primMapListFixed BP.word8 sfx |
| 115 | + |
| 116 | + | -6 < n, n <= 0 |
| 117 | + = Encoding $ |
| 118 | + B.word8 W8_0 <> |
| 119 | + B.word8 W8_DOT <> |
| 120 | + BP.primMapListFixed BP.word8 (replicate (negate n) W8_0) <> |
| 121 | + BP.primMapListFixed BP.word8 ds |
| 122 | + |
| 123 | + | k == 1, [d] <- ds |
| 124 | + = Encoding $ |
| 125 | + B.word8 d <> |
| 126 | + B.word8 W8_e <> |
| 127 | + B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <> |
| 128 | + BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) |
| 129 | + |
| 130 | + | (d:ds') <- ds |
| 131 | + = Encoding $ |
| 132 | + B.word8 d <> |
| 133 | + B.word8 W8_DOT <> |
| 134 | + BP.primMapListFixed BP.word8 ds' <> |
| 135 | + B.word8 W8_e <> |
| 136 | + B.word8 (if (n - 1) >= 0 then W8_PLUS else W8_MINUS) <> |
| 137 | + BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) |
| 138 | + |
| 139 | + | otherwise |
| 140 | + = string "0" -- shouldn't happen, but we need a default case. |
| 141 | + |
| 142 | + where |
| 143 | + -- 5. Otherwise, let n, k, and s be integers such that |
| 144 | + -- k ≥ 1, 10k - 1 ≤ s < 10k, the Number value for s × 10n - k is m, |
| 145 | + -- and k is as small as possible. |
| 146 | + -- Note that k is the number of digits in the decimal representation of s, |
| 147 | + -- that s is not divisible by 10, and that the least significant digit of s |
| 148 | + -- is not necessarily uniquely determined by these criteria. |
| 149 | + (n, k, s) = nks m |
| 150 | + ds = integerToDecimalDigits s |
| 151 | + |
| 152 | +-- 5. Otherwise, let n, k, and s be integers such that k ≥ 1, 10^(k - 1) ≤ s < 10^k, |
| 153 | +-- the Number value for s × 10^(n - k) is m, and k is as small as possible. |
| 154 | +-- Note that k is the number of digits in the decimal representation of s, |
| 155 | +-- that s is not divisible by 10, and that the least significant digit of s |
| 156 | +-- is not necessarily uniquely determined by these criteria. |
| 157 | +nks :: Scientific -> (Int, Int, Integer) |
| 158 | +nks m = (e + k, k, c) |
| 159 | + where |
| 160 | + m' = Sci.normalize m |
| 161 | + c = Sci.coefficient m' |
| 162 | + e = Sci.base10Exponent m' |
| 163 | + k = integerLog10 c + 1 |
| 164 | + |
| 165 | +integerToDecimalDigits :: Integer -> [Word8] |
| 166 | +integerToDecimalDigits = go [] where |
| 167 | + go acc 0 = acc |
| 168 | + go acc i = case quotRemInteger i 10 of |
| 169 | + (# q, r #) -> go (d:acc) q where !d = fromIntegral r + W8_0 |
0 commit comments