Add size-aware 'union' functions

rockbmb · rockbmb · commit 2357b66594dc · 2017-10-08T04:28:37.000+01:00
diff --git a/Data/HashMap/Array.hs b/Data/HashMap/Array.hs
@@ -24,6 +24,7 @@ module Data.HashMap.Array
     , indexM
     , update
     , updateWith'
+    , updateWithInternal'
     , unsafeUpdateM
     , insert
     , insertM
@@ -32,6 +33,7 @@ module Data.HashMap.Array
     , unsafeFreeze
     , unsafeThaw
     , run
+    , runInternal
     , run2
     , copy
     , copyM
@@ -232,6 +234,13 @@ run :: (forall s . ST s (MArray s e)) -> Array e
 run act = runST $ act >>= unsafeFreeze
 {-# INLINE run #-}
 
+runInternal :: (forall s . ST s (Int, MArray s e)) -> (Int, Array e)
+runInternal act = runST $ do
+    (s, mary) <- act
+    ary <- unsafeFreeze mary
+    return (s, ary)
+{-# INLINE runInternal #-}
+
 run2 :: (forall s. ST s (MArray s e, a)) -> (Array e, a)
 run2 k = runST (do
                  (marr,b) <- k
@@ -297,6 +306,15 @@ updateWith' :: Array e -> Int -> (e -> e) -> Array e
 updateWith' ary idx f = update ary idx $! f (index ary idx)
 {-# INLINE updateWith' #-}
 
+-- | /O(n)/ Update the element at the given positio in this array, by
+-- applying a function to it.  Evaluates the element to WHNF before
+-- inserting it into the array.
+updateWithInternal' :: Array e -> Int -> (e -> (Int, e)) -> (Int, Array e)
+updateWithInternal' ary idx f =
+    let (!sz, !e) = f (index ary idx)
+    in (sz, update ary idx e)
+{-# INLINE updateWithInternal' #-}
+
 -- | /O(1)/ Update the element at the given position in this array,
 -- without copying.
 unsafeUpdateM :: Array e -> Int -> e -> ST s ()
diff --git a/Data/HashMap/Base.hs b/Data/HashMap/Base.hs
@@ -160,7 +160,7 @@ type role HashMap nominal representational
 
 -- | WIP. This will become the user-facing 'HashMap' after this PR is
 -- finalized.
-data HashMapW = HashMapW {-# UNPACK #-} !Int !HashMap
+data HashMapW k v = HashMapW {-# UNPACK #-} !Int !(HashMap k v)
 
 instance (NFData k, NFData v) => NFData (HashMap k v) where
     rnf Empty                 = ()
@@ -1033,6 +1033,14 @@ union :: (Eq k, Hashable k) => HashMap k v -> HashMap k v -> HashMap k v
 union = unionWith const
 {-# INLINABLE union #-}
 
+-- | /O(n+m)/ The union of two maps. If a key occurs in both maps, the
+-- mapping from the first will be the mapping in the result.
+-- Returns a tuple with the increase in the first hashmap's size and
+-- the union of the two maps.
+unionInternal :: (Eq k, Hashable k) => HashMap k v -> HashMapW k v -> (Int, HashMap k v)
+unionInternal = unionWithInternal const
+{-# INLINABLE unionInternal #-}
+
 -- | /O(n+m)/ The union of two maps.  If a key occurs in both maps,
 -- the provided function (first argument) will be used to compute the
 -- result.
@@ -1041,6 +1049,20 @@ unionWith :: (Eq k, Hashable k) => (v -> v -> v) -> HashMap k v -> HashMap k v
 unionWith f = unionWithKey (const f)
 {-# INLINE unionWith #-}
 
+-- | /O(n+m)/ The union of two maps.  If a key occurs in both maps,
+-- the provided function (first argument) will be used to compute the
+-- result.
+-- Returns a tuple with the increase in the first hashmap's size and the
+-- union of the two maps.
+unionWithInternal
+    :: (Eq k, Hashable k)
+    => (v -> v -> v)
+    -> HashMap k v
+    -> HashMapW k v
+    -> (Int, HashMap k v)
+unionWithInternal f = unionWithKeyInternal (const f)
+{-# INLINE unionWithInternal #-}
+
 -- | /O(n+m)/ The union of two maps.  If a key occurs in both maps,
 -- the provided function (first argument) will be used to compute the
 -- result.
@@ -1128,6 +1150,140 @@ unionWithKey f = go 0
         m2 = mask h2 s
 {-# INLINE unionWithKey #-}
 
+-- | /O(n+m)/ The union of two maps.  If a key occurs in both maps,
+-- the provided function (first argument) will be used to compute the
+-- result.
+-- Returns a tuple where the first component is how many elements were added
+-- to the first hashmap and the second is the union hashmap itself.
+unionWithKeyInternal
+    :: forall k v . (Eq k, Hashable k)
+    => (k -> v -> v -> v)
+    -> HashMap k v
+    -> HashMapW k v
+    -> (Int, HashMap k v)
+unionWithKeyInternal f h1 (HashMapW size h2) = go 0 size h1 h2
+  where
+    go :: Int -> Int -> HashMap k v -> HashMap k v -> (Int, HashMap k v)
+    -- empty vs. anything
+    go !_ !sz t1 Empty = (sz, t1)
+    go _ !sz Empty t2 = (sz, t2)
+    -- leaf vs. leaf
+    go s !sz t1@(Leaf h1 l1@(L k1 v1)) t2@(Leaf h2 l2@(L k2 v2))
+        | h1 == h2  = if k1 == k2
+                      then (sz - 1, Leaf h1 (L k1 (f k1 v1 v2)))
+                      else (sz, collision h1 l1 l2)
+        | otherwise = goDifferentHash sz s h1 h2 t1 t2 -- don't forget this
+    go s !sz t1@(Leaf h1 (L k1 v1)) t2@(Collision h2 ls2)
+        | h1 == h2  =
+            let !start = A.length ls2
+                !newV = updateOrSnocWithKey f k1 v1 ls2
+                !end = A.length newV
+            in (sz + end - start - 1, Collision h1 newV)
+        | otherwise = goDifferentHash sz s h1 h2 t1 t2 -- or this
+    go s !sz t1@(Collision h1 ls1) t2@(Leaf h2 (L k2 v2))
+        | h1 == h2  =
+            let !start = A.length ls1
+                !newV = updateOrSnocWithKey (flip . f) k2 v2 ls1
+                !end = A.length newV
+            in (sz + end - start - 1, Collision h1 newV)
+        | otherwise = goDifferentHash sz s h1 h2 t1 t2 -- this too
+    go s !sz t1@(Collision h1 ls1) t2@(Collision h2 ls2)
+        | h1 == h2  =
+            let !start = A.length ls1
+                !newV = updateOrConcatWithKey f ls1 ls2
+                !end = A.length newV
+            in (sz + (end - start - A.length ls2), Collision h1 newV)
+        | otherwise = goDifferentHash sz s h1 h2 t1 t2
+    -- branch vs. branch
+    go s !sz (BitmapIndexed b1 ary1) (BitmapIndexed b2 ary2) =
+        let b'         = b1 .|. b2
+            (dsz, ary') =
+                unionArrayByInternal sz
+                                     (go (s+bitsPerSubkey))
+                                     b1
+                                     b2
+                                     ary1
+                                     ary2
+        in (dsz, bitmapIndexedOrFull b' ary')
+    go s !sz (BitmapIndexed b1 ary1) (Full ary2) =
+        let (dsz, ary') =
+                unionArrayByInternal sz
+                                     (go (s+bitsPerSubkey))
+                                     b1
+                                     fullNodeMask
+                                     ary1
+                                     ary2
+        in (dsz, Full ary')
+    go s !sz (Full ary1) (BitmapIndexed b2 ary2) =
+        let (dsz, ary') =
+                unionArrayByInternal sz
+                                     (go (s+bitsPerSubkey))
+                                     fullNodeMask
+                                     b2
+                                     ary1
+                                     ary2
+        in (dsz, Full ary')
+    go s !sz (Full ary1) (Full ary2) =
+        let (dsz, ary') =
+                unionArrayByInternal sz
+                                     (go (s+bitsPerSubkey))
+                                     fullNodeMask
+                                     fullNodeMask
+                                     ary1
+                                     ary2
+        in (dsz, Full ary')
+    -- leaf vs. branch
+    go s !sz (BitmapIndexed b1 ary1) t2
+        | b1 .&. m2 == 0 = let ary' = A.insert ary1 i t2
+                               b'   = b1 .|. m2
+                           in (sz, bitmapIndexedOrFull b' ary')
+        | otherwise      = let (dsz, ary') = A.updateWithInternal' ary1 i $ \st1 ->
+                                   go (s+bitsPerSubkey) sz st1 t2
+                           in (dsz, BitmapIndexed b1 ary')
+        where
+          h2 = leafHashCode t2
+          m2 = mask h2 s
+          i = sparseIndex b1 m2
+    go s !sz t1 (BitmapIndexed b2 ary2)
+        | b2 .&. m1 == 0 = let ary' = A.insert ary2 i $! t1
+                               b'   = b2 .|. m1
+                           in (sz, bitmapIndexedOrFull b' ary')
+        | otherwise      = let (dsz, ary') = A.updateWithInternal' ary2 i $ \st2 ->
+                                   go (s+bitsPerSubkey) sz t1 st2
+                           in (dsz, BitmapIndexed b2 ary')
+      where
+        h1 = leafHashCode t1
+        m1 = mask h1 s
+        i = sparseIndex b2 m1
+    go s !sz (Full ary1) t2 =
+        let h2   = leafHashCode t2
+            i    = index h2 s
+            (dsz, ary') =
+                update16WithInternal' ary1 i $ \st1 ->
+                    go (s+bitsPerSubkey) sz st1 t2
+        in (dsz, Full ary')
+    go s !sz t1 (Full ary2) =
+        let h1   = leafHashCode t1
+            i    = index h1 s
+            (dsz, ary') =
+                update16WithInternal' ary2 i $ \st2 ->
+                    go (s+bitsPerSubkey) sz t1 st2
+        in (dsz, Full ary')
+
+    leafHashCode (Leaf h _) = h
+    leafHashCode (Collision h _) = h
+    leafHashCode _ = error "leafHashCode"
+
+    goDifferentHash sz s h1 h2 t1 t2
+        | m1 == m2  = let (!dsz, !hm) = go sz (s+bitsPerSubkey) t1 t2
+                      in (dsz, BitmapIndexed m1 (A.singleton hm))
+        | m1 <  m2  = (sz, BitmapIndexed (m1 .|. m2) (A.pair t1 t2))
+        | otherwise = (sz, BitmapIndexed (m1 .|. m2) (A.pair t2 t1))
+      where
+        m1 = mask h1 s
+        m2 = mask h2 s
+{-# INLINE unionWithKeyInternal #-}
+
 -- | Strict in the result of @f@.
 unionArrayBy :: (a -> a -> a) -> Bitmap -> Bitmap -> A.Array a -> A.Array a
              -> A.Array a
@@ -1156,6 +1312,42 @@ unionArrayBy f b1 b2 ary1 ary2 = A.run $ do
     -- where we copy one array, and then update.
 {-# INLINE unionArrayBy #-}
 
+-- | Strict in the result of @f@.
+unionArrayByInternal
+    :: Int
+    -> (Int -> a -> a -> (Int, a))
+    -> Bitmap
+    -> Bitmap
+    -> A.Array a
+    -> A.Array a
+    -> (Int, A.Array a)
+unionArrayByInternal size f b1 b2 ary1 ary2 = A.runInternal $ do
+    let b' = b1 .|. b2
+    mary <- A.new_ (popCount b')
+    -- iterate over nonzero bits of b1 .|. b2
+    -- it would be nice if we could shift m by more than 1 each time
+    let ba = b1 .&. b2
+--        go :: forall s . Int -> Int -> Int -> Int -> Bitmap -> ST s Int
+        go !sz !i !i1 !i2 !m
+            | m > b'        = return sz
+            | b' .&. m == 0 = go sz i i1 i2 (m `unsafeShiftL` 1)
+            | ba .&. m /= 0 = do
+                let (!dsz, !hm) = f sz (A.index ary1 i1) (A.index ary2 i2)
+                A.write mary i hm
+                go dsz (i+1) (i1+1) (i2+1) (m `unsafeShiftL` 1)
+            | b1 .&. m /= 0 = do
+                A.write mary i =<< A.indexM ary1 i1
+                go sz (i+1) (i1+1) (i2  ) (m `unsafeShiftL` 1)
+            | otherwise     = do
+                A.write mary i =<< A.indexM ary2 i2
+                go sz (i+1) (i1  ) (i2+1) (m `unsafeShiftL` 1)
+    d <- go size 0 0 0 (b' .&. negate b') -- XXX: b' must be non-zero
+    return (d, mary)
+    -- TODO: For the case where b1 .&. b2 == b1, i.e. when one is a
+    -- subset of the other, we could use a slightly simpler algorithm,
+    -- where we copy one array, and then update.
+{-# INLINE unionArrayByInternal #-}
+
 -- TODO: Figure out the time complexity of 'unions'.
 
 -- | Construct a set containing all elements from a list of sets.
@@ -1679,6 +1871,13 @@ update16With' :: A.Array e -> Int -> (e -> e) -> A.Array e
 update16With' ary idx f = update16 ary idx $! f (A.index ary idx)
 {-# INLINE update16With' #-}
 
+-- | /O(n)/ Update the element at the given position in this array, by applying a function to it.
+update16WithInternal' :: A.Array e -> Int -> (e -> (Int, e)) -> (Int, A.Array e)
+update16WithInternal' ary idx f =
+    let (s, x) = f $! A.index ary idx
+    in (s, update16 ary idx x)
+{-# INLINE update16WithInternal' #-}
+
 -- | Unsafely clone an array of 16 elements.  The length of the input
 -- array is not checked.
 clone16 :: A.Array e -> ST s (A.MArray s e)