Improve isSubsetOf (#615)

treeowl · web-flow · commit 200d7ae71315 · 2019-04-09T07:03:18.000-04:00
* Add recursive size tests to `Data.Set.isSubsetOf`. * Add a special case for singleton subsets to avoid extra splits at all the leaves. * Do the same for `isSubmapOf`. * Add the singleton special case to `disjoint`. * Tighten advertised bounds and improve documentation. Closes #614
diff --git a/Data/Map/Internal.hs b/Data/Map/Internal.hs
@@ -2727,22 +2727,33 @@ isSubmapOf m1 m2 = isSubmapOfBy (==) m1 m2
  > isSubmapOfBy (<)  (fromList [('a',1)]) (fromList [('a',1),('b',2)])
  > isSubmapOfBy (==) (fromList [('a',1),('b',2)]) (fromList [('a',1)])
 
+ Note that @isSubmapOfBy (\_ _ -> True) m1 m2@ tests whether all the keys
+ in @m1@ are also keys in @m2@.
 
 -}
 isSubmapOfBy :: Ord k => (a->b->Bool) -> Map k a -> Map k b -> Bool
 isSubmapOfBy f t1 t2
-  = (size t1 <= size t2) && (submap' f t1 t2)
+  = size t1 <= size t2 && submap' f t1 t2
 #if __GLASGOW_HASKELL__
 {-# INLINABLE isSubmapOfBy #-}
 #endif
 
+-- Test whether a map is a submap of another without the *initial*
+-- size test. See Data.Set.Internal.isSubsetOfX for notes on
+-- implementation and analysis.
 submap' :: Ord a => (b -> c -> Bool) -> Map a b -> Map a c -> Bool
 submap' _ Tip _ = True
 submap' _ _ Tip = False
+submap' f (Bin 1 kx x _ _) t
+  = case lookup kx t of
+      Just y -> f x y
+      Nothing -> False
 submap' f (Bin _ kx x l r) t
   = case found of
       Nothing -> False
-      Just y  -> f x y && submap' f l lt && submap' f r gt
+      Just y  -> f x y
+                 && size l <= size lt && size r <= size gt
+                 && submap' f l lt && submap' f r gt
   where
     (lt,found,gt) = splitLookup kx t
 #if __GLASGOW_HASKELL__
@@ -2778,7 +2789,7 @@ isProperSubmapOf m1 m2
 -}
 isProperSubmapOfBy :: Ord k => (a -> b -> Bool) -> Map k a -> Map k b -> Bool
 isProperSubmapOfBy f t1 t2
-  = (size t1 < size t2) && (submap' f t1 t2)
+  = size t1 < size t2 && submap' f t1 t2
 #if __GLASGOW_HASKELL__
 {-# INLINABLE isProperSubmapOfBy #-}
 #endif
diff --git a/Data/Set/Internal.hs b/Data/Set/Internal.hs
@@ -596,29 +596,67 @@ delete = go
 {--------------------------------------------------------------------
   Subset
 --------------------------------------------------------------------}
--- | /O(n+m)/. Is this a proper subset? (ie. a subset but not equal).
+-- | /O(m*log(n\/m + 1)), m <= n/.
+-- @(s1 \`isProperSubsetOf\` s2)@ indicates whether @s1@ is a
+-- proper subset of @s2@.
+--
+-- @
+-- s1 \`isProperSubsetOf\` s2 = s1 ``isSubsetOf`` s2 && s1 /= s2
+-- @
 isProperSubsetOf :: Ord a => Set a -> Set a -> Bool
 isProperSubsetOf s1 s2
-    = (size s1 < size s2) && (isSubsetOf s1 s2)
+    = size s1 < size s2 && isSubsetOfX s1 s2
 #if __GLASGOW_HASKELL__
 {-# INLINABLE isProperSubsetOf #-}
 #endif
 
 
--- | /O(n+m)/. Is this a subset?
--- @(s1 \`isSubsetOf\` s2)@ tells whether @s1@ is a subset of @s2@.
+-- | /O(m*log(n\/m + 1)), m <= n/.
+-- @(s1 \`isSubsetOf\` s2)@ indicates whether @s1@ is a subset of @s2@.
+--
+-- @
+-- s1 \`isSubsetOf\` s2 = all (``member`` s2) s1
+-- s1 \`isSubsetOf\` s2 = null (s1 ``difference`` s2)
+-- s1 \`isSubsetOf\` s2 = s1 ``union`` s2 == s2
+-- s1 \`isSubsetOf\` s2 = s1 ``intersection`` s2 == s1
+-- @
 isSubsetOf :: Ord a => Set a -> Set a -> Bool
 isSubsetOf t1 t2
-  = (size t1 <= size t2) && (isSubsetOfX t1 t2)
+  = size t1 <= size t2 && isSubsetOfX t1 t2
 #if __GLASGOW_HASKELL__
 {-# INLINABLE isSubsetOf #-}
 #endif
 
+-- Test whether a set is a subset of another without the *initial*
+-- size test.
+--
+-- This function is structured very much like `difference`, `union`,
+-- and `intersection`. Whereas the bounds proofs for those in Blelloch
+-- et al needed to accound for both "split work" and "merge work", we
+-- only have to worry about split work here, which is the same as in
+-- those functions.
 isSubsetOfX :: Ord a => Set a -> Set a -> Bool
 isSubsetOfX Tip _ = True
 isSubsetOfX _ Tip = False
+-- Skip the final split when we hit a singleton.
+isSubsetOfX (Bin 1 x _ _) t = member x t
 isSubsetOfX (Bin _ x l r) t
-  = found && isSubsetOfX l lt && isSubsetOfX r gt
+  = found &&
+    -- Cheap size checks can sometimes save expensive recursive calls when the
+    -- result will be False. Suppose we check whether [1..10] (with root 4) is
+    -- a subset of [0..9]. After the first split, we have to check if [1..3] is
+    -- a subset of [0..3] and if [5..10] is a subset of [5..9]. But we can bail
+    -- immediately because size [5..10] > size [5..9].
+    --
+    -- Why not just call `isSubsetOf` on each side to do the size checks?
+    -- Because that could make a recursive call on the left even though the
+    -- size check would fail on the right. In principle, we could take this to
+    -- extremes by maintaining a queue of pairs of sets to be checked, working
+    -- through the tree level-wise. But that would impose higher administrative
+    -- costs without obvious benefits. It might be worth considering if we find
+    -- a way to use it to tighten the bounds in some useful/comprehensible way.
+    size l <= size lt && size r <= size gt &&
+    isSubsetOfX l lt && isSubsetOfX r gt
   where
     (lt,found,gt) = splitMember x t
 #if __GLASGOW_HASKELL__
@@ -628,19 +666,25 @@ isSubsetOfX (Bin _ x l r) t
 {--------------------------------------------------------------------
   Disjoint
 --------------------------------------------------------------------}
--- | /O(n+m)/. Check whether two sets are disjoint (i.e. their intersection
---   is empty).
+-- | /O(m*log(n\/m + 1)), m <= n/. Check whether two sets are disjoint
+-- (i.e., their intersection is empty).
 --
 -- > disjoint (fromList [2,4,6])   (fromList [1,3])     == True
 -- > disjoint (fromList [2,4,6,8]) (fromList [2,3,5,7]) == False
 -- > disjoint (fromList [1,2])     (fromList [1,2,3,4]) == False
 -- > disjoint (fromList [])        (fromList [])        == True
 --
+-- @
+-- xs ``disjoint`` ys = null (xs ``intersection`` ys)
+-- @
+--
 -- @since 0.5.11
 
 disjoint :: Ord a => Set a -> Set a -> Bool
 disjoint Tip _ = True
 disjoint _ Tip = True
+-- Avoid a split for the singleton case.
+disjoint (Bin 1 x _ _) t = x `notMember` t
 disjoint (Bin _ x l r) t
   -- Analogous implementation to `subsetOfX`
   = not found && disjoint l lt && disjoint r gt