1
1
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
2
+ {-# LANGUAGE MagicHash #-}
3
+ {-# LANGUAGE UnliftedFFITypes #-}
2
4
3
5
-- |
4
6
-- Module : Data.Text.Lazy.Search
@@ -27,9 +29,14 @@ import qualified Data.Text.Array as A
27
29
import Data.Int (Int64 )
28
30
import Data.Word (Word8 , Word64 )
29
31
import qualified Data.Text.Internal as T
32
+ import qualified Data.Text as T (concat )
30
33
import Data.Text.Internal.Fusion.Types (PairS (.. ))
31
- import Data.Text.Internal.Lazy (Text (.. ), foldlChunks )
34
+ import Data.Text.Internal.Lazy (Text (.. ), foldrChunks , equal )
35
+ import Data.Text.Unsafe (unsafeDupablePerformIO )
32
36
import Data.Bits ((.|.) , (.&.) )
37
+ import Foreign.C.Types
38
+ import GHC.Exts (ByteArray #)
39
+ import System.Posix.Types (CSsize (.. ))
33
40
34
41
-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
35
42
-- @needle@ within @haystack@.
@@ -42,110 +49,90 @@ import Data.Bits ((.|.), (.&.))
42
49
indices :: Text -- ^ Substring to search for (@needle@)
43
50
-> Text -- ^ Text to search in (@haystack@)
44
51
-> [Int64 ]
45
- indices needle@ ( Chunk n ns) _haystack @ ( Chunk k ks)
46
- | nlen <= 0 = []
47
- | nlen == 1 = indicesOne (nindex 0 ) 0 k ks
48
- | otherwise = advance k ks 0 0
52
+ indices needle
53
+ | nlen <= 0 = const []
54
+ | nlen == 1 = indicesOne (A. unsafeIndex narr noff ) 0
55
+ | otherwise = advance 0 0
49
56
where
50
- advance x@ (T. Text _ _ l) xs = scan
51
- where
52
- scan ! g ! i
53
- | i >= m = case xs of
54
- Empty -> []
55
- Chunk y ys -> advance y ys g (i- m)
57
+ T. Text narr noff nlen = T. concat (foldrChunks (:) [] needle)
58
+
59
+ advance ! _ ! _ Empty = []
60
+ advance ! (g :: Int64 ) ! (i :: Int ) xxs@ (Chunk x@ (T. Text xarr@ (A. ByteArray xarr# ) xoff l) xs)
61
+ | i >= l = advance g (i - l) xs
56
62
| lackingHay (i + nlen) x xs = []
57
- | c == z && candidateMatch 0 = g : scan (g+ nlen) (i+ nlen)
58
- | otherwise = scan (g+ delta) (i+ delta)
63
+ | c == z && candidateMatch = g : advance (g + intToInt64 nlen) (i + nlen) xxs
64
+ | otherwise = advance (g + intToInt64 delta) (i + delta) xxs
59
65
where
60
- m = intToInt64 l
61
- c = hindex (i + nlast)
66
+ c = index xxs (i + nlast)
62
67
delta | nextInPattern = nlen + 1
63
68
| c == z = skip + 1
64
- | otherwise = 1
65
- nextInPattern = mask .&. swizzle (hindex (i+ nlen)) == 0
66
- candidateMatch ! j
67
- | j >= nlast = True
68
- | hindex (i+ j) /= nindex j = False
69
- | otherwise = candidateMatch (j+ 1 )
70
- hindex = index x xs
71
- nlen = wordLength needle
69
+ | l >= i + nlen = case unsafeDupablePerformIO $
70
+ memchr xarr# (intToCSize (xoff + i + nlen)) (intToCSize (l - i - nlen)) z of
71
+ - 1 -> max 1 (l - i - nlen)
72
+ s -> cSsizeToInt s + 1
73
+ | otherwise = 1
74
+ nextInPattern = mask .&. swizzle (index xxs (i + nlen)) == 0
75
+
76
+ candidateMatch
77
+ | i + nlen <= l = A. equal narr noff xarr (xoff + i) nlen
78
+ | otherwise = A. equal narr noff xarr (xoff + i) (l - i) &&
79
+ Chunk (T. Text narr (noff + l - i) (nlen - l + i)) Empty `equal` xs
80
+
72
81
nlast = nlen - 1
73
- nindex = index n ns
74
- z = foldlChunks fin 0 needle
75
- where fin _ (T. Text farr foff flen) = A. unsafeIndex farr (foff+ flen- 1 )
76
- (mask :: Word64 ) :*: skip = buildTable n ns 0 0 0 (nlen- 2 )
82
+ z = A. unsafeIndex narr (noff + nlen - 1 )
83
+ (mask :: Word64 ) :*: skip = buildTable 0 0 0 (nlen- 2 )
77
84
78
85
swizzle :: Word8 -> Word64
79
86
swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f )
80
87
81
- buildTable (T. Text xarr xoff xlen) xs = go
82
- where
83
- go ! (g:: Int64 ) ! i ! msk ! skp
84
- | i >= xlast = case xs of
85
- Empty -> (msk .|. swizzle z) :*: skp
86
- Chunk y ys -> buildTable y ys g 0 msk' skp'
87
- | otherwise = go (g+ 1 ) (i+ 1 ) msk' skp'
88
- where c = A. unsafeIndex xarr (xoff+ i)
88
+ buildTable ! g ! i ! msk ! skp
89
+ | i >= nlast = (msk .|. swizzle z) :*: skp
90
+ | otherwise = buildTable (g+ 1 ) (i+ 1 ) msk' skp'
91
+ where c = A. unsafeIndex narr (noff+ i)
89
92
msk' = msk .|. swizzle c
90
93
skp' | c == z = nlen - g - 2
91
94
| otherwise = skp
92
- xlast = xlen - 1
93
95
94
96
-- | Check whether an attempt to index into the haystack at the
95
97
-- given offset would fail.
96
- lackingHay :: Int64 -> T. Text -> Text -> Bool
97
- lackingHay q = go 0
98
- where
99
- go p (T. Text _ _ l) ps = p' < q && case ps of
100
- Empty -> True
101
- Chunk r rs -> go p' r rs
102
- where p' = p + intToInt64 l
103
- indices _ _ = []
98
+ lackingHay :: Int -> T. Text -> Text -> Bool
99
+ lackingHay q (T. Text _ _ l) ps = l < q && case ps of
100
+ Empty -> True
101
+ Chunk r rs -> lackingHay (q - l) r rs
104
102
105
103
-- | Fast index into a partly unpacked 'Text'. We take into account
106
104
-- the possibility that the caller might try to access one element
107
105
-- past the end.
108
- index :: T. Text -> Text -> Int64 -> Word8
109
- index (T. Text arr off len) xs ! i
110
- | j < len = A. unsafeIndex arr (off+ j)
111
- | otherwise = case xs of
112
- Empty
113
- -- out of bounds, but legal
114
- | j == len -> 0
115
- -- should never happen, due to lackingHay above
116
- | otherwise -> emptyError " index"
117
- Chunk c cs -> index c cs (i- intToInt64 len)
118
- where j = int64ToInt i
106
+ index :: Text -> Int -> Word8
107
+ index Empty ! _ = 0
108
+ index (Chunk (T. Text arr off len) xs) ! i
109
+ | i < len = A. unsafeIndex arr (off + i)
110
+ | otherwise = index xs (i - len)
119
111
120
112
-- | A variant of 'indices' that scans linearly for a single 'Word8'.
121
- indicesOne :: Word8 -> Int64 -> T. Text -> Text -> [Int64 ]
113
+ indicesOne :: Word8 -> Int64 -> Text -> [Int64 ]
122
114
indicesOne c = chunk
123
115
where
124
- chunk :: Int64 -> T. Text -> Text -> [Int64 ]
125
- chunk ! i (T. Text oarr ooff olen) os = go 0
116
+ chunk :: Int64 -> Text -> [Int64 ]
117
+ chunk ! _ Empty = []
118
+ chunk ! i (Chunk (T. Text oarr ooff olen) os) = go 0
126
119
where
127
- go h | h >= olen = case os of
128
- Empty -> []
129
- Chunk y ys -> chunk (i+ intToInt64 olen) y ys
120
+ go h | h >= olen = chunk (i+ intToInt64 olen) os
130
121
| on == c = i + intToInt64 h : go (h+ 1 )
131
122
| otherwise = go (h+ 1 )
132
123
where on = A. unsafeIndex oarr (ooff+ h)
133
124
134
- -- | The number of 'Word8' values in a 'Text'.
135
- wordLength :: Text -> Int64
136
- wordLength = foldlChunks sumLength 0
137
- where
138
- sumLength :: Int64 -> T. Text -> Int64
139
- sumLength i (T. Text _ _ l) = i + intToInt64 l
140
-
141
- emptyError :: String -> a
142
- emptyError fun = error (" Data.Text.Lazy.Search." ++ fun ++ " : empty input" )
143
-
144
125
intToInt64 :: Int -> Int64
145
126
intToInt64 = fromIntegral
146
127
147
- int64ToInt :: Int64 -> Int
148
- int64ToInt = fromIntegral
149
-
150
128
word8ToInt :: Word8 -> Int
151
129
word8ToInt = fromIntegral
130
+
131
+ intToCSize :: Int -> CSize
132
+ intToCSize = fromIntegral
133
+
134
+ cSsizeToInt :: CSsize -> Int
135
+ cSsizeToInt = fromIntegral
136
+
137
+ foreign import ccall unsafe " _hs_text_memchr" memchr
138
+ :: ByteArray # -> CSize -> CSize -> Word8 -> IO CSsize
0 commit comments