Skip to content

Commit 470b13d

Browse files
committed
Parsing example
1 parent b58955e commit 470b13d

File tree

3 files changed

+85
-29
lines changed

3 files changed

+85
-29
lines changed

README.md

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,8 @@
55
succinct data-structures to allow traversal of large XML
66
strings with minimal memory overhead.
77

8-
For an example, see app/Main.hs
8+
For an example, see [app/Main.hs](../master/app/Main.hs)
99

10-
```
11-
benchmarking XmlBig/Run blankXml
12-
time 2.212 s (1.971 s .. 2.652 s)
13-
0.996 R² (0.989 R² .. 1.000 R²)
14-
mean 2.138 s (2.073 s .. 2.186 s)
15-
std dev 73.37 ms (0.0 s .. 83.51 ms)
16-
variance introduced by outliers: 19% (moderately inflated)
17-
18-
benchmarking XmlBig/Run xmlToInterestBits3
19-
time 2.497 s (2.449 s .. 2.531 s)
20-
1.000 R² (1.000 R² .. 1.000 R²)
21-
mean 2.531 s (2.515 s .. 2.540 s)
22-
std dev 13.90 ms (0.0 s .. 14.76 ms)
23-
variance introduced by outliers: 19% (moderately inflated)
24-
25-
benchmarking XmlBig/loadXml
26-
time 2.768 s (2.698 s .. 2.857 s)
27-
1.000 R² (1.000 R² .. 1.000 R²)
28-
mean 2.780 s (2.767 s .. 2.790 s)
29-
std dev 15.40 ms (0.0 s .. 17.48 ms)
30-
variance introduced by outliers: 19% (moderately inflated)
31-
```
10+
# Notes
11+
* [Semi-Indexing Semi-Structured Data in Tiny Space](http://www.di.unipi.it/~ottavian/files/semi_index_cikm.pdf)
12+
* [Space-Efficient, High-Performance Rank & Select Structures on Uncompressed Bit Sequences](https://www.cs.cmu.edu/~dga/papers/zhou-sea2013.pdf)

app/Main.hs

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1-
{-# LANGUAGE BangPatterns #-}
2-
{-# LANGUAGE ScopedTypeVariables #-}
1+
{-# LANGUAGE BangPatterns #-}
2+
{-# LANGUAGE FlexibleInstances #-}
3+
{-# LANGUAGE ScopedTypeVariables #-}
4+
{-# LANGUAGE TypeSynonymInstances #-}
35

46
module Main where
57

8+
import Data.Foldable
9+
import Data.Maybe
610
import Data.Semigroup ((<>))
711
import Data.Word
812
import HaskellWorks.Data.BalancedParens.RangeMinMax2
@@ -11,37 +15,98 @@ import HaskellWorks.Data.Bits.BitShown
1115
import HaskellWorks.Data.FromByteString
1216
import HaskellWorks.Data.RankSelect.CsPoppy
1317
import HaskellWorks.Data.TreeCursor
18+
import HaskellWorks.Data.Xml.Decode
19+
import HaskellWorks.Data.Xml.DecodeResult
20+
import HaskellWorks.Data.Xml.RawDecode
1421
import HaskellWorks.Data.Xml.RawValue
1522
import HaskellWorks.Data.Xml.Succinct.Cursor
1623
import HaskellWorks.Data.Xml.Succinct.Index
24+
import HaskellWorks.Data.Xml.Value
1725

1826
import qualified Data.ByteString as BS
1927
import qualified Data.Vector.Storable as DVS
2028

2129
type RawCursor = XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))
22-
type CsCursor = XmlCursor BS.ByteString CsPoppy (RangeMinMax2 CsPoppy)
30+
type FastCursor = XmlCursor BS.ByteString CsPoppy (RangeMinMax2 CsPoppy)
2331

32+
-- | Read an XML file into memory and return a raw cursor initialised to the
33+
-- start of the XML document.
2434
readRawCursor :: String -> IO RawCursor
2535
readRawCursor path = do
2636
!bs <- BS.readFile path
2737
let !cursor = fromByteString bs :: RawCursor
2838
return cursor
2939

30-
readCsCursor :: String -> IO CsCursor
31-
readCsCursor filename = do
40+
-- | Read an XML file into memory and return a query-optimised cursor initialised
41+
-- to the start of the XML document.
42+
readFastCursor :: String -> IO FastCursor
43+
readFastCursor filename = do
44+
-- Load the XML file into memory as a raw cursor.
45+
-- The raw XML data is `text`, and `ib` and `bp` are the indexes.
46+
-- `ib` and `bp` can be persisted to an index file for later use to avoid
47+
-- re-parsing the file.
3248
XmlCursor !text (BitShown !ib) (SimpleBalancedParens !bp) _ <- readRawCursor filename
3349
let !bpCsPoppy = makeCsPoppy bp
3450
let !rangeMinMax = mkRangeMinMax2 bpCsPoppy
3551
let !ibCsPoppy = makeCsPoppy ib
3652
return $ XmlCursor text ibCsPoppy rangeMinMax 1
3753

54+
-- | Parse the text of an XML node.
55+
class ParseText a where
56+
parseText :: Value -> DecodeResult a
57+
58+
instance ParseText String where
59+
parseText (XmlText text) = DecodeOk text
60+
parseText (XmlCData text) = DecodeOk text
61+
parseText (XmlElement _ _ cs) = DecodeOk $ concat $ concat $ toList . parseText <$> cs
62+
parseText _ = DecodeOk ""
63+
64+
-- | Convert a decode result to a maybe
65+
decodeResultToMaybe :: DecodeResult a -> Maybe a
66+
decodeResultToMaybe (DecodeOk a) = Just a
67+
decodeResultToMaybe _ = Nothing
68+
69+
-- | Document model. This does not need to be able to completely represent all
70+
-- the data in the XML document. In fact, having a smaller model may improve
71+
-- query performance.
72+
data Plant = Plant
73+
{ common :: String
74+
, price :: String
75+
} deriving (Eq, Show)
76+
77+
newtype Catalog = Catalog
78+
{ plants :: [Plant]
79+
} deriving (Eq, Show)
80+
81+
-- | Decode plant element
82+
decodePlant :: Value -> DecodeResult Plant
83+
decodePlant xml = do
84+
aCommon <- xml /> "common" >>= parseText
85+
aPrice <- xml /> "price" >>= parseText
86+
return $ Plant aCommon aPrice
87+
88+
-- | Decode catalog element
89+
decodeCatalog :: Value -> DecodeResult Catalog
90+
decodeCatalog xml = do
91+
aPlantXmls <- xml />> "plant"
92+
let aPlants = catMaybes (decodeResultToMaybe . decodePlant <$> aPlantXmls)
93+
return $ Catalog aPlants
94+
3895
main :: IO ()
3996
main = do
40-
!cursor <- readCsCursor "data/catalog.xml"
97+
-- Read XML into memory as a query-optimised cursor
98+
!cursor <- readFastCursor "data/catalog.xml"
99+
-- Skip the XML declaration to get to the root element cursor
41100
case nextSibling cursor of
42101
Just rootCursor -> do
102+
-- Get the root raw XML value at the root element cursor
43103
let rootValue = rawValueAt (xmlIndexAt rootCursor)
104+
-- Show what we have at this cursor
44105
putStrLn $ "Raw value: " <> take 100 (show rootValue)
106+
-- Decode the raw XML value
107+
case decodeCatalog (rawDecode rootValue) of
108+
DecodeOk catalog -> putStrLn $ "Catalog: " <> show catalog
109+
DecodeFailed msg -> putStrLn $ "Error: " <> show msg
45110
Nothing -> do
46111
putStrLn "Could not read XML"
47112
return ()

stack.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ packages:
44
- '.'
55

66
extra-deps:
7+
- hw-balancedparens-0.2.0.1
8+
- hw-bits-0.7.0.1
9+
- hw-conduit-0.2.0.2
10+
- hw-excess-0.2.0.0
11+
- hw-int-0.0.0.3
12+
- hw-parser-0.0.0.3
13+
- hw-prim-0.5.0.0
14+
- hw-rankselect-0.10.0.3
15+
- hw-rankselect-base-0.2.0.2
16+
- hw-string-parse-0.0.0.4
717
- hedgehog-0.5
818

919
flags: {}

0 commit comments

Comments
 (0)