1- {-# LANGUAGE BangPatterns #-}
2- {-# LANGUAGE ScopedTypeVariables #-}
1+ {-# LANGUAGE BangPatterns #-}
2+ {-# LANGUAGE FlexibleInstances #-}
3+ {-# LANGUAGE ScopedTypeVariables #-}
4+ {-# LANGUAGE TypeSynonymInstances #-}
35
46module Main where
57
8+ import Data.Foldable
9+ import Data.Maybe
610import Data.Semigroup ((<>) )
711import Data.Word
812import HaskellWorks.Data.BalancedParens.RangeMinMax2
@@ -11,37 +15,98 @@ import HaskellWorks.Data.Bits.BitShown
1115import HaskellWorks.Data.FromByteString
1216import HaskellWorks.Data.RankSelect.CsPoppy
1317import HaskellWorks.Data.TreeCursor
18+ import HaskellWorks.Data.Xml.Decode
19+ import HaskellWorks.Data.Xml.DecodeResult
20+ import HaskellWorks.Data.Xml.RawDecode
1421import HaskellWorks.Data.Xml.RawValue
1522import HaskellWorks.Data.Xml.Succinct.Cursor
1623import HaskellWorks.Data.Xml.Succinct.Index
24+ import HaskellWorks.Data.Xml.Value
1725
1826import qualified Data.ByteString as BS
1927import qualified Data.Vector.Storable as DVS
2028
2129type RawCursor = XmlCursor BS. ByteString (BitShown (DVS. Vector Word64 )) (SimpleBalancedParens (DVS. Vector Word64 ))
22- type CsCursor = XmlCursor BS. ByteString CsPoppy (RangeMinMax2 CsPoppy )
30+ type FastCursor = XmlCursor BS. ByteString CsPoppy (RangeMinMax2 CsPoppy )
2331
32+ -- | Read an XML file into memory and return a raw cursor initialised to the
33+ -- start of the XML document.
2434readRawCursor :: String -> IO RawCursor
2535readRawCursor path = do
2636 ! bs <- BS. readFile path
2737 let ! cursor = fromByteString bs :: RawCursor
2838 return cursor
2939
30- readCsCursor :: String -> IO CsCursor
31- readCsCursor filename = do
40+ -- | Read an XML file into memory and return a query-optimised cursor initialised
41+ -- to the start of the XML document.
42+ readFastCursor :: String -> IO FastCursor
43+ readFastCursor filename = do
44+ -- Load the XML file into memory as a raw cursor.
45+ -- The raw XML data is `text`, and `ib` and `bp` are the indexes.
46+ -- `ib` and `bp` can be persisted to an index file for later use to avoid
47+ -- re-parsing the file.
3248 XmlCursor ! text (BitShown ! ib) (SimpleBalancedParens ! bp) _ <- readRawCursor filename
3349 let ! bpCsPoppy = makeCsPoppy bp
3450 let ! rangeMinMax = mkRangeMinMax2 bpCsPoppy
3551 let ! ibCsPoppy = makeCsPoppy ib
3652 return $ XmlCursor text ibCsPoppy rangeMinMax 1
3753
54+ -- | Parse the text of an XML node.
55+ class ParseText a where
56+ parseText :: Value -> DecodeResult a
57+
58+ instance ParseText String where
59+ parseText (XmlText text) = DecodeOk text
60+ parseText (XmlCData text) = DecodeOk text
61+ parseText (XmlElement _ _ cs) = DecodeOk $ concat $ concat $ toList . parseText <$> cs
62+ parseText _ = DecodeOk " "
63+
64+ -- | Convert a decode result to a maybe
65+ decodeResultToMaybe :: DecodeResult a -> Maybe a
66+ decodeResultToMaybe (DecodeOk a) = Just a
67+ decodeResultToMaybe _ = Nothing
68+
69+ -- | Document model. This does not need to be able to completely represent all
70+ -- the data in the XML document. In fact, having a smaller model may improve
71+ -- query performance.
72+ data Plant = Plant
73+ { common :: String
74+ , price :: String
75+ } deriving (Eq , Show )
76+
77+ newtype Catalog = Catalog
78+ { plants :: [Plant ]
79+ } deriving (Eq , Show )
80+
81+ -- | Decode plant element
82+ decodePlant :: Value -> DecodeResult Plant
83+ decodePlant xml = do
84+ aCommon <- xml /> " common" >>= parseText
85+ aPrice <- xml /> " price" >>= parseText
86+ return $ Plant aCommon aPrice
87+
88+ -- | Decode catalog element
89+ decodeCatalog :: Value -> DecodeResult Catalog
90+ decodeCatalog xml = do
91+ aPlantXmls <- xml />> " plant"
92+ let aPlants = catMaybes (decodeResultToMaybe . decodePlant <$> aPlantXmls)
93+ return $ Catalog aPlants
94+
3895main :: IO ()
3996main = do
40- ! cursor <- readCsCursor " data/catalog.xml"
97+ -- Read XML into memory as a query-optimised cursor
98+ ! cursor <- readFastCursor " data/catalog.xml"
99+ -- Skip the XML declaration to get to the root element cursor
41100 case nextSibling cursor of
42101 Just rootCursor -> do
102+ -- Get the root raw XML value at the root element cursor
43103 let rootValue = rawValueAt (xmlIndexAt rootCursor)
104+ -- Show what we have at this cursor
44105 putStrLn $ " Raw value: " <> take 100 (show rootValue)
106+ -- Decode the raw XML value
107+ case decodeCatalog (rawDecode rootValue) of
108+ DecodeOk catalog -> putStrLn $ " Catalog: " <> show catalog
109+ DecodeFailed msg -> putStrLn $ " Error: " <> show msg
45110 Nothing -> do
46111 putStrLn " Could not read XML"
47112 return ()
0 commit comments