Skip to content

Commit fae5f9a

Browse files
committed
wip
1 parent 5254a77 commit fae5f9a

File tree

6 files changed

+98
-117
lines changed

6 files changed

+98
-117
lines changed

pub/xlsx/src/Codec/Xlsx/Parser/Stream.hs

Lines changed: 72 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
{-# LANGUAGE RecordWildCards #-}
1313
{-# LANGUAGE ScopedTypeVariables #-}
1414
{-# LANGUAGE StrictData #-}
15-
{-# LANGUAGE TemplateHaskell #-}
1615
{-# LANGUAGE TypeApplications #-}
1716
{-# LANGUAGE UndecidableInstances #-}
1817

@@ -39,7 +38,6 @@ module Codec.Xlsx.Parser.Stream
3938
runXlsxM,
4039
WorkbookInfo (..),
4140
SheetInfo (..),
42-
wiSheets,
4341
getOrParseSharedStringss,
4442
getWorkbookInfo,
4543
CellRow,
@@ -54,13 +52,9 @@ module Codec.Xlsx.Parser.Stream
5452

5553
-- ** SheetItem
5654
SheetItem (..),
57-
si_sheet_index,
58-
si_row,
5955

6056
-- ** Row
6157
Row (..),
62-
ri_row_index,
63-
ri_cell_row,
6458

6559
-- * Errors
6660
SheetErrors (..),
@@ -136,24 +130,21 @@ type CellRow = IntMap Cell
136130
-- The current sheet at a time, every sheet is constructed of these items.
137131
data SheetItem = MkSheetItem
138132
{ -- | The sheet number
139-
_si_sheet_index :: Int,
140-
_si_row :: ~Row
133+
si_sheet_index :: Int,
134+
si_row :: ~Row
141135
}
142136
deriving stock (Generic, Show)
143137
deriving anyclass (NFData)
144138

145139
data Row = MkRow
146140
{ -- | Row number
147-
_ri_row_index :: RowIndex,
141+
ri_row_index :: RowIndex,
148142
-- | Row itself
149-
_ri_cell_row :: ~CellRow
143+
ri_cell_row :: ~CellRow
150144
}
151145
deriving stock (Generic, Show)
152146
deriving anyclass (NFData)
153147

154-
makeLenses 'MkSheetItem
155-
makeLenses 'MkRow
156-
157148
type SharedStringsMap = V.Vector Text
158149

159150
-- | Type of the excel value
@@ -179,43 +170,39 @@ data ExcelValueType
179170
-- | State for parsing sheets
180171
data SheetState = MkSheetState
181172
{ -- | Current row
182-
_ps_row :: ~CellRow,
173+
ps_row :: ~CellRow,
183174
-- | Current sheet ID (AKA 'sheetInfoSheetId')
184-
_ps_sheet_index :: Int,
175+
ps_sheet_index :: Int,
185176
-- | Current row number
186-
_ps_cell_row_index :: RowIndex,
177+
ps_cell_row_index :: RowIndex,
187178
-- | Current column number
188-
_ps_cell_col_index :: ColumnIndex,
189-
_ps_cell_style :: Maybe Int,
179+
ps_cell_col_index :: ColumnIndex,
180+
ps_cell_style :: Maybe Int,
190181
-- | Flag for indexing wheter the parser is in value or not
191-
_ps_is_in_val :: Bool,
182+
ps_is_in_val :: Bool,
192183
-- | Shared string map
193-
_ps_shared_strings :: SharedStringsMap,
184+
ps_shared_strings :: SharedStringsMap,
194185
-- | The last detected value type
195-
_ps_type :: ExcelValueType,
186+
ps_type :: ExcelValueType,
196187
-- | for hexpat only, which can break up char data into multiple events
197-
_ps_text_buf :: Text,
188+
ps_text_buf :: Text,
198189
-- | For hexpat only, which can throw errors right at the end of the sheet
199190
-- rather than ending gracefully.
200-
_ps_worksheet_ended :: Bool
191+
ps_worksheet_ended :: Bool
201192
}
202193
deriving stock (Generic, Show)
203194

204-
makeLenses 'MkSheetState
205-
206195
-- | State for parsing shared strings
207196
data SharedStringsState = MkSharedStringsState
208197
{ -- | String we are parsing
209198
-- TODO: At the moment SharedStrings can be used only to create CellText values.
210199
-- We should add support for CellRich values.
211-
_ss_string :: TB.Builder,
200+
ss_string :: TB.Builder,
212201
-- | list of shared strings
213-
_ss_list :: DL.DList Text
202+
ss_list :: DL.DList Text
214203
}
215204
deriving stock (Generic, Show)
216205

217-
makeLenses 'MkSharedStringsState
218-
219206
type HasSheetState = MonadState SheetState
220207

221208
type HasSharedStringsState = MonadState SharedStringsState
@@ -234,17 +221,16 @@ data SheetInfo = SheetInfo
234221
-- | Information about the workbook contained in xl/workbook.xml
235222
-- (currently a subset)
236223
data WorkbookInfo = WorkbookInfo
237-
{ _wiSheets :: [SheetInfo]
224+
{ wiSheets :: [SheetInfo]
238225
}
239-
deriving (Show)
240-
241-
makeLenses 'WorkbookInfo
226+
deriving (Show, Generic)
242227

243228
data XlsxMState = MkXlsxMState
244-
{ _xs_shared_strings :: Memoized (V.Vector Text),
245-
_xs_workbook_info :: Memoized WorkbookInfo,
246-
_xs_relationships :: Memoized Relationships
229+
{ xs_shared_strings :: Memoized (V.Vector Text),
230+
xs_workbook_info :: Memoized WorkbookInfo,
231+
xs_relationships :: Memoized Relationships
247232
}
233+
deriving (Show, Generic)
248234

249235
newtype XlsxM a = XlsxM {_unXlsxM :: ReaderT XlsxMState Zip.ZipArchive a}
250236
deriving newtype
@@ -264,24 +250,24 @@ newtype XlsxM a = XlsxM {_unXlsxM :: ReaderT XlsxMState Zip.ZipArchive a}
264250
initialSheetState :: SheetState
265251
initialSheetState =
266252
MkSheetState
267-
{ _ps_row = mempty,
268-
_ps_sheet_index = 0,
269-
_ps_cell_row_index = 0,
270-
_ps_cell_col_index = 0,
271-
_ps_is_in_val = False,
272-
_ps_shared_strings = mempty,
273-
_ps_type = Untyped,
274-
_ps_text_buf = mempty,
275-
_ps_worksheet_ended = False,
276-
_ps_cell_style = Nothing
253+
{ ps_row = mempty,
254+
ps_sheet_index = 0,
255+
ps_cell_row_index = 0,
256+
ps_cell_col_index = 0,
257+
ps_is_in_val = False,
258+
ps_shared_strings = mempty,
259+
ps_type = Untyped,
260+
ps_text_buf = mempty,
261+
ps_worksheet_ended = False,
262+
ps_cell_style = Nothing
277263
}
278264

279265
-- | Initial parsing state
280266
initialSharedStrings :: SharedStringsState
281267
initialSharedStrings =
282268
MkSharedStringsState
283-
{ _ss_string = mempty,
284-
_ss_list = mempty
269+
{ ss_string = mempty,
270+
ss_list = mempty
285271
}
286272

287273
-- | Parse shared string entry from xml event and return it once
@@ -295,19 +281,19 @@ parseSharedStrings ::
295281
m (Maybe Text)
296282
parseSharedStrings = \case
297283
-- TODO: Add parsing of text styles to further create CellRich values.
298-
StartElement "si" _ -> Nothing <$ (ss_string .= mempty)
299-
EndElement "si" -> Just . LT.toStrict . TB.toLazyText <$> gets _ss_string
300-
CharacterData txt -> Nothing <$ (ss_string <>= TB.fromText txt)
284+
StartElement "si" _ -> Nothing <$ (#ss_string .= mempty)
285+
EndElement "si" -> Just . LT.toStrict . TB.toLazyText <$> gets ss_string
286+
CharacterData txt -> Nothing <$ (#ss_string <>= TB.fromText txt)
301287
_ -> pure Nothing
302288

303289
-- | Run a series of actions on an Xlsx file
304290
runXlsxM :: (MonadIO m) => FilePath -> XlsxM a -> m a
305291
runXlsxM xlsxFile (XlsxM act) = liftIO $ do
306292
-- TODO: don't run the withArchive multiple times but use liftWith or runInIO instead
307-
_xs_workbook_info <- memoizeRef (Zip.withArchive xlsxFile readWorkbookInfo)
308-
_xs_relationships <-
293+
xs_workbook_info <- memoizeRef (Zip.withArchive xlsxFile readWorkbookInfo)
294+
xs_relationships <-
309295
memoizeRef (Zip.withArchive xlsxFile readWorkbookRelationships)
310-
_xs_shared_strings <- memoizeRef (Zip.withArchive xlsxFile parseSharedStringss)
296+
xs_shared_strings <- memoizeRef (Zip.withArchive xlsxFile parseSharedStringss)
311297
Zip.withArchive xlsxFile $ runReaderT act $ MkXlsxMState {..}
312298

313299
liftZip :: Zip.ZipArchive a -> XlsxM a
@@ -325,12 +311,12 @@ parseSharedStringss = do
325311
st <- liftIO $ runExpat state0 byteSrc $ \evs -> forM_ evs $ \ev -> do
326312
mTxt <- parseSharedStrings ev
327313
for_ mTxt $ \txt ->
328-
ss_list %= (`DL.snoc` txt)
329-
pure $ V.fromList $ DL.toList $ _ss_list st
314+
#ss_list %= (`DL.snoc` txt)
315+
pure $ V.fromList $ DL.toList $ ss_list st
330316

331317
{-# SCC getOrParseSharedStringss #-}
332318
getOrParseSharedStringss :: XlsxM (V.Vector Text)
333-
getOrParseSharedStringss = runMemoized =<< asks _xs_shared_strings
319+
getOrParseSharedStringss = runMemoized =<< asks xs_shared_strings
334320

335321
readWorkbookInfo :: Zip.ZipArchive WorkbookInfo
336322
readWorkbookInfo = do
@@ -354,7 +340,7 @@ lookupBy fields attrs = maybe (throwM $ LookupError attrs fields) pure $ lookup
354340
-- xl/workbook.xml. The result is cached so the XML will only be
355341
-- decompressed and parsed once inside a larger XlsxM action.
356342
getWorkbookInfo :: XlsxM WorkbookInfo
357-
getWorkbookInfo = runMemoized =<< asks _xs_workbook_info
343+
getWorkbookInfo = runMemoized =<< asks xs_workbook_info
358344

359345
readWorkbookRelationships :: Zip.ZipArchive Relationships
360346
readWorkbookRelationships = do
@@ -380,7 +366,7 @@ readWorkbookRelationships = do
380366
-- The relationships xml file will only be parsed once when called
381367
-- multiple times within a larger XlsxM action.
382368
getWorkbookRelationships :: XlsxM Relationships
383-
getWorkbookRelationships = runMemoized =<< asks _xs_relationships
369+
getWorkbookRelationships = runMemoized =<< asks xs_relationships
384370

385371
type HexpatEvent = SAXEvent ByteString Text
386372

@@ -463,14 +449,14 @@ runExpatForSheet ::
463449
runExpatForSheet initState byteSource inner =
464450
void $ liftIO $ runExpat initState byteSource handler
465451
where
466-
sheetName = _ps_sheet_index initState
452+
sheetName = ps_sheet_index initState
467453
handler evs = forM_ evs $ \ev -> do
468454
parseRes <- runExceptT $ matchHexpatEvent ev
469455
case parseRes of
470456
Left err -> throwM err
471457
Right (Just cellRow)
472458
| not (IntMap.null cellRow) -> do
473-
rowNum <- use ps_cell_row_index
459+
rowNum <- use #ps_cell_row_index
474460
liftIO $ inner $ MkSheetItem sheetName $ MkRow rowNum cellRow
475461
_ -> pure ()
476462

@@ -505,7 +491,7 @@ makeIndexFromName sheetName = do
505491
-- names differ only in alphabetic case (at least for ascii...)
506492
let sheetNameCI = T.toLower sheetName
507493
findRes :: Maybe SheetInfo
508-
findRes = find ((== sheetNameCI) . T.toLower . sheetInfoName) $ _wiSheets wi
494+
findRes = find ((== sheetNameCI) . T.toLower . sheetInfoName) $ wiSheets wi
509495
pure $ makeIndex . sheetInfoSheetId <$> findRes
510496

511497
readSheet ::
@@ -524,9 +510,9 @@ readSheet (MkSheetIndex sheetId) inner = do
524510
sharedStrs <- getOrParseSharedStringss
525511
let sheetState0 =
526512
initialSheetState
527-
& ps_shared_strings
513+
& #ps_shared_strings
528514
.~ sharedStrs
529-
& ps_sheet_index
515+
& #ps_sheet_index
530516
.~ sheetId
531517
runExpatForSheet sheetState0 sourceSheetXml inner
532518
pure True
@@ -549,8 +535,8 @@ countRowsInSheet (MkSheetIndex sheetId) = do
549535
-- | Return row from the state and empty it
550536
popRow :: (HasSheetState m) => m CellRow
551537
popRow = do
552-
row <- use ps_row
553-
ps_row .= mempty
538+
row <- use #ps_row
539+
#ps_row .= mempty
554540
pure row
555541

556542
data AddCellErrors
@@ -606,25 +592,25 @@ addCellToRow ::
606592
m ()
607593
addCellToRow txt = do
608594
st <- get
609-
style <- use ps_cell_style
610-
when (_ps_is_in_val st) $ do
595+
style <- use #ps_cell_style
596+
when (ps_is_in_val st) $ do
611597
val <-
612598
liftEither $
613599
first ParseCellError $
614-
parseValue (_ps_shared_strings st) txt (_ps_type st)
600+
parseValue (ps_shared_strings st) txt (ps_type st)
615601
put $
616602
st
617-
{ _ps_row =
603+
{ ps_row =
618604
IntMap.insert
619-
(unColumnIndex $ _ps_cell_col_index st)
605+
(unColumnIndex $ ps_cell_col_index st)
620606
( Cell
621607
{ _cellStyle = style,
622608
_cellValue = Just val,
623609
_cellComment = Nothing,
624610
_cellFormula = Nothing
625611
}
626612
)
627-
$ _ps_row st
613+
$ ps_row st
628614
}
629615

630616
data SheetErrors
@@ -680,14 +666,14 @@ matchHexpatEvent ev = case ev of
680666
CharacterData txt ->
681667
{-# SCC "handle_CharData" #-}
682668
do
683-
inVal <- use ps_is_in_val
669+
inVal <- use #ps_is_in_val
684670
when inVal $
685-
{-# SCC "append_text_buf" #-} (ps_text_buf <>= txt)
671+
{-# SCC "append_text_buf" #-} (#ps_text_buf <>= txt)
686672
pure Nothing
687673
StartElement "c" attrs -> Nothing <$ (setCoord attrs *> setType attrs *> setStyle attrs)
688-
StartElement "is" _ -> Nothing <$ (ps_is_in_val .= True)
674+
StartElement "is" _ -> Nothing <$ (#ps_is_in_val .= True)
689675
EndElement "is" -> Nothing <$ finaliseCellValue
690-
StartElement "v" _ -> Nothing <$ (ps_is_in_val .= True)
676+
StartElement "v" _ -> Nothing <$ (#ps_is_in_val .= True)
691677
EndElement "v" -> Nothing <$ finaliseCellValue
692678
-- If beginning of row, empty the state and return nothing.
693679
-- We don't know if there is anything in the state, the user may have
@@ -697,15 +683,15 @@ matchHexpatEvent ev = case ev of
697683
-- If at the end of the row, we have collected the whole row into
698684
-- the current state. Empty the state and return the row.
699685
EndElement "row" -> Just <$> popRow
700-
StartElement "worksheet" _ -> ps_worksheet_ended .= False >> pure Nothing
701-
EndElement "worksheet" -> ps_worksheet_ended .= True >> pure Nothing
686+
StartElement "worksheet" _ -> #ps_worksheet_ended .= False >> pure Nothing
687+
EndElement "worksheet" -> #ps_worksheet_ended .= True >> pure Nothing
702688
-- Skip everything else, e.g. the formula elements <f>
703689
FailDocument err -> do
704690
-- this event is emitted at the end the xml stream (possibly
705691
-- because the xml files in xlsx archives don't end in a
706692
-- newline, but that's a guess), so we use state to determine if
707693
-- it's expected.
708-
finished <- use ps_worksheet_ended
694+
finished <- use #ps_worksheet_ended
709695
unless finished $
710696
throwError $
711697
HexpatParseError err
@@ -716,12 +702,12 @@ matchHexpatEvent ev = case ev of
716702
finaliseCellValue ::
717703
(MonadError SheetErrors m, HasSheetState m) => m ()
718704
finaliseCellValue = do
719-
txt <- gets _ps_text_buf
705+
txt <- gets ps_text_buf
720706
addCellToRow txt
721707
modify' $ \st ->
722708
st
723-
{ _ps_is_in_val = False,
724-
_ps_text_buf = mempty
709+
{ ps_is_in_val = False,
710+
ps_text_buf = mempty
725711
}
726712

727713
-- | Update state coordinates accordingly to @parseCoordinates@
@@ -734,8 +720,8 @@ setCoord ::
734720
m ()
735721
setCoord list = do
736722
coordinates <- liftEither $ first ParseCoordinateError $ parseCoordinates list
737-
ps_cell_col_index .= (coordinates ^. _2)
738-
ps_cell_row_index .= (coordinates ^. _1)
723+
#ps_cell_col_index .= (coordinates ^. _2)
724+
#ps_cell_row_index .= (coordinates ^. _1)
739725

740726
-- | Parse type from values and update state accordingly
741727
setType ::
@@ -746,7 +732,7 @@ setType ::
746732
m ()
747733
setType list = do
748734
type' <- liftEither $ first ParseTypeError $ parseType list
749-
ps_type .= type'
735+
#ps_type .= type'
750736

751737
-- | Find sheet value by its name
752738
findName :: ByteString -> SheetValues -> Maybe SheetValue
@@ -756,7 +742,7 @@ findName name = find ((name ==) . fst)
756742
setStyle :: (MonadError SheetErrors m, HasSheetState m) => SheetValues -> m ()
757743
setStyle list = do
758744
style <- liftEither $ first ParseStyleErrors $ parseStyle list
759-
ps_cell_style .= style
745+
#ps_cell_style .= style
760746

761747
data StyleError = InvalidStyleRef {seInput :: Text, seErrorMsg :: String}
762748
deriving (Show)

0 commit comments

Comments
 (0)