From 389c20606300747a0391c57f60b8882685c23b0e Mon Sep 17 00:00:00 2001 From: Jonathan Lamothe Date: Wed, 20 Apr 2022 21:21:51 -0400 Subject: [PATCH] implemented decodeRawRows --- src/Data/CSV/Slurp.hs | 133 +++++++++++++++++++++++++++++++++++-- test/Data/CSV/SlurpSpec.hs | 25 +++---- 2 files changed, 141 insertions(+), 17 deletions(-) diff --git a/src/Data/CSV/Slurp.hs b/src/Data/CSV/Slurp.hs index 6d1a168..ae05dbc 100644 --- a/src/Data/CSV/Slurp.hs +++ b/src/Data/CSV/Slurp.hs @@ -33,7 +33,9 @@ module Data.CSV.Slurp ( ) where import Conduit (ConduitT, await, mapC, yield, (.|)) -import Control.Monad.Trans.State (StateT, evalStateT) +import Control.Monad (unless) +import Control.Monad.Trans.Class (lift) +import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify) import qualified Data.ByteString as BS import Data.Maybe (fromMaybe) import qualified Data.Text as T @@ -63,20 +65,141 @@ toBytes = await >>= \case toBytes Nothing -> return () +-- Internal + data DecodeState = DecodeState { isQuoted :: Bool + , fields :: [BS.ByteString] , collected :: BS.ByteString } deriving (Eq, Show) +type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () +type Modifier = DecodeState -> DecodeState + newDecodeState :: DecodeState newDecodeState = DecodeState { isQuoted = False + , fields = [] , collected = "" } -decodeLoop - :: Monad m - => StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () -decodeLoop = return () +-- Decoders + +decodeLoop :: Monad m => Decoder m +decodeLoop = lift await >>= \case + Just byte -> case byte of + 0x22 -> processQuote + 0x2c -> processComma + 0x0d -> processCR + 0x0a -> processLF + _ -> performAction $ addByte byte + Nothing -> cleanup + +processQuote :: Monad m => Decoder m +processQuote = gets isQuoted >>= \case + True -> processQuotedQuote + False -> processUnquotedQuote + +processComma :: Monad m => Decoder m +processComma = gets isQuoted >>= performAction . \case + True -> addByte 0x2c + False -> commitField + +processCR :: Monad m => Decoder m +processCR = gets isQuoted >>= \case + True -> performAction $ addByte 0xd + False -> endRow + +processLF :: Monad m => Decoder m +processLF = gets isQuoted >>= \case + True -> performAction $ addByte 0xa + False -> endRow + +processQuotedQuote :: Monad m => Decoder m +processQuotedQuote = lift await >>= \case + Just byte -> case byte of + 0x22 -> performAction $ addByte 0x22 -- quote + 0x2c -> performAction commitField -- comma + 0x0d -> commitRow -- carriage return + 0x0a -> commitRow -- line feed + _ -> corruptedField + Nothing -> cleanup + +processUnquotedQuote :: Monad m => Decoder m +processUnquotedQuote = gets (BS.null . collected) >>= \case + True -> performAction setQuoted + False -> corruptedField + +endRow :: Monad m => Decoder m +endRow = do + s <- get + if null (fields s) && BS.null (collected s) + then decodeLoop + else commitRow + +commitRow :: Monad m => Decoder m +commitRow = do + modify commitField + gets fields >>= lift . yield + performAction dropFields + +corruptedField :: Monad m => Decoder m +corruptedField = do + modify dropField + ignoreField + +ignoreField :: Monad m => Decoder m +ignoreField = lift await >>= \case + Just byte -> case byte of + 0x2c -> performAction commitField -- comma + 0x0d -> commitRow + _ -> ignoreField + Nothing -> cleanup + +cleanup :: Monad m => Decoder m +cleanup = do + gets isQuoted >>= \case + True -> modify $ commitField . dropField + False -> gets (BS.null . collected) >>= \case + True -> return () + False -> modify commitField + fs <- gets fields + unless (null fs) $ + lift $ yield fs + +performAction :: Monad m => Modifier -> Decoder m +performAction f = do + modify f + decodeLoop + +-- Modifiers + +addByte :: Word8 -> Modifier +addByte b s = let + collected' = BS.snoc (collected s) b + in s { collected = collected' } + +commitField :: Modifier +commitField s = let + isQuoted' = False + fields' = fields s ++ [collected s] + collected' = "" + in s + { isQuoted = isQuoted' + , fields = fields' + , collected = collected' + } + +dropFields :: Modifier +dropFields s = s { fields = [] } + +dropField :: Modifier +dropField s = s + { isQuoted = False + , collected = "" + } + +setQuoted :: Modifier +setQuoted s = s { isQuoted = True } --jl diff --git a/test/Data/CSV/SlurpSpec.hs b/test/Data/CSV/SlurpSpec.hs index 53089e8..cd29130 100644 --- a/test/Data/CSV/SlurpSpec.hs +++ b/test/Data/CSV/SlurpSpec.hs @@ -25,7 +25,7 @@ module Data.CSV.SlurpSpec (spec) where import Conduit (runConduit, (.|)) import Data.Char (ord) import Data.Conduit.List (consume, sourceList) -import Test.Hspec (Spec, context, describe, it, shouldBe, xit) +import Test.Hspec (Spec, context, describe, it, shouldBe) import Data.CSV.Slurp @@ -43,25 +43,26 @@ decodeRowsSpec = describe "decodeRows" $ mapM_ let expLen = length expected resLen = length result - xit ("should have " ++ show expLen ++ " rows") $ + it ("should have " ++ show expLen ++ " rows") $ resLen `shouldBe` expLen mapM_ ( \(n, expected', result') -> context ("row " ++ show n) $ - xit ("should be " ++ show expected') $ + it ("should be " ++ show expected') $ result' `shouldBe` expected' ) $ zip3 [(0::Int)..] expected result ) -- label, input, expected - [ ( "valid", validIn, validRes ) - , ( "invalid", invalidIn, [] ) - , ( "empty", [], [] ) + [ ( "valid", validIn, validRes ) + , ( "invalid", invalidIn, invalidRes ) + , ( "empty", [], [] ) ] where - validIn = ["foo,bar\r\n", "baz,quuux\r\n"] - invalidIn = ["\"a"] - validRes = [["foo", "bar"], ["baz", "quux"]] + validIn = ["foo,bar\r\n", "baz,quux\r\n"] + invalidIn = ["\"a"] + validRes = [["foo", "bar"], ["baz", "quux"]] + invalidRes = [[""]] decodeRawRowsSpec :: Spec decodeRawRowsSpec = describe "decodeRawRows" $ mapM_ @@ -70,12 +71,12 @@ decodeRawRowsSpec = describe "decodeRawRows" $ mapM_ let expLen = length expected resLen = length result - xit ("should have " ++ show expLen ++ " rows") $ + it ("should have " ++ show expLen ++ " rows") $ resLen `shouldBe` expLen mapM_ ( \(n, expected', result') -> context ("row " ++ show n) $ - xit ("should be " ++ show result') $ - expected' `shouldBe` result' + it ("should be " ++ show result') $ + result' `shouldBe` expected' ) $ zip3 [(0::Int)..] expected result )