implemented decodeRawRows

This commit is contained in:
Jonathan Lamothe 2022-04-20 21:21:51 -04:00
parent 63b97649a6
commit 389c206063
2 changed files with 141 additions and 17 deletions

View File

@ -33,7 +33,9 @@ module Data.CSV.Slurp (
) where ) where
import Conduit (ConduitT, await, mapC, yield, (.|)) import Conduit (ConduitT, await, mapC, yield, (.|))
import Control.Monad.Trans.State (StateT, evalStateT) import Control.Monad (unless)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import qualified Data.Text as T import qualified Data.Text as T
@ -63,20 +65,141 @@ toBytes = await >>= \case
toBytes toBytes
Nothing -> return () Nothing -> return ()
-- Internal
data DecodeState = DecodeState data DecodeState = DecodeState
{ isQuoted :: Bool { isQuoted :: Bool
, fields :: [BS.ByteString]
, collected :: BS.ByteString , collected :: BS.ByteString
} deriving (Eq, Show) } deriving (Eq, Show)
type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
type Modifier = DecodeState -> DecodeState
newDecodeState :: DecodeState newDecodeState :: DecodeState
newDecodeState = DecodeState newDecodeState = DecodeState
{ isQuoted = False { isQuoted = False
, fields = []
, collected = "" , collected = ""
} }
-- Decoders
decodeLoop :: Monad m => Decoder m
decodeLoop = lift await >>= \case
Just byte -> case byte of
0x22 -> processQuote
0x2c -> processComma
0x0d -> processCR
0x0a -> processLF
_ -> performAction $ addByte byte
Nothing -> cleanup
processQuote :: Monad m => Decoder m
processQuote = gets isQuoted >>= \case
True -> processQuotedQuote
False -> processUnquotedQuote
processComma :: Monad m => Decoder m
processComma = gets isQuoted >>= performAction . \case
True -> addByte 0x2c
False -> commitField
processCR :: Monad m => Decoder m
processCR = gets isQuoted >>= \case
True -> performAction $ addByte 0xd
False -> endRow
processLF :: Monad m => Decoder m
processLF = gets isQuoted >>= \case
True -> performAction $ addByte 0xa
False -> endRow
processQuotedQuote :: Monad m => Decoder m
processQuotedQuote = lift await >>= \case
Just byte -> case byte of
0x22 -> performAction $ addByte 0x22 -- quote
0x2c -> performAction commitField -- comma
0x0d -> commitRow -- carriage return
0x0a -> commitRow -- line feed
_ -> corruptedField
Nothing -> cleanup
processUnquotedQuote :: Monad m => Decoder m
processUnquotedQuote = gets (BS.null . collected) >>= \case
True -> performAction setQuoted
False -> corruptedField
endRow :: Monad m => Decoder m
endRow = do
s <- get
if null (fields s) && BS.null (collected s)
then decodeLoop
else commitRow
commitRow :: Monad m => Decoder m
commitRow = do
modify commitField
gets fields >>= lift . yield
performAction dropFields
corruptedField :: Monad m => Decoder m
corruptedField = do
modify dropField
ignoreField
ignoreField :: Monad m => Decoder m
ignoreField = lift await >>= \case
Just byte -> case byte of
0x2c -> performAction commitField -- comma
0x0d -> commitRow
_ -> ignoreField
Nothing -> cleanup
cleanup :: Monad m => Decoder m
cleanup = do
gets isQuoted >>= \case
True -> modify $ commitField . dropField
False -> gets (BS.null . collected) >>= \case
True -> return ()
False -> modify commitField
fs <- gets fields
unless (null fs) $
lift $ yield fs
performAction :: Monad m => Modifier -> Decoder m
performAction f = do
modify f
decodeLoop decodeLoop
:: Monad m
=> StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () -- Modifiers
decodeLoop = return ()
addByte :: Word8 -> Modifier
addByte b s = let
collected' = BS.snoc (collected s) b
in s { collected = collected' }
commitField :: Modifier
commitField s = let
isQuoted' = False
fields' = fields s ++ [collected s]
collected' = ""
in s
{ isQuoted = isQuoted'
, fields = fields'
, collected = collected'
}
dropFields :: Modifier
dropFields s = s { fields = [] }
dropField :: Modifier
dropField s = s
{ isQuoted = False
, collected = ""
}
setQuoted :: Modifier
setQuoted s = s { isQuoted = True }
--jl --jl

View File

@ -25,7 +25,7 @@ module Data.CSV.SlurpSpec (spec) where
import Conduit (runConduit, (.|)) import Conduit (runConduit, (.|))
import Data.Char (ord) import Data.Char (ord)
import Data.Conduit.List (consume, sourceList) import Data.Conduit.List (consume, sourceList)
import Test.Hspec (Spec, context, describe, it, shouldBe, xit) import Test.Hspec (Spec, context, describe, it, shouldBe)
import Data.CSV.Slurp import Data.CSV.Slurp
@ -43,25 +43,26 @@ decodeRowsSpec = describe "decodeRows" $ mapM_
let let
expLen = length expected expLen = length expected
resLen = length result resLen = length result
xit ("should have " ++ show expLen ++ " rows") $ it ("should have " ++ show expLen ++ " rows") $
resLen `shouldBe` expLen resLen `shouldBe` expLen
mapM_ mapM_
( \(n, expected', result') -> context ("row " ++ show n) $ ( \(n, expected', result') -> context ("row " ++ show n) $
xit ("should be " ++ show expected') $ it ("should be " ++ show expected') $
result' `shouldBe` expected' result' `shouldBe` expected'
) $ zip3 [(0::Int)..] expected result ) $ zip3 [(0::Int)..] expected result
) )
-- label, input, expected -- label, input, expected
[ ( "valid", validIn, validRes ) [ ( "valid", validIn, validRes )
, ( "invalid", invalidIn, [] ) , ( "invalid", invalidIn, invalidRes )
, ( "empty", [], [] ) , ( "empty", [], [] )
] ]
where where
validIn = ["foo,bar\r\n", "baz,quuux\r\n"] validIn = ["foo,bar\r\n", "baz,quux\r\n"]
invalidIn = ["\"a"] invalidIn = ["\"a"]
validRes = [["foo", "bar"], ["baz", "quux"]] validRes = [["foo", "bar"], ["baz", "quux"]]
invalidRes = [[""]]
decodeRawRowsSpec :: Spec decodeRawRowsSpec :: Spec
decodeRawRowsSpec = describe "decodeRawRows" $ mapM_ decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
@ -70,12 +71,12 @@ decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
let let
expLen = length expected expLen = length expected
resLen = length result resLen = length result
xit ("should have " ++ show expLen ++ " rows") $ it ("should have " ++ show expLen ++ " rows") $
resLen `shouldBe` expLen resLen `shouldBe` expLen
mapM_ mapM_
( \(n, expected', result') -> context ("row " ++ show n) $ ( \(n, expected', result') -> context ("row " ++ show n) $
xit ("should be " ++ show result') $ it ("should be " ++ show result') $
expected' `shouldBe` result' result' `shouldBe` expected'
) $ zip3 [(0::Int)..] expected result ) $ zip3 [(0::Int)..] expected result
) )