basic structure for decodeRawRows

This commit is contained in:
Jonathan Lamothe 2022-04-19 19:33:35 -04:00
parent eea4710b80
commit 67e85f0a78
4 changed files with 152 additions and 1 deletions

View File

@ -32,6 +32,7 @@ library
, bytestring
, conduit >=1.3.4.2 && <1.4
, text
, transformers
default-language: Haskell2010
autogen-modules: Paths_csv_slurp
@ -51,5 +52,6 @@ test-suite csv-slurp-test
, csv-slurp
, hspec >=2.8.5 && <2.9
, text
, transformers
default-language: Haskell2010
autogen-modules: Paths_csv_slurp

View File

@ -26,6 +26,7 @@ dependencies:
- bytestring
- conduit >= 1.3.4.2 && < 1.4
- text
- transformers
library:
source-dirs: src

View File

@ -29,13 +29,16 @@ module Data.CSV.Slurp (
decodeRows,
decodeRawRows,
decodeUTF8,
toBytes,
) where
import Conduit (ConduitT, mapC, (.|))
import Control.Monad.Trans.State (StateT, evalStateT)
import qualified Data.ByteString as BS
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8')
import Data.Word (Word8)
-- | decode the rows from a stream of ByteStrings
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
@ -43,7 +46,7 @@ decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)
-- | decode the rows returning raw ByteStrings instead of text
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
decodeRawRows = return ()
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState
-- | decode a raw ByteString into Text (if possible)
decodeUTF8 :: BS.ByteString -> Maybe T.Text
@ -51,4 +54,24 @@ decodeUTF8 bs = case decodeUtf8' bs of
Left _ -> Nothing
Right txt -> Just txt
-- | convert a stream to ByteStrings to a string of bytes
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
toBytes = return ()
data DecodeState = DecodeState
{ isQuoted :: Bool
, collected :: BS.ByteString
} deriving (Eq, Show)
newDecodeState :: DecodeState
newDecodeState = DecodeState
{ isQuoted = False
, collected = ""
}
decodeLoop
:: Monad m
=> StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
decodeLoop = return ()
--jl

View File

@ -31,6 +31,7 @@ import Data.CSV.Slurp
spec :: Spec
spec = describe "Data.CSV.Slurp" $ do
decodeRowsSpec
decodeRawRowsSpec
decodeUTF8Spec
decodeRowsSpec :: Spec
@ -60,6 +61,130 @@ decodeRowsSpec = describe "decodeRows" $ mapM_
invalidIn = ["\"a"]
validRes = [["foo", "bar"], ["baz", "quux"]]
decodeRawRowsSpec :: Spec
decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
( \(label, input, expected) -> context label $ do
result <- runConduit $ sourceList input .| decodeRawRows .| consume
let
expLen = length expected
resLen = length result
xit ("should have " ++ show expLen ++ " rows") $
resLen `shouldBe` expLen
mapM_
( \(n, expected', result') -> context ("row " ++ show n) $
xit ("should be " ++ show result') $
expected' `shouldBe` result'
) $ zip3 [(0::Int)..] expected result
)
-- label, input, expected
[ ( "unquoted", unquotedIn, normalRes )
, ( "quoted", quotedIn, normalRes )
, ( "mixed", mixedIn, normalRes )
, ( "CR only", crOnlyIn, normalRes )
, ( "LF only", lfOnlyIn, normalRes )
, ( "has quote", quoteIn, quoteRes )
, ( "has CR", crIn, crRes )
, ( "has LF", lfIn, lfRes )
, ( "has CRLF", crlfIn, crlfRes )
, ( "odd chunk", oddChunkIn, normalRes )
, ( "no newline", noNewlineIn, normalRes )
, ( "malformed", malformedIn, malformedRes )
]
where
unquotedIn =
[ "foo,bar\r\n"
, "baz,quux\r\n"
]
quotedIn =
[ "\"foo\",\"bar\"\r\n"
, "\"baz\",\"quux\"\r\n"
]
mixedIn =
[ "\"foo\",bar\r\n"
, "baz,\"quux\"\r\n"
]
crOnlyIn =
[ "foo,bar\r"
, "baz,quux\r"
]
lfOnlyIn =
[ "foo,bar\n"
, "baz,quux\n"
]
quoteIn =
[ "\"a\"\"b\",bar\r\n"
, "baz,quux\r\n"
]
crIn =
[ "\"a\rb\",bar\r\n"
, "baz,quux\r\n"
]
lfIn =
[ "\"a\nb\",bar\r\n"
, "baz,quux\r\n"
]
crlfIn =
[ "\"a\r\nb\",bar\r\n"
, "baz,quux\r\n"
]
oddChunkIn =
[ "foo,"
, "bar\r\nbaz,"
, "quux\r\n"
]
noNewlineIn =
[ "foo,bar\r\n"
, "baz,quux"
]
malformedIn =
[ "a\"b,bar\r\n"
, "baz,quux\r\n"
]
normalRes =
[ ["foo", "bar"]
, ["baz", "quux"]
]
quoteRes =
[ ["a\"b", "bar"]
, ["baz", "quux"]
]
crRes =
[ ["a\rb", "bar"]
, ["baz", "quux"]
]
lfRes =
[ ["a\nb", "bar"]
, ["baz", "quux"]
]
crlfRes =
[ ["a\r\nb", "bar"]
, ["baz", "quux"]
]
malformedRes =
[ ["", "bar"]
, ["baz", "quux"]
]
decodeUTF8Spec :: Spec
decodeUTF8Spec = describe "decodeUTF8" $ mapM_
( \(label, input, expected) -> context label $