From 67e85f0a789f099b3b2752f11d93f41247da851d Mon Sep 17 00:00:00 2001 From: Jonathan Lamothe Date: Tue, 19 Apr 2022 19:33:35 -0400 Subject: [PATCH] basic structure for decodeRawRows --- csv-slurp.cabal | 2 + package.yaml | 1 + src/Data/CSV/Slurp.hs | 25 +++++++- test/Data/CSV/SlurpSpec.hs | 125 +++++++++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 1 deletion(-) diff --git a/csv-slurp.cabal b/csv-slurp.cabal index 4a556f3..15d108e 100644 --- a/csv-slurp.cabal +++ b/csv-slurp.cabal @@ -32,6 +32,7 @@ library , bytestring , conduit >=1.3.4.2 && <1.4 , text + , transformers default-language: Haskell2010 autogen-modules: Paths_csv_slurp @@ -51,5 +52,6 @@ test-suite csv-slurp-test , csv-slurp , hspec >=2.8.5 && <2.9 , text + , transformers default-language: Haskell2010 autogen-modules: Paths_csv_slurp diff --git a/package.yaml b/package.yaml index 3f71495..57573fb 100644 --- a/package.yaml +++ b/package.yaml @@ -26,6 +26,7 @@ dependencies: - bytestring - conduit >= 1.3.4.2 && < 1.4 - text +- transformers library: source-dirs: src diff --git a/src/Data/CSV/Slurp.hs b/src/Data/CSV/Slurp.hs index da7e3fb..cd272f6 100644 --- a/src/Data/CSV/Slurp.hs +++ b/src/Data/CSV/Slurp.hs @@ -29,13 +29,16 @@ module Data.CSV.Slurp ( decodeRows, decodeRawRows, decodeUTF8, + toBytes, ) where import Conduit (ConduitT, mapC, (.|)) +import Control.Monad.Trans.State (StateT, evalStateT) import qualified Data.ByteString as BS import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8') +import Data.Word (Word8) -- | decode the rows from a stream of ByteStrings decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m () @@ -43,7 +46,7 @@ decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8) -- | decode the rows returning raw ByteStrings instead of text decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m () -decodeRawRows = return () +decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState -- | decode a raw ByteString into Text (if possible) decodeUTF8 :: BS.ByteString -> Maybe T.Text @@ -51,4 +54,24 @@ decodeUTF8 bs = case decodeUtf8' bs of Left _ -> Nothing Right txt -> Just txt +-- | convert a stream to ByteStrings to a string of bytes +toBytes :: Monad m => ConduitT BS.ByteString Word8 m () +toBytes = return () + +data DecodeState = DecodeState + { isQuoted :: Bool + , collected :: BS.ByteString + } deriving (Eq, Show) + +newDecodeState :: DecodeState +newDecodeState = DecodeState + { isQuoted = False + , collected = "" + } + +decodeLoop + :: Monad m + => StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () +decodeLoop = return () + --jl diff --git a/test/Data/CSV/SlurpSpec.hs b/test/Data/CSV/SlurpSpec.hs index 9b59ba5..d36d7da 100644 --- a/test/Data/CSV/SlurpSpec.hs +++ b/test/Data/CSV/SlurpSpec.hs @@ -31,6 +31,7 @@ import Data.CSV.Slurp spec :: Spec spec = describe "Data.CSV.Slurp" $ do decodeRowsSpec + decodeRawRowsSpec decodeUTF8Spec decodeRowsSpec :: Spec @@ -60,6 +61,130 @@ decodeRowsSpec = describe "decodeRows" $ mapM_ invalidIn = ["\"a"] validRes = [["foo", "bar"], ["baz", "quux"]] +decodeRawRowsSpec :: Spec +decodeRawRowsSpec = describe "decodeRawRows" $ mapM_ + ( \(label, input, expected) -> context label $ do + result <- runConduit $ sourceList input .| decodeRawRows .| consume + let + expLen = length expected + resLen = length result + xit ("should have " ++ show expLen ++ " rows") $ + resLen `shouldBe` expLen + mapM_ + ( \(n, expected', result') -> context ("row " ++ show n) $ + xit ("should be " ++ show result') $ + expected' `shouldBe` result' + ) $ zip3 [(0::Int)..] expected result + ) + + -- label, input, expected + [ ( "unquoted", unquotedIn, normalRes ) + , ( "quoted", quotedIn, normalRes ) + , ( "mixed", mixedIn, normalRes ) + , ( "CR only", crOnlyIn, normalRes ) + , ( "LF only", lfOnlyIn, normalRes ) + , ( "has quote", quoteIn, quoteRes ) + , ( "has CR", crIn, crRes ) + , ( "has LF", lfIn, lfRes ) + , ( "has CRLF", crlfIn, crlfRes ) + , ( "odd chunk", oddChunkIn, normalRes ) + , ( "no newline", noNewlineIn, normalRes ) + , ( "malformed", malformedIn, malformedRes ) + ] + + where + + unquotedIn = + [ "foo,bar\r\n" + , "baz,quux\r\n" + ] + + quotedIn = + [ "\"foo\",\"bar\"\r\n" + , "\"baz\",\"quux\"\r\n" + ] + + mixedIn = + [ "\"foo\",bar\r\n" + , "baz,\"quux\"\r\n" + ] + + crOnlyIn = + [ "foo,bar\r" + , "baz,quux\r" + ] + + lfOnlyIn = + [ "foo,bar\n" + , "baz,quux\n" + ] + + quoteIn = + [ "\"a\"\"b\",bar\r\n" + , "baz,quux\r\n" + ] + + crIn = + [ "\"a\rb\",bar\r\n" + , "baz,quux\r\n" + ] + + lfIn = + [ "\"a\nb\",bar\r\n" + , "baz,quux\r\n" + ] + + crlfIn = + [ "\"a\r\nb\",bar\r\n" + , "baz,quux\r\n" + ] + + oddChunkIn = + [ "foo," + , "bar\r\nbaz," + , "quux\r\n" + ] + + noNewlineIn = + [ "foo,bar\r\n" + , "baz,quux" + ] + + malformedIn = + [ "a\"b,bar\r\n" + , "baz,quux\r\n" + ] + + normalRes = + [ ["foo", "bar"] + , ["baz", "quux"] + ] + + quoteRes = + [ ["a\"b", "bar"] + , ["baz", "quux"] + ] + + crRes = + [ ["a\rb", "bar"] + , ["baz", "quux"] + ] + + lfRes = + [ ["a\nb", "bar"] + , ["baz", "quux"] + ] + + crlfRes = + [ ["a\r\nb", "bar"] + , ["baz", "quux"] + ] + + malformedRes = + [ ["", "bar"] + , ["baz", "quux"] + ] + decodeUTF8Spec :: Spec decodeUTF8Spec = describe "decodeUTF8" $ mapM_ ( \(label, input, expected) -> context label $