extra test in decodeRawRows

I needed to make sure that a trailing comma on a row would leave a blank cell at the end of the row.
implemented decodeRawRows
2022-04-21 00:02:38 -04:00 · 2022-04-20 21:21:51 -04:00 · 2022-04-19 20:12:12 -04:00 · 2022-04-19 19:33:35 -04:00
4 changed files with 313 additions and 12 deletions
--- a/csv-slurp.cabal
+++ b/csv-slurp.cabal
@ -32,6 +32,7 @@ library
    , bytestring
    , conduit >=1.3.4.2 && <1.4
    , text
+    , transformers
  default-language: Haskell2010
  autogen-modules: Paths_csv_slurp

@ -51,5 +52,6 @@ test-suite csv-slurp-test
    , csv-slurp
    , hspec >=2.8.5 && <2.9
    , text
+    , transformers
  default-language: Haskell2010
  autogen-modules: Paths_csv_slurp
--- a/package.yaml
+++ b/package.yaml
@ -26,6 +26,7 @@ dependencies:
 - bytestring
 - conduit >= 1.3.4.2 && < 1.4
 - text
+- transformers

 library:
  source-dirs: src
--- a/src/Data/CSV/Slurp.hs
+++ b/src/Data/CSV/Slurp.hs
@ -23,19 +23,24 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.

 -}

-{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE LambdaCase, OverloadedStrings #-}

 module Data.CSV.Slurp (
  decodeRows,
  decodeRawRows,
  decodeUTF8,
+  toBytes,
 ) where

-import Conduit (ConduitT, mapC, (.|))
+import Conduit (ConduitT, await, mapC, yield, (.|))
+import Control.Monad (unless)
+import Control.Monad.Trans.Class (lift)
+import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
 import qualified Data.ByteString as BS
 import Data.Maybe (fromMaybe)
 import qualified Data.Text as T
 import Data.Text.Encoding (decodeUtf8')
+import Data.Word (Word8)

 -- | decode the rows from a stream of ByteStrings
 decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
@ -43,7 +48,7 @@ decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)

 -- | decode the rows returning raw ByteStrings instead of text
 decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
-decodeRawRows = return ()
+decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState

 -- | decode a raw ByteString into Text (if possible)
 decodeUTF8 :: BS.ByteString -> Maybe T.Text
@ -51,4 +56,150 @@ decodeUTF8 bs = case decodeUtf8' bs of
  Left _    -> Nothing
  Right txt -> Just txt

+-- | convert a stream to ByteStrings to a string of bytes
+toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
+toBytes = await >>= \case
+  Just bs -> do
+    let bytes = BS.unpack bs
+    mapM_ yield bytes
+    toBytes
+  Nothing -> return ()
+
+-- Internal
+
+data DecodeState = DecodeState
+  { isQuoted  :: Bool
+  , fields    :: [BS.ByteString]
+  , collected :: BS.ByteString
+  } deriving (Eq, Show)
+
+type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
+type Modifier = DecodeState -> DecodeState
+
+newDecodeState :: DecodeState
+newDecodeState = DecodeState
+  { isQuoted  = False
+  , fields    = []
+  , collected = ""
+  }
+
+-- Decoders
+
+decodeLoop :: Monad m => Decoder m
+decodeLoop = lift await >>= \case
+  Just byte -> case byte of
+    0x22 -> processQuote
+    0x2c -> processComma
+    0x0d -> processCR
+    0x0a -> processLF
+    _    -> performAction $ addByte byte
+  Nothing -> cleanup
+
+processQuote :: Monad m => Decoder m
+processQuote = gets isQuoted >>= \case
+  True  -> processQuotedQuote
+  False -> processUnquotedQuote
+
+processComma :: Monad m => Decoder m
+processComma = gets isQuoted >>= performAction . \case
+  True  -> addByte 0x2c
+  False -> commitField
+
+processCR :: Monad m => Decoder m
+processCR = gets isQuoted >>= \case
+  True  -> performAction $ addByte 0xd
+  False -> endRow
+
+processLF :: Monad m => Decoder m
+processLF = gets isQuoted >>= \case
+  True  -> performAction $ addByte 0xa
+  False -> endRow
+
+processQuotedQuote :: Monad m => Decoder m
+processQuotedQuote = lift await >>= \case
+  Just byte -> case byte of
+    0x22 -> performAction $ addByte 0x22 -- quote
+    0x2c -> performAction commitField    -- comma
+    0x0d -> commitRow                    -- carriage return
+    0x0a -> commitRow                    -- line feed
+    _    -> corruptedField
+  Nothing -> cleanup
+
+processUnquotedQuote :: Monad m => Decoder m
+processUnquotedQuote = gets (BS.null . collected) >>= \case
+  True  -> performAction setQuoted
+  False -> corruptedField
+
+endRow :: Monad m => Decoder m
+endRow = do
+  s <- get
+  if null (fields s) && BS.null (collected s)
+    then decodeLoop
+    else commitRow
+
+commitRow :: Monad m => Decoder m
+commitRow = do
+  modify commitField
+  gets fields >>= lift . yield
+  performAction dropFields
+
+corruptedField :: Monad m => Decoder m
+corruptedField = do
+  modify dropField
+  ignoreField
+
+ignoreField :: Monad m => Decoder m
+ignoreField = lift await >>= \case
+  Just byte -> case byte of
+    0x2c -> performAction commitField -- comma
+    0x0d -> commitRow
+    _    -> ignoreField
+  Nothing -> cleanup
+
+cleanup :: Monad m => Decoder m
+cleanup = do
+  gets isQuoted >>= \case
+    True  -> modify $ commitField . dropField
+    False -> gets (BS.null . collected) >>= \case
+      True  -> return ()
+      False -> modify commitField
+  fs <- gets fields
+  unless (null fs) $
+    lift $ yield fs
+
+performAction :: Monad m => Modifier -> Decoder m
+performAction f = do
+  modify f
+  decodeLoop
+
+-- Modifiers
+
+addByte :: Word8 -> Modifier
+addByte b s = let
+  collected' = BS.snoc (collected s) b
+  in s { collected = collected' }
+
+commitField :: Modifier
+commitField s = let
+  isQuoted'  = False
+  fields'    = fields s ++ [collected s]
+  collected' = ""
+  in s
+  { isQuoted  = isQuoted'
+  , fields    = fields'
+  , collected = collected'
+  }
+
+dropFields :: Modifier
+dropFields s = s { fields = [] }
+
+dropField :: Modifier
+dropField s = s
+  { isQuoted  = False
+  , collected = ""
+  }
+
+setQuoted :: Modifier
+setQuoted s = s { isQuoted = True }
+
 --jl
--- a/test/Data/CSV/SlurpSpec.hs
+++ b/test/Data/CSV/SlurpSpec.hs
@ -23,15 +23,18 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 module Data.CSV.SlurpSpec (spec) where

 import Conduit (runConduit, (.|))
+import Data.Char (ord)
 import Data.Conduit.List (consume, sourceList)
-import Test.Hspec (Spec, context, describe, it, shouldBe, xit)
+import Test.Hspec (Spec, context, describe, it, shouldBe)

 import Data.CSV.Slurp

 spec :: Spec
 spec = describe "Data.CSV.Slurp" $ do
  decodeRowsSpec
+  decodeRawRowsSpec
  decodeUTF8Spec
+  toBytesSpec

 decodeRowsSpec :: Spec
 decodeRowsSpec = describe "decodeRows" $ mapM_
@ -40,25 +43,161 @@ decodeRowsSpec = describe "decodeRows" $ mapM_
    let
      expLen = length expected
      resLen = length result
-    xit ("should have " ++ show expLen ++ " rows") $
+    it ("should have " ++ show expLen ++ " rows") $
      resLen `shouldBe` expLen
    mapM_
      ( \(n, expected', result') -> context ("row " ++ show n) $
-        xit ("should be " ++ show expected') $
+        it ("should be " ++ show expected') $
          result' `shouldBe` expected'
      ) $ zip3 [(0::Int)..] expected result
  )

  --  label,     input,     expected
-  [ ( "valid",   validIn,   validRes )
-  , ( "invalid", invalidIn, []       )
-  , ( "empty",   [],        []       )
+  [ ( "valid",   validIn,   validRes   )
+  , ( "invalid", invalidIn, invalidRes )
+  , ( "empty",   [],        []         )
  ]

  where
-    validIn   = ["foo,bar\r\n", "baz,quuux\r\n"]
-    invalidIn = ["\"a"]
-    validRes  = [["foo", "bar"], ["baz", "quux"]]
+    validIn    = ["foo,bar\r\n", "baz,quux\r\n"]
+    invalidIn  = ["\"a"]
+    validRes   = [["foo", "bar"], ["baz", "quux"]]
+    invalidRes = [[""]]
+
+decodeRawRowsSpec :: Spec
+decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
+  ( \(label, input, expected) -> context label $ do
+    result <- runConduit $ sourceList input .| decodeRawRows .| consume
+    let
+      expLen = length expected
+      resLen = length result
+    it ("should have " ++ show expLen ++ " rows") $
+      resLen `shouldBe` expLen
+    mapM_
+      ( \(n, expected', result') -> context ("row " ++ show n) $
+        it ("should be " ++ show result') $
+          result' `shouldBe` expected'
+      ) $ zip3 [(0::Int)..] expected result
+  )
+
+  --  label,         input,      expected
+  [ ( "unquoted",    unquotedIn,  normalRes    )
+  , ( "quoted",      quotedIn,    normalRes    )
+  , ( "mixed",       mixedIn,     normalRes    )
+  , ( "CR only",     crOnlyIn,    normalRes    )
+  , ( "LF only",     lfOnlyIn,    normalRes    )
+  , ( "has quote",   quoteIn,     quoteRes     )
+  , ( "has CR",      crIn,        crRes        )
+  , ( "has LF",      lfIn,        lfRes        )
+  , ( "has CRLF",    crlfIn,      crlfRes      )
+  , ( "odd chunk",   oddChunkIn,  normalRes    )
+  , ( "no newline",  noNewlineIn, normalRes    )
+  , ( "malformed",   malformedIn, malformedRes )
+  , ( "blank end",   blankEndIn,  blankEndRes  )
+  ]
+
+  where
+
+    unquotedIn =
+      [ "foo,bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    quotedIn =
+      [ "\"foo\",\"bar\"\r\n"
+      ,  "\"baz\",\"quux\"\r\n"
+      ]
+
+    mixedIn =
+      [ "\"foo\",bar\r\n"
+      , "baz,\"quux\"\r\n"
+      ]
+
+    crOnlyIn =
+      [ "foo,bar\r"
+      , "baz,quux\r"
+      ]
+
+    lfOnlyIn =
+      [ "foo,bar\n"
+      , "baz,quux\n"
+      ]
+
+    quoteIn =
+      [ "\"a\"\"b\",bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    crIn =
+      [ "\"a\rb\",bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    lfIn =
+      [ "\"a\nb\",bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    crlfIn =
+      [ "\"a\r\nb\",bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    oddChunkIn =
+      [ "foo,"
+      , "bar\r\nbaz,"
+      , "quux\r\n"
+      ]
+
+    noNewlineIn =
+      [ "foo,bar\r\n"
+      , "baz,quux"
+      ]
+
+    malformedIn =
+      [ "a\"b,bar\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    blankEndIn =
+      [ "foo,bar,\r\n"
+      , "baz,quux\r\n"
+      ]
+
+    normalRes =
+      [ ["foo", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    quoteRes =
+      [ ["a\"b", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    crRes =
+      [ ["a\rb", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    lfRes =
+      [ ["a\nb", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    crlfRes =
+      [ ["a\r\nb", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    malformedRes =
+      [ ["", "bar"]
+      , ["baz", "quux"]
+      ]
+
+    blankEndRes =
+      [ ["foo", "bar", ""]
+      , ["baz", "quux"]
+      ]

 decodeUTF8Spec :: Spec
 decodeUTF8Spec = describe "decodeUTF8" $ mapM_
@ -74,4 +213,12 @@ decodeUTF8Spec = describe "decodeUTF8" $ mapM_
  , ( "blank",        "",         Just ""      )
  ]

+toBytesSpec :: Spec
+toBytesSpec = describe "toBytes" $ let
+  input    = ["ab", "cd"]
+  expected = map (fromIntegral . ord) "abcd"
+  in it ("should be " ++ show expected) $ do
+    result <- runConduit $ sourceList input .| toBytes .| consume
+    result `shouldBe` expected
+
 --jl
Author	SHA1	Message	Date
Jonathan Lamothe	9ec4d7e9c3	extra test in decodeRawRows I needed to make sure that a trailing comma on a row would leave a blank cell at the end of the row.	2022-04-21 00:02:38 -04:00
Jonathan Lamothe	389c206063	implemented decodeRawRows	2022-04-20 21:21:51 -04:00
Jonathan Lamothe	63b97649a6	implemented toBytes	2022-04-19 20:12:12 -04:00
Jonathan Lamothe	67e85f0a78	basic structure for decodeRawRows	2022-04-19 19:33:35 -04:00