Compare commits

..

No commits in common. "9ec4d7e9c34a23ef813c89abf401ccdf236454c8" and "365dc3995b813df40a7e121745d702690fb18eb7" have entirely different histories.

5 changed files with 4 additions and 389 deletions

View File

@ -29,10 +29,6 @@ library
ghc-options: -Wall ghc-options: -Wall
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
, bytestring
, conduit >=1.3.4.2 && <1.4
, text
, transformers
default-language: Haskell2010 default-language: Haskell2010
autogen-modules: Paths_csv_slurp autogen-modules: Paths_csv_slurp
@ -47,11 +43,7 @@ test-suite csv-slurp-test
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
, bytestring
, conduit >=1.3.4.2 && <1.4
, csv-slurp , csv-slurp
, hspec >=2.8.5 && <2.9 , hspec >=2.8.5 && <2.9
, text
, transformers
default-language: Haskell2010 default-language: Haskell2010
autogen-modules: Paths_csv_slurp autogen-modules: Paths_csv_slurp

View File

@ -23,10 +23,6 @@ ghc-options:
dependencies: dependencies:
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- bytestring
- conduit >= 1.3.4.2 && < 1.4
- text
- transformers
library: library:
source-dirs: src source-dirs: src

View File

@ -23,183 +23,6 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
-} -}
{-# LANGUAGE LambdaCase, OverloadedStrings #-} module Data.CSV.Slurp where
module Data.CSV.Slurp (
decodeRows,
decodeRawRows,
decodeUTF8,
toBytes,
) where
import Conduit (ConduitT, await, mapC, yield, (.|))
import Control.Monad (unless)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
import qualified Data.ByteString as BS
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8')
import Data.Word (Word8)
-- | decode the rows from a stream of ByteStrings
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)
-- | decode the rows returning raw ByteStrings instead of text
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState
-- | decode a raw ByteString into Text (if possible)
decodeUTF8 :: BS.ByteString -> Maybe T.Text
decodeUTF8 bs = case decodeUtf8' bs of
Left _ -> Nothing
Right txt -> Just txt
-- | convert a stream to ByteStrings to a string of bytes
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
toBytes = await >>= \case
Just bs -> do
let bytes = BS.unpack bs
mapM_ yield bytes
toBytes
Nothing -> return ()
-- Internal
data DecodeState = DecodeState
{ isQuoted :: Bool
, fields :: [BS.ByteString]
, collected :: BS.ByteString
} deriving (Eq, Show)
type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
type Modifier = DecodeState -> DecodeState
newDecodeState :: DecodeState
newDecodeState = DecodeState
{ isQuoted = False
, fields = []
, collected = ""
}
-- Decoders
decodeLoop :: Monad m => Decoder m
decodeLoop = lift await >>= \case
Just byte -> case byte of
0x22 -> processQuote
0x2c -> processComma
0x0d -> processCR
0x0a -> processLF
_ -> performAction $ addByte byte
Nothing -> cleanup
processQuote :: Monad m => Decoder m
processQuote = gets isQuoted >>= \case
True -> processQuotedQuote
False -> processUnquotedQuote
processComma :: Monad m => Decoder m
processComma = gets isQuoted >>= performAction . \case
True -> addByte 0x2c
False -> commitField
processCR :: Monad m => Decoder m
processCR = gets isQuoted >>= \case
True -> performAction $ addByte 0xd
False -> endRow
processLF :: Monad m => Decoder m
processLF = gets isQuoted >>= \case
True -> performAction $ addByte 0xa
False -> endRow
processQuotedQuote :: Monad m => Decoder m
processQuotedQuote = lift await >>= \case
Just byte -> case byte of
0x22 -> performAction $ addByte 0x22 -- quote
0x2c -> performAction commitField -- comma
0x0d -> commitRow -- carriage return
0x0a -> commitRow -- line feed
_ -> corruptedField
Nothing -> cleanup
processUnquotedQuote :: Monad m => Decoder m
processUnquotedQuote = gets (BS.null . collected) >>= \case
True -> performAction setQuoted
False -> corruptedField
endRow :: Monad m => Decoder m
endRow = do
s <- get
if null (fields s) && BS.null (collected s)
then decodeLoop
else commitRow
commitRow :: Monad m => Decoder m
commitRow = do
modify commitField
gets fields >>= lift . yield
performAction dropFields
corruptedField :: Monad m => Decoder m
corruptedField = do
modify dropField
ignoreField
ignoreField :: Monad m => Decoder m
ignoreField = lift await >>= \case
Just byte -> case byte of
0x2c -> performAction commitField -- comma
0x0d -> commitRow
_ -> ignoreField
Nothing -> cleanup
cleanup :: Monad m => Decoder m
cleanup = do
gets isQuoted >>= \case
True -> modify $ commitField . dropField
False -> gets (BS.null . collected) >>= \case
True -> return ()
False -> modify commitField
fs <- gets fields
unless (null fs) $
lift $ yield fs
performAction :: Monad m => Modifier -> Decoder m
performAction f = do
modify f
decodeLoop
-- Modifiers
addByte :: Word8 -> Modifier
addByte b s = let
collected' = BS.snoc (collected s) b
in s { collected = collected' }
commitField :: Modifier
commitField s = let
isQuoted' = False
fields' = fields s ++ [collected s]
collected' = ""
in s
{ isQuoted = isQuoted'
, fields = fields'
, collected = collected'
}
dropFields :: Modifier
dropFields s = s { fields = [] }
dropField :: Modifier
dropField s = s
{ isQuoted = False
, collected = ""
}
setQuoted :: Modifier
setQuoted s = s { isQuoted = True }
--jl --jl

View File

@ -18,207 +18,11 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
-} -}
{-# LANGUAGE OverloadedStrings #-}
module Data.CSV.SlurpSpec (spec) where module Data.CSV.SlurpSpec (spec) where
import Conduit (runConduit, (.|)) import Test.Hspec (Spec, describe)
import Data.Char (ord)
import Data.Conduit.List (consume, sourceList)
import Test.Hspec (Spec, context, describe, it, shouldBe)
import Data.CSV.Slurp
spec :: Spec spec :: Spec
spec = describe "Data.CSV.Slurp" $ do spec = describe "Data.CSV.Slurp" $ return ()
decodeRowsSpec
decodeRawRowsSpec
decodeUTF8Spec
toBytesSpec
decodeRowsSpec :: Spec
decodeRowsSpec = describe "decodeRows" $ mapM_
( \(label, input, expected) -> context label $ do
result <- runConduit $ sourceList input .| decodeRows .| consume
let
expLen = length expected
resLen = length result
it ("should have " ++ show expLen ++ " rows") $
resLen `shouldBe` expLen
mapM_
( \(n, expected', result') -> context ("row " ++ show n) $
it ("should be " ++ show expected') $
result' `shouldBe` expected'
) $ zip3 [(0::Int)..] expected result
)
-- label, input, expected
[ ( "valid", validIn, validRes )
, ( "invalid", invalidIn, invalidRes )
, ( "empty", [], [] )
]
where
validIn = ["foo,bar\r\n", "baz,quux\r\n"]
invalidIn = ["\"a"]
validRes = [["foo", "bar"], ["baz", "quux"]]
invalidRes = [[""]]
decodeRawRowsSpec :: Spec
decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
( \(label, input, expected) -> context label $ do
result <- runConduit $ sourceList input .| decodeRawRows .| consume
let
expLen = length expected
resLen = length result
it ("should have " ++ show expLen ++ " rows") $
resLen `shouldBe` expLen
mapM_
( \(n, expected', result') -> context ("row " ++ show n) $
it ("should be " ++ show result') $
result' `shouldBe` expected'
) $ zip3 [(0::Int)..] expected result
)
-- label, input, expected
[ ( "unquoted", unquotedIn, normalRes )
, ( "quoted", quotedIn, normalRes )
, ( "mixed", mixedIn, normalRes )
, ( "CR only", crOnlyIn, normalRes )
, ( "LF only", lfOnlyIn, normalRes )
, ( "has quote", quoteIn, quoteRes )
, ( "has CR", crIn, crRes )
, ( "has LF", lfIn, lfRes )
, ( "has CRLF", crlfIn, crlfRes )
, ( "odd chunk", oddChunkIn, normalRes )
, ( "no newline", noNewlineIn, normalRes )
, ( "malformed", malformedIn, malformedRes )
, ( "blank end", blankEndIn, blankEndRes )
]
where
unquotedIn =
[ "foo,bar\r\n"
, "baz,quux\r\n"
]
quotedIn =
[ "\"foo\",\"bar\"\r\n"
, "\"baz\",\"quux\"\r\n"
]
mixedIn =
[ "\"foo\",bar\r\n"
, "baz,\"quux\"\r\n"
]
crOnlyIn =
[ "foo,bar\r"
, "baz,quux\r"
]
lfOnlyIn =
[ "foo,bar\n"
, "baz,quux\n"
]
quoteIn =
[ "\"a\"\"b\",bar\r\n"
, "baz,quux\r\n"
]
crIn =
[ "\"a\rb\",bar\r\n"
, "baz,quux\r\n"
]
lfIn =
[ "\"a\nb\",bar\r\n"
, "baz,quux\r\n"
]
crlfIn =
[ "\"a\r\nb\",bar\r\n"
, "baz,quux\r\n"
]
oddChunkIn =
[ "foo,"
, "bar\r\nbaz,"
, "quux\r\n"
]
noNewlineIn =
[ "foo,bar\r\n"
, "baz,quux"
]
malformedIn =
[ "a\"b,bar\r\n"
, "baz,quux\r\n"
]
blankEndIn =
[ "foo,bar,\r\n"
, "baz,quux\r\n"
]
normalRes =
[ ["foo", "bar"]
, ["baz", "quux"]
]
quoteRes =
[ ["a\"b", "bar"]
, ["baz", "quux"]
]
crRes =
[ ["a\rb", "bar"]
, ["baz", "quux"]
]
lfRes =
[ ["a\nb", "bar"]
, ["baz", "quux"]
]
crlfRes =
[ ["a\r\nb", "bar"]
, ["baz", "quux"]
]
malformedRes =
[ ["", "bar"]
, ["baz", "quux"]
]
blankEndRes =
[ ["foo", "bar", ""]
, ["baz", "quux"]
]
decodeUTF8Spec :: Spec
decodeUTF8Spec = describe "decodeUTF8" $ mapM_
( \(label, input, expected) -> context label $
it ("should be " ++ show expected) $
decodeUTF8 input `shouldBe` expected
)
-- label, input, expected
[ ( "plain ASCII", "hello", Just "hello" )
, ( "valid UTF8", "\xc3\xa9", Just "é" )
, ( "invalid UTF8", "\xff", Nothing )
, ( "blank", "", Just "" )
]
toBytesSpec :: Spec
toBytesSpec = describe "toBytes" $ let
input = ["ab", "cd"]
expected = map (fromIntegral . ord) "abcd"
in it ("should be " ++ show expected) $ do
result <- runConduit $ sourceList input .| toBytes .| consume
result `shouldBe` expected
--jl --jl

View File

@ -25,6 +25,6 @@ import Test.Hspec (hspec)
import qualified Data.CSV.SlurpSpec as Slurp import qualified Data.CSV.SlurpSpec as Slurp
main :: IO () main :: IO ()
main = hspec Slurp.spec main = hspec $ Slurp.spec
--jl --jl