Compare commits
No commits in common. "9ec4d7e9c34a23ef813c89abf401ccdf236454c8" and "365dc3995b813df40a7e121745d702690fb18eb7" have entirely different histories.
9ec4d7e9c3
...
365dc3995b
|
@ -29,10 +29,6 @@ library
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.7 && <5
|
base >=4.7 && <5
|
||||||
, bytestring
|
|
||||||
, conduit >=1.3.4.2 && <1.4
|
|
||||||
, text
|
|
||||||
, transformers
|
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
autogen-modules: Paths_csv_slurp
|
autogen-modules: Paths_csv_slurp
|
||||||
|
|
||||||
|
@ -47,11 +43,7 @@ test-suite csv-slurp-test
|
||||||
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N
|
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.7 && <5
|
base >=4.7 && <5
|
||||||
, bytestring
|
|
||||||
, conduit >=1.3.4.2 && <1.4
|
|
||||||
, csv-slurp
|
, csv-slurp
|
||||||
, hspec >=2.8.5 && <2.9
|
, hspec >=2.8.5 && <2.9
|
||||||
, text
|
|
||||||
, transformers
|
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
autogen-modules: Paths_csv_slurp
|
autogen-modules: Paths_csv_slurp
|
||||||
|
|
|
@ -23,10 +23,6 @@ ghc-options:
|
||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- base >= 4.7 && < 5
|
- base >= 4.7 && < 5
|
||||||
- bytestring
|
|
||||||
- conduit >= 1.3.4.2 && < 1.4
|
|
||||||
- text
|
|
||||||
- transformers
|
|
||||||
|
|
||||||
library:
|
library:
|
||||||
source-dirs: src
|
source-dirs: src
|
||||||
|
|
|
@ -23,183 +23,6 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE LambdaCase, OverloadedStrings #-}
|
module Data.CSV.Slurp where
|
||||||
|
|
||||||
module Data.CSV.Slurp (
|
|
||||||
decodeRows,
|
|
||||||
decodeRawRows,
|
|
||||||
decodeUTF8,
|
|
||||||
toBytes,
|
|
||||||
) where
|
|
||||||
|
|
||||||
import Conduit (ConduitT, await, mapC, yield, (.|))
|
|
||||||
import Control.Monad (unless)
|
|
||||||
import Control.Monad.Trans.Class (lift)
|
|
||||||
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
|
|
||||||
import qualified Data.ByteString as BS
|
|
||||||
import Data.Maybe (fromMaybe)
|
|
||||||
import qualified Data.Text as T
|
|
||||||
import Data.Text.Encoding (decodeUtf8')
|
|
||||||
import Data.Word (Word8)
|
|
||||||
|
|
||||||
-- | decode the rows from a stream of ByteStrings
|
|
||||||
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
|
|
||||||
decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)
|
|
||||||
|
|
||||||
-- | decode the rows returning raw ByteStrings instead of text
|
|
||||||
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
|
|
||||||
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState
|
|
||||||
|
|
||||||
-- | decode a raw ByteString into Text (if possible)
|
|
||||||
decodeUTF8 :: BS.ByteString -> Maybe T.Text
|
|
||||||
decodeUTF8 bs = case decodeUtf8' bs of
|
|
||||||
Left _ -> Nothing
|
|
||||||
Right txt -> Just txt
|
|
||||||
|
|
||||||
-- | convert a stream to ByteStrings to a string of bytes
|
|
||||||
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
|
|
||||||
toBytes = await >>= \case
|
|
||||||
Just bs -> do
|
|
||||||
let bytes = BS.unpack bs
|
|
||||||
mapM_ yield bytes
|
|
||||||
toBytes
|
|
||||||
Nothing -> return ()
|
|
||||||
|
|
||||||
-- Internal
|
|
||||||
|
|
||||||
data DecodeState = DecodeState
|
|
||||||
{ isQuoted :: Bool
|
|
||||||
, fields :: [BS.ByteString]
|
|
||||||
, collected :: BS.ByteString
|
|
||||||
} deriving (Eq, Show)
|
|
||||||
|
|
||||||
type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
|
|
||||||
type Modifier = DecodeState -> DecodeState
|
|
||||||
|
|
||||||
newDecodeState :: DecodeState
|
|
||||||
newDecodeState = DecodeState
|
|
||||||
{ isQuoted = False
|
|
||||||
, fields = []
|
|
||||||
, collected = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
-- Decoders
|
|
||||||
|
|
||||||
decodeLoop :: Monad m => Decoder m
|
|
||||||
decodeLoop = lift await >>= \case
|
|
||||||
Just byte -> case byte of
|
|
||||||
0x22 -> processQuote
|
|
||||||
0x2c -> processComma
|
|
||||||
0x0d -> processCR
|
|
||||||
0x0a -> processLF
|
|
||||||
_ -> performAction $ addByte byte
|
|
||||||
Nothing -> cleanup
|
|
||||||
|
|
||||||
processQuote :: Monad m => Decoder m
|
|
||||||
processQuote = gets isQuoted >>= \case
|
|
||||||
True -> processQuotedQuote
|
|
||||||
False -> processUnquotedQuote
|
|
||||||
|
|
||||||
processComma :: Monad m => Decoder m
|
|
||||||
processComma = gets isQuoted >>= performAction . \case
|
|
||||||
True -> addByte 0x2c
|
|
||||||
False -> commitField
|
|
||||||
|
|
||||||
processCR :: Monad m => Decoder m
|
|
||||||
processCR = gets isQuoted >>= \case
|
|
||||||
True -> performAction $ addByte 0xd
|
|
||||||
False -> endRow
|
|
||||||
|
|
||||||
processLF :: Monad m => Decoder m
|
|
||||||
processLF = gets isQuoted >>= \case
|
|
||||||
True -> performAction $ addByte 0xa
|
|
||||||
False -> endRow
|
|
||||||
|
|
||||||
processQuotedQuote :: Monad m => Decoder m
|
|
||||||
processQuotedQuote = lift await >>= \case
|
|
||||||
Just byte -> case byte of
|
|
||||||
0x22 -> performAction $ addByte 0x22 -- quote
|
|
||||||
0x2c -> performAction commitField -- comma
|
|
||||||
0x0d -> commitRow -- carriage return
|
|
||||||
0x0a -> commitRow -- line feed
|
|
||||||
_ -> corruptedField
|
|
||||||
Nothing -> cleanup
|
|
||||||
|
|
||||||
processUnquotedQuote :: Monad m => Decoder m
|
|
||||||
processUnquotedQuote = gets (BS.null . collected) >>= \case
|
|
||||||
True -> performAction setQuoted
|
|
||||||
False -> corruptedField
|
|
||||||
|
|
||||||
endRow :: Monad m => Decoder m
|
|
||||||
endRow = do
|
|
||||||
s <- get
|
|
||||||
if null (fields s) && BS.null (collected s)
|
|
||||||
then decodeLoop
|
|
||||||
else commitRow
|
|
||||||
|
|
||||||
commitRow :: Monad m => Decoder m
|
|
||||||
commitRow = do
|
|
||||||
modify commitField
|
|
||||||
gets fields >>= lift . yield
|
|
||||||
performAction dropFields
|
|
||||||
|
|
||||||
corruptedField :: Monad m => Decoder m
|
|
||||||
corruptedField = do
|
|
||||||
modify dropField
|
|
||||||
ignoreField
|
|
||||||
|
|
||||||
ignoreField :: Monad m => Decoder m
|
|
||||||
ignoreField = lift await >>= \case
|
|
||||||
Just byte -> case byte of
|
|
||||||
0x2c -> performAction commitField -- comma
|
|
||||||
0x0d -> commitRow
|
|
||||||
_ -> ignoreField
|
|
||||||
Nothing -> cleanup
|
|
||||||
|
|
||||||
cleanup :: Monad m => Decoder m
|
|
||||||
cleanup = do
|
|
||||||
gets isQuoted >>= \case
|
|
||||||
True -> modify $ commitField . dropField
|
|
||||||
False -> gets (BS.null . collected) >>= \case
|
|
||||||
True -> return ()
|
|
||||||
False -> modify commitField
|
|
||||||
fs <- gets fields
|
|
||||||
unless (null fs) $
|
|
||||||
lift $ yield fs
|
|
||||||
|
|
||||||
performAction :: Monad m => Modifier -> Decoder m
|
|
||||||
performAction f = do
|
|
||||||
modify f
|
|
||||||
decodeLoop
|
|
||||||
|
|
||||||
-- Modifiers
|
|
||||||
|
|
||||||
addByte :: Word8 -> Modifier
|
|
||||||
addByte b s = let
|
|
||||||
collected' = BS.snoc (collected s) b
|
|
||||||
in s { collected = collected' }
|
|
||||||
|
|
||||||
commitField :: Modifier
|
|
||||||
commitField s = let
|
|
||||||
isQuoted' = False
|
|
||||||
fields' = fields s ++ [collected s]
|
|
||||||
collected' = ""
|
|
||||||
in s
|
|
||||||
{ isQuoted = isQuoted'
|
|
||||||
, fields = fields'
|
|
||||||
, collected = collected'
|
|
||||||
}
|
|
||||||
|
|
||||||
dropFields :: Modifier
|
|
||||||
dropFields s = s { fields = [] }
|
|
||||||
|
|
||||||
dropField :: Modifier
|
|
||||||
dropField s = s
|
|
||||||
{ isQuoted = False
|
|
||||||
, collected = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
setQuoted :: Modifier
|
|
||||||
setQuoted s = s { isQuoted = True }
|
|
||||||
|
|
||||||
--jl
|
--jl
|
||||||
|
|
|
@ -18,207 +18,11 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
|
||||||
|
|
||||||
module Data.CSV.SlurpSpec (spec) where
|
module Data.CSV.SlurpSpec (spec) where
|
||||||
|
|
||||||
import Conduit (runConduit, (.|))
|
import Test.Hspec (Spec, describe)
|
||||||
import Data.Char (ord)
|
|
||||||
import Data.Conduit.List (consume, sourceList)
|
|
||||||
import Test.Hspec (Spec, context, describe, it, shouldBe)
|
|
||||||
|
|
||||||
import Data.CSV.Slurp
|
|
||||||
|
|
||||||
spec :: Spec
|
spec :: Spec
|
||||||
spec = describe "Data.CSV.Slurp" $ do
|
spec = describe "Data.CSV.Slurp" $ return ()
|
||||||
decodeRowsSpec
|
|
||||||
decodeRawRowsSpec
|
|
||||||
decodeUTF8Spec
|
|
||||||
toBytesSpec
|
|
||||||
|
|
||||||
decodeRowsSpec :: Spec
|
|
||||||
decodeRowsSpec = describe "decodeRows" $ mapM_
|
|
||||||
( \(label, input, expected) -> context label $ do
|
|
||||||
result <- runConduit $ sourceList input .| decodeRows .| consume
|
|
||||||
let
|
|
||||||
expLen = length expected
|
|
||||||
resLen = length result
|
|
||||||
it ("should have " ++ show expLen ++ " rows") $
|
|
||||||
resLen `shouldBe` expLen
|
|
||||||
mapM_
|
|
||||||
( \(n, expected', result') -> context ("row " ++ show n) $
|
|
||||||
it ("should be " ++ show expected') $
|
|
||||||
result' `shouldBe` expected'
|
|
||||||
) $ zip3 [(0::Int)..] expected result
|
|
||||||
)
|
|
||||||
|
|
||||||
-- label, input, expected
|
|
||||||
[ ( "valid", validIn, validRes )
|
|
||||||
, ( "invalid", invalidIn, invalidRes )
|
|
||||||
, ( "empty", [], [] )
|
|
||||||
]
|
|
||||||
|
|
||||||
where
|
|
||||||
validIn = ["foo,bar\r\n", "baz,quux\r\n"]
|
|
||||||
invalidIn = ["\"a"]
|
|
||||||
validRes = [["foo", "bar"], ["baz", "quux"]]
|
|
||||||
invalidRes = [[""]]
|
|
||||||
|
|
||||||
decodeRawRowsSpec :: Spec
|
|
||||||
decodeRawRowsSpec = describe "decodeRawRows" $ mapM_
|
|
||||||
( \(label, input, expected) -> context label $ do
|
|
||||||
result <- runConduit $ sourceList input .| decodeRawRows .| consume
|
|
||||||
let
|
|
||||||
expLen = length expected
|
|
||||||
resLen = length result
|
|
||||||
it ("should have " ++ show expLen ++ " rows") $
|
|
||||||
resLen `shouldBe` expLen
|
|
||||||
mapM_
|
|
||||||
( \(n, expected', result') -> context ("row " ++ show n) $
|
|
||||||
it ("should be " ++ show result') $
|
|
||||||
result' `shouldBe` expected'
|
|
||||||
) $ zip3 [(0::Int)..] expected result
|
|
||||||
)
|
|
||||||
|
|
||||||
-- label, input, expected
|
|
||||||
[ ( "unquoted", unquotedIn, normalRes )
|
|
||||||
, ( "quoted", quotedIn, normalRes )
|
|
||||||
, ( "mixed", mixedIn, normalRes )
|
|
||||||
, ( "CR only", crOnlyIn, normalRes )
|
|
||||||
, ( "LF only", lfOnlyIn, normalRes )
|
|
||||||
, ( "has quote", quoteIn, quoteRes )
|
|
||||||
, ( "has CR", crIn, crRes )
|
|
||||||
, ( "has LF", lfIn, lfRes )
|
|
||||||
, ( "has CRLF", crlfIn, crlfRes )
|
|
||||||
, ( "odd chunk", oddChunkIn, normalRes )
|
|
||||||
, ( "no newline", noNewlineIn, normalRes )
|
|
||||||
, ( "malformed", malformedIn, malformedRes )
|
|
||||||
, ( "blank end", blankEndIn, blankEndRes )
|
|
||||||
]
|
|
||||||
|
|
||||||
where
|
|
||||||
|
|
||||||
unquotedIn =
|
|
||||||
[ "foo,bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
quotedIn =
|
|
||||||
[ "\"foo\",\"bar\"\r\n"
|
|
||||||
, "\"baz\",\"quux\"\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
mixedIn =
|
|
||||||
[ "\"foo\",bar\r\n"
|
|
||||||
, "baz,\"quux\"\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
crOnlyIn =
|
|
||||||
[ "foo,bar\r"
|
|
||||||
, "baz,quux\r"
|
|
||||||
]
|
|
||||||
|
|
||||||
lfOnlyIn =
|
|
||||||
[ "foo,bar\n"
|
|
||||||
, "baz,quux\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
quoteIn =
|
|
||||||
[ "\"a\"\"b\",bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
crIn =
|
|
||||||
[ "\"a\rb\",bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
lfIn =
|
|
||||||
[ "\"a\nb\",bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
crlfIn =
|
|
||||||
[ "\"a\r\nb\",bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
oddChunkIn =
|
|
||||||
[ "foo,"
|
|
||||||
, "bar\r\nbaz,"
|
|
||||||
, "quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
noNewlineIn =
|
|
||||||
[ "foo,bar\r\n"
|
|
||||||
, "baz,quux"
|
|
||||||
]
|
|
||||||
|
|
||||||
malformedIn =
|
|
||||||
[ "a\"b,bar\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
blankEndIn =
|
|
||||||
[ "foo,bar,\r\n"
|
|
||||||
, "baz,quux\r\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
normalRes =
|
|
||||||
[ ["foo", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
quoteRes =
|
|
||||||
[ ["a\"b", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
crRes =
|
|
||||||
[ ["a\rb", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
lfRes =
|
|
||||||
[ ["a\nb", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
crlfRes =
|
|
||||||
[ ["a\r\nb", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
malformedRes =
|
|
||||||
[ ["", "bar"]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
blankEndRes =
|
|
||||||
[ ["foo", "bar", ""]
|
|
||||||
, ["baz", "quux"]
|
|
||||||
]
|
|
||||||
|
|
||||||
decodeUTF8Spec :: Spec
|
|
||||||
decodeUTF8Spec = describe "decodeUTF8" $ mapM_
|
|
||||||
( \(label, input, expected) -> context label $
|
|
||||||
it ("should be " ++ show expected) $
|
|
||||||
decodeUTF8 input `shouldBe` expected
|
|
||||||
)
|
|
||||||
|
|
||||||
-- label, input, expected
|
|
||||||
[ ( "plain ASCII", "hello", Just "hello" )
|
|
||||||
, ( "valid UTF8", "\xc3\xa9", Just "é" )
|
|
||||||
, ( "invalid UTF8", "\xff", Nothing )
|
|
||||||
, ( "blank", "", Just "" )
|
|
||||||
]
|
|
||||||
|
|
||||||
toBytesSpec :: Spec
|
|
||||||
toBytesSpec = describe "toBytes" $ let
|
|
||||||
input = ["ab", "cd"]
|
|
||||||
expected = map (fromIntegral . ord) "abcd"
|
|
||||||
in it ("should be " ++ show expected) $ do
|
|
||||||
result <- runConduit $ sourceList input .| toBytes .| consume
|
|
||||||
result `shouldBe` expected
|
|
||||||
|
|
||||||
--jl
|
--jl
|
||||||
|
|
|
@ -25,6 +25,6 @@ import Test.Hspec (hspec)
|
||||||
import qualified Data.CSV.SlurpSpec as Slurp
|
import qualified Data.CSV.SlurpSpec as Slurp
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = hspec Slurp.spec
|
main = hspec $ Slurp.spec
|
||||||
|
|
||||||
--jl
|
--jl
|
||||||
|
|
Loading…
Reference in New Issue
Block a user