csv-sip/src/Data/CSV/Sip.hs

337 lines
8.6 KiB
Haskell

{-|
Module : Data.CSV.Sip
Description : works with CSV files
Copyright : (C) Jonathan Lamothe
License : GPL-3.0-or-later
Maintainer : jonathan@jlamothe.net
Stability : experimental
Portability : POSIX
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
-}
{-# LANGUAGE LambdaCase, OverloadedStrings #-}
module Data.CSV.Sip (
-- * Read an entire CSV file
slurpCSV,
slurpRawCSV,
slurpLabelledCSV,
slurpRawLabelledCSV,
-- * Conduits
-- ** Encoding
encodeCSV,
encodeRawCSV,
encodeRows,
encodeRawRows,
-- ** Decoding
labelFields,
decodeRows,
decodeRawRows,
decodeUTF8,
toBytes,
) where
import Conduit
( ConduitT
, MonadResource
, await
, mapC
, runConduit
, sourceFile
, yield
, (.|)
)
import Control.Monad (unless)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
import qualified Data.ByteString as BS
import Data.Conduit.List (consume, sourceList)
import qualified Data.Map as M
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Data.Word (Word8)
-- | read an entire CSV file
slurpCSV
:: MonadResource m
=> FilePath
-- ^ the path to the file to read
-> m [[T.Text]]
slurpCSV file = runConduit $ sourceFile file .| decodeRows .| consume
-- | read an entire CSV file in raw mode
slurpRawCSV
:: MonadResource m
=> FilePath
-- ^ the path to the file to read
-> m [[BS.ByteString]]
slurpRawCSV file = runConduit $ sourceFile file .| decodeRawRows .| consume
-- | read an entire CSV file with a header
slurpLabelledCSV
:: MonadResource m
=> FilePath
-- ^ the path to the file to read
-> m [M.Map T.Text T.Text]
slurpLabelledCSV file = runConduit $
sourceFile file .| decodeRows .| labelFields .|consume
-- | read an entire CSV file with a header
slurpRawLabelledCSV
:: MonadResource m
=> FilePath
-- ^ the path to the file to read
-> m [M.Map BS.ByteString BS.ByteString]
slurpRawLabelledCSV file = runConduit $
sourceFile file .| decodeRawRows .| labelFields .|consume
-- | encode an entire CSV file
encodeCSV
:: Monad m
=> [[T.Text]]
-- ^ the data being encoded, organized into rows and fields
-> ConduitT () BS.ByteString m ()
encodeCSV csv = sourceList csv .| encodeRows
-- | encode an entire CSV file
encodeRawCSV
:: Monad m
=> [[BS.ByteString]]
-- ^ the data being encoded, organized into rows and fields
-> ConduitT () BS.ByteString m ()
encodeRawCSV csv = sourceList csv .| encodeRawRows
-- | encode a CSV stream row by row, each element in the list read
-- represents a field, with the entire list representing a row
encodeRows :: Monad m => ConduitT [T.Text] BS.ByteString m ()
encodeRows = mapC (map encodeUtf8) .| encodeRawRows
-- | encode raw CSV stream row by row, each element in the list read
-- represents a field, with the entire list representing a row
encodeRawRows :: Monad m => ConduitT [BS.ByteString] BS.ByteString m ()
encodeRawRows = await >>= \case
Just fs-> do
encodeFields fs
encodeRawRows
Nothing -> return ()
-- | read a CSV stream, using the first row as a header containing
-- field labels
labelFields :: (Monad m, Ord a) => ConduitT [a] (M.Map a a) m ()
labelFields = await >>= \case
Just headers -> labelLoop headers
Nothing -> return ()
-- | decode the rows from a stream of ByteStrings
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)
-- | decode the rows returning raw ByteStrings instead of text
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState
-- | decode a raw ByteString into Text (if possible)
decodeUTF8 :: BS.ByteString -> Maybe T.Text
decodeUTF8 bs = case decodeUtf8' bs of
Left _ -> Nothing
Right txt -> Just txt
-- | convert a stream to ByteStrings to a string of bytes
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
toBytes = await >>= \case
Just bs -> do
let bytes = BS.unpack bs
mapM_ yield bytes
toBytes
Nothing -> return ()
-- Internal
data DecodeState = DecodeState
{ isQuoted :: Bool
, fields :: [BS.ByteString]
, collected :: BS.ByteString
} deriving (Eq, Show)
type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
type Modifier = DecodeState -> DecodeState
newDecodeState :: DecodeState
newDecodeState = DecodeState
{ isQuoted = False
, fields = []
, collected = ""
}
-- Conduits
encodeFields
:: Monad m
=> [BS.ByteString]
-> ConduitT [BS.ByteString] BS.ByteString m ()
encodeFields [] = yield "\r\n"
encodeFields [f] = yield $ escapeField f `BS.append` "\r\n"
encodeFields (f:fs) = do
yield $ escapeField f `BS.append` ","
encodeFields fs
labelLoop :: (Monad m, Ord a) => [a] -> ConduitT [a] (M.Map a a) m ()
labelLoop headers = await >>= \case
Just values -> do
yield $ M.fromList $ zip headers values
labelLoop headers
Nothing -> return ()
-- Decoders
decodeLoop :: Monad m => Decoder m
decodeLoop = lift await >>= \case
Just byte -> case byte of
0x22 -> processQuote
0x2c -> processComma
0x0d -> processCR
0x0a -> processLF
_ -> performAction $ addByte byte
Nothing -> cleanup
processQuote :: Monad m => Decoder m
processQuote = gets isQuoted >>= \case
True -> processQuotedQuote
False -> processUnquotedQuote
processComma :: Monad m => Decoder m
processComma = gets isQuoted >>= performAction . \case
True -> addByte 0x2c
False -> commitField
processCR :: Monad m => Decoder m
processCR = gets isQuoted >>= \case
True -> performAction $ addByte 0xd
False -> endRow
processLF :: Monad m => Decoder m
processLF = gets isQuoted >>= \case
True -> performAction $ addByte 0xa
False -> endRow
processQuotedQuote :: Monad m => Decoder m
processQuotedQuote = lift await >>= \case
Just byte -> case byte of
0x22 -> performAction $ addByte 0x22 -- quote
0x2c -> performAction commitField -- comma
0x0d -> commitRow -- carriage return
0x0a -> commitRow -- line feed
_ -> corruptedField
Nothing -> cleanup
processUnquotedQuote :: Monad m => Decoder m
processUnquotedQuote = gets (BS.null . collected) >>= \case
True -> performAction setQuoted
False -> corruptedField
endRow :: Monad m => Decoder m
endRow = do
s <- get
if null (fields s) && BS.null (collected s)
then decodeLoop
else commitRow
commitRow :: Monad m => Decoder m
commitRow = do
modify commitField
gets fields >>= lift . yield
performAction dropFields
corruptedField :: Monad m => Decoder m
corruptedField = do
modify dropField
ignoreField
ignoreField :: Monad m => Decoder m
ignoreField = lift await >>= \case
Just byte -> case byte of
0x2c -> performAction commitField -- comma
0x0d -> commitRow
_ -> ignoreField
Nothing -> cleanup
cleanup :: Monad m => Decoder m
cleanup = do
gets isQuoted >>= \case
True -> modify $ commitField . dropField
False -> gets (BS.null . collected) >>= \case
True -> return ()
False -> modify commitField
fs <- gets fields
unless (null fs) $
lift $ yield fs
performAction :: Monad m => Modifier -> Decoder m
performAction f = do
modify f
decodeLoop
-- Modifiers
addByte :: Word8 -> Modifier
addByte b s = let
collected' = BS.snoc (collected s) b
in s { collected = collected' }
commitField :: Modifier
commitField s = let
isQuoted' = False
fields' = fields s ++ [collected s]
collected' = ""
in s
{ isQuoted = isQuoted'
, fields = fields'
, collected = collected'
}
dropFields :: Modifier
dropFields s = s { fields = [] }
dropField :: Modifier
dropField s = s
{ isQuoted = False
, collected = ""
}
setQuoted :: Modifier
setQuoted s = s { isQuoted = True }
-- Helpers
escapeField :: BS.ByteString -> BS.ByteString
escapeField field = let
bytes = BS.unpack field
in BS.concat
[ "\""
, BS.pack $ escapeLoop bytes
, "\""
]
escapeLoop :: [Word8] -> [Word8]
escapeLoop [] = []
escapeLoop (0x22:bs) = [0x22, 0x22] ++ escapeLoop bs -- escape quote
escapeLoop (b:bs) = b : escapeLoop bs
--jl