264 lines
6.6 KiB
Haskell
264 lines
6.6 KiB
Haskell
{-|
|
|
|
|
Module : Data.CSV.Sip
|
|
Description : works with CSV files
|
|
Copyright : (C) Jonathan Lamothe
|
|
License : GPL-3.0-or-later
|
|
Maintainer : jonathan@jlamothe.net
|
|
Stability : experimental
|
|
Portability : POSIX
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or (at
|
|
your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
-}
|
|
|
|
{-# LANGUAGE LambdaCase, OverloadedStrings #-}
|
|
|
|
module Data.CSV.Sip (
|
|
-- * Read an entire CSV file
|
|
slurpCSV,
|
|
slurpRawCSV,
|
|
slurpLabelledCSV,
|
|
-- * Conduits
|
|
labelFields,
|
|
decodeRows,
|
|
decodeRawRows,
|
|
decodeUTF8,
|
|
toBytes,
|
|
) where
|
|
|
|
import Conduit
|
|
( ConduitT
|
|
, MonadResource
|
|
, await
|
|
, mapC
|
|
, runConduit
|
|
, sourceFile
|
|
, yield
|
|
, (.|)
|
|
)
|
|
import Control.Monad (unless)
|
|
import Control.Monad.Trans.Class (lift)
|
|
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
|
|
import qualified Data.ByteString as BS
|
|
import Data.Conduit.List (consume)
|
|
import qualified Data.Map as M
|
|
import Data.Maybe (fromMaybe)
|
|
import qualified Data.Text as T
|
|
import Data.Text.Encoding (decodeUtf8')
|
|
import Data.Word (Word8)
|
|
|
|
-- | read an entire CSV file
|
|
slurpCSV
|
|
:: MonadResource m
|
|
=> FilePath
|
|
-- ^ the path to the file to read
|
|
-> m [[T.Text]]
|
|
slurpCSV file = runConduit $ sourceFile file .| decodeRows .| consume
|
|
|
|
-- | read an entire CSV file in raw mode
|
|
slurpRawCSV
|
|
:: MonadResource m
|
|
=> FilePath
|
|
-- ^ the path to the file to read
|
|
-> m [[BS.ByteString]]
|
|
slurpRawCSV file = runConduit $ sourceFile file .| decodeRawRows .| consume
|
|
|
|
-- | read an entire CSV file with a header
|
|
slurpLabelledCSV
|
|
:: MonadResource m
|
|
=> FilePath
|
|
-- ^ the path to the file to read
|
|
-> m [M.Map T.Text T.Text]
|
|
slurpLabelledCSV file = runConduit $
|
|
sourceFile file .| decodeRows .| labelFields .|consume
|
|
|
|
-- | read a CSV stream, using the first row as a header containing
|
|
-- field labels
|
|
labelFields :: (Monad m, Ord a) => ConduitT [a] (M.Map a a) m ()
|
|
labelFields = await >>= \case
|
|
Just headers -> labelLoop headers
|
|
Nothing -> return ()
|
|
|
|
-- | decode the rows from a stream of ByteStrings
|
|
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
|
|
decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)
|
|
|
|
-- | decode the rows returning raw ByteStrings instead of text
|
|
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
|
|
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState
|
|
|
|
-- | decode a raw ByteString into Text (if possible)
|
|
decodeUTF8 :: BS.ByteString -> Maybe T.Text
|
|
decodeUTF8 bs = case decodeUtf8' bs of
|
|
Left _ -> Nothing
|
|
Right txt -> Just txt
|
|
|
|
-- | convert a stream to ByteStrings to a string of bytes
|
|
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
|
|
toBytes = await >>= \case
|
|
Just bs -> do
|
|
let bytes = BS.unpack bs
|
|
mapM_ yield bytes
|
|
toBytes
|
|
Nothing -> return ()
|
|
|
|
-- Internal
|
|
|
|
data DecodeState = DecodeState
|
|
{ isQuoted :: Bool
|
|
, fields :: [BS.ByteString]
|
|
, collected :: BS.ByteString
|
|
} deriving (Eq, Show)
|
|
|
|
type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
|
|
type Modifier = DecodeState -> DecodeState
|
|
|
|
newDecodeState :: DecodeState
|
|
newDecodeState = DecodeState
|
|
{ isQuoted = False
|
|
, fields = []
|
|
, collected = ""
|
|
}
|
|
|
|
-- Conduits
|
|
|
|
labelLoop :: (Monad m, Ord a) => [a] -> ConduitT [a] (M.Map a a) m ()
|
|
labelLoop headers = await >>= \case
|
|
Just values -> do
|
|
yield $ M.fromList $ zip headers values
|
|
labelLoop headers
|
|
Nothing -> return ()
|
|
|
|
-- Decoders
|
|
|
|
decodeLoop :: Monad m => Decoder m
|
|
decodeLoop = lift await >>= \case
|
|
Just byte -> case byte of
|
|
0x22 -> processQuote
|
|
0x2c -> processComma
|
|
0x0d -> processCR
|
|
0x0a -> processLF
|
|
_ -> performAction $ addByte byte
|
|
Nothing -> cleanup
|
|
|
|
processQuote :: Monad m => Decoder m
|
|
processQuote = gets isQuoted >>= \case
|
|
True -> processQuotedQuote
|
|
False -> processUnquotedQuote
|
|
|
|
processComma :: Monad m => Decoder m
|
|
processComma = gets isQuoted >>= performAction . \case
|
|
True -> addByte 0x2c
|
|
False -> commitField
|
|
|
|
processCR :: Monad m => Decoder m
|
|
processCR = gets isQuoted >>= \case
|
|
True -> performAction $ addByte 0xd
|
|
False -> endRow
|
|
|
|
processLF :: Monad m => Decoder m
|
|
processLF = gets isQuoted >>= \case
|
|
True -> performAction $ addByte 0xa
|
|
False -> endRow
|
|
|
|
processQuotedQuote :: Monad m => Decoder m
|
|
processQuotedQuote = lift await >>= \case
|
|
Just byte -> case byte of
|
|
0x22 -> performAction $ addByte 0x22 -- quote
|
|
0x2c -> performAction commitField -- comma
|
|
0x0d -> commitRow -- carriage return
|
|
0x0a -> commitRow -- line feed
|
|
_ -> corruptedField
|
|
Nothing -> cleanup
|
|
|
|
processUnquotedQuote :: Monad m => Decoder m
|
|
processUnquotedQuote = gets (BS.null . collected) >>= \case
|
|
True -> performAction setQuoted
|
|
False -> corruptedField
|
|
|
|
endRow :: Monad m => Decoder m
|
|
endRow = do
|
|
s <- get
|
|
if null (fields s) && BS.null (collected s)
|
|
then decodeLoop
|
|
else commitRow
|
|
|
|
commitRow :: Monad m => Decoder m
|
|
commitRow = do
|
|
modify commitField
|
|
gets fields >>= lift . yield
|
|
performAction dropFields
|
|
|
|
corruptedField :: Monad m => Decoder m
|
|
corruptedField = do
|
|
modify dropField
|
|
ignoreField
|
|
|
|
ignoreField :: Monad m => Decoder m
|
|
ignoreField = lift await >>= \case
|
|
Just byte -> case byte of
|
|
0x2c -> performAction commitField -- comma
|
|
0x0d -> commitRow
|
|
_ -> ignoreField
|
|
Nothing -> cleanup
|
|
|
|
cleanup :: Monad m => Decoder m
|
|
cleanup = do
|
|
gets isQuoted >>= \case
|
|
True -> modify $ commitField . dropField
|
|
False -> gets (BS.null . collected) >>= \case
|
|
True -> return ()
|
|
False -> modify commitField
|
|
fs <- gets fields
|
|
unless (null fs) $
|
|
lift $ yield fs
|
|
|
|
performAction :: Monad m => Modifier -> Decoder m
|
|
performAction f = do
|
|
modify f
|
|
decodeLoop
|
|
|
|
-- Modifiers
|
|
|
|
addByte :: Word8 -> Modifier
|
|
addByte b s = let
|
|
collected' = BS.snoc (collected s) b
|
|
in s { collected = collected' }
|
|
|
|
commitField :: Modifier
|
|
commitField s = let
|
|
isQuoted' = False
|
|
fields' = fields s ++ [collected s]
|
|
collected' = ""
|
|
in s
|
|
{ isQuoted = isQuoted'
|
|
, fields = fields'
|
|
, collected = collected'
|
|
}
|
|
|
|
dropFields :: Modifier
|
|
dropFields s = s { fields = [] }
|
|
|
|
dropField :: Modifier
|
|
dropField s = s
|
|
{ isQuoted = False
|
|
, collected = ""
|
|
}
|
|
|
|
setQuoted :: Modifier
|
|
setQuoted s = s { isQuoted = True }
|
|
|
|
--jl
|