csv-sip/src/Data/CSV/Sip.hs

{-|

Module      : Data.CSV.Sip
Description : works with CSV files
Copyright   : (C) Jonathan Lamothe
License     : GPL-3.0-or-later
Maintainer  : jonathan@jlamothe.net
Stability   : experimental
Portability : POSIX

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.

-}

{-# LANGUAGE LambdaCase, OverloadedStrings #-}

module Data.CSV.Sip (
  -- * Read an entire CSV file
  slurpCSV,
  slurpRawCSV,
  slurpLabelledCSV,
  slurpRawLabelledCSV,
  -- * Conduits
  -- ** Encoding
  encodeCSV,
  encodeRawCSV,
  encodeRows,
  encodeRawRows,
  -- ** Decoding
  labelFields,
  decodeRows,
  decodeRawRows,
  decodeUTF8,
  toBytes,
) where

import Conduit
  ( ConduitT
  , MonadResource
  , await
  , mapC
  , runConduit
  , sourceFile
  , yield
  , (.|)
  )
import Control.Monad (unless)
import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify)
import qualified Data.ByteString as BS
import Data.Conduit.List (consume, sourceList)
import qualified Data.Map as M
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Data.Word (Word8)

-- | read an entire CSV file
slurpCSV
  :: MonadResource m
  => FilePath
  -- ^ the path to the file to read
  -> m [[T.Text]]
slurpCSV file = runConduit $ sourceFile file .| decodeRows .| consume

-- | read an entire CSV file in raw mode
slurpRawCSV
  :: MonadResource m
  => FilePath
  -- ^ the path to the file to read
  -> m [[BS.ByteString]]
slurpRawCSV file = runConduit $ sourceFile file .| decodeRawRows .| consume

-- | read an entire CSV file with a header
slurpLabelledCSV
  :: MonadResource m
  => FilePath
  -- ^ the path to the file to read
  -> m [M.Map T.Text T.Text]
slurpLabelledCSV file = runConduit $
  sourceFile file .| decodeRows .| labelFields .|consume

-- | read an entire CSV file with a header
slurpRawLabelledCSV
  :: MonadResource m
  => FilePath
  -- ^ the path to the file to read
  -> m [M.Map BS.ByteString BS.ByteString]
slurpRawLabelledCSV file = runConduit $
  sourceFile file .| decodeRawRows .| labelFields .|consume

-- | encode an entire CSV file
encodeCSV
  :: Monad m
  => [[T.Text]]
  -- ^ the data being encoded, organized into rows and fields
  -> ConduitT () BS.ByteString m ()
encodeCSV csv = sourceList csv .| encodeRows

-- | encode an entire CSV file
encodeRawCSV
  :: Monad m
  => [[BS.ByteString]]
  -- ^ the data being encoded, organized into rows and fields
  -> ConduitT () BS.ByteString m ()
encodeRawCSV csv = sourceList csv .| encodeRawRows

-- | encode a CSV stream row by row, each element in the list read
-- represents a field, with the entire list representing a row
encodeRows :: Monad m => ConduitT [T.Text] BS.ByteString m ()
encodeRows = mapC (map encodeUtf8) .| encodeRawRows

-- | encode raw CSV stream row by row, each element in the list read
-- represents a field, with the entire list representing a row
encodeRawRows :: Monad m => ConduitT [BS.ByteString] BS.ByteString m ()
encodeRawRows = await >>= \case

  Just fs-> do
    encodeFields fs
    encodeRawRows

  Nothing -> return ()

-- | read a CSV stream, using the first row as a header containing
-- field labels
labelFields :: (Monad m, Ord a) => ConduitT [a] (M.Map a a) m ()
labelFields = await >>= \case
  Just headers -> labelLoop headers
  Nothing      -> return ()

-- | decode the rows from a stream of ByteStrings
decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m ()
decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8)

-- | decode the rows returning raw ByteStrings instead of text
decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m ()
decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState

-- | decode a raw ByteString into Text (if possible)
decodeUTF8 :: BS.ByteString -> Maybe T.Text
decodeUTF8 bs = case decodeUtf8' bs of
  Left _    -> Nothing
  Right txt -> Just txt

-- | convert a stream to ByteStrings to a string of bytes
toBytes :: Monad m => ConduitT BS.ByteString Word8 m ()
toBytes = await >>= \case
  Just bs -> do
    let bytes = BS.unpack bs
    mapM_ yield bytes
    toBytes
  Nothing -> return ()

-- Internal

data DecodeState = DecodeState
  { isQuoted  :: Bool
  , fields    :: [BS.ByteString]
  , collected :: BS.ByteString
  } deriving (Eq, Show)

type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) ()
type Modifier = DecodeState -> DecodeState

newDecodeState :: DecodeState
newDecodeState = DecodeState
  { isQuoted  = False
  , fields    = []
  , collected = ""
  }

-- Conduits

encodeFields
  :: Monad m
  => [BS.ByteString]
  -> ConduitT [BS.ByteString] BS.ByteString m ()
encodeFields []     = yield "\r\n"
encodeFields [f]    = yield $ escapeField f `BS.append` "\r\n"
encodeFields (f:fs) = do
  yield $ escapeField f `BS.append` ","
  encodeFields fs

labelLoop :: (Monad m, Ord a) => [a] -> ConduitT [a] (M.Map a a) m ()
labelLoop headers = await >>= \case
  Just values -> do
    yield $ M.fromList $ zip headers values
    labelLoop headers
  Nothing     -> return ()

-- Decoders

decodeLoop :: Monad m => Decoder m
decodeLoop = lift await >>= \case
  Just byte -> case byte of
    0x22 -> processQuote
    0x2c -> processComma
    0x0d -> processCR
    0x0a -> processLF
    _    -> performAction $ addByte byte
  Nothing -> cleanup

processQuote :: Monad m => Decoder m
processQuote = gets isQuoted >>= \case
  True  -> processQuotedQuote
  False -> processUnquotedQuote

processComma :: Monad m => Decoder m
processComma = gets isQuoted >>= performAction . \case
  True  -> addByte 0x2c
  False -> commitField

processCR :: Monad m => Decoder m
processCR = gets isQuoted >>= \case
  True  -> performAction $ addByte 0xd
  False -> endRow

processLF :: Monad m => Decoder m
processLF = gets isQuoted >>= \case
  True  -> performAction $ addByte 0xa
  False -> endRow

processQuotedQuote :: Monad m => Decoder m
processQuotedQuote = lift await >>= \case
  Just byte -> case byte of
    0x22 -> performAction $ addByte 0x22 -- quote
    0x2c -> performAction commitField    -- comma
    0x0d -> commitRow                    -- carriage return
    0x0a -> commitRow                    -- line feed
    _    -> corruptedField
  Nothing -> cleanup

processUnquotedQuote :: Monad m => Decoder m
processUnquotedQuote = gets (BS.null . collected) >>= \case
  True  -> performAction setQuoted
  False -> corruptedField

endRow :: Monad m => Decoder m
endRow = do
  s <- get
  if null (fields s) && BS.null (collected s)
    then decodeLoop
    else commitRow

commitRow :: Monad m => Decoder m
commitRow = do
  modify commitField
  gets fields >>= lift . yield
  performAction dropFields

corruptedField :: Monad m => Decoder m
corruptedField = do
  modify dropField
  ignoreField

ignoreField :: Monad m => Decoder m
ignoreField = lift await >>= \case
  Just byte -> case byte of
    0x2c -> performAction commitField -- comma
    0x0d -> commitRow
    _    -> ignoreField
  Nothing -> cleanup

cleanup :: Monad m => Decoder m
cleanup = do
  gets isQuoted >>= \case
    True  -> modify $ commitField . dropField
    False -> gets (BS.null . collected) >>= \case
      True  -> return ()
      False -> modify commitField
  fs <- gets fields
  unless (null fs) $
    lift $ yield fs

performAction :: Monad m => Modifier -> Decoder m
performAction f = do
  modify f
  decodeLoop

-- Modifiers

addByte :: Word8 -> Modifier
addByte b s = let
  collected' = BS.snoc (collected s) b
  in s { collected = collected' }

commitField :: Modifier
commitField s = let
  isQuoted'  = False
  fields'    = fields s ++ [collected s]
  collected' = ""
  in s
  { isQuoted  = isQuoted'
  , fields    = fields'
  , collected = collected'
  }

dropFields :: Modifier
dropFields s = s { fields = [] }

dropField :: Modifier
dropField s = s
  { isQuoted  = False
  , collected = ""
  }

setQuoted :: Modifier
setQuoted s = s { isQuoted = True }

-- Helpers
escapeField :: BS.ByteString -> BS.ByteString
escapeField field = let
  bytes = BS.unpack field
  in BS.concat
    [ "\""
    , BS.pack $ escapeLoop bytes
    , "\""
    ]

escapeLoop :: [Word8] -> [Word8]
escapeLoop []        = []
escapeLoop (0x22:bs) = [0x22, 0x22] ++ escapeLoop bs -- escape quote
escapeLoop (b:bs)    = b : escapeLoop bs

--jl