{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- | HTML entity decoding.

module Text.HTML.TagStream.Entities
  (Dec(..)
  ,isNameChar
  ,isNameStart
  ,decodeEntities)
  where

import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types

import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first,second)

-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: (Monad m
                  ,Monoid builder
                  ,Monoid string
                  ,IsString string
                  ,Eq string)
               => Dec builder string
               -> Conduit (Token' string) m (Token' string)
decodeEntities :: forall (m :: * -> *) builder string.
(Monad m, Monoid builder, Monoid string, IsString string,
 Eq string) =>
Dec builder string -> Conduit (Token' string) m (Token' string)
decodeEntities Dec builder string
dec =
    ConduitT (Token' string) (Token' string) m ()
start
  where
    start :: ConduitT (Token' string) (Token' string) m ()
start = ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
forall (m :: * -> *) i o. Monad m => ConduitT i o m (Maybe i)
await ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
-> (Maybe (Token' string)
    -> ConduitT (Token' string) (Token' string) m ())
-> ConduitT (Token' string) (Token' string) m ()
forall a b.
ConduitT (Token' string) (Token' string) m a
-> (a -> ConduitT (Token' string) (Token' string) m b)
-> ConduitT (Token' string) (Token' string) m b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT (Token' string) (Token' string) m ()
-> (Token' string -> ConduitT (Token' string) (Token' string) m ())
-> Maybe (Token' string)
-> ConduitT (Token' string) (Token' string) m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT (Token' string) (Token' string) m ()
forall a. a -> ConduitT (Token' string) (Token' string) m a
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (\Token' string
token -> Token' string -> ConduitT (Token' string) (Token' string) m ()
forall {m :: * -> *}.
Monad m =>
Token' string -> ConduitT (Token' string) (Token' string) m ()
start' Token' string
token ConduitT (Token' string) (Token' string) m ()
-> ConduitT (Token' string) (Token' string) m ()
-> ConduitT (Token' string) (Token' string) m ()
forall a b.
ConduitT (Token' string) (Token' string) m a
-> ConduitT (Token' string) (Token' string) m b
-> ConduitT (Token' string) (Token' string) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' string) (Token' string) m ()
start)
    start' :: Token' string -> ConduitT (Token' string) (Token' string) m ()
start' (Text string
t) = (string -> ConduitT (Token' string) string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
t ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
forall a b.
ConduitT (Token' string) string m a
-> ConduitT (Token' string) string m b
-> ConduitT (Token' string) string m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' string) string m ()
forall (m :: * -> *) string.
Monad m =>
Conduit (Token' string) m string
yieldWhileText) ConduitT (Token' string) string m ()
-> ConduitT string (Token' string) m ()
-> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= Dec builder string -> Conduit string m string
forall (m :: * -> *) string builder.
(Monad m, Monoid string, IsString string, Monoid builder,
 Eq string) =>
Dec builder string -> Conduit string m string
decodeEntities' Dec builder string
dec Conduit string m string
-> ConduitT string (Token' string) m ()
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= (string -> Maybe (Token' string))
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> Maybe b) -> ConduitT a b m ()
CL.mapMaybe string -> Maybe (Token' string)
forall {s}. (Eq s, IsString s) => s -> Maybe (Token' s)
go
    start' (TagOpen string
name [Attr' string]
attrs Bool
bool) = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (string -> [Attr' string] -> Bool -> Token' string
forall s. s -> [Attr' s] -> Bool -> Token' s
TagOpen string
name ((Attr' string -> Attr' string) -> [Attr' string] -> [Attr' string]
forall a b. (a -> b) -> [a] -> [b]
map ((string -> string) -> Attr' string -> Attr' string
forall b c d. (b -> c) -> (d, b) -> (d, c)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (d, b) (d, c)
second (Dec builder string -> string -> string
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder string
dec)) [Attr' string]
attrs) Bool
bool)
    start' Token' string
token = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Token' string
token

    go :: s -> Maybe (Token' s)
go s
t
        | s
t s -> s -> Bool
forall a. Eq a => a -> a -> Bool
== s
""   = Maybe (Token' s)
forall a. Maybe a
Nothing
        | Bool
otherwise = Token' s -> Maybe (Token' s)
forall a. a -> Maybe a
Just (s -> Token' s
forall s. s -> Token' s
Text s
t)

-- | Decode entities in a complete string.
decodeString
  :: (Eq a, IsString a, Monoid builder, Monoid a)
  => Dec builder a -> a -> a
decodeString :: forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder a
dec a
input =
  case Dec builder a -> a -> (a, a)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder a
dec a
input of
    (a
value', a
remainder)
      | a
value' a -> a -> Bool
forall a. Eq a => a -> a -> Bool
/= a
forall a. Monoid a => a
mempty -> a
value' a -> a -> a
forall a. Semigroup a => a -> a -> a
<> Dec builder a -> a -> a
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder a
dec a
remainder
      | Bool
otherwise -> a
input

decodeEntities' :: (Monad m
                   ,Monoid string
                   ,IsString string
                   ,Monoid builder
                   ,Eq string)
                => Dec builder string
                -> Conduit string m string
decodeEntities' :: forall (m :: * -> *) string builder.
(Monad m, Monoid string, IsString string, Monoid builder,
 Eq string) =>
Dec builder string -> Conduit string m string
decodeEntities' Dec builder string
dec =
    (string -> string) -> ConduitT string string m ()
forall {m :: * -> *}.
Monad m =>
(string -> string) -> ConduitT string string m ()
loop string -> string
forall a. a -> a
id
  where
    loop :: (string -> string) -> ConduitT string string m ()
loop string -> string
accum = do
        Maybe string
mchunk <- ConduitT string string m (Maybe string)
forall (m :: * -> *) i o. Monad m => ConduitT i o m (Maybe i)
await
        let chunk :: string
chunk = string -> string
accum (string -> string) -> string -> string
forall a b. (a -> b) -> a -> b
$ string -> Maybe string -> string
forall a. a -> Maybe a -> a
fromMaybe string
forall a. Monoid a => a
mempty Maybe string
mchunk
            (string
newStr, string
remainder) = Dec builder string -> string -> (string, string)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder string
dec string
chunk
        string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
newStr
        if Maybe string -> Bool
forall a. Maybe a -> Bool
isJust Maybe string
mchunk
            then (string -> string) -> ConduitT string string m ()
loop (string -> string -> string
forall a. Monoid a => a -> a -> a
mappend string
remainder)
            else string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
remainder

-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText :: forall (m :: * -> *) string.
Monad m =>
Conduit (Token' string) m string
yieldWhileText =
    ConduitT (Token' string) string m ()
forall {o}. ConduitT (Token' o) o m ()
loop
  where
    loop :: ConduitT (Token' o) o m ()
loop = ConduitT (Token' o) o m (Maybe (Token' o))
forall (m :: * -> *) i o. Monad m => ConduitT i o m (Maybe i)
await ConduitT (Token' o) o m (Maybe (Token' o))
-> (Maybe (Token' o) -> ConduitT (Token' o) o m ())
-> ConduitT (Token' o) o m ()
forall a b.
ConduitT (Token' o) o m a
-> (a -> ConduitT (Token' o) o m b) -> ConduitT (Token' o) o m b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT (Token' o) o m ()
-> (Token' o -> ConduitT (Token' o) o m ())
-> Maybe (Token' o)
-> ConduitT (Token' o) o m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT (Token' o) o m ()
forall a. a -> ConduitT (Token' o) o m a
forall (m :: * -> *) a. Monad m => a -> m a
return ()) Token' o -> ConduitT (Token' o) o m ()
go
    go :: Token' o -> ConduitT (Token' o) o m ()
go (Text o
t) = o -> ConduitT (Token' o) o m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield o
t ConduitT (Token' o) o m ()
-> ConduitT (Token' o) o m () -> ConduitT (Token' o) o m ()
forall a b.
ConduitT (Token' o) o m a
-> ConduitT (Token' o) o m b -> ConduitT (Token' o) o m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' o) o m ()
loop
    go Token' o
token = Token' o -> ConduitT (Token' o) o m ()
forall i o (m :: * -> *). i -> ConduitT i o m ()
leftover Token' o
token

-- | A decoder.
data Dec builder string = Dec
  { forall builder string. Dec builder string -> builder -> string
decToS     :: builder -> string
  , forall builder string.
Dec builder string -> (Char -> Bool) -> string -> (string, string)
decBreak   :: (Char -> Bool) -> string -> (string,string)
  , forall builder string. Dec builder string -> string -> builder
decBuilder :: string -> builder
  , forall builder string.
Dec builder string -> Int -> string -> string
decDrop    :: Int -> string -> string
  , forall builder string. Dec builder string -> string -> Maybe string
decEntity  :: string -> Maybe string
  , forall builder string.
Dec builder string -> string -> Maybe (Char, string)
decUncons  :: string -> Maybe (Char,string)
  }

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
                  => Dec builder string -> string -> (string, string)
makeEntityDecoder :: forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{string -> builder
string -> Maybe string
string -> Maybe (Char, string)
builder -> string
Int -> string -> string
(Char -> Bool) -> string -> (string, string)
decToS :: forall builder string. Dec builder string -> builder -> string
decBreak :: forall builder string.
Dec builder string -> (Char -> Bool) -> string -> (string, string)
decBuilder :: forall builder string. Dec builder string -> string -> builder
decDrop :: forall builder string.
Dec builder string -> Int -> string -> string
decEntity :: forall builder string. Dec builder string -> string -> Maybe string
decUncons :: forall builder string.
Dec builder string -> string -> Maybe (Char, string)
decToS :: builder -> string
decBreak :: (Char -> Bool) -> string -> (string, string)
decBuilder :: string -> builder
decDrop :: Int -> string -> string
decEntity :: string -> Maybe string
decUncons :: string -> Maybe (Char, string)
..} = (builder -> string) -> (builder, string) -> (string, string)
forall b c d. (b -> c) -> (b, d) -> (c, d)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first builder -> string
decToS ((builder, string) -> (string, string))
-> (string -> (builder, string)) -> string -> (string, string)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. string -> (builder, string)
go
  where
    go :: string -> (builder, string)
go string
s =
      case (Char -> Bool) -> string -> (string, string)
decBreak (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'&') string
s of
        (string
_,string
"") -> (string -> builder
decBuilder string
s, string
"")
        (string
before,restPlusAmp :: string
restPlusAmp@(Int -> string -> string
decDrop Int
1 -> string
rest)) ->
          case (Char -> Bool) -> string -> (string, string)
decBreak (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Char
c -> Char -> Bool
isNameChar Char
c Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#')) string
rest of
            (string
_,string
"") -> (string -> builder
decBuilder string
before, string
restPlusAmp)
            (string
entity,string
after) -> (builder
before1 builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> builder
before2, string
after')
              where
                before1 :: builder
before1 = string -> builder
decBuilder string
before
                (builder
before2, string
after') =
                  case Maybe string
mdecoded of
                    Maybe string
Nothing -> (builder -> builder) -> (builder, string) -> (builder, string)
forall b c d. (b -> c) -> (b, d) -> (c, d)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first ((string -> builder
decBuilder string
"&" builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> string -> builder
decBuilder string
entity) builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
                    Just (string -> builder
decBuilder -> builder
decoded) ->
                      case string -> Maybe (Char, string)
decUncons string
after of
                        Just (Char
';',string
validAfter) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall b c d. (b -> c) -> (b, d) -> (c, d)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
validAfter)
                        Just (Char
_invalid,string
_rest) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall b c d. (b -> c) -> (b, d) -> (c, d)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
                        Maybe (Char, string)
Nothing -> (builder
forall a. Monoid a => a
mempty, string
s)
                mdecoded :: Maybe string
mdecoded =
                  if string
entity string -> string -> Bool
forall a. Eq a => a -> a -> Bool
== string
forall a. Monoid a => a
mempty
                     then Maybe string
forall a. Maybe a
Nothing
                     else string -> Maybe string
decEntity string
entity

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart :: Char -> Bool
isNameStart Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
':' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_' Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiLower Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xC0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xD6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xD8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xF6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xF8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x370' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x37D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x37F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x1FFF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x200C' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x200D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x2070' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x218F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x2C00' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2FEF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x3001' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xD7FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xF900' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFDCF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xFDF0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFFFD') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x10000' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xEFFFF')

-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar :: Char -> Bool
isNameChar Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'-' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'.' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\xB7' Bool -> Bool -> Bool
||
  Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isNameStart Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x0300' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x036F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x203F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2040')