{-# OPTIONS_GHC -cpp -fglasgow-exts #-} -- -- Module : Data.ByteString.UTF8 -- Copyright : (c) Martin Norbäck 2006 -- License : BSD-style -- -- Maintainer : martin@norpan.org -- Stability : experimental -- Portability : same as Data.ByteString -- -- -- | Manipulate ByteStrings containing UTF-8 encoded characters. -- This is especially useful when doing mmapFile on a UTF-8 encoded file -- because it will allow using the full Unicode range on Chars. -- -- Behaviour when contents is not UTF-8 is undefined. -- -- This module is intended to be imported @qualified@, to avoid name -- clashes with Prelude functions. eg. -- -- > import qualified Data.ByteString.Char8 as B -- module UTF8 ( -- * The @ByteString@ type ByteString(..), -- instances: Eq, Ord, Show, Read, Data, Typeable -- * Introducing and eliminating 'ByteString's empty, -- :: ByteString packChar, -- :: Char -> ByteString pack, -- :: String -> ByteString unpack, -- :: ByteString -> String -- * Basic interface cons, -- :: Char -> ByteString -> ByteString snoc, -- :: ByteString -> Char -> ByteString null, -- :: ByteString -> Bool length, -- :: ByteString -> Int head, -- :: ByteString -> Char tail, -- :: ByteString -> ByteString last, -- :: ByteString -> Char init, -- :: ByteString -> ByteString append, -- :: ByteString -> ByteString -> ByteString -- * Reducing 'ByteString's foldl, -- :: (a -> Char -> a) -> a -> ByteString -> a foldr, -- :: (Char -> a -> a) -> a -> ByteString -> a foldl1, -- :: (Char -> Char -> Char) -> ByteString -> Char foldr1, -- :: (Char -> Char -> Char) -> ByteString -> Char -- ** Special folds concat, -- :: [ByteString] -> ByteString concatMap, -- :: (Char -> ByteString) -> ByteString -> ByteString -- ** Joining strings join, -- :: ByteString -> [ByteString] -> ByteString -- ** Searching for substrings isPrefixOf, -- :: ByteString -> ByteString -> Bool isSuffixOf, -- :: ByteString -> ByteString -> Bool isSubstringOf, -- :: ByteString -> ByteString -> Bool findSubstring, -- :: ByteString -> ByteString -> Maybe Int findSubstrings, -- :: ByteString -> ByteString -> [Int] -- ** Using ByteStrings as CStrings useAsCString, -- :: ByteString -> (CString -> IO a) -> IO a useAsCStringLen, -- :: ByteString -> (CString -> IO a) -> IO a unsafeUseAsCString, -- :: ByteString -> (CString -> IO a) -> IO a unsafeUseAsCStringLen, -- :: ByteString -> (CStringLen -> IO a) -> IO a -- ** Copying ByteStrings -- | These functions perform memcpy(3) operations copy, -- :: ByteString -> ByteString copyCString, -- :: CString -> ByteString copyCStringLen, -- :: CStringLen -> ByteString -- * I\/O with @ByteString@s -- ** Standard input and output #if defined(__GLASGOW_HASKELL__) getLine, -- :: IO ByteString #endif getContents, -- :: IO ByteString putStr, -- :: ByteString -> IO () putStrLn, -- :: ByteString -> IO () -- ** Files readFile, -- :: FilePath -> IO ByteString -- mmapFile, -- :: FilePath -> IO ByteString writeFile, -- :: FilePath -> ByteString -> IO () -- ** I\/O with Handles #if defined(__GLASGOW_HASKELL__) -- getArgs, -- :: IO [ByteString] hGetLine, -- :: Handle -> IO ByteString hGetNonBlocking, -- :: Handle -> Int -> IO ByteString #endif hGetContents, -- :: Handle -> IO ByteString hGet, -- :: Handle -> Int -> IO ByteString hPut, -- :: Handle -> ByteString -> IO () #if defined(__GLASGOW_HASKELL__) -- * Low level construction -- | For constructors from foreign language types see /Data.ByteString/ packAddress, -- :: Addr# -> ByteString unsafePackAddress, -- :: Int -> Addr# -> ByteString #endif -- simple list-using functions take, drop, unlines, group, reverse, inits, tails, sort, splitAt, index, map, filter, filterChar, filterNotChar, takeWhile, dropWhile, span, spanEnd, break, lines, lines', unlines', split, words, unwords, words', unwords', groupBy, intersperse, any, all, maximum, minimum, replicate, elem, notElem, find, elemIndex, elemIndexLast, findIndex, elemIndices, findIndices, lineIndices, breakChar, breakSpace, spanChar, breakFirst, splitWith, tokens, dropSpace, dropSpaceEnd, joinWithChar, zip, zipWith, count, unzip, transpose, hash ) where import Data.Int import qualified Prelude as P import Prelude hiding (reverse,head,tail,last,init,null ,length,map,lines,foldl,foldr,unlines ,concat,any,take,drop,splitAt,takeWhile ,dropWhile,span,break,elem,filter,unwords ,words,maximum,minimum,all,concatMap ,foldl1,foldr1,readFile,writeFile,replicate ,getContents,getLine,putStr,putStrLn ,zip,zipWith,unzip,notElem) import qualified Data.ByteString as B -- These functions are unchanged from Data.ByteString -- which means they work for (valid) UTF-8 byte strings too import Data.ByteString (empty,null,append ,concat,join -- for valid UTF-8 these functions work on the byte -- level ,isPrefixOf,isSuffixOf,isSubstringOf ,getContents, putStr, putStrLn ,readFile, {-mmapFile,-} writeFile ,hGetContents, hGet, hPut ,copy, copyCString, copyCStringLen ,singleton ) import Data.ByteString.Base ( ByteString(..), unsafeUseAsCString, unsafeUseAsCStringLen, unsafeCreate, memcpy ) import Data.ByteString.Char8 (getLine, hGetLine, hGetNonBlocking ,packAddress, unsafePackAddress ,useAsCStringLen, useAsCString ) import Foreign.ForeignPtr import Foreign.Ptr import Foreign.Storable import Data.Bits import Data.Word (Word8) import Control.Monad (when) import Control.Exception (assert) import Data.Maybe (listToMaybe) import GHC.Base (build, unsafeChr) import GHC.Prim (realWorld#) import GHC.IOBase import qualified Data.List as List import qualified Data.Char as Char #define STRICT1(f) f a | a `seq` False = undefined #define STRICT2(f) f a b | a `seq` b `seq` False = undefined #define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined #define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined #define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined newline, space :: Word8 newline = 0x0A space = 0x20 packChar :: Char -> ByteString packChar c = pack [c] pack :: String -> ByteString pack s = unsafeCreate (numBytesString s) $ \p -> go p s where go _ [] = return () go p (x:xs) = do l <- putUTF8 p x go (p `plusPtr` l) xs {-# INLINE unpack #-} unpack :: ByteString -> String unpack ps = build (unpackFoldr ps) {-# INLINE [0] unpackFoldr #-} unpackFoldr :: ByteString -> (Char -> a -> a) -> a -> a unpackFoldr (PS x s l) f c = withPtr x $ \p -> go (p `plusPtr` s) l where STRICT2(go) go q n | n <= 0 = touchForeignPtr x >> return c | otherwise = do (e, w) <- getUTF8 q n es <- unsafeInterleaveIO $ go (q `plusPtr` w) (n-w) return (e `f` es) {-# RULES "unpack-list" [1] forall p . unpackFoldr p (:) [] = unpackList p #-} unpackList :: ByteString -> String unpackList (PS x s l) = withPtr x $ \p -> go (p `plusPtr` s) l where STRICT2(go) go q n | n <= 0 = touchForeignPtr x >> return [] | otherwise = do (e, w) <- getUTF8 q n es <- unsafeInterleaveIO $ go (q `plusPtr` w) (n-w) return (e : es) {- -- ----------------------------------------------------------------------------- -- unpacking {-# INLINE unpack #-} unpack :: ByteString -> [Char] unpack ps = build (unpackFoldr ps) {-# INLINE [0] unpackFoldr #-} unpackFoldr (PS x s l) f c = withPtr x $ \p -> unpackFoldrCStringUtf8 (p `plusPtr` s) l f c unpackFoldrCStringUtf8 :: Ptr Word8 -> Int -> (Char -> a -> a) -> a -> IO a STRICT4(unpackFoldrCStringUtf8) unpackFoldrCStringUtf8 addr len f c = unpack 0 c where unpack nh acc | nh >= len = return acc unpack nh acc = do (c,w) <- getUTF8 (addr `plusPtr` nh) (len - nh) unpack (nh+w) (c `f` acc) unpackList (PS x s l) = withPtr x $ \p -> unpackCStringUtf8 (p `plusPtr` s) l unpackCStringUtf8 :: Ptr Word8 -> Int -> IO [Char] STRICT2(unpackCStringUtf8) unpackCStringUtf8 addr len = unpack 0 [] where unpack nh acc | nh >= len = return acc unpack nh acc = do (c,w) <- getUTF8 (addr `plusPtr` nh) (len - nh) unpack (nh+w) (c:acc) {-# RULES "unpack-list" [1] forall l . unpackFoldr l (:) [] = unpackList l #-} -} cons :: Char -> ByteString -> ByteString cons c (PS x s l) = let w = numBytes c in unsafeCreate (l + w) $ \p -> withForeignPtr x $ \f -> do memcpy (p `plusPtr` w) (f `plusPtr` s) (fromIntegral l) n <- putUTF8 p c assert (w == n) $ return () snoc :: ByteString -> Char -> ByteString snoc (PS x s l) c = let w = numBytes c in unsafeCreate (l + w) $ \p -> withForeignPtr x $ \f -> do memcpy p (f `plusPtr` s) (fromIntegral l) n <- putUTF8 (p `plusPtr` l) c assert (w == n) $ return () -- length taks O(n) now because we need to traverse -- find length by using full string length and subtract all "follow" bytes length :: ByteString -> Int length bs = B.length bs - P.length (B.findIndices isFollow bs) head :: ByteString -> Char head = fst . headTail tail :: ByteString -> ByteString tail = snd . headTail headTail :: ByteString -> (Char, ByteString) headTail (PS x s l) | l <= 0 = errorEmptyList "headTail" | otherwise = withPtr x $ \p -> do (c,l') <- getUTF8 (p `plusPtr` s) l return (c, PS x (s+l') (l-l')) last :: ByteString -> Char last (PS x s l) | l <= 0 = errorEmptyList "last" | otherwise = withPtr x $ \p -> do n <- backFollow (p `plusPtr` (s+l-1)) 0 (c,_) <- getUTF8 (p `plusPtr` (s+l-n-1)) (n+1) return c init :: ByteString -> ByteString init (PS x s l) | l <= 0 = errorEmptyList "init" | otherwise = withPtr x $ \p -> do n <- backFollow (p `plusPtr` (s+l-1)) 0 return (PS x s (l-n-1)) foldl :: (a -> Char -> a) -> a -> ByteString -> a foldl f e ps | null ps = e | otherwise = let (h,t) = headTail ps in foldl f (f e h) t foldl1 :: (Char -> Char -> Char) -> ByteString -> Char foldl1 f ps | null ps = errorEmptyList "foldl1" | otherwise = let (h,t) = headTail ps in foldl f h t foldr :: (Char -> a -> a) -> a -> ByteString -> a foldr f e ps | null ps = e | otherwise = let (h,t) = headTail ps in f h (foldr f e t) foldr1 :: (Char -> Char -> Char) -> ByteString -> Char foldr1 f ps | null ps = errorEmptyList "foldr1" | otherwise = foldr f (last ps) (init ps) concatMap :: (Char -> ByteString) -> ByteString -> ByteString concatMap f = foldr (append . f) empty -- return how many 10xxxxxx bytes there are backwards from the ptr backFollow :: Ptr Word8 -> Int -> IO Int backFollow p x = do when (x > 3) (fail "too many follow bytes") (c :: Word8) <- peekByteOff p (-x) if (not (isFollow c)) then return x else backFollow p (x+1) -- put char into memory area and return number of bytes written putUTF8 :: Ptr Word8 -> Char -> IO Int putUTF8 p0 char = if ch < 0x80 then do poke p0 (toEnum ch) return 1 else if ch < 0x800 then do poke p0 (toEnum (b11000000 .|. bits12)) poke p1 (toEnum (b10000000 .|. bits6)) return 2 else if ch < 0x10000 then do poke p0 (toEnum (b11100000 .|. bits18)) poke p1 (toEnum (b10000000 .|. bits12)) poke p2 (toEnum (b10000000 .|. bits6)) return 3 else do -- if ch < 0x110000 poke p0 (toEnum (b11110000 .|. bits24)) poke p1 (toEnum (b10000000 .|. bits18)) poke p2 (toEnum (b10000000 .|. bits12)) poke p3 (toEnum (b10000000 .|. bits6)) return 4 where -- all calculations are made in Int here, for speed and no risk of -- overflow, portability etc. ch = fromEnum char bits6 = ch .&. b00111111 bits12 = (ch `shiftR` 6) .&. b00111111 bits18 = (ch `shiftR` 12) .&. b00111111 bits24 = (ch `shiftR` 18) -- no mask, since Char is limited (p1 :: Ptr Word8) = p0 `plusPtr` 1 (p2 :: Ptr Word8) = p0 `plusPtr` 2 (p3 :: Ptr Word8) = p0 `plusPtr` 3 -- utf8 diagram -- UTF-8 Char -- 1 byte 0xxxxxxx 0xxxxxxx -- 2 bytes 110zzzzx 10xxxxxx 00000zzz zxxxxxxx -- 3 bytes 1110zzzz 10zxxxxx 10xxxxxx zzzzzxxx xxxxxxxx -- 4 bytes 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx 000zzzzz xxxxxxxx xxxxxxxx -- the digits marked with z may not all be zero (overlong) -- get one char from pointer location -- only allowed to read int number of words -- int must be at least 1 -- return bytes consumed getUTF8 :: Ptr Word8 -> Int -> IO (Char, Int) getUTF8 p m = do c <- fmap fromEnum (peek p) if c < b10000000 then -- 0xxxxxxx return (unsafeChr c, 1) else if c < b11000000 then -- 10xxxxxx fail "invalid first byte" else if c < b11100000 then do -- 110zzzzx r <- decodeFollow p 1 (c .&. b00011111) m when (r < 0x80) (fail "overlong") return (unsafeChr r, 2) else if c < b11110000 then do -- 1110xxxx r <- decodeFollow p 2 (c .&. b00001111) m when (r < 0x800) (fail "overlong") return (unsafeChr r, 3) else if c < b11111000 then do -- 11110xxx r <- decodeFollow p 3 (c .&. b00000111) m when (r < 0x10000) (fail "overlong") return (unsafeChr r, 4) else -- 11111xxx fail "unicode value too large" where decodeFollow :: (Ptr Word8) -> Int -> Int -> Int -> IO Int decodeFollow _ 0 c _ = return c decodeFollow _ _ _ 0 = fail "premature end of string" decodeFollow q n c o = do let q' = q `plusPtr` 1 b <- fmap fromEnum (peek q' :: IO Word8) when (not (isFollow b)) (fail "invalid follow byte") decodeFollow q' (n-1) ((c `shiftL` 6) .|. (b .&. b00111111)) (o-1) isFollow :: Bits a => a -> Bool isFollow b = (b .&. b11000000) == b10000000 numBytes :: Char -> Int numBytes c | fromEnum c < 0x80 = 1 | fromEnum c < 0x800 = 2 | fromEnum c < 0x10000 = 3 | otherwise = 4 numBytesString :: String -> Int numBytesString s = sum (P.map numBytes s) b00000111,b00001111,b00011111,b00111111,b11111000,b11110000,b11100000,b11000000,b10000000 :: Num a => a b00000111 = 0x07 b00001111 = 0x0f b00011111 = 0x1f b00111111 = 0x3f b11111000 = 0xf8 b11110000 = 0xf0 b11100000 = 0xe0 b11000000 = 0xc0 b10000000 = 0x80 -- Common up near identical calls to `error' to reduce the number -- constant strings created when compiled: errorEmptyList :: String -> a errorEmptyList fun = error ("Data.ByteString.UTF8." ++ fun ++ ": empty ByteString") {-# INLINE errorEmptyList #-} -- unlines unlines :: [ByteString] -> ByteString unlines [] = empty unlines ss = (concat $ List.intersperse nl ss) `append` nl -- half as much space where nl = singleton newline -- unlines' unlines' :: [ByteString] -> ByteString unlines' [] = empty unlines' ss = (concat $ List.intersperse nl ss) where nl = singleton newline -- Below are simple, via-list implementations -- ...should have great potential for optimization :-) viaList :: ([Char] -> [Char]) -> ByteString -> ByteString viaList f = pack . f . unpack viaList2 :: ([Char]->([Char],[Char])) -> ByteString -> (ByteString,ByteString) viaList2 f bs = let (a,b) = f (unpack bs) in (pack a, pack b) take :: Int -> ByteString -> ByteString take n = fst . splitAt n drop :: Int -> ByteString -> ByteString drop n = snd . splitAt n inits :: ByteString -> [ByteString] inits bs@(PS x s _) = [PS x s n | n <- rawIndices bs] tails :: ByteString -> [ByteString] tails bs = if null bs then [empty] else bs : tails (tail bs) sort :: ByteString -> ByteString sort = viaList List.sort -- can be optimized to ignore follow bytes splitAt :: Int -> ByteString -> (ByteString, ByteString) splitAt n ps@(PS x s l) | n <= 0 = (empty, ps) | otherwise = withPtr x $ \p -> go (p `plusPtr` s) 0 l where go q i m | m <= 0 = return (ps, empty) | i >= n = return (PS x s (l-m), PS x (s+l-m) m) | otherwise = do k <- rawLength q go (q `plusPtr` k) (i+1) (m-k) group :: ByteString -> [ByteString] group = groupBy (==) reverse :: ByteString -> ByteString reverse = pack . List.reverse . unpack -- index index :: ByteString -> Int -> Char index = (!!) . unpack -- map map :: (Char -> Char) -> ByteString -> ByteString map f = viaList (List.map f) -- filter filter :: (Char -> Bool) -> ByteString -> ByteString filter p = viaList (List.filter p) -- filterChar filterChar :: Char -> ByteString -> ByteString filterChar c = viaList (List.filter (==c)) -- filterNotChar filterNotChar :: Char -> ByteString -> ByteString filterNotChar c = viaList (List.filter (/=c)) -- takeWhile takeWhile :: (Char -> Bool) -> ByteString -> ByteString takeWhile p = fst . span p -- dropWhile dropWhile :: (Char -> Bool) -> ByteString -> ByteString dropWhile p = snd . span p -- span span :: (Char -> Bool) -> ByteString -> (ByteString, ByteString) span p = break (not . p) -- spanEnd spanEnd :: (Char -> Bool) -> ByteString -> (ByteString, ByteString) spanEnd _ = error "not implemented yet" -- lines -- TODO: implement other types of line breaks! lines :: ByteString -> [ByteString] lines ps | null ps = [] | otherwise = case search ps of Nothing -> [ps] Just n -> B.take n ps : lines (B.drop (n+1) ps) where search = B.elemIndex newline lines' :: ByteString -> [ByteString] lines' bs = lines bs ++ if (not (null bs) && B.last bs == newline) then [singleton newline] else [] -- split split :: Char -> ByteString -> [ByteString] split c = let i = Char.ord c in if i < 128 then B.split (fromIntegral i) else splitWith (== c) -- words words :: ByteString -> [ByteString] words = tokens Char.isSpace -- unwords unwords :: [ByteString] -> ByteString unwords = join (singleton space) words' :: ByteString -> [ByteString] words' = splitWith Char.isSpace unwords' :: [ByteString] -> ByteString unwords' = unwords -- groupBy groupBy :: (Char -> Char -> Bool) -> ByteString -> [ByteString] groupBy f (PS x s l) = withPtr x $ \p -> go (p `plusPtr` s) 0 l where STRICT3(go) go q r m | m <= 0 = touchForeignPtr x >> return [] | otherwise = do (c,w) <- getUTF8 q m mi <- findRawIndex (not . f c) (q `plusPtr` w) (m-w) case mi of Nothing -> return [PS x (s+r) m] Just i -> do rest <- unsafeInterleaveIO $ go (q `plusPtr` (w+i)) (r+w+i) (m-w-i) return (PS x (s+r) (i+w):rest) -- intersperse intersperse :: Char -> ByteString -> ByteString intersperse c = viaList (List.intersperse c) -- any any :: (Char -> Bool) -> ByteString -> Bool any f (PS x s l) = withPtr x $ \p -> do mi <- findRawIndex f (p `plusPtr` s) l return $ case mi of Nothing -> False Just _ -> True -- all all :: (Char -> Bool) -> ByteString -> Bool all p = not . any (not . p) -- maximum maximum :: ByteString -> Char maximum = foldl1 max -- minimum minimum :: ByteString -> Char minimum = foldl1 min -- replicate replicate :: Int -> Char -> ByteString replicate n c | n <= 0 = empty | Char.ord c < 128 = B.replicate n (fromIntegral $ Char.ord c) | otherwise = unsafeCreate (n * numBytes c) $ \p -> go n p where go 0 _ = return () go n p = do k <- putUTF8 p c go (n-1) (p `plusPtr` k) -- elem elem :: Char -> ByteString -> Bool elem c = if Char.ord c < 128 then B.elem (fromIntegral $ Char.ord c) else List.elem c . unpack -- notElem notElem :: Char -> ByteString -> Bool notElem c = not . elem c -- find find :: (Char -> Bool) -> ByteString -> Maybe Char find p = List.find p . unpack -- elemIndex elemIndex :: Char -> ByteString -> Maybe Int elemIndex c = List.elemIndex c . unpack -- elemIndexLast elemIndexLast :: Char -> ByteString -> Maybe Int elemIndexLast _ = undefined -- findIndex findIndex :: (Char -> Bool) -> ByteString -> Maybe Int findIndex p = List.findIndex p . unpack -- elemIndices elemIndices :: Char -> ByteString -> [Int] elemIndices c = List.elemIndices c . unpack -- findIndices findIndices :: (Char -> Bool) -> ByteString -> [Int] findIndices p = List.findIndices p . unpack -- lineIndices lineIndices :: ByteString -> [Int] lineIndices = elemIndices '\n' -- breakChar breakChar :: Char -> ByteString -> (ByteString, ByteString) breakChar c = viaList2 (List.break (==c)) -- breakSpace breakSpace :: ByteString -> (ByteString,ByteString) breakSpace = break Char.isSpace -- spanChar spanChar :: Char -> ByteString -> (ByteString, ByteString) spanChar c = viaList2 (List.span (==c)) -- splitWith splitWith :: (Char -> Bool) -> ByteString -> [ByteString] splitWith f ps | null ps = [] | otherwise = let (first, rest) = break f ps t = tail rest -- need to add extra empty string if char splitted on was the last one in first:if null rest then [] else if null t then [empty] else splitWith f t -- tokens tokens :: (Char -> Bool) -> ByteString -> [ByteString] tokens f = P.filter (not.null) . splitWith f -- dropSpace dropSpace :: ByteString -> ByteString dropSpace = dropWhile Char.isSpace -- dropSpaceEnd dropSpaceEnd :: ByteString -> ByteString dropSpaceEnd = undefined -- joinWithChar joinWithChar :: Char -> ByteString -> ByteString -> ByteString joinWithChar c b1 b2 = concat [b1, packChar c, b2] -- zip zip :: ByteString -> ByteString -> [(Char,Char)] zip = zipWith (,) -- zipWith zipWith :: (Char -> Char -> a) -> ByteString -> ByteString -> [a] zipWith f b1 b2 | null b1 || null b2 = [] | otherwise = f (head b1) (head b2) : zipWith f (tail b1) (tail b2) -- count count :: Char -> ByteString -> Int count c bs | Char.ord c < 128 = B.count (fromIntegral (Char.ord c)) bs | otherwise = List.length (findSubstrings (packChar c) bs) -- | Perform an operation with a temporary ByteString withPtr :: ForeignPtr a -> (Ptr a -> IO b) -> b withPtr fp io = inlinePerformIO (withForeignPtr fp io) {-# INLINE withPtr #-} unzip :: [(Char,Char)] -> (ByteString,ByteString) unzip ls = (pack (P.map fst ls), pack (P.map snd ls)) transpose :: [ByteString] -> [ByteString] transpose ps = P.map pack (List.transpose (P.map unpack ps)) break :: (Char -> Bool) -> ByteString -> (ByteString, ByteString) break f ps@(PS x s l) = withPtr x $ \p -> do mi <- findRawIndex f (p `plusPtr` s) l return $ case mi of Nothing -> (ps, empty) Just i -> (PS x s i, PS x (s+i) (l-i)) -- internal function -- find raw index in byte array for first char matching findRawIndex :: (Char -> Bool) -> Ptr Word8 -> Int -> IO (Maybe Int) findRawIndex f p l = go p 0 where STRICT2(go) go q i | i >= l = return Nothing | otherwise = do (e, w) <- getUTF8 q (l-i) if f e then return (Just i) else go (q `plusPtr` w) (i+w) -- assumes correct input rawLength :: Ptr Word8 -> IO Int rawLength p = do c <- fmap fromEnum (peek p) return $ if c < b10000000 then 1 else if c < b11100000 then 2 else if c < b11110000 then 3 else if c < b11111000 then 4 else 5 -- any number would do rawIndices:: ByteString -> [Int] rawIndices (PS x s l) = withPtr x $ \p -> go 0 (p `plusPtr` s) l where STRICT3(go) go k q m | m <= 0 = touchForeignPtr x >> return [k] | otherwise = do i <- rawLength q is <- unsafeInterleaveIO $ go (k+i) (q `plusPtr` i) (m-i) return (k:is) breakFirst :: Char -> ByteString -> Maybe (ByteString, ByteString) breakFirst c xs = let (x,y) = breakChar c xs in if null y then Nothing else Just (x, tail y) findSubstring :: ByteString -> ByteString -> Maybe Int findSubstring b1 b2 = listToMaybe (findSubstrings b1 b2) -- use regular findSubstrings and map results back findSubstrings :: ByteString -> ByteString -> [Int] findSubstrings b1 b2 | null b1 = [0 .. length b2] | otherwise = [ i | (i,b) <- P.zip [0..] (tails b2), b1 `isPrefixOf` b ] -- Just like inlinePerformIO, but we inline it. Big performance gains as -- it exposes lots of things to further inlining -- {-# INLINE inlinePerformIO #-} inlinePerformIO :: IO a -> a #if defined(__GLASGOW_HASKELL__) inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r #else inlinePerformIO = unsafePerformIO #endif hash :: ByteString -> Int32 hash (PS x s l) = inlinePerformIO $ withForeignPtr x $ \p -> go (0 :: Int32) (p `plusPtr` s) l where go :: Int32 -> Ptr Word8 -> Int -> IO Int32 STRICT3(go) go h _ 0 = return h go h p n = do w <- peek p go (fromIntegral w + rotateL h 8) (p `plusPtr` 1) (n-1)