-- -- The Computer Language Shootout -- http://shootout.alioth.debian.org/ -- -- Contributed by Don Stewart, Chris Kuklewicz and Alson Kemp. -- Updated for ByteString by Chris Kuklewicz February, 2007 -- -- Compile with: -O2 -package parsec -- -- An idiomatic Haskell entry using lazy regex combinators, described in the paper: -- -- Manuel M. T. Chakravarty, Lazy Lexing is Fast. -- In A. Middeldorp and T. Sato, editors, Proceedings of Fourth Fuji -- International Symposium on Functional and Logic Programming, -- Springer-Verlag, LNCS 1722, 1999. import Monad import Data.Array (Array, assocs, accumArray,bounds,listArray) import Data.Array.Base (unsafeAt) import Data.ByteString.Base import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Char8 as BC import Word import List hiding (delete) import qualified Data.Map as M import System import qualified Text.ParserCombinators.Parsec as P import Text.ParserCombinators.Parsec ((<|>),(),pzero) ------------------------------------------------------------------------ main = putStr . work =<< B.getContents work fileIn = fileIn `seq` unlines $ counts ++ [[],l0, l1, l2] where l0 = show $ B.length fileIn clean = B.concat . delete "\n|>[^\n]+\n" $ fileIn l1 = show $ B.length clean counts = [ re++" "++show (count re clean) | re <- variants ] l2 = show $ L.length (iubsExpand clean) iubsExpand = L.fromChunks . foldr1 (.) (map replace iubs) . return variants = [ "agggtaaa|tttaccct" ,"[cgt]gggtaaa|tttaccc[acg]","a[act]ggtaaa|tttacc[agt]t" ,"ag[act]gtaaa|tttac[agt]ct","agg[act]taaa|ttta[agt]cct","aggg[acg]aaa|ttt[cgt]ccct" ,"agggt[cgt]aa|tt[acg]accct","agggta[cgt]a|t[acg]taccct","agggtaa[cgt]|[acg]ttaccct"] iubs = map (\(s,new) -> (regex s,BC.pack new)) $ [("B","(c|g|t)"),("D","(a|g|t)"),("H","(a|c|t)"),("K","(g|t)") ,("M","(a|c)") ,("N","(a|c|g|t)"),("R","(a|g)") ,("S","(c|g)"),("V","(a|c|g)"),("W","(a|t)") ,("Y","(c|t)")] -- And that's it! ------------------------------------------------------------------------ -- external interface to regular expressions regex = (either (error.show) accept) . (\x -> P.parse p_regex x x) where accept re = re (Lexer True Done) -- Close a regular expression into a Lexer. (!) a b = a `seq` unsafeAt a (fromEnum $ (b - fst (bounds a))) delete s = del where r = regex s del b = pieces 0 (run r b 0) where pieces end x | end `seq` x `seq` False = undefined pieces end [] = unsafeDrop end b : [] pieces end ((start,stop):rest) = unsafeTake (start-end) (unsafeDrop end b) : pieces stop rest count s b = runOverlappingCount r b 0 where r = regex s replace (r,new) = rep where rep [] = [] rep (b:bs) = b `seq` pieces 0 (run r b 0) where pieces 0 [] = b : rep bs pieces end [] | B.length b > end = unsafeDrop end b : rep bs | otherwise = rep bs pieces end ((start,stop):rest) | start > end = unsafeTake (start-end) (unsafeDrop end b) : new : pieces stop rest | otherwise = new : pieces stop rest run :: Lexer -> B.ByteString -> Int -> [(Int,Int)] run lexerIn b offsetIn = b `seq` loop offsetIn where end = B.length b loop offset | offset == end = [] | otherwise = let t@(start,stop) = lexOne b lexerIn offset in if start == -1 then [] else t : loop stop runOverlappingCount :: Lexer -> B.ByteString -> Int -> Int runOverlappingCount lexerIn b offsetIn = b `seq` loop offsetIn 0 where end = B.length b loop offset c | offset `seq` c `seq` False = undefined loop offset c | offset == end = c | otherwise = let start = fst $ lexOne b lexerIn offset in if start == -1 then c else loop (succ start) (succ c) -- -- Construct a regex combinator from a string regex (use Parsec) -- Designed to follow "man re_format" (Mac OS X 10.4.4) -- -- The regular expressions accepted by the program include those using -- |, empty group (), grouping with ( and ), wildcard '.', backslach -- escaped characters "\.", greedy modifiers ? + and *, bracketed -- alternatives including ranges such as [a-z0-9] and inverted -- brackets such as [^]\n-]. Only 7-bit Ascii accepted. -- 'p_regex' is the only "exported" function, used by 'regex' above p_regex = liftM (foldr1 (>|<)) (P.sepBy1 p_branch (P.char '|')) p_branch = liftM (($ epsilon).(foldr (.) id)) (P.many1 (p_atom >>= p_post_atom)) p_atom = P.try (P.string "()" >> return epsilon) <|> P.between (P.char '(') (P.char ')') p_regex <|> p_bracket <|> p_dot <|> p_escaped_char <|> p_other_char <|> (pzero "cannot parse regexp atom") p_post_atom atom = (P.char '?' >> return (atom `quest`)) <|> (P.char '+' >> return (atom `plus`)) <|> (P.char '*' >> return (atom `star`)) <|> (return (atom +>)) p_bracket = (P.char '[') >> ( (P.char '^' >> p_set True) <|> (p_set False) ) p_set invert = do initial <- (P.option "" ((P.char ']' >> return "]") <|> (P.char '-' >> return "-"))) middle <- P.manyTill P.anyChar (P.char ']') let expand [] = [] expand ('-':[]) = "-" expand (a:'-':b:rest) | a /= '-' = (enumFromTo a b)++(expand rest) expand (x:xs) | x /= '-' = x:(expand xs) | otherwise = error "A dash is in the wrong place in a p_set" characters = nub ( sort (initial ++ (expand middle)) ) return $ if invert then alt ( ['\0'..'\127'] \\ characters ) else alt characters p_dot = P.char '.' >> return (alt ['\0'..'\127']) p_escaped_char = P.char '\\' >> liftM char P.anyChar p_other_char = liftM char (P.noneOf specials) where specials = "^.[$()|*+?\\" -- -- And everything else is the modified CTK library. -- -- Compiler Toolkit: Self-optimizing lexers -- Author : Manuel M. T. Chakravarty -- -- tree structure used to represent the lexer table data Lexer = Lexer !Bool !Cont -- represent the continuation of a lexer -- on top of the tree, where entries are dense, we use arrays data Cont = Dense !BoundsNum !(Array Word8 Lexer) -- further down, where the valid entries are sparse, we -- use association lists, to save memory | Sparse !BoundsNum !(M.Map Word8 Lexer) -- end of a automaton | Done type Regexp = Lexer -> Lexer infixr 4 `quest`, `star`, `plus` infixl 3 +> -- Empty lexeme (noop) epsilon = id :: Regexp -- One character regexp char c = (\l -> Lexer False (Dense (B 1 w w) (listArray (w,w) [l]))) where w = c2w c -- accepts a non-empty set of alternative characters -- Equiv. to `(foldr1 (>|<) . map char) cs', but much faster alt cs = \l -> let bnds = B (length ws) (minimum ws) (maximum ws) in Lexer False (aggregate bnds [(w, l) | w <- ws]) where ws = map c2w cs -- accept a character sequence string cs = (foldr1 (+>) . map char) cs -- Concatenation of regexps is just concatenation of functions (+>) = (.) :: Regexp -> Regexp -> Regexp -- disjunctive combination of two regexps, corresponding to x|y re1 >|< re2 = \l -> re1 l >||< re2 l -- x `quest` y corresponds to the regular expression x?y quest re1 re2 = (re1 +> re2) >|< re2 -- x `plus` y corresponds to the regular expression x+y plus re1 re2 = re1 +> (re1 `star` re2) -- x `star` y corresponds to the regular expression x*y star re1 re2 = \l -> let self = re1 self >||< re2 l in self -- Scan forwards searching for a match anywhere at or after -- startOffset. Return offsets of first matching character and after -- last matching character or (-1,-1) for failure lexOne b lexerIn startOffset | b `seq` startOffset `seq` False = undefined lexOne b lexerIn startOffset = let stop = oneLexeme lexerIn startOffset (-1) in if stop == -1 then if startOffset < end then lexOne b lexerIn (succ startOffset) else (-1,-1) else (startOffset,stop) where end = B.length b oneLexeme (Lexer win cont) offset last = offset `seq` last `seq` let last' = if win then offset else last in if offset == end then last' -- at end, has to be this action else oneChar cont (unsafeIndex b offset) (succ offset) last' -- keep looking oneChar tbl c offset' last = c `seq` offset' `seq` last `seq` case peek tbl c of (Lexer win Done) -> if win then offset' else last l' -> oneLexeme l' offset' last peek _ c | c `seq` False = undefined peek (Dense bn arr) c | c `inBounds` bn = arr ! c peek (Sparse bn cls) c | c `inBounds` bn = M.findWithDefault (Lexer False Done) c cls peek _ _ = (Lexer False Done) -- disjunctive combination of two lexers (longest match, right biased) (Lexer win c) >||< (Lexer win' c') = Lexer (win || win') (joinConts c c') -- represents the number of (non-error) elements and the bounds of a -- DFA transition table data BoundsNum = B !Int !Word8 !Word8 -- combine two bounds addBoundsNum (B n lc hc) (B n' lc' hc') = B (n + n') (min lc lc') (max hc hc') -- check whether a character is in the bounds inBounds c (B _ lc hc) = c >= lc && c <= hc -- combine two disjunctive continuations joinConts Done c' = c' joinConts c Done = c joinConts c c' = let (bn , cls ) = listify c (bn', cls') = listify c' -- note: `addsBoundsNum' can, at this point, only -- approx. the number of *non-overlapping* cases; -- however, the bounds are correct in aggregate (addBoundsNum bn bn') (cls ++ cls') where listify (Dense n arr) = (n, assocs arr) listify (Sparse n cls) = (n, M.toList cls) -- we use the dense representation if a table has at least the given -- number of (non-error) elements denseMin = 1 :: Int -- Note: `n' is only an upper bound of the number of non-overlapping cases aggregate bn@(B n lc hc) cls | n >= denseMin = Dense bn (accumArray (>||<) (Lexer False Done) (lc, hc) cls) | otherwise = Sparse bn (M.fromList (accum (>||<) cls)) -- combine the elements in the association list that have the same key accum _ [] = [] accum f ((c, el):ces) = let (ce, ces') = gather c el ces in ce : accum f ces' where gather k e [] = ((k, e), []) gather k e (ke'@(k', e'):kes) | k == k' = gather k (f e e') kes | otherwise = let (ke'', kes') = gather k e kes in (ke'', ke':kes')