-------------------------------------------------------------------------------
-- SICSA MultiCore Challenge Phase I: Concordance
-- For details visit: http://www.macs.hw.ac.uk/sicsawiki/index.php/Challenge_PhaseI
-- Haskell sequential implementation
-- Author: Thomas Horstmeyer
-----------------------------------------------------------------------------

{-# LANGUAGE ScopedTypeVariables #-}

import System(getArgs)
import Control.Monad(when)
import Data.Char
import Data.List
import qualified List
import Data.Ord(comparing)

-------------------------------------------------------------------------------

type Index a = (Int, [a]) -- (pos, following words)

toInt :: Index a -> Int
toInt = fst

indices :: [a] -> [Index a]
indices [] = []
indices xs = zip [0..length xs - 1] (iterate tail xs)
-- todo: perhaps indizes as fold? avoids first list traversal (for length).

compareByIndex :: (Ord a) => Index a -> Index a -> Ordering
compareByIndex = comparing (head.snd)

equalByIndex :: (Eq a) => Index a -> Index a -> Bool
equalByIndex (_, []) (_, []) = False
equalByIndex (_, x:xs) (_, y:ys) = x == y

get :: Index a -> a
get (_, x:xs) = x
get _ = error "no such element"

next :: Index a -> Index a
next (i, x:xs) = (i+1, xs)
next _ = error "no next index"

isValidIndex :: Index a -> Bool
isValidIndex (_, []) = False
isValidIndex _ = True

-----

genNextLength :: forall a. (Ord a, Eq a) =>  Bool -> ([a], [Index a]) -> [([a], [Index a])]
genNextLength keepSingletons (xs, is) = map f lists where
  f :: [Index a] -> ([a], [Index a])
  f js = (get (head js):xs, map next js)
  lists = if keepSingletons then indexLists
			    else filter hasMultipleElements indexLists
  indexLists :: [[Index a]]
  indexLists = groupBy equalByIndex $ sortBy compareByIndex $ filter isValidIndex is

hasMultipleElements :: [a] -> Bool
hasMultipleElements (_:_:_) = True
hasMultipleElements _ = False

transform :: ([a], [Index a]) -> ([a], (Int, [Int]))
transform (xs, is) = (reverse xs, (length is, map (flip (-) (length xs) . toInt) is))

-----------------------------------------------------------------------------
-- this section copied from reference implementation Concordance.hs

-- test input
l = ["half","a","league","half","a","league","half","a","league","onward"]

-- the output should be a newline-separated sequence of entries like this:
-- <"to be":(3,[60,113,545])>
-- i.e. the sequence "to be" has 3 occurrences, starting at indices 60, 113, 545
-- (counting words, starting with 0)

-- show result, in format: <se
showS [] = ""
showS xs = show (List.concat (List.intersperse " " (map id xs)))

showL [] = []
showL ((s1,i1):t) =
 "<"++showS s1++":"++show i1++">\n"++showL t

mySplit :: String -> [String] -> String -> [String]
mySplit acc accs [] = if (not (null acc))
                        then acc:accs
                        else accs
mySplit acc accs (c:cs) | isSpace c = if (not (null acc))
                                        then mySplit [] (acc:accs) cs
                                        else mySplit [] (accs) cs
               		| isPunctuation c = mySplit acc accs cs
               		| otherwise = mySplit (toLower c:acc) accs cs


-----------------------------------------------------------------------------


main =
 do 
   args <- getArgs
   when (length args < 2) $
     error "Usage: concordance <LEN> <FILE>"
   let n = read (args!!0) :: Int
   let keepSingletons = False
   input <- readFile (args!!1)
   let inps = reverse (map reverse (mySplit [] [] input)) 
   let res = map transform $ concat $ take n $ iterate (concat . map (genNextLength keepSingletons)) (genNextLength keepSingletons ([], indices inps))
   putStrLn (showL res)
