forked from composewell/streamly
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordClassifier.hs
More file actions
75 lines (68 loc) · 2.5 KB
/
Copy pathWordClassifier.hs
File metadata and controls
75 lines (68 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleInstances #-}
#if __GLASGOW_HASKELL__ >= 800
{-# OPTIONS_GHC -Wno-orphans #-}
#endif
-- compile with:
-- ghc -O2 -fspec-constr-recursive=10 -fmax-worker-args=16 word-classifier.hs
--
import qualified Data.Char as Char
import Data.Foldable
import Data.Function ((&))
import Data.Functor.Identity (Identity(..))
import qualified Data.HashMap.Strict as Map
import Data.Hashable
import Data.IORef
import qualified Data.List as List
import qualified Data.Ord as Ord
import Foreign.Storable (Storable(..))
import qualified Streamly.Data.Unicode.Stream as S
import qualified Streamly.Internal.Data.Unicode.Stream as S
import qualified Streamly.Data.Fold as FL
import qualified Streamly.Internal.Data.Fold as IFL
import qualified Streamly.Internal.Data.Unfold as IUF
import qualified Streamly.Internal.FileSystem.File as File
import qualified Streamly.Memory.Array as A
import qualified Streamly.Prelude as S
import System.Environment (getArgs)
instance (Enum a, Storable a) => Hashable (A.Array a) where
hash arr = runIdentity $ IUF.fold A.read IFL.rollingHash arr
hashWithSalt salt arr = runIdentity $
IUF.fold A.read (IFL.rollingHashWithSalt salt) arr
{-# INLINE toLower #-}
toLower :: Char -> Char
toLower c
| uc >= 0x61 && uc <= 0x7a = c
| otherwise = Char.toLower c
where
uc = fromIntegral (Char.ord c) :: Word
{-# INLINE isAlpha #-}
isAlpha :: Char -> Bool
isAlpha c
| uc >= 0x61 && uc <= 0x7a = True
| otherwise = Char.isAlpha c
where
uc = fromIntegral (Char.ord c) :: Word
main :: IO ()
main = do
inFile <- fmap head getArgs
-- Write the stream to a hashmap consisting of word counts
mp <-
let
alter Nothing = fmap Just $ newIORef (1 :: Int)
alter (Just ref) = modifyIORef' ref (+ 1) >> return (Just ref)
in File.toBytes inFile -- SerialT IO Word8
& S.decodeLatin1 -- SerialT IO Char
& S.map toLower -- SerialT IO Char
& S.words FL.toList -- SerialT IO String
& S.filter (all isAlpha) -- SerialT IO String
& S.foldlM' (flip (Map.alterF alter)) Map.empty -- IO (Map String (IORef Int))
-- Print the top hashmap entries
counts <-
let readRef (w, ref) = do
cnt <- readIORef ref
return (w, cnt)
in Map.toList mp
& mapM readRef
traverse_ print $ List.sortOn (Ord.Down . snd) counts
& List.take 25