-- See: L. Allison. Types and classes of machine learning and data mining.
-- 26th Australasian Computer Science Conference (ACSC) pp.207-215,
-- Adelaide, February 2003
-- L. Allison. Models for machine learning and data mining in
-- functional programming. doi:10.1017/S0956796804005301
-- J. Functional Programming, 15(1), pp.15-32, Jan. 2005
-- Author: Lloyd ALLISON lloyd at bruce cs monash edu au
-- http://www.csse.monash.edu.au/~lloyd/tildeFP/II/200309/
-- This program is free software; you can redistribute it and/or modify it
-- under the terms of the GNU General Public License (GPL) as published by
-- the Free Software Foundation; either version 2 of the License, or (at
-- your option) any later version. This program is distributed in the hope
-- that it will be useful, but without any warranty, without even the implied
-- warranty of merchantability or fitness for a particular purpose. See the
-- GNU General Public License for more details. You should have received a
-- copy of the GNU General Public License with this program; if not, write to:
-- Free Software Foundation, Inc., Boston, MA 02111, USA.
module SM_Utilities (module SM_Utilities) where -- export all
latticeK1 = 1/12 -- lattice
latticeK2 = 5/(36 * sqrt(3)) -- constants
-- see J.H.Conway and N.J.A.Sloane. Sphere Packings, Lattices and Groups.
-- Springer 1998 3rd edn.
data Throw = H | T deriving (Enum, Bounded, Eq, Ord, Show) -- Mr. Bernoulli
instance Splits Throw where splits = splitsBE -- Splits Throw
-- ----------------------------------------------------------------------------
assert p x = assert1 p x "assertion failed"
assert1 p x why = if p x then x else error why
log2 = log 2.0
-- logPlusBase b (-logBase b p1) (-logBase b p2) = -logBase b (p1+p2)
logPlusBase b nlPr1 nlPr2 = -- NB. this is "OK" but there are faster ways.
let (bigger, smaller) =
if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1)
diff = bigger - smaller -- is non-negative
eps = b ** (-diff) -- between 0 and 1
nits = log b
in if diff * nits > 30 then smaller else smaller - (logBase b (1+eps))
-- the cutoff of 30 nits is somewhat(!) arbitrary.
-- and base e ...
logPlus nlPr1 nlPr2 = -- NB. this is "OK" but there are faster ways.
let (bigger, smaller) =
if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1)
diff = bigger - smaller -- is non-negative
eps = exp (-diff) -- between 0 and 1
in if diff > 30 then smaller else smaller - (log (1+eps))
normalise xs = -- scale xs to make sum to 1.0
let (ans, finalTotal) = n xs 0
n [] total = ([], total)
n (x:xs) total =
let (rest, sum) = n xs (total+x)
in ((x/finalTotal):rest, sum)
in ans
stirling n = -- Stirling's approximation
let ans = (n + 0.5)*(log n) - n + (log(2*pi)) / 2 -- ~ logBase e (n!)
in max ans 0
stirlingBase b n = (stirling n) / (log b) -- and to base b
-- ----------------------------------------------------------------------------
prng seed = (seed * (1 + 4*37*109) + 9999) `mod` (32 * 1024)
-- Pseudo Random Number Generator
-- ([0 .. 32K-1], only a 32K cycle, really should use a better one!)
-- Linear Congruential Pseudo Random Number Generator,
-- see: D. E. Knuth, The Art of Computer Programming, Vol. 2.
-- ----------------------------------------------L.Allison--CSSE--Monash--.au--
-- The class Splits and associated type and functions are primarily to do
-- with partitioning data, as in classification-trees do, although they might
-- have more uses, e.g. in clustering/ unsupervised-classification.
class Splits t where
splits :: [t] -> [Splitter t] -- i.e. can find ways of partitioning t-lists;
-- these ways should be proposed in order of decreasing prior-probability.
data Splitter t = Splitter Int (t -> Int) (() -> String)
-- i.e. arity partition_fn description
instance Show (Splitter t) where
show (Splitter _ _ description) = description()
aritySplitter (Splitter n _ _) = n
applySplitter (Splitter _ f _) d = f d -- apply it to one datum
-- partition a data-set, ds, into n parts according to s
split s ds = partition (aritySplitter s) (map (applySplitter s) ds) ds
-- The following is not allowed :-(
-- instance (Bounded t, Enum t) => Splits t where splits = splitsBE
-- because ... The instance type must be of the form (T a b c) ...
-- Splits discrete_type ...
instance Splits Bool where splits = splitsBE -- e.g. Splits Bool
-- Splits continous type ...
instance Splits Float where splits = splitsOrd -- e.g. Splits Float
instance Splits Double where splits = splitsOrd -- e.g. Splits Double
splitsBE [] = [] -- calculate the Splitter's for a Bounded Enum type
splitsBE (d:ds) =
let { lwb = minBound `asTypeOf` d; lwbn = fromEnum lwb;
upb = maxBound `asTypeOf` d; upbn = fromEnum upb;
arity = upbn - lwbn + 1 }
in if all ((==) d) ds then [] -- all the same, no variety in data set
else [Splitter arity (\u -> (fromEnum (u `asTypeOf` d) - lwbn))
(\()->"="++(show lwb)++(if arity > 2 then ".." else "|")++(show upb))]
-- If the type is also Ord(ered, e.g. Bad|Poor|...|VG) and if
-- arity is quite big, then it might be better to use splitsOrd lwb..upb .
splitsOrd ds = -- calculate Splitter's for an Ord(ered, inc' continuous) type
let splitPoints [] = [] -- lines like the next make it all worthwhile
splitPoints ds = (medianEtc . tail . unique . msort) ds -- !
in map (\cut -> Splitter 2 (\y -> if y < cut then 0 else 1)
(\() -> "<|>=" ++ show cut)) (splitPoints ds)
-- Splits tuples ...
instance (Splits t1, Splits t2) => Splits (t1, t2) where -- e.g. Splits Pair
splits xys =
let (xs, ys) = unzip xys
in interleave (splitsAttr 0 fst xs)
(splitsAttr 1 snd ys)
-- Note, we get hierarchical dataspaces done for free.
instance (Splits t1, Splits t2, Splits t3) => Splits (t1,t2,t3) where -- Triple
splits xyzs =
let (xs, ys, zs) = unzip3 xyzs
sx = splitsAttr 0 (\(x,_,_) -> x) xs
sy = splitsAttr 1 (\(_,y,_) -> y) ys
sz = splitsAttr 2 (\(_,_,z) -> z) zs
in interleave3 sx sy sz
instance (Splits t1, Splits t2, Splits t3, Splits t4) =>
Splits (t1,t2,t3,t4) where -- Quad
splits wxyzs =
let (ws, xs, ys, zs) = unzip4 wxyzs
sw = splitsAttr 0 (\(w,_,_,_) -> w) ws
sx = splitsAttr 1 (\(_,x,_,_) -> x) xs
sy = splitsAttr 2 (\(_,_,y,_) -> y) ys
sz = splitsAttr 3 (\(_,_,_,z) -> z) zs
in interleave4 sw sx sy sz
splitsAttr n sel xs = -- selected attribute
-- get splits for nth attribute of multivariate dataset
-- NB. sel must select the nth attribute !
let prefix = "@" ++ (show n)
in map (\(Splitter n f d)->Splitter n (f.sel) (\()->prefix++d())) (splits xs)
-- ------------------------------------- make tuples instances of class Enum --
-- NB. tuples of Bounded types are already in Bounded
instance (Enum t1,Bounded t1, Enum t2,Bounded t2) => Enum (t1,t2) where -- Pair
fromEnum (v1,v2) = -- NB. (minBound, minBound') -> 0
let min1 = fromEnum (minBound `asTypeOf` v1)
min2 = fromEnum (minBound `asTypeOf` v2)
max2 = fromEnum (maxBound `asTypeOf` v2)
width = max2 - min2 + 1
in (fromEnum v1 - min1)*width + (fromEnum v2 - min2)
toEnum = -- had a big battle to remove 'ambiguous type' errors in this
let (mnB1, mnB2) = minBound `asTypeOf` (te 0) -- !
(mxB1, mxB2) = maxBound `asTypeOf` (mnB1, mnB2)
(min1, min2) = (fromEnum mnB1, fromEnum mnB2)
(max1, max2) = (fromEnum mxB1, fromEnum mxB2)
width = max2 - min2 + 1
te n = (toEnum(min1 + n `div` width), toEnum(min2 + n `mod` width))
in te
instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3) =>
Enum (t1,t2,t3) where -- triple
fromEnum (v1,v2,v3) = fromEnum (v1,(v2,v3))
toEnum n = let (v1,(v2,v3)) = toEnum n in (v1,v2,v3)
instance (Enum t1,Bounded t1, Enum t2,Bounded t2, -- quad
Enum t3,Bounded t3, Enum t4,Bounded t4) => Enum (t1,t2,t3,t4) where
fromEnum (v1,v2,v3,v4) = fromEnum (v1,(v2,v3,v4))
toEnum n = let (v1,(v2,v3,v4)) = toEnum n in (v1,v2,v3,v4)
instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3, -- five
Enum t4,Bounded t4, Enum t5,Bounded t5) => Enum (t1,t2,t3,t4,t5) where
fromEnum (v1,v2,v3,v4,v5) = fromEnum (v1,(v2,v3,v4,v5))
toEnum n = let (v1,(v2,v3,v4,v5)) = toEnum n in (v1,v2,v3,v4,v5)
-- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au--
-- some ordinary but useful List functions...
unzip4 = foldr (\(w,x,y,z) ~(ws,xs,ys,zs) -> (w:ws, x:xs, y:ys, z:zs))
([], [], [], []) -- standard prelude has up to unzip3
-- given a list of heads & a list of tails (lists), put each head on its tail.
prepend heads tails = zipWith (:) heads tails
addToList n x lsts = -- add x to the nth List
let add n [] = add n [[]] -- prem' end of Lists, so lengthen!
add 0 (lst:lsts) = (x:lst) : lsts -- add x to this lst
add n (lst:lsts) = lst:(add (n-1) lsts)
in add n lsts
partition n indxs elts = -- partition elts into n parts according to indxs
let p [] [] ans = ans
p (i:is) (x:xs) ans = p is xs (addToList i x ans) -- add x to i-th part
in p indxs elts (replicate n [])
interleave [] ys = ys
interleave xs [] = xs -- interleave the elements of two lists
interleave (x:xs) (y:ys) = x : y : (interleave xs ys)
interleave3 [] ys zs = interleave ys zs
interleave3 xs [] zs = interleave xs zs
interleave3 xs ys [] = interleave xs ys
interleave3 (x:xs) (y:ys) (z:zs) = x : y : z : (interleave3 xs ys zs)
interleave4 ws xs ys zs = interleave (interleave ws ys) (interleave xs zs)
unique [] = [] -- POST: no duplicates in output
unique (x:xs) = -- PRE: any duplicates in input are adjacent
let u _ [] = []
u z (x:xs) = if z == x then u z xs else x : (u x xs)
in x : (u x xs)
merge [] bs = bs
merge as [] = as -- merge two sorted(!) lists
merge (aas@(a:as)) (bbs@(b:bs)) =
if a <= b then a : (merge as bbs) else b : (merge aas bs)
msort [] = []
msort [x] = [x] -- merge sort
msort inList = -- an (unstable) merge-sort, cos it is simple!
let (as, bs) = splt inList [] []
splt [] as bs = (as, bs)
splt (x:xs) as bs = splt xs bs (x:as)
in merge (msort as) (msort bs)
-- NB. This code for medianEtc is an improvement on the 2002 test code both
-- in complexity and also in the order in which its results are produced.
medianEtc [] = [] -- returns [median, quartiles, octiles, ...]
medianEtc s = -- PRE: s is sorted
let add _ 0 _ 0 _ ss = ss -- n1 = n2 = 0
add _ n1 s1 0 _ ss = ((n1,s1):ss) -- n1 > 0, n2 = 0
add _ 0 _ n2 s2 ss = ((n2,s2):ss) -- n1 = 0, n2 > 0
add True n1 s1 n2 s2 ss = (n1,s1) : (n2,s2) : ss -- n1 > 0, n2 > 0
add False n1 s1 n2 s2 ss = (n2,s2) : (n1,s1) : ss -- switch _2, _1
select fwd [] [] = []
select fwd [] level = select (not fwd) level [] -- down a level
-- param 3 of select is an accum'ing buffer of next level of xyz-iles
select fwd ((n,s):l1) l2 = -- NB. n == |s| (usable part), n >= 1
let n1 = n `div` 2 -- 1->0, 2->1, 3->1 ... i.e. |small| (proper)
n2 = n - n1 -- 1->1, 2->1, 3->2 ... i.e. |median:big|
small = s -- or at any rate small's 1st n1 elements
(median:big) = drop n1 s
in median : (select fwd l1 (add fwd n1 small (n2-1) big l2))
-- note the breadth-first traversal
in select True [(length s, s)] []
-- Guaranteed "OK" behaviour, alternatively might try Hoare's Find, one day.
-- Neighbouring values in the result are as similar as possible.
-- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au--
test01 = print "-- test01 --"
>> print( "prng 17 ... = " ++ show( take 8 (iterate prng 17) ) )
>> (let sb1010 = stirlingBase 10 10
in print("stirlingBase 10 10 = "++show sb1010
++ ", 10**_="++show (10**sb1010) ))
>> print( "normalise [1,1,2,4] = " ++ show( normalise [1,1,2,4] ) )
>> print( "logPlus 7 7 = " ++ show( logPlus 7 7 ) )
>> print( "logPlusBase 2 7 7 = " ++ show( logPlusBase 2 7 7 ) )
>> print( "partition 2 ... ... = "
++ show( partition 2 [0,1,3,0,1] ["fst1","snd1","fth","fst2","snd2"] ) )
>> print( "msort [3,1,6,5,...] = " ++ show( msort [3,1,5,3,3,2,4] ) )
>> print( "medianEtc ... = "
++ show( medianEtc [1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15] ) )
>> print( "splits[(H,(True,..) = "
++ show( splits [(H,(True,4.0)),(H,(False,1.0 :: Double)),
(T,(True,3.0)),(H,(True, 2.0))]))
-- ----------------------------------------------------------------------------