library(rebmix)
library(stats)
library(ggplot2)
library(tidyr)
library(gridExtra)
library(cluster)
library(jpeg)
library(cluster.datasets)


data(all.mammals.milk.1956)


#Veiem com són les dades
str(all.mammals.milk.1956)

'data.frame':	25 obs. of  6 variables:
 $ name   : chr  "Horse" "Orangutan" "Monkey" "Donkey" ...
 $ water  : num  90.1 88.5 88.4 90.3 90.4 87.7 86.9 82.1 81.9 81.6 ...
 $ protein: num  2.6 1.4 2.2 1.7 0.6 3.5 4.8 5.9 7.4 10.1 ...
 $ fat    : num  1 3.5 2.7 1.4 4.5 3.4 1.7 7.9 7.2 6.3 ...
 $ lactose: num  6.9 6 6.4 6.2 4.4 4.8 5.7 4.7 2.7 4.4 ...
 $ ash    : num  0.35 0.24 0.18 0.4 0.1 0.71 0.9 0.78 0.85 0.75 ...


#Mostrem les 6 primeres observacions
head(all.mammals.milk.1956)


#Començem fem una representació dels valors que pren cada 
plot1 <- all.mammals.milk.1956 %>% 
    ggplot(aes(x = "all mammals", y = water)) + 
    geom_jitter(width = .025, height = 0, size = 2, alpha = .5, color = "blue") +
    labs(x = "", y="aigua")
  
plot2 <-  all.mammals.milk.1956 %>%
  ggplot(aes(x = "all mammals", y = protein)) + 
    geom_jitter(width = .02, height = 0, size = 2, alpha = .6,  color = "orange") +
    labs(x = "", y="proteïna")
  
plot3 <-  all.mammals.milk.1956 %>%
    ggplot(aes(x = "all mammals", y = fat)) + 
    geom_jitter(width = .02, height = 0, size = 2, alpha = .6,  color = "green") +
    labs(x = "", y="greix")
  
plot4 <-  all.mammals.milk.1956 %>%
    ggplot(aes(x = "all mammals", y = lactose)) + 
    geom_jitter(width = .02, height = 0, size = 2, alpha = .6,  color = "red") +
    labs(x = "", y="lactosa")
  
plot5 <-  all.mammals.milk.1956 %>%
    ggplot(aes(x = "all mammals", y = ash)) + 
    geom_jitter(width = .02, height = 0, size = 2, alpha = .6,  color = "violet") +
    labs(x = "", y="minerals")
  
grid.arrange(plot1, plot2, plot3, plot4, plot5)


obs <- all.mammals.milk.1956[,2:6]
wss <- (nrow(obs)-1)*sum(apply(obs,2,var))
for(i in 1:20){
    wss[i] <- sum(kmeans(obs, centers=i)$withinss)
    }
    plot(1:20, wss, type="b", xlab="No. of Clusters", ylab="wss")
round(wss,3)


set.seed(20)
allmamals.clu <- kmeans(obs, 4, nstart = 20)
allmamals.clu

K-means clustering with 4 clusters of sizes 7, 6, 10, 2

Cluster means:
             water           protein              fat          lactose
1 81.1857142857143  7.42857142857143  6.9000000000000 4.01428571428571
2 68.3333333333333  9.55000000000000 17.4166666666667 2.91666666666667
3 88.5000000000000  2.57000000000000  2.8000000000000 5.68000000000000
4 45.6500000000000 10.15000000000000 38.4500000000000 0.45000000000000
                ash
1 0.931428571428571
2 1.330000000000000
3 0.485000000000000
4 0.690000000000000

Clustering vector:
 [1] 3 3 3 3 3 3 3 1 1 1 1 3 3 1 3 1 1 2 2 2 2 2 2 4 4

Within cluster sum of squares by cluster:
[1]  63.5349142857143 191.9610000000000  59.4122500000000  27.1912000000000
 (between_SS / total_SS =  95.1 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"


#Nombre d'observacions assignades a cada clúster
table(allmamals.clu$cluster)

 1  2  3  4 
 7  6 10  2


sil <- silhouette(allmamals.clu$cluster, dist(obs))
plot(sil)


kmeanspropi <-function(data, K=4, tol=1e-5){
    # Inicialitzem els prototips (aleatòriament d'entre el conjunt dobservacions)
    prototips <-data[sample.int(nrow(data),K), 1:2]
    
    # La variable dif emmagatzema la diferència entre els prototips; la inicialitzem  a un valor arbitràriament gran (per entrar en el 'while')
    dif <-1000
    
    
    # Inicialitzem un vector per guardar les assignacions dels punts als clústers a cada iteració
    cluster <-rep(0,nrow(data))
    
   
    it<-1
    convergencia<-FALSE
    while(dif>=tol && convergencia==FALSE) {
        it=it+1
        if(dif<=tol){
            convergencia<-TRUE
        }
        prototips_old <-prototips
        
        # Assignem cada punt del conjunt de dades al clúster més proper (distància euclidiana)
        for (i in 1:nrow(data)){
            dist_min<-10e10
            for (prototip in 1:nrow(prototips)){
                dist_prototip <-sum((prototips[prototip,1:2]-data[i,1:2])^2)
                if (dist_prototip<=dist_min){
                    cluster[i] <-prototip
                    dist_min <-dist_prototip
                }
            }
        } 

        
        # Reestimem els nous prototips (mitjana dels punts que han estat assignats al clúster corresponent en el pas previ)
        for (alpha in 1:nrow(prototips)){
            prototips[alpha,1:2]=apply(as.matrix(data[cluster==alpha,1:2]),2,mean)
        }
        
        dif <-mean(abs(as.matrix(prototips_old-prototips)))
    }
    
    return(list(result=data.frame(data,cluster)))
}


allmamals.clu2 <-kmeanspropi(obs,4,1e-5)


sil <- silhouette(allmamals.clu2$result$cluster, dist(obs))
plot(sil)


allmamals.clu2$resul


#Llegim la imatge
img <- readJPEG("imagecompression.jpg")


#Mostrem la imatge
options(repr.plot.width=5,repr.plot.height=5)
plot(c(0,1),c(0,1),"n",xlab="",ylab="")
rasterImage(img,0,0,1,1)


#Extraiem la dimensió de la imatge
imgDm <- dim(img)
imgDm


#Separem la imatge en les seves components RGB (colors primaris)
imgRGB <- data.frame(
  x = rep(1:imgDm[2], each = imgDm[1]),
  y = rep(imgDm[1]:1, imgDm[2]),
  R = as.vector(img[,,1]),
  G = as.vector(img[,,2]),
  B = as.vector(img[,,3])
)


#Creem una funció per personalitzar la representació de les imatges
plotTheme <- function() {
  theme(
    panel.background = element_rect(
      size = 3,
      colour = "black",
      fill = "white"),
    axis.ticks = element_line(
      size = 2),
    panel.grid.major = element_line(
      colour = "gray80",
      linetype = "dotted"),
    panel.grid.minor = element_line(
      colour = "gray90",
      linetype = "dashed"),
    axis.title.x = element_text(
      size = rel(1.2),
      face = "bold"),
    axis.title.y = element_text(
      size = rel(1.2),
      face = "bold"),
    plot.title = element_text(
      size = 20,
      face = "bold",
      vjust = 1.5)
  )
}


#Usem la funicó que hem definit per mostrar primer la nostra imatge original
ggplot(data = imgRGB, aes(x = x, y = y)) + 
    geom_point(colour = rgb(imgRGB[c("R", "G", "B")])) +
    labs(title = paste("Imatge original")) +
    xlab("x") +
    ylab("y") +
plotTheme()


img10 <- kmeans(imgRGB[, c("R", "G", "B")], centers = 10)
colors <- rgb(img10$centers[img10$cluster,])


ggplot(data = imgRGB, aes(x = x, y = y)) + 
  geom_point(colour = colors) +
  labs(title = paste("k-mitjanes amb k = 10 colors")) +
  xlab("x") +
  ylab("y") + 
plotTheme()


# Definim un model de mixtura de gaussianes

n <- c(50, 150, 75, 125, 100) 

# La mixtura té cinc components. El total d'observacions és 500. La primera component en té 50, per tant π_1 = 0.1
# Anàlogament π_2 = 0.3 , π_3 = 0.15 , π_4 = 0.25 , π_5 = 0.2

Theta <- new("RNGMVNORM.Theta", c = 5, d = 2) 

#Theta contindrà els paràmetres de la mixtura de K = 5 (components) i  d = 2 (dimensió/variables observades)

# mu (mijtanes): vector d-dimensional
a.theta1(Theta, 1) <- c(2.7, 3.7)
a.theta1(Theta, 2) <- c(5.7, 9.1)
a.theta1(Theta, 3) <- c(2, 9)
a.theta1(Theta, 4) <- c(9.5, 6.6)
a.theta1(Theta, 5) <- c(6.3, 0.6)

# Sigma (variàncies): matriu real, simètrica i definida positiva, de dimensió d x d
a.theta2(Theta, 1) <- c(0.9, -0.1, -0.1, 0.4)
a.theta2(Theta, 2) <- c(2.8, -1.3, -1.3, 1.5)
a.theta2(Theta, 3) <- c(0.1, 0, 0, 0.3)
a.theta2(Theta, 4) <- c(1.3, -0.4, -0.4, 0.4)
a.theta2(Theta, 5) <- c(0.5, 0.3, 0.3, 2.5)

# Generem aleatòriament la mostra
mvnorm.simulated <- RNGMIX(model = "RNGMVNORM", Dataset.name = "mvnormdataset", rseed = -1, n = n, Theta = a.Theta(Theta))

RNGMIX Version 2.13.1

Dataset = mvnormdataset


#Vegem les primeres 6 observacions que ens ha generat aquest procés
head(mvnorm.simulated@Dataset$mvnormdataset)


#Fem un gràfic dels punts (observacions) de la mostra aleatòria que hem generat
plot(mvnorm.simulated)


#Usem la funicó REBMIX del paquet 'rebmix' per estimar el model de mixtura
mvnormest <- REBMIX(model = "REBMVNORM", Dataset = a.Dataset(mvnorm.simulated), Preprocessing = "histogram", cmax = 20, Criterion = "BIC")

REBMIX Version 2.13.1

Dataset = mvnormdataset


#Grafiquem el model de mixtura obtingut
plot(mvnormest)


mvnormest

An object of class "REBMVNORM"
Slot "w":
[1] 0.146895813464989 0.260051424291348 0.194002356457407 0.190199088554626
[5] 0.106088223921686 0.102763093309944
Slot "Theta":
$pdf1
[1] normal normal

$theta1.1
[1] 1.97479881153781 9.11311226419078

$theta2.1
[1] 0.1815090816359257 0.0171283190432554 0.0171283190432554 0.6059462229556800

$pdf2
[1] normal normal

$theta1.2
[1] 9.48473870286257 6.54916960016614

$theta2.2
[1]  2.107541502395406 -0.560617301737565 -0.560617301737565  0.470306432272665

$pdf3
[1] normal normal

$theta1.3
[1] 6.330867466978292 0.254603835905561

$theta2.3
[1] 0.555546717935158 0.461260108702528 0.461260108702528 2.196408687256392

$pdf4
[1] normal normal

$theta1.4
[1] 4.77224898612766 9.62350400277331

$theta2.4
[1]  1.919141324311902 -0.456668653190171 -0.456668653190171  0.852438489761127

$pdf5
[1] normal normal

$theta1.5
[1] 2.82166870872033 3.80276539727387

$theta2.5
[1] 1.375159426399008 0.102756529295187 0.102756529295187 0.586736883527212

$pdf6
[1] normal normal

$theta1.6
[1] 7.56192190706474 8.08442812866403

$theta2.6
[1]  1.798500367510549 -0.189029098392155 -0.189029098392155  0.386302896755296

Slot "summary":
        Dataset Preprocessing Criterion c v/k               IC
1 mvnormdataset     histogram       BIC 6  14 4251.10470170412
               logL  M
1 -2016.79670912967 35


EM <- new("EM.Control", strategy = "single", variant = "EM", acceleration = "fixed", acceleration.multiplier = 1, tolerance = 1e-04, maximum.iterations = 1000, K = 0)
K <- optbins(Dataset = a.Dataset(mvnorm.simulated), Rule = "Knuth equal", kmin = 2, kmax = 100)
mvnormest.em <- REBMIX(model = "REBMVNORM", Dataset = a.Dataset(mvnorm.simulated), Preprocessing = "histogram", cmax = 20, K = K, Criterion = "BIC", EMcontrol = EM)

REBMIX Version 2.13.1

Dataset = mvnormdataset


summary(mvnormest.em)

        Dataset Preprocessing Criterion c v/k               IC
1 mvnormdataset     histogram       BIC 5  11 4187.41470947874
               logL  M
1 -2003.59553731225 29
Maximum logL = -2003.59553731225 at pos = 1.


plot(mvnormest.em)


mvnormest.em

An object of class "REBMVNORM"
Slot "w":
[1] 0.152279635754161 0.275358384808801 0.199843621310076 0.100173516773880
[5] 0.272344841353083
Slot "Theta":
$pdf1
[1] normal normal

$theta1.1
[1] 2.02428656449457 9.07574734127331

$theta2.1
[1] 0.10981483707693809 0.00352370387753491 0.00352370387753491
[4] 0.46913200788209303

$pdf2
[1] normal normal

$theta1.2
[1] 9.44659813907218 6.63688235904895

$theta2.2
[1]  1.933553716924786 -0.623478526344136 -0.623478526344136  0.450227158988646

$pdf3
[1] normal normal

$theta1.3
[1] 6.269368825668520 0.324380333584456

$theta2.3
[1] 0.467421529302982 0.384510592759515 0.384510592759515 2.219716132871343

$pdf4
[1] normal normal

$theta1.4
[1] 2.68807681282499 3.83486159862061

$theta2.4
[1]  0.9110468749452292 -0.0287615211342961 -0.0287615211342961
[4]  0.4500137858474870

$pdf5
[1] normal normal

$theta1.5
[1] 5.68847779853623 9.13463414245424

$theta2.5
[1]  3.08963942399193 -1.16287620792059 -1.16287620792059  1.24108251562985

Slot "summary":
        Dataset Preprocessing Criterion c v/k               IC
1 mvnormdataset     histogram       BIC 5  11 4187.41470947874
               logL  M
1 -2003.59553731225 29


cat("Iteracions: ", a.summary.EM(mvnormest.em, pos = 1, col.name = "total.iterations.nbr") )

Iteracions:  207


EM.normal <- new("EM.Control", strategy = "exhaustive", variant = "EM", acceleration = "fixed", acceleration.multiplier = 1, tolerance = 1e-04, maximum.iterations = 1000, K = 0)
mvnormestest.em.normal <- REBMIX(model = "REBMVNORM", Dataset = a.Dataset(mvnorm.simulated), Preprocessing = "histogram", cmax = 15, Criterion = "BIC", EMcontrol = EM.normal)

REBMIX Version 2.13.1

Dataset = mvnormdataset


summary(mvnormestest.em.normal)

        Dataset Preprocessing Criterion c v/k              IC              logL
1 mvnormdataset     histogram       BIC 5  10 4187.3926805214 -2003.58452283358
   M
1 29
Maximum logL = -2003.58452283358 at pos = 1.


plot(mvnormestest.em.normal)


mvnormestest.em.normal

An object of class "REBMVNORM"
Slot "w":
[1] 0.152284985073041 0.274921060140380 0.199843614718997 0.100173515328916
[5] 0.272776824738665
Slot "Theta":
$pdf1
[1] normal normal

$theta1.1
[1] 2.02432486842025 9.07569168839743

$theta2.1
[1] 0.10982941869482202 0.00354384455037328 0.00354384455037328
[4] 0.46900870887927582

$pdf2
[1] normal normal

$theta1.2
[1] 9.44869793968046 6.63550861870828

$theta2.2
[1]  1.931305322008650 -0.622164115054210 -0.622164115054210  0.449014605989509

$pdf3
[1] normal normal

$theta1.3
[1] 6.269368819011667 0.324380140500414

$theta2.3
[1] 0.467421578809030 0.384510553145366 0.384510553145366 2.219715054978578

$pdf4
[1] normal normal

$theta1.4
[1] 2.68807677888187 3.83486155882545

$theta2.4
[1]  0.9110468106082343 -0.0287616236059882 -0.0287616236059882
[4]  0.4500136951479720

$pdf5
[1] normal normal

$theta1.5
[1] 5.69243712082065 9.13204634426902

$theta2.5
[1]  3.09671203987169 -1.16803398563731 -1.16803398563731  1.24419425992394

Slot "summary":
        Dataset Preprocessing Criterion c v/k              IC              logL
1 mvnormdataset     histogram       BIC 5  10 4187.3926805214 -2003.58452283358
   M
1 29


cat("Iteracions: ", a.summary.EM(mvnormestest.em.normal, pos = 1, col.name = "total.iterations.nbr"))

Iteracions:  775


#Usem paràmetre d'acceleració fixa, a_{EM}=1.5


EM.fixed1.5 <- new("EM.Control", strategy = "exhaustive", variant = "EM", acceleration = "fixed", acceleration.multiplier = 1.5, tolerance = 1e-04, maximum.iterations = 1000, K = 0)
mvnormest.em.fixed1.5 <- REBMIX(model = "REBMVNORM", Dataset = a.Dataset(mvnorm.simulated), Preprocessing = "histogram", cmax = 15, Criterion = "BIC", EMcontrol = EM.fixed1.5)

REBMIX Version 2.13.1

Dataset = mvnormdataset


summary(mvnormest.em.fixed1.5)

        Dataset Preprocessing Criterion c v/k               IC
1 mvnormdataset     histogram       BIC 5  11 4187.27258636969
               logL  M
1 -2003.52447575772 29
Maximum logL = -2003.52447575772 at pos = 1.


plot(mvnormest.em.fixed1.5)


mvnormest.em.fixed1.5

An object of class "REBMVNORM"
Slot "w":
[1] 0.152168395036592 0.272273714652508 0.199843564849612 0.100173511577744
[5] 0.275540813883544
Slot "Theta":
$pdf1
[1] normal normal

$theta1.1
[1] 2.02444840669804 9.07418964326810

$theta2.1
[1] 0.1098054366810460 0.0037007009177143 0.0037007009177143 0.4667908711983257

$pdf2
[1] normal normal

$theta1.2
[1] 9.46067428686192 6.62718684379548

$theta2.2
[1]  1.919741767875493 -0.614775405528496 -0.614775405528496  0.441646965635795

$pdf3
[1] normal normal

$theta1.3
[1] 6.269368697301642 0.324378681442089

$theta2.3
[1] 0.467421885396044 0.384509915585586 0.384509915585586 2.219706755536705

$pdf4
[1] normal normal

$theta1.4
[1] 2.68807669436544 3.83486141239794

$theta2.4
[1]  0.911046716178501 -0.028762019849457 -0.028762019849457  0.450013406051403

$pdf5
[1] normal normal

$theta1.5
[1] 5.71508350884029 9.11708067669072

$theta2.5
[1]  3.14792563371995 -1.20188916943485 -1.20188916943485  1.26318927352261

Slot "summary":
        Dataset Preprocessing Criterion c v/k               IC
1 mvnormdataset     histogram       BIC 5  11 4187.27258636969
               logL  M
1 -2003.52447575772 29


cat("Iteracions: ", a.summary.EM(mvnormest.em.fixed1.5, pos = 1, col.name = "total.iterations.nbr") )

Iteracions:  560

	1	2
	<dbl>	<dbl>
1	2.53656091344232	3.19628986315287
2	4.23254080375765	3.64640591888040
3	3.31962744405870	3.52091452476089
4	2.88418285207739	3.33881754963756
5	3.84251901887412	4.15033449722182
6	1.41400300222414	4.79735719197654

MIXTURES GAUSSIANES I ALGORISME EM¶

Andrea Baena Espejo, 2022-01-24¶

1. Implementació de l'algorisme K-mitjanes¶

Aplicació de l'algorisme K-mitjanes en compressió d'imatges¶

2. Algorisme EM¶

Paquet REBMIX¶

	name	water	protein	fat	lactose	ash
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	Horse	90.1	2.6	1.0	6.9	0.35
2	Orangutan	88.5	1.4	3.5	6.0	0.24
3	Monkey	88.4	2.2	2.7	6.4	0.18
4	Donkey	90.3	1.7	1.4	6.2	0.40
5	Hippo	90.4	0.6	4.5	4.4	0.10
6	Camel	87.7	3.5	3.4	4.8	0.71