#######################################################################

###  Fit a 2-component univariate normal mixture to a given        ####

###  density g(y) using the misspecified population-based EM       ####

###  algorithm.                                                    ####

###                                                                ####

###  Input:                                                        ####

###         l = lower bound on the support of g(y)                 ####

###         u = upper bound on the support of g(y)                 ####

###        g(y) = user defined density function.                   ####

###        err = convergence criterion (default = 1/10^15)         ####

###               err must be a positive number                    ####

###                                                                ####

###                                                                ####

###  Output:                                                       ####

###        * Parameters of a two component normal mixture:         ####

###          pi1, pi2 (prior probabilities with pi2=1-pi1)         ####

###          mu1, mu2, mixture component means                     ####

###          sigma1, sigma2, mixture component standard deviations ####

###        * loglike = Misspecified log-likelihood                 ####

###                  = int(g(y)log(f(y))dy                         ####

###                     where f(y) = mixture density.              ####

###                                                                ####

###                                                                ####

###  NOTES:                                                        ####

###     1. If the density g(y) is similar in shape to a normal     ####

###        density, then the misspecified log-likelihood may       ####

###        converge very slowly.                                   ####

###     2. The default density used is the gamma density.          ####

###     3. R's integrate function may give an error for infinite   ####

###        limits of integration (when l and/or u are infinite.    ####

###        In these cases, the user can specify finite but large   ####

###        values for l and/or u to approximate the integral over  ####

###        the infinite range.                                     ####

###     4. If the algorithm has not converged after maxiter, an    ####

###          message is printed saying the algorithm has not       ####

###          converged.                                            ####

###          Default value is maxiter = 20,0000                    ####

###     5.  This program can be used to approximate any function   ####

###         with a bounded range. One need only vertically shift   ####

###           the function so that it takes only positive values   ####

###           and normalize so that it integrates to one.          ####

###                                                                ####

###                                                                ####

####################################################################### 

#######################################################################  

 

 

err <- 1/10^15    # convergence criterion = squared difference between log-likelihood between successive iterations of the algorithm.

l <- 0            # lower limit on support of the density

u <- 40           # upper limit on support of the density

 

 

## Specify the density g(y)

## The default density is a gamma with shape parameter kappa and

##  scale parameter theta.

kappa <- 10  # shape parameter for gamma distribution.

theta <- 1   # scale parameter for the gamma density.

 

g <- function(y){1/(theta^kappa*gamma(kappa))*y^(kappa-1)*exp(-y/theta)}

 

 

 

# Specify initial values for the mixture component parameters

pi1 <- 0.5

pi2 <- 1-pi1

mu <- integrate(function(y){y*g(y)},l,u)$value

sigma <- sqrt(integrate(function(y){(y-mu)^2*g(y)},l,u)$value)

mu1 <- mu - sigma

mu2 <- mu + sigma

sigma1 <- sigma/2

sigma2 <- sigma/2

 

diff <- 100

llold <- 0  # Store previous log-likelihood value

iter <- 0   # iteration number

maxiter <- 20000 # maximum number of iterations allowed.

 

 

while (diff > err & iter < maxiter)

{

iter=iter+1

 

 

fpi1 <- function(y){g(y)*

           pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))/

          (pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))+

           pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))) }

 

pi1 <- integrate(fpi1, l,u)$value

pi2=1-pi1

 

fmu1 <- function(y) {y*g(y)*

        pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))/

    (pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))+

               pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))) }

 

fmu2 <- function(y) {y*g(y)*

        pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))/

    (pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))+

                 pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))) }

 

 

fsig1 <- function(y) {(y-mu1)^2*g(y)*

           pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))/

    (pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))+

          pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))) }

 

 

fsig2 <- function(y) {(y-mu2)^2*g(y)*

             pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))/

    (pi1/sigma1*exp(-(y-mu1)^2/(2*sigma1^2))+

             pi2/sigma2*exp(-(y-mu2)^2/(2*sigma2^2))) }

 

normmix <- function(y){pi1*dnorm(y, mean=mu1, sd=sigma1)+

               pi2*dnorm(y, mean=mu2, sd=sigma2)}

 

mu1  <-  integrate(fmu1, l, u)$value/pi1

mu2  <-  integrate(fmu2, l, u)$value/pi2

sigma1 <- sqrt(integrate(fsig1, l,u)$value/pi1)

sigma2 <- sqrt(integrate(fsig2, l,u)$value/pi2)

 

# Compute the log-likelihood

loglikeint <- function(y){g(y)*(log(pi1*dnorm(y,mean=mu1, sd=sigma1)

                          +(1-pi1)*dnorm(y,mean=mu2, sd=sigma2)))}

 

mixloglikeint <- function(y){

           normmix(y)*(log(pi1*dnorm(y,mean=mu1, sd=sigma1)

                          +(1-pi1)*dnorm(y,mean=mu2, sd=sigma2)))}

 

 

loglike=integrate(loglikeint,l,u)$value

diff = (loglike-llold)^2

llold = loglike    # Store old log-likelihood value

 

trueloglikeint <- function(y){g(y)*log(g(y))}

trueloglike <- integrate(trueloglikeint, l,u)$value

 

cat(iter, loglike, trueloglike, pi1, mu1, mu2, sigma1, sigma2, diff, "\n")

 

# Plot g(y) and the mixture approximation

lbound <- l

ubound <- u

if (l == -Inf){lbound = mu - 6*sigma}

if (u ==  Inf){ubound = mu + 6*sigma}

y=seq(lbound,ubound, length=200)

 

mmax <- as.matrix(sort(normmix(y)))

mmax <- min(mmax[dim(mmax)[1],1], Inf)

 

pts <- matrix(c(mu1, mu2, 0, 0), 2,2)

 

plot(y, g(y), type="l", ylim=c(-mmax/4,mmax), xlim=c(lbound,ubound),

       lwd=3, main="2-Component Mixture Approximation to g(y)",

       ylab="Density", xlab="")

lines(y, normmix(y), lty=2, col=2, lwd=3)

abline(h=0)

points(pts, cex=3, col=2, pch=19)

legend((lbound+ubound)*.25, -mmax/16, c("g(y)",

                   "2-Component Mixture"),

       col = c(1,2),

       lty = c(1,2),

       lwd=c(3,3), merge = TRUE)#, bg='gray90')

 

}

 

if (iter > maxiter){

  cat("Misspecified EM Algorithm did not converge in ", maxiter, " iterations", "\n")

}

 

cat(" pi1 = ", pi1, "\n",

    "pi2 = ", pi2, "\n",

    "mu1 = ", mu1, "\n",

    "mu2 = ", mu2, "\n",

    "sigma1 = ", sigma1, "\n",

    "sigma2 = ", sigma2, "\n",

    "Misspecified log-likelihood = ", loglike, "\n")