# Parametric Bootstrap Test for Self-Consistency
# of a parametric k-means solution (Tarpey 2006)
#
#  Assuming the input data is from a multivariate normal distribution,
#  the parametric k-means algorithm is implemented to estimate
#  k-principal points.  If the normality assumption is valid,
#  then the principal points from the parametric k-means algorithm
#  should be approximately self-consistent (Tarpey and Flury 1998)
#  for the sample data.  Partition the distribution into k sets according to
#  minimal distance to each of the estimated k principal points.  The
#  k sets in the partition are called the domains of attraction.
#  Self-consistency requires that the conditional means over each
#  domain of attraction is equal to the k principal point for that domain
#  of attraction.
#  
#  The test statistic used for self-consistency is T2,
#  the squared distance between the estimated
#  principal points from the parametric k-means algorithm and the
#  means of their respective domains of attraction.
#  Next, we simulate NSIM data sets with the same sample size
#  as the original data set from a N(xbar, S) distribution
#  where xbar = sample mean and S = sample covariance matrix.
#  The parametric k-means algorithm is applied to each simulated
#  data set and T2 is computed for each simulated data set.
#
#  If the normality assumption is valid, then T2 from the original
#  data should behave approximately like a T2 from the simulated data.
#
#  p-value = Proportion of simulated T2's exceed original T2.
#
#  Input:
#        x = n by p data matrix
#        k = number of principal points to estimate
#       ns = parametric k-means simulation size
#      nsim= number of data sets to simulate
#
#  Output:
#      pps = k by p matrix of estimated principal points from the parametric
#                    k-means algorithm.
#      pval= estimated p-value from parametric bootstrap test.
#
#  References:
#  "A Parametric k-Means Algorithm" by Tarpey (2006), to appear in
#     Computational Statistics.
#  "Self--Consistency: A Fundamental Concept in Statistics" by
#     Tarpey and Flury (1996), Statistical Science, 11, 229-243.
#
#
SCtest <- function(x,k,ns,nsim)
{
n <- dim(x)[1]    # sample size
p <- dim(x)[2]    # dimension of data
xbar <-apply(x,2,mean) # sample mean
S <- cov(x)      # sample covariance matrix
e <- eigen(S)    # eigen decomposition of the covariance matrix
# Simulate N(xbar, S) data with sample size ns and put the data in xsim:
if (p == 1)
{xfake=rnorm(ns, mean=mean(x), sd=sd(x))}
if (p > 1)
{
xfake <- sweep(matrix(rnorm(ns*p), ns, p)%*%diag(sqrt(e$value))%*%t(e$vector), 2, -xbar)
}
pps=kmeans(xfake,k)$centers     # cluster means from parametric k-means
# Compute the means of sample data of each Voronoi region induced by pps:

scpps=kmeans(x,pps, iter.max=1)$centers

# Compute the test statistic T2
T2 = sum((scpps-pps)^2)

T2sim=NULL
for (isim in 1:nsim)
{
if (p == 1)
{xsim=as.matrix(rnorm(n, mean=mean(x), sd=sd(x)))}
if (p > 1)
{
xsim <- sweep(matrix(rnorm(n*p), n, p)%*%diag(sqrt(e$value))%*%t(e$vector), 2, -xbar)
}


# Compute parametric k-means pp estimates for the simulated data set:
xbarsim <-apply(xsim,2,mean) # sample mean
Ssim <- cov(xsim)      # sample covariance matrix
esim <- eigen(Ssim)    # eigen decomposition of the covariance matrix

if (p == 1)
{xfakesim=as.matrix(rnorm(ns, mean=mean(xsim), sd=sd(xsim)))}
if (p > 1)
{
xfakesim <- sweep(matrix(rnorm(ns*p), ns, p)%*%diag(sqrt(esim$value))%*%t(esim$vector), 2, -xbarsim)
}
ppssim=kmeans(xfakesim,pps)$centers     # cluster means from parametric k-means


# Compute the test statistic T2
T2sim = rbind(T2sim,sum((kmeans(xsim,ppssim, iter.max=1)$centers-ppssim)^2))
}
pval = sum(T2sim > T2)*1/nsim


results <- list("principal points" = pps, "pvalue" = pval)
results
}