# Parametric Bootstrap Test for Self-Consistency
# of a parametric k-means solution (Tarpey 2006)
#
# Assuming the input data is from a multivariate normal
distribution,
# the parametric k-means algorithm is implemented to estimate
# k-principal points. If the normality assumption is valid,
# then the principal points from the parametric k-means algorithm
# should be approximately self-consistent (Tarpey and Flury 1998)
# for the sample data. Partition the distribution into k
sets according to
# minimal distance to each of the estimated k principal
points. The
# k sets in the partition are called the domains of attraction.
# Self-consistency requires that the conditional means over each
# domain of attraction is equal to the k principal point for that
domain
# of attraction.
#
# The test statistic used for self-consistency is T2,
# the squared distance between the estimated
# principal points from the parametric k-means algorithm and the
# means of their respective domains of attraction.
# Next, we simulate NSIM data sets with the same sample size
# as the original data set from a N(xbar, S) distribution
# where xbar = sample mean and S = sample covariance matrix.
# The parametric k-means algorithm is applied to each simulated
# data set and T2 is computed for each simulated data set.
#
# If the normality assumption is valid, then T2 from the original
# data should behave approximately like a T2 from the simulated
data.
#
# p-value = Proportion of simulated T2's exceed original T2.
#
# Input:
# x = n by p data matrix
# k = number of principal
points to estimate
# ns = parametric k-means
simulation size
# nsim= number of data sets to simulate
#
# Output:
# pps = k by p matrix of estimated
principal points from the parametric
#
k-means algorithm.
# pval= estimated p-value from parametric
bootstrap test.
#
# References:
# "A Parametric k-Means Algorithm" by Tarpey (2006), to appear in
# Computational Statistics.
# "Self--Consistency: A Fundamental Concept in Statistics" by
# Tarpey and Flury (1996), Statistical Science,
11, 229-243.
#
#
SCtest <- function(x,k,ns,nsim)
{
n <- dim(x)[1] # sample size
p <- dim(x)[2] # dimension of data
xbar <-apply(x,2,mean) # sample mean
S <- cov(x) # sample covariance matrix
e <- eigen(S) # eigen decomposition of the
covariance matrix
# Simulate N(xbar, S) data with sample size ns and put the data in xsim:
if (p == 1)
{xfake=rnorm(ns, mean=mean(x), sd=sd(x))}
if (p > 1)
{
xfake <- sweep(matrix(rnorm(ns*p), ns,
p)%*%diag(sqrt(e$value))%*%t(e$vector), 2, -xbar)
}
pps=kmeans(xfake,k)$centers # cluster means
from parametric k-means
# Compute the means of sample data of each Voronoi region induced by
pps:
scpps=kmeans(x,pps, iter.max=1)$centers
# Compute the test statistic T2
T2 = sum((scpps-pps)^2)
T2sim=NULL
for (isim in 1:nsim)
{
if (p == 1)
{xsim=as.matrix(rnorm(n, mean=mean(x), sd=sd(x)))}
if (p > 1)
{
xsim <- sweep(matrix(rnorm(n*p), n,
p)%*%diag(sqrt(e$value))%*%t(e$vector), 2, -xbar)
}
# Compute parametric k-means pp estimates for the simulated data set:
xbarsim <-apply(xsim,2,mean) # sample mean
Ssim <- cov(xsim) # sample covariance
matrix
esim <- eigen(Ssim) # eigen decomposition of the
covariance matrix
if (p == 1)
{xfakesim=as.matrix(rnorm(ns, mean=mean(xsim), sd=sd(xsim)))}
if (p > 1)
{
xfakesim <- sweep(matrix(rnorm(ns*p), ns,
p)%*%diag(sqrt(esim$value))%*%t(esim$vector), 2, -xbarsim)
}
ppssim=kmeans(xfakesim,pps)$centers # cluster
means from parametric k-means
# Compute the test statistic T2
T2sim = rbind(T2sim,sum((kmeans(xsim,ppssim,
iter.max=1)$centers-ppssim)^2))
}
pval = sum(T2sim > T2)*1/nsim
results <- list("principal points" = pps, "pvalue" = pval)
results
}