# R function for estimating the k-principal points (Flury 1990)
# of a multivariate normal distribution using the Parametric
k-means
# algorithm (Tarpey 2006, Computational Statistics).
#
# Input:
# x = n by p data matrix, where n is the sample size and p
# is the dimension of the
data.
# k = Number of principal points to estimate (1 < k < n-1)
# ns = Choose a large sample size for the simulated data (e.g.
ns=500000)
#
# Output:
# results = k by p matrix of estimated principal points
#
# Usage: results=parametrickmeans(x,k,ns)
#
# References:
# "Principal Points" (1990) by B. Flury,
Biometrika, 77, 33-41.
# "A Parametric k-Means Algorithm"
(2006), by T. Tarpey, to appear in Computational Statistics.
#
parametrickmeans <- function(x, k, ns)
{
n <- dim(x)[1] # sample size
p <- dim(x)[2] # dimension of data
xbar <-apply(x,2,mean) # sample mean
S <- cov(x) # sample covariance matrix
e <- eigen(S) # eigen decomposition of the
covariance matrix
# Simulate N(xbar, S) data with sample size ns and put the data in xsim:
xsim <- sweep(matrix(rnorm(ns*p), ns,
p)%*%diag(sqrt(e$value))%*%t(e$vector), 2, -xbar)
kmeans(xsim,k)$centers # Estimated principal points
using the parametric k-means
results <- list("kprincipalpoints" = kmeans(xsim,k)$centers)
results
}