Let's compare PCA and kPCA on the digits data from the textbook.
library(dplyr) # for data manipulation
library(purrr) # for functional programming (map)
library(ElemStatLearn)
#
iz<-sample(seq(1,7291),500) # a subset of digits
Nmat<-zip.train[iz,-1]
Nlab<-zip.train[iz,1]
#
center<-function(x) { x<-(x-mean(x))} # centering
#
Nmat<-t(apply(Nmat,1,center))
Let's verify that we can move between the covariance space $X'X$ and the neighborhood space $XX'$ to obtain low rank representations of the data.
ssp<-svd(Nmat) # SVD on X
ssp2<-svd(t(Nmat)) # SVD on X'
vfr2<-t(Nmat)%*%ssp2$v%*%diag(1/ssp2$d) # getting X'X pcs from XX' pcs
# X'W L^-1/2 - see intro notebook
plot(ssp$v[,1],vfr2[,1], xlab="PC1 from XX'", ylab="Loading 1 from X'X")
abline(0,sign(cor(ssp$v[,1],vfr2[,1])))
#
plot(ssp$v[,2],vfr2[,2], xlab="PC2 from XX'", ylab="Loading 2 from X'X")
abline(0,sign(cor(ssp$v[,2],vfr2[,2])))
# So PCA from X'X can be obtained from PCA on XX'
Yes, we can get the loadings from either the covariance between features or the distance between observations (with rescaling and projections).
Let's look at just the 0s and see what PCA picks up.
#install.packages("kernlab")
library(kernlab)
#
zeroes<-zip.train[zip.train[,1]==0,-1] # looking at just the zero digits
#
ssn<-svd(zeroes) # SVD on the zero digits
plot(ssn$d, xlab="components", ylab="eigenvalues")
plot(ssn$u[,1:2], xlab="PC1", ylab="PC2")
#
aa <- order(ssn$u[,1])
bb <- order(ssn$u[,2])
ii <- c(sample(aa[1:20],1),sample(bb[1:20],1),sample(rev(aa)[1:20],1),sample(rev(bb)[1:20],1))
points(ssn$u[ii,1:2],col=2,cex=1.5, pch=16)
#
par(mfrow=c(2,2))
for (kk in (1:4)) {
image(t(matrix(as.numeric(zeroes[ii[kk],]),16,16,byrow=T))[,16:1], col = gray.colors(33))
# highligthing the structure that the 1st components identify and separate
}
PC1 and PC2 picks up different aspects of the 0 digits.
Run this a couple of times.
Let's try kernel PCA and see how that separates the data.
par(mfrow=c(1,1))
ssk<-kpca(zeroes, kernel = "rbfdot", kpar = list(sigma = 0.1),
features = 50, th = 1e-4) # try with different bandwidths
#
plot(eig(ssk), xlab="components", ylab="eigenvalues") # corresponding screeplot
#
plot(pcv(ssk)[,1:2], xlab="kPC1", ylab="kPC2") #
aa <- order(pcv(ssk)[,1])
bb <- order(pcv(ssk)[,2])
ii <- c(sample(aa[1:20],1),sample(bb[1:20],1),sample(rev(aa)[1:20],1),sample(rev(bb)[1:20],1))
points(pcv(ssk)[ii,1:2],col=2,cex=1.5, pch=16)
#
par(mfrow=c(2,2))
for (kk in (1:4)) {
image(t(matrix(as.numeric(zeroes[ii[kk],]),16,16,byrow=T))[,16:1],col=gray.colors(33))
}
Let's now apply to the full digits data set.
par(mfrow=c(1,1))
iz<-sample(seq(1,7291),2000) # 2000 digits
Nmat<-zip.train[iz,]
#
ssn<-svd(Nmat[,-1]) #PCA
plot(ssn$u[,1:2],pch=as.character(Nmat[,1]),col=gray(Nmat[,1]/10), xlab="PC1",ylab="PC2",
main="PCA")
# Let's try different bandwidths
ssk<-kpca(Nmat[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0005),
features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(Nmat[,1]/10), xlab="kPC1",ylab="kkPC2, sigma 0.0005",
main="kPCA, sigma 0.0005, gaussian kernel")
#
ssk<-kpca(Nmat[,-1], kernel = "polydot", kpar = list(degree = 3),
features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(Nmat[,1]/10), xlab="kPC1",ylab="kPC2",
main="kPCA, degree 3 poly kernel")
#
ssk<-kpca(Nmat[,-1], kernel = "polydot", kpar = list(degree = 5),
features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(Nmat[,1]/10), xlab="kPC1",ylab="kPC2",
main="kPCA, degree 5 poly kernel")
#
Notice how your kernel and bandwidth choices changes what is captures by the first kPCA components! It really depends on what you decide is similar and not - see how the zero or one digits are either "compressed" or "blown up".
You can look at more kPCA components of course.
Try this on some of the other data sets from class and see what happens!