library(dplyr) # for data manipulation
library(purrr) # for functional programming (map)
library(ElemStatLearn)
#
iz<-sample(seq(1,7291),500) # a subset of digits
Nmat<-zip.train[iz,-1]
Nlab<-zip.train[iz,1]
#
center<-function(x) { x<-(x-mean(x))} # centering
#
Nmat<-t(apply(Nmat,1,center))

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


ssp<-svd(Nmat)          # SVD on X
ssp2<-svd(t(Nmat))      # SVD on X'
vfr2<-t(Nmat)%*%ssp2$v%*%diag(1/ssp2$d)   # getting X'X pcs from XX' pcs
                                          # X'W L^-1/2 - see intro notebook
plot(ssp$v[,1],vfr2[,1], xlab="PC1 from XX'", ylab="Loading 1 from X'X")
abline(0,sign(cor(ssp$v[,1],vfr2[,1])))
#
plot(ssp$v[,2],vfr2[,2], xlab="PC2 from XX'", ylab="Loading 2 from X'X")
abline(0,sign(cor(ssp$v[,2],vfr2[,2])))
# So PCA from X'X can be obtained from PCA on XX'


#install.packages("kernlab")
library(kernlab)
#
zeroes<-zip.train[zip.train[,1]==0,-1] # looking at just the zero digits
#
ssn<-svd(zeroes) # SVD on the zero digits
plot(ssn$d, xlab="components", ylab="eigenvalues")
plot(ssn$u[,1:2], xlab="PC1", ylab="PC2")
#
aa <- order(ssn$u[,1])
bb <- order(ssn$u[,2])
ii <- c(sample(aa[1:20],1),sample(bb[1:20],1),sample(rev(aa)[1:20],1),sample(rev(bb)[1:20],1))
points(ssn$u[ii,1:2],col=2,cex=1.5, pch=16)
#
par(mfrow=c(2,2))
for (kk in (1:4)) {
image(t(matrix(as.numeric(zeroes[ii[kk],]),16,16,byrow=T))[,16:1], col = gray.colors(33))
# highligthing the structure that the 1st components identify and separate
}

Attaching package: 'kernlab'


The following object is masked from 'package:purrr':

    cross


par(mfrow=c(1,1))
ssk<-kpca(zeroes, kernel = "rbfdot", kpar = list(sigma = 0.1),
     features = 50, th = 1e-4) # try with different bandwidths
#
plot(eig(ssk), xlab="components", ylab="eigenvalues") # corresponding screeplot
#
plot(pcv(ssk)[,1:2], xlab="kPC1", ylab="kPC2") #
aa <- order(pcv(ssk)[,1])
bb <- order(pcv(ssk)[,2])
ii <- c(sample(aa[1:20],1),sample(bb[1:20],1),sample(rev(aa)[1:20],1),sample(rev(bb)[1:20],1))
points(pcv(ssk)[ii,1:2],col=2,cex=1.5, pch=16)
#
par(mfrow=c(2,2))
for (kk in (1:4)) {
  image(t(matrix(as.numeric(zeroes[ii[kk],]),16,16,byrow=T))[,16:1],col=gray.colors(33))
}


par(mfrow=c(1,1))
iz<-sample(seq(1,7291),2000) # 2000 digits
Nmat<-zip.train[iz,]
#
ssn<-svd(Nmat[,-1]) #PCA
plot(ssn$u[,1:2],pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10), xlab="PC1",ylab="PC2",
    main="PCA")
# Let's try different bandwidths
ssk<-kpca(Nmat[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0005),
          features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10), xlab="kPC1",ylab="kkPC2, sigma 0.0005",
     main="kPCA, sigma 0.0005, gaussian kernel")
#
ssk<-kpca(Nmat[,-1], kernel = "polydot", kpar = list(degree = 3),
          features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, degree 3 poly kernel")
#
ssk<-kpca(Nmat[,-1], kernel = "polydot", kpar = list(degree = 5),
          features = 50, th = 1e-4)
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, degree 5 poly kernel")
#


# MDS
library(cluster) # a library of clustering methods
library(stats)
#
ddNbr<-daisy(Nmat[,-1])  # daisy is a cluster method that uses different distance metrics depending on the data type
                         # Here, our data is numeric so we could use euclidean distance just as well
ffNbr<-cmdscale(ddNbr,2) # 2 dimensions
plot(ffNbr,pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10),xlab="Dim1",ylab="Dim2",main="MDS - Daisy")
#
ffNbr2<-cmdscale(dist(Nmat[,-1]),2) # 2 dimensions
plot(ffNbr2,pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10),xlab="Dim1",ylab="Dim2",main="MDS - Euclidean")
#
ddNbr3<-as.dist(1-cor(t(Nmat[,-1]))) # using correlation based distance
ffNbr3<-cmdscale(ddNbr3,2) # 
plot(ffNbr3,pch=as.character(Nmat[,1]),col=gray(.1+.9*Nmat[,1]/10),xlab="Dim1",ylab="Dim2",main="MDS - Correlation")


# isomap
library(vegan)
isoNbr<-isomap(dist(Nmat[,-1]),ndim=2,k=10) # use only 10 nearest neighbors to produce the 2dim representation
plot(isoNbr$points,col=gray(.1+.9*Nmat[,1]/10),pch=as.character(Nmat[,1]),xlab="Dim 1",ylab="Dim 2",main="ISOMAP k=10")
#
isoNbr<-isomap(dist(Nmat[,-1]),ndim=2,k=5) # use only 5 nearest neighbors to produce the 2dim representation
plot(isoNbr$points,col=gray(.1+.9*Nmat[,1]/10),pch=as.character(Nmat[,1]),xlab="Dim 1",ylab="Dim 2",main="ISOMAP k=5")
#
isoNbr<-isomap(dist(Nmat[,-1]),ndim=2,k=50) # use 50 nearest neighbors to produce the 2dim representation
plot(isoNbr$points,col=gray(.1+.9*Nmat[,1]/10),pch=as.character(Nmat[,1]),xlab="Dim 1",ylab="Dim 2",main="ISOMAP k=50")
###
# Try different number of dimensions here as well as neighbors - cluster in the low-dimensional space


isoNbr<-isomap(dist(Nmat[,-1]),ndim=2,k=5) # use only 5 nearest neighbors to produce the 2dim representation
plot(isoNbr$points,col=gray(.1+.9*Nmat[,1]/10),pch=as.character(Nmat[,1]),xlab="Dim 1",ylab="Dim 2",main="ISOMAP k=5")
kiso <- kmeans(isoNbr$points,10) # You can use a different clustering method of course. Try it.
plot(isoNbr$points,col=gray(.1+.9*(kiso$cluster-1)/10),pch=as.character(Nmat[,1]),xlab="Dim 1",ylab="Dim 2",main="ISOMAP k=5")


### tSNE
library(tsne)
ecb = function(x,y){ plot(x,t='n'); text(x,labels=Nmat[,1], col=gray(.1+.9*Nmat[,1]/10)) }
tsne_N = tsne(Nmat[,-1], epoch_callback = ecb, perplexity=25, max_iter=200, epoch = 50)  
# plot trace of method every 50 iterations
#
# takes a while to run if you run many iterations
# perplexity - bandwidth choice

sigma summary: Min. : 0.307305436144325 |1st Qu. : 0.55077171072504 |Median : 0.621216510586269 |Mean : 0.605468741751375 |3rd Qu. : 0.672348678617624 |Max. : 0.904833750811287 |

Epoch: Iteration #50 error is: 18.2839237197438

Epoch: Iteration #100 error is: 17.9556598929394

Epoch: Iteration #150 error is: 1.55997891411243

Epoch: Iteration #200 error is: 1.3238629129913


par(mfrow=c(1,1)) # final output
plot(tsne_N,col="white")
text(tsne_N,labels=Nmat[,1],col=gray(.1+.9*Nmat[,1]/10),xlab="Dim1",ylab="Dim2",main="tSNE perplexity 25")


tsne_N = tsne(Nmat[,-1], epoch_callback = ecb, perplexity=5, max_iter=200, epoch=50)  # smaller perplexity - more local

sigma summary: Min. : 0.144449285973004 |1st Qu. : 0.414430506076077 |Median : 0.495741649894109 |Mean : 0.483222364681674 |3rd Qu. : 0.561414985922352 |Max. : 0.820152717854378 |

Epoch: Iteration #50 error is: 20.788103710268

Epoch: Iteration #100 error is: 19.2055902929072

Epoch: Iteration #150 error is: 2.31364427776511

Epoch: Iteration #200 error is: 1.96131076189519


par(mfrow=c(1,1))
plot(tsne_N,col="white")
text(tsne_N,labels=Nmat[,1],col=gray(.1+.9*Nmat[,1]/10),xlab="Dim1",ylab="Dim2",main="tSNE perplexity 5")


ktsne <- kmeans(tsne_N, 10)
par(mfrow=c(1,1))
plot(tsne_N,col="white")
text(tsne_N,labels=Nmat[,1],col=gray(.1+.9*(ktsne$cluster-1)/10),xlab="Dim1",ylab="Dim2",main="tSNE perplexity 5") 
# kmeans in low-dimensional space
table(ktsne$cluster, Nmat[,1])

    
       0   1   2   3   4   5   6   7   8   9
  1    0 176   1   2   9   1  11   0   0   0
  2   14   0 137   0   3   1   1  45  12   0
  3    0   0   1 169   0  14   0   0   6   5
  4    0  47   7   3   7   0   3   0 124   0
  5    2   0   0  19   0 127   1   0   8   0
  6  150   0   1   0   0   6   2   0   7   0
  7    0  24   1   0   9   1 158   2   0   1
  8    0   0  11   1  14   3   0  88   1 151
  9  148   0  41   0   0   1   0  16   2   1
  10   0  45  10   1 120   2   3  11   0  12


ssk<-kpca(Nmat[,-1], kernel = "rbfdot", kpar = list(sigma = 0.01),
          features = 50, th = 1e-4)
specNat<-specc(Nmat[,-1],centers=10, kernel="rbfdot", kpar=list(sigma=0.01))


plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]),col=gray(.1+.9*(specNat-1)/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.0005, gaussian kernel")
plot(pcv(ssk)[,3:4],pch=as.character(Nmat[,1]),col=gray(.1+.9*(specNat-1)/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.0005, gaussian kernel")
plot(pcv(ssk)[,5:6],pch=as.character(Nmat[,1]),col=gray(.1+.9*(specNat-1)/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.0005, gaussian kernel")
plot(pcv(ssk)[,7:8],pch=as.character(Nmat[,1]),col=gray(.1+.9*(specNat-1)/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.0005, gaussian kernel")
plot(pcv(ssk)[,9:10],pch=as.character(Nmat[,1]),col=gray(.1+.9*(specNat-1)/10), xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.0005, gaussian kernel")
# Note the spectral clustering is done in 10 dimensions, not 2, so the results may look surprising.


table(specNat,Nmat[,1])

       
specNat   0   1   2   3   4   5   6   7   8   9
     1    0   0   2   4   2   0   0 122   1  21
     2  128   0   0   0   0   3   3   0   1   0
     3   10   0 183   8   8   5  32   0   5   0
     4    4   0   8 161   0  79   0   0  13   0
     5   20   0   0   1   0  35 136   0   3   0
     6  145   0   2   0   0   0   2   0   2   0
     7    3   0   6   0  87  25   2  10  16  44
     8    3  42   6  17   3   9   2   3  88   6
     9    1   0   2   4  57   0   2  27  31  99
     10   0 250   1   0   5   0   0   0   0   0


ssk<-kpca(Nmat[,-1], kernel = "rbfdot", kpar = list(sigma = 0.005),
          features = 50, th = 1e-4)
specNat<-specc(Nmat[,-1],centers=10, kernel="rbfdot", kpar=list(sigma=0.005))


table(specNat,Nmat[,1])
plot(pcv(ssk)[,1:2],pch=as.character(Nmat[,1]), col=gray(.1+.9*(specNat-1)/10),xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.005, gaussian kernel")
plot(pcv(ssk)[,3:4],pch=as.character(Nmat[,1]), col=gray(.1+.9*(specNat-1)/10),xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.005, gaussian kernel")
plot(pcv(ssk)[,5:6],pch=as.character(Nmat[,1]), col=gray(.1+.9*(specNat-1)/10),xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.005, gaussian kernel")
plot(pcv(ssk)[,7:8],pch=as.character(Nmat[,1]), col=gray(.1+.9*(specNat-1)/10),xlab="kPC1",ylab="kPC2",
     main="kPCA, sigma 0.005, gaussian kernel")
# try a different kernel

       
specNat   0   1   2   3   4   5   6   7   8   9
     1    0   0   1   2  47   0   0  31  28 104
     2    3   0   9 158   0  64   0   0  11   0
     3  133   0   1   4   0   7   7   0   1   0
     4  153   0   2   0   1   2   5   0   4   0
     5    7   0 180   6   9   3  26   1   5   0
     6    4  22   5  19   2  10   2   3  85   8
     7    0   0   5   3   1   1   0 119   1  15
     8   14   0   1   3   0  47 136   0   3   0
     9    0 270   1   0   5   0   0   0   0   0
     10   0   0   5   0  97  22   3   8  22  43


spectral_clustering <- function(X, # matrix of data points
                                nn = 10, # the k nearest neighbors to consider
                                n_eig = 2) # m number of eignenvectors to keep
{
  mutual_knn_graph <- function(X, nn = 10)
  {
    D <- as.matrix( dist(X) ) # matrix of euclidean distances between data points in X
    
    # intialize the knn matrix
    knn_mat <- matrix(0,
                      nrow = nrow(X),
                      ncol = nrow(X))
    
    # find the 10 nearest neighbors for each point
    for (i in 1: nrow(X)) {
      neighbor_index <- order(D[i,])[2:(nn + 1)]
      knn_mat[i,][neighbor_index] <- 1 
    }
   
    # Now we note that i,j are neighbors iff K[i,j] = 1 or K[j,i] = 1 
    knn_mat <- knn_mat + t(knn_mat) # find mutual knn
    
    knn_mat[ knn_mat == 2 ] = 1
    
    return(knn_mat)
  }
  
  graph_laplacian <- function(W, normalized = TRUE)
  {
    stopifnot(nrow(W) == ncol(W)) 
    
    g = colSums(W) # degrees of vertices
    n = nrow(W)
    
    if(normalized)
    {
      D_half = diag(1 / sqrt(g) )
      return( diag(n) - D_half %*% W %*% D_half )
    }
    else
    {
      return( diag(g) - W )
    }
  }
  
  W = mutual_knn_graph(X) # 1. matrix of similarities
  L = graph_laplacian(W) # 2. compute graph laplacian
  ei = eigen(L, symmetric = TRUE) # 3. Compute the eigenvectors and values of L
  n = nrow(L)
  return(ei$vectors[,(n - n_eig):(n - 1)]) # return the eigenvectors of the n_eig smallest eigenvalues

}


# do spectral clustering procedure
X_sc <- spectral_clustering(Nmat[,-1], 20, 10) # 20 neighbors, 10 eigenvectors
X_sc_kmeans <- kmeans(X_sc, 10)
table(X_sc_kmeans$cluster,Nmat[,1])
plot(X_sc[,1:2], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 20 neighbors")
plot(X_sc[,3:4], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 20 neighbors")
plot(X_sc[,5:6], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 20 neighbors")

    
       0   1   2   3   4   5   6   7   8   9
  1    4   0  11 185   0 147   1 134  18   1
  2  167   0   4   0   0   1   2   0   3   0
  3    0  90   1   0   4   0   2   2   0   0
  4    0   0   2   0 150   1   4  26   4 167
  5    0   0   7   7   1   1   1   0 126   1
  6    0 116   0   0   1   0   2   0   7   1
  7    0  86   1   1   4   0   0   0   0   0
  8  140   0   2   0   0   2   5   0   1   0
  9    3   0   1   0   0   4 161   0   1   0
  10   0   0 181   2   2   0   1   0   0   0


# do spectral clustering procedure
X_sc <- spectral_clustering(Nmat[,-1], 50, 20) # more neighbors
# run kmeans on the 5 eigenvectors
X_sc_kmeans <- kmeans(X_sc, 10)
table(X_sc_kmeans$cluster,Nmat[,1])
plot(X_sc[,1:2], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 50 neighbors")
plot(X_sc[,3:4], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 50 neighbors")
plot(X_sc[,5:6], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 50 neighbors")

    
       0   1   2   3   4   5   6   7   8   9
  1  118   0   2   0   0   1   3   0   2   0
  2    0   0   5   0  59   2   0 112   2 108
  3    1   0   0   0   0   4 160   0   1   0
  4    0   0   4 176   0   8   0   0  15   0
  5    0 101   1   0   6   0   0   1   0   0
  6  195   0  10   5   1   5   6   0 120   0
  7    0   1   3   1  92   1   5  48  13  62
  8    0   0   0  10   0 135   0   0   0   0
  9    0 123 185   3   4   0   3   1   0   0
  10   0  67   0   0   0   0   2   0   7   0


# do spectral clustering procedure
X_sc <- spectral_clustering(Nmat[,-1], 10, 10) # 10 neighbors, 10 eigenvectors
X_sc_kmeans <- kmeans(X_sc, 10) 
table(X_sc_kmeans$cluster,Nmat[,1])
plot(X_sc[,1:2], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 10 neighbors")
plot(X_sc[,3:4], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 10 neighbors")
plot(X_sc[,5:6], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 10 neighbors")
plot(X_sc[,7:8], ,pch=as.character(Nmat[,1]), col=gray(.1+.9*(X_sc_kmeans$cluster-1)/10), xlab="sPC1",ylab="sPC2",
     main="spec Clust, 10 neighbors")

    
       0   1   2   3   4   5   6   7   8   9
  1    0   0   0   0 105   0   0   3   0  67
  2    0 172   2   0   5   0   7   2   8   0
  3    2   0   8   2  46   5   7  37  11 102
  4    0 120   1   1   4   0   0   0   0   0
  5    0   0   5 184   0 141   0   0  16   0
  6    0   0   1   0   0   0   0 120   0   1
  7  309   0   8   0   0   4   6   0   4   0
  8    3   0   0   0   0   5 158   0   1   0
  9    0   0 179   2   1   0   1   0   0   0
  10   0   0   6   6   1   1   0   0 120   0

Comparison of kPCA and PCA¶

Multi-dimensional Scaling¶

tSNE¶

Spectral clustering¶

Summary¶