# SVD of the mnist data

library(ElemStatLearn)
library(nsprcomp)

data(zip.train)
set.seed(1000)
Numbers<-zip.train[sample(seq(1,7291),2000),] # a subset of the digits

ssn<-svd(Numbers[,-1])
plot(ssn$d,type="l",xlab="Number of components",ylab="Eigenvalues")
# most of variance explained by 50 components (of 256)
plot(ssn$u[,1:2],xlab="PC1",ylab="PC2",pch=16)   # Plotting PC1 vs PC2
#pp<-identify(ssn$u[,1:2]) # use this to mark some of the observations if use in R interactive mode
aa<-rev(sort.list(ssn$u[,1]))
bb<-rev(sort.list(ssn$u[,2]))
au<-c(sample(aa[1:10],2),sample(aa[990:1000],1))
bu<-c(sample(bb[1:10],2),sample(bb[990:1000],1)) # Here I just pick some digits at the extremes of pc1 and pc2
points(ssn$u[au,1:2],col=2,pch=16,cex=1.5)       # and highlight them in the scatter plot
points(ssn$u[bu,1:2],col=3,pch=16,cex=1.5)
#
par(mfrow=c(3,3))                                # Plotting the corresponding digits
for (bb in (1:length(au))) {
   image(t(matrix(as.numeric(Numbers[au[bb],-1]),16,16,byrow=T))[,16:1], col = gray.colors(33)) }
for (bb in (1:length(bu))) {
   image(t(matrix(as.numeric(Numbers[bu[bb],-1]),16,16,byrow=T))[,16:1], col = gray.colors(33)) }


par(mfrow=c(3,3))       # The digits can look quite different - run this a couple of times to see.
for (zz in (1:9)) {
  iz<-sample(seq(1,dim(Numbers)[1])[Numbers[,1]==zz],1)
  image(t(matrix(as.numeric(Numbers[iz,-1]),16,16,byrow=T))[,16:1], col = gray.colors(33))
}


plot(ssn$u[,1:2],xlab="PC1",ylab="PC2",pch=as.character(Numbers[,1]),col=Numbers[,1]+1) # Scatter plot with the labels


#######################
# Approximation of one image using svd - 2 components
nu<-2            # try more components to see how reconstruction error decreases.
Nmat<-as.matrix(Numbers[,-1])
iz<-sample(seq(1,dim(Numbers)[1]),4)
par(mfrow=c(1,2))
for (bb in (1:4)) {
    ss<-svd(t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1])
    image(ss$u[,1:nu]%*%diag(ss$d[1:nu])%*%t(ss$v[,1:nu]),col=gray.colors(33))
    image(t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1],col=gray.colors(33)) }


#### For good reconstruction, use either sparse and many components, or dense and few components for approximately
#### the same performance.
library(nsprcomp)
for (bb in (1:4)) {
 Im<-t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1] # one object
 ss<-nsprcomp(Im,k=c(rep(10,4)),ncomp=4,tol=.1,scale=F,center=F) #10 loadings in each of 4 components - try different numbers here
 if (bb==1) {print(ss) } # Example of sparse rotation
 #
 UU<-Im%*%ss$rotation
 ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation)  # Reconstruction
 par(mfrow=c(1,2))                         # Plot reconstruction and original for comparison
 image(ImA,col=gray.colors(33))
 image(Im,col=gray.colors(33)) }

Standard deviations (1, .., p=4):
[1] 1.2963833 1.2017984 0.9762602 0.7887812

Rotation (n x k) = (16 x 4):
            PC1         PC2        PC3         PC4
 [1,] 0.3223290  0.00000000  0.2741850  0.06229128
 [2,] 0.3169807  0.00000000  0.3017923  0.00000000
 [3,] 0.3259026  0.00000000  0.2554992  0.00000000
 [4,] 0.3344939 -0.03177669  0.0000000  0.00000000
 [5,] 0.3356788 -0.03356832  0.0000000  0.00000000
 [6,] 0.3327006  0.00000000  0.0000000  0.00000000
 [7,] 0.3204183  0.00000000 -0.2288247  0.08046779
 [8,] 0.3054834  0.04889731 -0.3355132  0.10801224
 [9,] 0.2648411  0.09305093  0.0000000 -0.10301058
[10,] 0.0000000  0.32167492  0.0000000 -0.74018759
[11,] 0.0000000  0.40906380 -0.4639541  0.00000000
[12,] 0.0000000  0.45380684 -0.2309139  0.24901640
[13,] 0.0000000  0.43856442  0.0000000  0.36879362
[14,] 0.0000000  0.41979663  0.3735810  0.21550655
[15,] 0.0000000  0.37616144  0.3329660 -0.27426867
[16,] 0.2966984  0.00000000 -0.2907414 -0.31591923


for (bb in (1:4)) {
 Im<-t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1]
 ss<-nsprcomp(Im,k=c(rep(5,8)),ncomp=8,tol=.1,scale=F,center=F) # only 5 loadings in each of 8 components
 UU<-Im%*%ss$rotation
 ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation)
 par(mfrow=c(1,2))
 image(ImA,col=gray.colors(33))
 image(Im,col=gray.colors(33)) }


pp<-prcomp(Nmat)                               # Ordinary SVD
Nbr.svd.sparse<-nsprcomp(Nmat,k=c(rep(25,10))) # Sparse SVD. This step takes quite some time to run
image(t(matrix(Nbr.svd.sparse$rotation[,1],16,16,byrow=T))[,16:1],col=gray.colors(33)) # First 3 loadings
image(t(matrix(Nbr.svd.sparse$rotation[,2],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(Nbr.svd.sparse$rotation[,3],16,16,byrow=T))[,16:1],col=gray.colors(33))
plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1),xlim=c(1,10),type="l",xlab="components",ylab="% variance explained")
lines(cumsum(Nbr.svd.sparse$sdev)/sum(pp$sdev),col=2) # explained variance from sparse reconstruction


UU<-as.matrix(Nmat)%*%Nbr.svd.sparse$rotation # Projection
plot(UU[,1:2],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,3:4],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,5:6],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,7:8],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))


SAdata<-SAheart
SAdata$famhist<-as.numeric(SAdata$famhist) # turn family history into 0/1 variable
#
pp<-prcomp(SAdata,scale=T)                         # PCA
plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1))     # Explained variance  
sp<-nsprcomp(SAdata,k=c(10,rep(10,9)),scale=T)     # Sparse PCA with varying degrees of sparsity
lines(cumsum(sp$sdev)/sum(sp$sdev))
sp<-nsprcomp(SAdata,k=c(3,rep(2,9)),scale=T)
lines(cumsum(sp$sdev)/sum(sp$sdev),col=2)
sp<-nsprcomp(SAdata,k=c(2,rep(2,9)),scale=T)
lines(cumsum(sp$sdev)/sum(sp$sdev),col=3)          # some loss of information due to sparsity
####
print(names(SAdata))                               # Are the sparse loadings interpretable?
plot(sp$rotation[,1],xlab="features",ylab="sparse PC1")  # PC1 contains adiposity,obseity
plot(sp$rotation[,2],xlab="features",ylab="sparse PC2")   # PC2 contains tobacco, age
plot(pp$rotation[,1],xlab="features",ylab="original PC1") # Much less clear interpretation with non-sparse loadings
#
pp$rotation[,1:4] # Original loadings
sp$rotation[,1:4] # Sparse loadings
# Try different levels of sparsity.
# Also try this with standardized data

 [1] "sbp"       "tobacco"   "ldl"       "adiposity" "famhist"   "typea"    
 [7] "obesity"   "alcohol"   "age"       "chd"


############################################
#SOM
library(kohonen)
library(MASS)
# 
data(crabs)
crabs2<-crabs
head(crabs) # mixed data types in this data set
crabs2[,1]<-as.numeric(crabs2[,1]) # create numerical representation for PCA to work
crabs2[,2]<-as.numeric(crabs2[,1])
pp<-prcomp(crabs2[,-3],scale=T) # PCA
CC<-as.matrix(crabs2[,-3])%*%pp$rot
plot(CC[,1],CC[,2],col=crabs2$sp,pch=crabs2$sex,xlab="PC1",ylab="PC2")
plot(CC[,3],CC[,4],col=crabs2$sp,pch=crabs2$sex,xlab="PC3",ylab="PC4")


ss<-som(as.matrix(crabs2[,-3]),grid=somgrid(5,5))
plot(ss,type="mapping",col=crabs2$sp,pch=crabs2$sex) # where observations are co-located
plot(ss,type="codes") # radius of wedge is the magnitude of that feature in this grid.


som_model <- som(as.matrix(crabs2[,-3]), 
                 grid=somgrid(5,5), 
                 rlen=100, 
                 alpha=c(0.05,0.01), 
                 keep.data = TRUE)
plot(som_model,type="count") # where are the objects located
plot(som_model,type="dist.neighbours") # how close is grid to neighbors
plot(som_model,type="codes")  # content of grid


# visualize one variable across the map
plot(som_model, type = "property", 
     property = som_model$codes[[1]][,4], 
     main=names(som_model$data)[4])
plot(som_model, type = "property", 
     property = som_model$codes[[1]][,1], 
     main=names(som_model$data)[1])


# Use the SOM grid data to cluster the grids into groups
som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 6)
plot(som_model, type="mapping", bgcol = som_cluster, main = "Clusters") 
add.cluster.boundaries(som_model, som_cluster) # cluster the prototypes


iz<-sample(seq(1,7291),2000)
Nmat<-as.matrix(zip.train[iz,-1])
Nlab<-zip.train[iz,1]
# this takes some time...
som_modelN <- som(as.matrix(Nmat), 
  grid=somgrid(25,25), 
  rlen=100, 
  alpha=c(0.05,0.01), 
  keep.data = TRUE)
plot(som_modelN,type="changes")
plot(som_modelN,type="count")
plot(som_modelN,type="dist.neighbours")


som_cluster <- cutree(hclust(dist(som_modelN$codes[[1]])), 10)
plot(som_modelN, type="mapping", bgcol = som_cluster, main = "Clusters") 
add.cluster.boundaries(som_model, som_cluster)
#
plot(som_modelN,type="mapping",pch=as.character(Nlab),col=as.numeric(Nlab)+1)
plot(som_modelN,type="mapping",pch=as.character(Nlab),bgcol = som_cluster)
plot(som_modelN,type="codes")


# 625 images to look at: space it out by 5
par(mfrow=c(5,5))
kvec<-sort(c(seq(1,25,by=5),seq(5*25+1,6*25,by=5),seq(10*25+1,11*25,by=5),seq(15*25+1,16*25,by=5),seq(20*25+1,21*25,by=5)))
for (kk in (1:25)) {
  image(t(matrix(som_modelN$codes[[1]][kvec[kk],],16,16,byrow=T))[,16:1],col=gray.colors(33))
}


####### supervised SOM
Nbr.som2<-xyf(Nmat,Y=as.factor(Nlab),grid=somgrid(5,5))
plot(Nbr.som2) 
plot(Nbr.som2,type="mapping",pch=as.character(Nlab))

# 25 code images to look at.
par(mfrow=c(5,5))
for (kk in (1:25)) {
image(t(matrix(Nbr.som2$codes[[1]][kk,],16,16,byrow=T))[,16:1],col=gray.colors(33))
}


# Heart disease data
sh<-som(as.matrix(SAdata[,-10]),grid=somgrid(5,5))
plot(sh)
plot(sh,type="count")
plot(sh,type="mapping",pch=as.character(SAdata[,10]))
plot(sh,type="dist.neighbours")
#
sh_cluster <- cutree(hclust(dist(sh$codes[[1]])), 6)
plot(sh, type="mapping", bgcol = som_cluster, main = "Clusters") 
add.cluster.boundaries(sh, sh_cluster)
# Unsupervised SOM does not separate the classes well.
plot(sh,type="mapping",pch=as.character(SAdata[,10]),col=as.numeric(SAdata[,10])+1)


# Let's try supervised
sh2<-xyf(X=as.matrix(SAdata[,-10]),Y=as.factor(SAdata[,10]),grid=somgrid(5,5))
plot(sh2)
plot(sh2,type="mapping",pch=as.character(SAdata[,10]),col=as.numeric(SAdata[,10])+1)


library("NMF")
# a subset of handwritten digits
N<-500
# sample 500 digits to try NMF on
it<-sample(seq(1,7291),N)
Nmat<-zip.train[it,-1]
Nlab<-zip.train[it,1]
K<-10 # how many factors
aa<-nmf(Nmat-min(Nmat),K)
coefmap(aa) # this is what the codebook looks like (each is a raster scan of a 16*16 image)
plot(basis(fit(aa))[,1:2],col=gray(Nlab/10),
             ,pch=as.character(Nlab),xlab="W1",ylab="W2") # this is the basis plot - are digits separated?
# Examples of code-book images
image(t(matrix(aa@fit@H[1,],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(aa@fit@H[2,],16,16,byrow=T))[,16:1],col=gray.colors(33))


# Try a few of the digits and look what the K-rank representation captures
par(mfrow=c(1,2))
for (bb in (1:5)) {
ss<-sample(seq(1,length(it)),1)
image(t(matrix(Nmat[ss,],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(fitted(aa)[ss,],16,16,byrow=T))[,16:1],col=gray.colors(33)) }
##### Try with different K at home and see how much this improves the reconstruction


# Summary of the NMF fit
summary(aa)
# percent of variance explained
print(c("Variance explained"))
round(1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2),3)

[1] "Variance explained"


# Let's try fewer factors
K<-3
aa<-nmf(Nmat-min(Nmat),K)
par(mfrow=c(1,1))
coefmap(aa)
summary(aa)
print("Variance explained")
round(1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2),3)
# Explained variance decreases

[1] "Variance explained"


####### can we find the digits based in the low-dim space?
kk<-kmeans(fitted(aa),10)
table(kk$cluster,Nlab)
# 
library(dendextend)
hc <- hclust(dist(fitted(aa)),method="complete")
dend <- as.dendrogram(hc)
labels_colors(dend) <- (as.numeric(Nlab))[order.dendrogram(dend)]
plot(dend)
hcc<-cutree(hc,10)
table(hcc,Nlab)

    Nlab
      0  1  2  3  4  5  6  7  8  9
  1   5  0 21  1  3  6  9  0  2  0
  2  30  0  0  0  0  0  0  0  0  0
  3   2  0  0  2  3  6  0  1 25  6
  4   0 64  1  0  4  0 11  1  2  3
  5   0  0  0  4 12  2  0 17  2 21
  6   5  0 17  6  5  3 15  1  0  0
  7  44  0  5  0  0  1  4  0  0  0
  8   2  0  2 11  1  7  0  0  2  0
  9   1  0  5 24  2 17  0  1  1  3
  10  0  0  0  0  6  2  0 24  0 17

    Nlab
hcc   0  1  2  3  4  5  6  7  8  9
  1   2 25  1  2  3  1  7  0  9  1
  2  43  0  4  0  0  1  3  0  0  0
  3   0 39  7  1  6  1  8  1  5  5
  4   0  0  2 19 21  9  0 40 13 36
  5   3  0 10 21  1 22  1  0  0  0
  6   8  0 25  4  4  4 19  1  0  0
  7  30  0  0  0  0  0  0  0  0  0
  8   0  0  0  0  0  4  0  2  5  8
  9   3  0  1  0  1  0  1  1  1  0
  10  0  0  1  1  0  2  0  0  1  0


puppy<-as.matrix(read.table("puppy.txt"))
image(puppy,col=gray.colors(256,0,1))
ssp<-svd(puppy-mean(puppy))
varexp<-cumsum(ssp$d^2)/sum(ssp$d^2)
plot(varexp) # very few components needed to summarize this image


#### Low-rank approximations
par(mfrow=c(2,2))
k<-2
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-5
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-15
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-25
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))


#### Missing values (corrupted image) # Try even more missing values here and see for how long SVD imputation works
aa<-sample(seq(1,322),100,replace=T)
bb<-sample(seq(1,300),100,replace=T)
puppy2<-puppy
puppy2[aa,bb]<-NA
par(mfrow=c(1,1))
image(puppy2,col=gray.colors(256,0,1))

library(softImpute)
#### Using low-rank approximations to fill in missing values
pcomp<-softImpute(puppy2,rank=10,lambda=.85) # using very low rank approximation to fill in the gaps
image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1))
pcomp<-softImpute(puppy2,rank=35,lambda=.9) # slightly higher rank
image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1))
####
#### Note - this procedure also uses soft thresholding.
#### When lambda is 0 you use the leading components for imputation, when lambda > 0 you shrink eigenvalues toward 0
#### This works as denoising.


#### Denoising example
puppy3<-puppy
puppy3<-(puppy+matrix(rnorm(322*300,sd=.2),322,300))
image(puppy3,col=gray.colors(256,0,1))
ssp<-svd(puppy3)
k<-15
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-35
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))


### the low rank approximations of the puppy image
ssp<-svd(puppy-mean(puppy))
par(mfrow=c(4,4))
for (k in 1:16) {
  if (k>1) {
  image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) }
  if (k==1) {
    image(ssp$d[1]*matrix(ssp$u[,1:k],322,1)%*%matrix(ssp$v[,1:k],1,300),col=gray.colors(256,0,1)) }
}
# The SVD layers
par(mfrow=c(4,4))
for (k in 1:16) {
    image(ssp$d[k]*matrix(ssp$u[,k],322,1)%*%matrix(ssp$v[,k],1,300),col=gray.colors(256,0,1)) }


### And now NMF
K<-16
aa<-nmf(puppy,K)
par(mfrow=c(1,1))
image(fitted(aa),col=gray.colors(256,0,1))
# the NMF layers
par(mfrow=c(4,4))
for (k in (1:16)) {
  image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1))
}
# adding the layers up
par(mfrow=c(4,4))
for (k in (1:16)) {
  if (k>1) {
  image((basis(aa))[,1:k]%*%(coef(aa)[1:k,]),col=gray.colors(256,0,1))
  }
  if (k==1) {
    image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1))
  }
}


###########################
#### on a big data set of digits
# this takes a while to run!
N<-500
itNbr<-sample(seq(1,7291),N)
Nmat<-zip.train[itNbr,-1]
Nlab<-zip.train[itNbr,1]
K<-16
aaNbr<-nmf(Nmat-min(Nmat),K)
#
par(mfrow=c(1,1))
coefmap(aaNbr)
basismap(aaNbr)
###############################


par(mfrow=c(4,4)) # Codebook
for (k in (1:16)) {
  image(t(matrix(coef(aaNbr)[k,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}


# how good is the approximation? Look at observation u (try different ones)
# here I try the a couple of observations
par(mfrow=c(4,4))
su<-sample(seq(1,length(itNbr)),8)
for (zz in (1:length(su))) {
u<-su[zz]
s1<-(basis(aaNbr))[u,]%*%(coef(aaNbr))
image(t(matrix(s1,16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
image(t(matrix(Nmat[u,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}


######
svdN<-svd(Nmat)
par(mfrow=c(4,4))
for (k in 1:16) {
  image(t(matrix(svdN$v[,k],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}
for (zz in (1:length(su))) {
  u<-su[zz]
s1<-svdN$u[u,1:16]%*%diag(svdN$d[1:16])%*%t(svdN$v[,1:16])
image(t(matrix(s1,16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
image(t(matrix(Nmat[u,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}

	PC1	PC2	PC3	PC4
sbp	-0.306353655	0.07962060	-0.30456703	0.101146784
tobacco	-0.303266917	0.42468435	-0.15581440	0.009733644
ldl	-0.324056204	-0.27411219	0.24864969	-0.164196944
adiposity	-0.481139710	-0.30711103	-0.09344212	0.099940021
famhist	-0.207459194	0.22554993	0.41109922	-0.247615625
typea	0.001496365	0.02182423	0.68327241	0.520782842
obesity	-0.361036881	-0.49744060	0.01471631	0.306996176
alcohol	-0.115336964	0.44846380	-0.16870103	0.647993822
age	-0.448790047	0.12855346	-0.15660315	-0.180947607
chd	-0.299273226	0.35694131	0.35119479	-0.270763719

	sp	sex	index	FL	RW	CL	CW	BD
	<fct>	<fct>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	B	M	1	8.1	6.7	16.1	19.0	7.0
2	B	M	2	8.8	7.7	18.1	20.8	7.4
3	B	M	3	9.2	7.8	19.0	22.4	7.7
4	B	M	4	9.6	7.9	20.1	23.1	8.2
5	B	M	5	9.8	8.0	20.3	23.0	8.2
6	B	M	6	10.8	9.0	23.0	26.5	9.8

Demo on SVD, NMF¶

Sparse SVD¶

Self-organizing maps¶

Non-negative Matrix Factorization¶

The world is low rank....¶

	PC1	PC2	PC3	PC4
sbp	0.0000000	0.0000000	0.0000000	-0.6581029
tobacco	0.0000000	-0.7120304	0.0000000	0.0000000
ldl	0.0000000	0.0000000	-0.6876708	0.0000000
adiposity	-0.7081240	0.0000000	0.0000000	0.0000000
famhist	0.0000000	0.0000000	0.0000000	0.0000000
typea	0.0000000	0.0000000	0.0000000	0.0000000
obesity	-0.7060881	0.0000000	0.0000000	0.0000000
alcohol	0.0000000	0.0000000	0.0000000	-0.7529280
age	0.0000000	-0.7021486	0.0000000	0.0000000
chd	0.0000000	0.0000000	-0.7260226	0.0000000