I will use the digits data and the heart disease data to illustrate the use of SVD, sparse SVD, NMF and SOMS for data representations and data exploration.
# SVD of the mnist data
library(ElemStatLearn)
library(nsprcomp)
data(zip.train)
set.seed(1000)
Numbers<-zip.train[sample(seq(1,7291),2000),] # a subset of the digits
ssn<-svd(Numbers[,-1])
plot(ssn$d,type="l",xlab="Number of components",ylab="Eigenvalues")
# most of variance explained by 50 components (of 256)
plot(ssn$u[,1:2],xlab="PC1",ylab="PC2",pch=16) # Plotting PC1 vs PC2
#pp<-identify(ssn$u[,1:2]) # use this to mark some of the observations if use in R interactive mode
aa<-rev(sort.list(ssn$u[,1]))
bb<-rev(sort.list(ssn$u[,2]))
au<-c(sample(aa[1:10],2),sample(aa[990:1000],1))
bu<-c(sample(bb[1:10],2),sample(bb[990:1000],1)) # Here I just pick some digits at the extremes of pc1 and pc2
points(ssn$u[au,1:2],col=2,pch=16,cex=1.5) # and highlight them in the scatter plot
points(ssn$u[bu,1:2],col=3,pch=16,cex=1.5)
#
par(mfrow=c(3,3)) # Plotting the corresponding digits
for (bb in (1:length(au))) {
image(t(matrix(as.numeric(Numbers[au[bb],-1]),16,16,byrow=T))[,16:1], col = gray.colors(33)) }
for (bb in (1:length(bu))) {
image(t(matrix(as.numeric(Numbers[bu[bb],-1]),16,16,byrow=T))[,16:1], col = gray.colors(33)) }
par(mfrow=c(3,3)) # The digits can look quite different - run this a couple of times to see.
for (zz in (1:9)) {
iz<-sample(seq(1,dim(Numbers)[1])[Numbers[,1]==zz],1)
image(t(matrix(as.numeric(Numbers[iz,-1]),16,16,byrow=T))[,16:1], col = gray.colors(33))
}
plot(ssn$u[,1:2],xlab="PC1",ylab="PC2",pch=as.character(Numbers[,1]),col=Numbers[,1]+1) # Scatter plot with the labels
The first PCs do a pretty good job separating 0s and 1s from the rest.
Let's examine how well SVD approximates the digits with few components.
#######################
# Approximation of one image using svd - 2 components
nu<-2 # try more components to see how reconstruction error decreases.
Nmat<-as.matrix(Numbers[,-1])
iz<-sample(seq(1,dim(Numbers)[1]),4)
par(mfrow=c(1,2))
for (bb in (1:4)) {
ss<-svd(t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1])
image(ss$u[,1:nu]%*%diag(ss$d[1:nu])%*%t(ss$v[,1:nu]),col=gray.colors(33))
image(t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1],col=gray.colors(33)) }
With only two components you can still discern the digits. Try with more components to see how much that improves the reconstruction.
We can make SVD/PCA more interpretable by generating sparse loadings - that is, which features can generate direction that capture variation in the data? A couple of different variants of sparse SVD have been proposed
We want to find a sparse SVD. Let's for now assume we have $X=UDV'$ and call the principal components $Z=UD$ and the loadings are in $V$. Let's look at the $i$th PC $Z_i = U_i D_{ii}$
Ridge-penalty $$\min_{\beta} ||Z_i - X\beta||^2 + \lambda ||\beta||^2$$
Solve the ridge-problem $$\beta_r = (X'X+\lambda I)^{-1} X'U_i D_{ii} = V(D^2+ \lambda I)^{-1}V'VDU'U_iD_{ii}=$$ $$ = V(D^2+\lambda I)DU'U_iD_{ii} = V_i \frac{D_{ii}^2}{D_{ii}^2+1}$$ Which means that $V_i = \beta_r/||\beta_r||$
Recall the elastic net formulation $$||Y-X\beta||^2 + (1-\alpha)\lambda ||\beta||^2 + \alpha \lambda ||\beta||$$ Now we add the L1 penalty to the above to get sparse loadings. Of course, in this formulation we needed to already have the SVD so it is an iterative method.
Alternatively, write the whole problem as follows $$\sum_{i=1}^N ||x_i - AB^T x_i||^2$$ where $A'A=I$ and $B \propto V$ and elastic net penalty on $B$
This is just a bunch of independent elastic-net problems!!
$A$ given $B$ $$\min_A ||X - (XB)A^T||^2$$
In R you can use PMA package and nsprcomp package
#### For good reconstruction, use either sparse and many components, or dense and few components for approximately
#### the same performance.
library(nsprcomp)
for (bb in (1:4)) {
Im<-t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1] # one object
ss<-nsprcomp(Im,k=c(rep(10,4)),ncomp=4,tol=.1,scale=F,center=F) #10 loadings in each of 4 components - try different numbers here
if (bb==1) {print(ss) } # Example of sparse rotation
#
UU<-Im%*%ss$rotation
ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation) # Reconstruction
par(mfrow=c(1,2)) # Plot reconstruction and original for comparison
image(ImA,col=gray.colors(33))
image(Im,col=gray.colors(33)) }
Standard deviations (1, .., p=4): [1] 1.2963833 1.2017984 0.9762602 0.7887812 Rotation (n x k) = (16 x 4): PC1 PC2 PC3 PC4 [1,] 0.3223290 0.00000000 0.2741850 0.06229128 [2,] 0.3169807 0.00000000 0.3017923 0.00000000 [3,] 0.3259026 0.00000000 0.2554992 0.00000000 [4,] 0.3344939 -0.03177669 0.0000000 0.00000000 [5,] 0.3356788 -0.03356832 0.0000000 0.00000000 [6,] 0.3327006 0.00000000 0.0000000 0.00000000 [7,] 0.3204183 0.00000000 -0.2288247 0.08046779 [8,] 0.3054834 0.04889731 -0.3355132 0.10801224 [9,] 0.2648411 0.09305093 0.0000000 -0.10301058 [10,] 0.0000000 0.32167492 0.0000000 -0.74018759 [11,] 0.0000000 0.40906380 -0.4639541 0.00000000 [12,] 0.0000000 0.45380684 -0.2309139 0.24901640 [13,] 0.0000000 0.43856442 0.0000000 0.36879362 [14,] 0.0000000 0.41979663 0.3735810 0.21550655 [15,] 0.0000000 0.37616144 0.3329660 -0.27426867 [16,] 0.2966984 0.00000000 -0.2907414 -0.31591923
for (bb in (1:4)) {
Im<-t(matrix(Nmat[iz[bb],],16,16,byrow=T))[,16:1]
ss<-nsprcomp(Im,k=c(rep(5,8)),ncomp=8,tol=.1,scale=F,center=F) # only 5 loadings in each of 8 components
UU<-Im%*%ss$rotation
ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation)
par(mfrow=c(1,2))
image(ImA,col=gray.colors(33))
image(Im,col=gray.colors(33)) }
Let's use sparse SVD on the full digits data set. This will generate sparse loadings (subset of pixels) that you can use to project the digit images on. Of course, the prize of sparsity is less explained variance for a fixed number of components, but you gained interpretability.
pp<-prcomp(Nmat) # Ordinary SVD
Nbr.svd.sparse<-nsprcomp(Nmat,k=c(rep(25,10))) # Sparse SVD. This step takes quite some time to run
image(t(matrix(Nbr.svd.sparse$rotation[,1],16,16,byrow=T))[,16:1],col=gray.colors(33)) # First 3 loadings
image(t(matrix(Nbr.svd.sparse$rotation[,2],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(Nbr.svd.sparse$rotation[,3],16,16,byrow=T))[,16:1],col=gray.colors(33))
plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1),xlim=c(1,10),type="l",xlab="components",ylab="% variance explained")
lines(cumsum(Nbr.svd.sparse$sdev)/sum(pp$sdev),col=2) # explained variance from sparse reconstruction
Let's see how the projection onto the sparse principal components worked for separating the digits.
UU<-as.matrix(Nmat)%*%Nbr.svd.sparse$rotation # Projection
plot(UU[,1:2],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,3:4],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,5:6],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
plot(UU[,7:8],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1]))
Let's explore the heart disease data from the text book.
SAdata<-SAheart
SAdata$famhist<-as.numeric(SAdata$famhist) # turn family history into 0/1 variable
#
pp<-prcomp(SAdata,scale=T) # PCA
plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1)) # Explained variance
sp<-nsprcomp(SAdata,k=c(10,rep(10,9)),scale=T) # Sparse PCA with varying degrees of sparsity
lines(cumsum(sp$sdev)/sum(sp$sdev))
sp<-nsprcomp(SAdata,k=c(3,rep(2,9)),scale=T)
lines(cumsum(sp$sdev)/sum(sp$sdev),col=2)
sp<-nsprcomp(SAdata,k=c(2,rep(2,9)),scale=T)
lines(cumsum(sp$sdev)/sum(sp$sdev),col=3) # some loss of information due to sparsity
####
print(names(SAdata)) # Are the sparse loadings interpretable?
plot(sp$rotation[,1],xlab="features",ylab="sparse PC1") # PC1 contains adiposity,obseity
plot(sp$rotation[,2],xlab="features",ylab="sparse PC2") # PC2 contains tobacco, age
plot(pp$rotation[,1],xlab="features",ylab="original PC1") # Much less clear interpretation with non-sparse loadings
#
pp$rotation[,1:4] # Original loadings
sp$rotation[,1:4] # Sparse loadings
# Try different levels of sparsity.
# Also try this with standardized data
[1] "sbp" "tobacco" "ldl" "adiposity" "famhist" "typea" [7] "obesity" "alcohol" "age" "chd"
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
sbp | -0.306353655 | 0.07962060 | -0.30456703 | 0.101146784 |
tobacco | -0.303266917 | 0.42468435 | -0.15581440 | 0.009733644 |
ldl | -0.324056204 | -0.27411219 | 0.24864969 | -0.164196944 |
adiposity | -0.481139710 | -0.30711103 | -0.09344212 | 0.099940021 |
famhist | -0.207459194 | 0.22554993 | 0.41109922 | -0.247615625 |
typea | 0.001496365 | 0.02182423 | 0.68327241 | 0.520782842 |
obesity | -0.361036881 | -0.49744060 | 0.01471631 | 0.306996176 |
alcohol | -0.115336964 | 0.44846380 | -0.16870103 | 0.647993822 |
age | -0.448790047 | 0.12855346 | -0.15660315 | -0.180947607 |
chd | -0.299273226 | 0.35694131 | 0.35119479 | -0.270763719 |
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
sbp | 0.0000000 | 0.0000000 | 0.0000000 | -0.6581029 |
tobacco | 0.0000000 | -0.7120304 | 0.0000000 | 0.0000000 |
ldl | 0.0000000 | 0.0000000 | -0.6876708 | 0.0000000 |
adiposity | -0.7081240 | 0.0000000 | 0.0000000 | 0.0000000 |
famhist | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 |
typea | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 |
obesity | -0.7060881 | 0.0000000 | 0.0000000 | 0.0000000 |
alcohol | 0.0000000 | 0.0000000 | 0.0000000 | -0.7529280 |
age | 0.0000000 | -0.7021486 | 0.0000000 | 0.0000000 |
chd | 0.0000000 | 0.0000000 | -0.7260226 | 0.0000000 |
We can visualize large data set by looking at the leading principal components. SOM - self-organizing maps is a very different way of looking at data.
We're now going to update the prototypes to better summarize the data, which corresponds to bending the PC plane to be able to map it to a rectangular grid.
We can be a bit more clever with the updating, taking neighborhood distance into account.
We can also use supervised techniques, where some variables (dimensions) matter more in the distance calculation and other distance metrics can be used (more appropriate for categorical data).
############################################
#SOM
library(kohonen)
library(MASS)
#
data(crabs)
crabs2<-crabs
head(crabs) # mixed data types in this data set
crabs2[,1]<-as.numeric(crabs2[,1]) # create numerical representation for PCA to work
crabs2[,2]<-as.numeric(crabs2[,1])
pp<-prcomp(crabs2[,-3],scale=T) # PCA
CC<-as.matrix(crabs2[,-3])%*%pp$rot
plot(CC[,1],CC[,2],col=crabs2$sp,pch=crabs2$sex,xlab="PC1",ylab="PC2")
plot(CC[,3],CC[,4],col=crabs2$sp,pch=crabs2$sex,xlab="PC3",ylab="PC4")
sp | sex | index | FL | RW | CL | CW | BD | |
---|---|---|---|---|---|---|---|---|
<fct> | <fct> | <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | B | M | 1 | 8.1 | 6.7 | 16.1 | 19.0 | 7.0 |
2 | B | M | 2 | 8.8 | 7.7 | 18.1 | 20.8 | 7.4 |
3 | B | M | 3 | 9.2 | 7.8 | 19.0 | 22.4 | 7.7 |
4 | B | M | 4 | 9.6 | 7.9 | 20.1 | 23.1 | 8.2 |
5 | B | M | 5 | 9.8 | 8.0 | 20.3 | 23.0 | 8.2 |
6 | B | M | 6 | 10.8 | 9.0 | 23.0 | 26.5 | 9.8 |
PCA can fail miserably for mixed data.
Let's try SOM on this data set instead.
ss<-som(as.matrix(crabs2[,-3]),grid=somgrid(5,5))
plot(ss,type="mapping",col=crabs2$sp,pch=crabs2$sex) # where observations are co-located
plot(ss,type="codes") # radius of wedge is the magnitude of that feature in this grid.
som_model <- som(as.matrix(crabs2[,-3]),
grid=somgrid(5,5),
rlen=100,
alpha=c(0.05,0.01),
keep.data = TRUE)
plot(som_model,type="count") # where are the objects located
plot(som_model,type="dist.neighbours") # how close is grid to neighbors
plot(som_model,type="codes") # content of grid
# visualize one variable across the map
plot(som_model, type = "property",
property = som_model$codes[[1]][,4],
main=names(som_model$data)[4])
plot(som_model, type = "property",
property = som_model$codes[[1]][,1],
main=names(som_model$data)[1])
# Use the SOM grid data to cluster the grids into groups
som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 6)
plot(som_model, type="mapping", bgcol = som_cluster, main = "Clusters")
add.cluster.boundaries(som_model, som_cluster) # cluster the prototypes
Let's try this on the digits data set as well.
iz<-sample(seq(1,7291),2000)
Nmat<-as.matrix(zip.train[iz,-1])
Nlab<-zip.train[iz,1]
# this takes some time...
som_modelN <- som(as.matrix(Nmat),
grid=somgrid(25,25),
rlen=100,
alpha=c(0.05,0.01),
keep.data = TRUE)
plot(som_modelN,type="changes")
plot(som_modelN,type="count")
plot(som_modelN,type="dist.neighbours")
som_cluster <- cutree(hclust(dist(som_modelN$codes[[1]])), 10)
plot(som_modelN, type="mapping", bgcol = som_cluster, main = "Clusters")
add.cluster.boundaries(som_model, som_cluster)
#
plot(som_modelN,type="mapping",pch=as.character(Nlab),col=as.numeric(Nlab)+1)
plot(som_modelN,type="mapping",pch=as.character(Nlab),bgcol = som_cluster)
plot(som_modelN,type="codes")
The codes are not as easy to interpret in a high-dimensional setting (it's the raster scan of the representative image in the code book). Let's look at these as images instead (not all 625 of course).
# 625 images to look at: space it out by 5
par(mfrow=c(5,5))
kvec<-sort(c(seq(1,25,by=5),seq(5*25+1,6*25,by=5),seq(10*25+1,11*25,by=5),seq(15*25+1,16*25,by=5),seq(20*25+1,21*25,by=5)))
for (kk in (1:25)) {
image(t(matrix(som_modelN$codes[[1]][kvec[kk],],16,16,byrow=T))[,16:1],col=gray.colors(33))
}
####### supervised SOM
Nbr.som2<-xyf(Nmat,Y=as.factor(Nlab),grid=somgrid(5,5))
plot(Nbr.som2)
plot(Nbr.som2,type="mapping",pch=as.character(Nlab))
# 25 code images to look at.
par(mfrow=c(5,5))
for (kk in (1:25)) {
image(t(matrix(Nbr.som2$codes[[1]][kk,],16,16,byrow=T))[,16:1],col=gray.colors(33))
}
# Heart disease data
sh<-som(as.matrix(SAdata[,-10]),grid=somgrid(5,5))
plot(sh)
plot(sh,type="count")
plot(sh,type="mapping",pch=as.character(SAdata[,10]))
plot(sh,type="dist.neighbours")
#
sh_cluster <- cutree(hclust(dist(sh$codes[[1]])), 6)
plot(sh, type="mapping", bgcol = som_cluster, main = "Clusters")
add.cluster.boundaries(sh, sh_cluster)
# Unsupervised SOM does not separate the classes well.
plot(sh,type="mapping",pch=as.character(SAdata[,10]),col=as.numeric(SAdata[,10])+1)
# Let's try supervised
sh2<-xyf(X=as.matrix(SAdata[,-10]),Y=as.factor(SAdata[,10]),grid=somgrid(5,5))
plot(sh2)
plot(sh2,type="mapping",pch=as.character(SAdata[,10]),col=as.numeric(SAdata[,10])+1)
Now we have found representatives for the classes on the grid.
One can use these for prediction for example.
Pros and Cons with (sparse) SVD
Non-negative matrix factorization
NMF: $$X=WH, \ \ W\geq 0 \ \ H \geq 0$$
Example from the handwritten digits: Let's say we choose to approximate the data with rank $K$
Both SVD and NMF try to find a linear dimension reduction of the data to summarize the data well. The difference lies in the assumed structure of the dimension reduction.
Let's say we have found a rank $K$ approximation. We can approximate the $j$th observation by $$ \hat{X}_j = \sum_{l=1}^K W_{jl} H_{l.}$$
library("NMF")
# a subset of handwritten digits
N<-500
# sample 500 digits to try NMF on
it<-sample(seq(1,7291),N)
Nmat<-zip.train[it,-1]
Nlab<-zip.train[it,1]
K<-10 # how many factors
aa<-nmf(Nmat-min(Nmat),K)
coefmap(aa) # this is what the codebook looks like (each is a raster scan of a 16*16 image)
plot(basis(fit(aa))[,1:2],col=gray(Nlab/10),
,pch=as.character(Nlab),xlab="W1",ylab="W2") # this is the basis plot - are digits separated?
# Examples of code-book images
image(t(matrix(aa@fit@H[1,],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(aa@fit@H[2,],16,16,byrow=T))[,16:1],col=gray.colors(33))
# Try a few of the digits and look what the K-rank representation captures
par(mfrow=c(1,2))
for (bb in (1:5)) {
ss<-sample(seq(1,length(it)),1)
image(t(matrix(Nmat[ss,],16,16,byrow=T))[,16:1],col=gray.colors(33))
image(t(matrix(fitted(aa)[ss,],16,16,byrow=T))[,16:1],col=gray.colors(33)) }
##### Try with different K at home and see how much this improves the reconstruction
# Summary of the NMF fit
summary(aa)
# percent of variance explained
print(c("Variance explained"))
round(1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2),3)
[1] "Variance explained"
# Let's try fewer factors
K<-3
aa<-nmf(Nmat-min(Nmat),K)
par(mfrow=c(1,1))
coefmap(aa)
summary(aa)
print("Variance explained")
round(1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2),3)
# Explained variance decreases
[1] "Variance explained"
####### can we find the digits based in the low-dim space?
kk<-kmeans(fitted(aa),10)
table(kk$cluster,Nlab)
#
library(dendextend)
hc <- hclust(dist(fitted(aa)),method="complete")
dend <- as.dendrogram(hc)
labels_colors(dend) <- (as.numeric(Nlab))[order.dendrogram(dend)]
plot(dend)
hcc<-cutree(hc,10)
table(hcc,Nlab)
Nlab 0 1 2 3 4 5 6 7 8 9 1 5 0 21 1 3 6 9 0 2 0 2 30 0 0 0 0 0 0 0 0 0 3 2 0 0 2 3 6 0 1 25 6 4 0 64 1 0 4 0 11 1 2 3 5 0 0 0 4 12 2 0 17 2 21 6 5 0 17 6 5 3 15 1 0 0 7 44 0 5 0 0 1 4 0 0 0 8 2 0 2 11 1 7 0 0 2 0 9 1 0 5 24 2 17 0 1 1 3 10 0 0 0 0 6 2 0 24 0 17
Nlab hcc 0 1 2 3 4 5 6 7 8 9 1 2 25 1 2 3 1 7 0 9 1 2 43 0 4 0 0 1 3 0 0 0 3 0 39 7 1 6 1 8 1 5 5 4 0 0 2 19 21 9 0 40 13 36 5 3 0 10 21 1 22 1 0 0 0 6 8 0 25 4 4 4 19 1 0 0 7 30 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 4 0 2 5 8 9 3 0 1 0 1 0 1 1 1 0 10 0 0 1 1 0 2 0 0 1 0
Try different K at home and see which works best for SVD and NMF.
Let's try to summarize an image.
puppy<-as.matrix(read.table("puppy.txt"))
image(puppy,col=gray.colors(256,0,1))
ssp<-svd(puppy-mean(puppy))
varexp<-cumsum(ssp$d^2)/sum(ssp$d^2)
plot(varexp) # very few components needed to summarize this image
#### Low-rank approximations
par(mfrow=c(2,2))
k<-2
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-5
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-15
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-25
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
#### Missing values (corrupted image) # Try even more missing values here and see for how long SVD imputation works
aa<-sample(seq(1,322),100,replace=T)
bb<-sample(seq(1,300),100,replace=T)
puppy2<-puppy
puppy2[aa,bb]<-NA
par(mfrow=c(1,1))
image(puppy2,col=gray.colors(256,0,1))
library(softImpute)
#### Using low-rank approximations to fill in missing values
pcomp<-softImpute(puppy2,rank=10,lambda=.85) # using very low rank approximation to fill in the gaps
image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1))
pcomp<-softImpute(puppy2,rank=35,lambda=.9) # slightly higher rank
image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1))
####
#### Note - this procedure also uses soft thresholding.
#### When lambda is 0 you use the leading components for imputation, when lambda > 0 you shrink eigenvalues toward 0
#### This works as denoising.
#### Denoising example
puppy3<-puppy
puppy3<-(puppy+matrix(rnorm(322*300,sd=.2),322,300))
image(puppy3,col=gray.colors(256,0,1))
ssp<-svd(puppy3)
k<-15
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
k<-35
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1))
### the low rank approximations of the puppy image
ssp<-svd(puppy-mean(puppy))
par(mfrow=c(4,4))
for (k in 1:16) {
if (k>1) {
image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) }
if (k==1) {
image(ssp$d[1]*matrix(ssp$u[,1:k],322,1)%*%matrix(ssp$v[,1:k],1,300),col=gray.colors(256,0,1)) }
}
# The SVD layers
par(mfrow=c(4,4))
for (k in 1:16) {
image(ssp$d[k]*matrix(ssp$u[,k],322,1)%*%matrix(ssp$v[,k],1,300),col=gray.colors(256,0,1)) }
### And now NMF
K<-16
aa<-nmf(puppy,K)
par(mfrow=c(1,1))
image(fitted(aa),col=gray.colors(256,0,1))
# the NMF layers
par(mfrow=c(4,4))
for (k in (1:16)) {
image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1))
}
# adding the layers up
par(mfrow=c(4,4))
for (k in (1:16)) {
if (k>1) {
image((basis(aa))[,1:k]%*%(coef(aa)[1:k,]),col=gray.colors(256,0,1))
}
if (k==1) {
image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1))
}
}
###########################
#### on a big data set of digits
# this takes a while to run!
N<-500
itNbr<-sample(seq(1,7291),N)
Nmat<-zip.train[itNbr,-1]
Nlab<-zip.train[itNbr,1]
K<-16
aaNbr<-nmf(Nmat-min(Nmat),K)
#
par(mfrow=c(1,1))
coefmap(aaNbr)
basismap(aaNbr)
###############################
par(mfrow=c(4,4)) # Codebook
for (k in (1:16)) {
image(t(matrix(coef(aaNbr)[k,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}
# how good is the approximation? Look at observation u (try different ones)
# here I try the a couple of observations
par(mfrow=c(4,4))
su<-sample(seq(1,length(itNbr)),8)
for (zz in (1:length(su))) {
u<-su[zz]
s1<-(basis(aaNbr))[u,]%*%(coef(aaNbr))
image(t(matrix(s1,16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
image(t(matrix(Nmat[u,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}
######
svdN<-svd(Nmat)
par(mfrow=c(4,4))
for (k in 1:16) {
image(t(matrix(svdN$v[,k],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}
for (zz in (1:length(su))) {
u<-su[zz]
s1<-svdN$u[u,1:16]%*%diag(svdN$d[1:16])%*%t(svdN$v[,1:16])
image(t(matrix(s1,16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
image(t(matrix(Nmat[u,],16,16,byrow=T))[,16:1],col=gray.colors(256,0,1))
}