First I will implement a simplified version of cluster prediction strength just to illustrate the logic behind it.
library(cluster)
library(datasets)
library(MASS)
data(iris)
# manual cluster prediction strength for selecting then number of clusters
# cheating a bit since not trying all combos
# There is package fpc that has many cluster validation functions in it - see further down
ii<-sample(seq(1,150),100) # sample a subset of the iris data
#
Kvec<-seq(2,10) # how many clusters to investigate
pvec<-0*Kvec # empty vector to store predictions in
ij<-sample(seq(1,length(ii)),round(length(ii)/2)) # divide the data set into two parts
Aset<-iris[ii,][ij,]
Bset<-iris[ii,][-ij,]
for (kk in (1:length(Kvec))) {
ppA<-pam(Aset[,1:4],Kvec[kk]) # cluster part A
ppB<-pam(Bset[,1:4],Kvec[kk]) # cluster part B#
if (Kvec[kk]<6) {
#plot the cluster outcomes, colors from data set A clustering, symbols from B
pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
p<-locator() } # push finish to continue to next cluster number
lA<-lda(Aset[,1:4],ppA$clust) # use A's clustering to generate a classification
pA<-predict(lA,newdata=Bset[,1:4],type="class") # predict on set B
lB<-lda(Bset[,1:4],ppB$clust)
pB<-predict(lB,newdata=Aset[,1:4],type="class")
#
tAB<-table(pA$class,ppB$clust) # compare classification vs clustering
tBA<-table(pB$class,ppA$clust)
pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) } #compute approximate overlap
plot(Kvec,pvec,xlab="K",ylab="Overlap")
Compare the cluster overlap between data set A (color) and data set B (grey symbols). For which number of clusters do colors and symbols appear "pure" - i.e., not a mix of colors within a symbol or a mix of symbols within a color? For this data set the cluster overlap is maximized with 2 clusters.
Next I manipulate the iris data to make the Virginica data more separated from the rest. This is to illustrate that with stronger separation you might find 3 clusters.
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 # make Virginica stand out more.
#
ii<-sample(seq(1,150),100)
Kvec<-seq(2,10)
pvec<-0*Kvec
ij<-sample(seq(1,length(ii)),round(length(ii)/2))
Aset<-irisnew[ii,][ij,]
Bset<-irisnew[ii,][-ij,]
for (kk in (1:length(Kvec))) {
ppA<-pam(Aset[,1:4],Kvec[kk])
ppB<-pam(Bset[,1:4],Kvec[kk])
if (Kvec[kk]<6) {
pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
p<-locator() }
lA<-lda(Aset[,1:4],ppA$clust)
pA<-predict(lA,newdata=Bset[,1:4],type="class")
lB<-lda(Bset[,1:4],ppB$clust)
pB<-predict(lB,newdata=Aset[,1:4],type="class")
#
tAB<-table(pA$class,ppB$clust)
tBA<-table(pB$class,ppA$clust)
pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) }
plot(Kvec,pvec,xlab="K",ylab="Overlap")
Next I split the first cluster into two with strong separation to see if I can detect 4 clusters.
### creating 4 separated clusters
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3
#
ii<-sample(seq(1,150),100)
Kvec<-seq(2,10)
pvec<-0*Kvec
ij<-sample(seq(1,length(ii)),round(length(ii)/2))
Aset<-irisnew[ii,][ij,]
Bset<-irisnew[ii,][-ij,]
for (kk in (1:length(Kvec))) {
ppA<-pam(Aset[,1:4],Kvec[kk])
ppB<-pam(Bset[,1:4],Kvec[kk])
if (Kvec[kk]<6) {
pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
p<-locator() }
lA<-lda(Aset[,1:4],ppA$clust)
pA<-predict(lA,newdata=Bset[,1:4],type="class")
lB<-lda(Bset[,1:4],ppB$clust)
pB<-predict(lB,newdata=Aset[,1:4],type="class")
#
tAB<-table(pA$class,ppB$clust)
tBA<-table(pB$class,ppA$clust)
pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) }
plot(Kvec,pvec,xlab="K",ylab="Overlap")
Instead of doing this from scratch let's use packages instead.
#### Package for cluster validation
#install.packages("fpc")
library(fpc)
# Read the helpfile about the package
#library(help="fpc")
# online manual also
ii<-sample(seq(1,150),100)
pp<-prediction.strength(iris[ii,-5], Gmin=2,Gmax=5,clustermethod=kmeansCBI, classification="centroid",M=10)
# You can try other clustering methods like hierarchical etc and other classifiers like kNN etc
# Check help(prediction.strength) for more info
plot(seq(2,5),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
#
Cleary, the overlap is best for 2 clusters.
Now, try this for the data set with more separation between 3 clusters and an added 4th from splitting Virginica.
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3
ii<-sample(seq(1,150),100)
pp<-prediction.strength(irisnew[ii,-5], Gmin=2,Gmax=10,clustermethod=kmeansCBI, classification="centroid",M=10)
plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
# Do a couple of times and check how many clusters are selected
Your results here may vary a bit from sample to sample - run a couple of times to see.
Next, try this with a different clustering and classification method (here hierarchical clustering and knn).
##
# You can also try different clustering and classification methods...
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+4
ii<-sample(seq(1,150),100)
pp<-prediction.strength(irisnew[ii,-5], Gmin=2,Gmax=10,clustermethod=hclustCBI, method="complete",cut="level",classification="knn",nnk=5,M=10)
plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
Warning message in max(which(mean.pred > cutoff)): "no non-missing arguments to max; returning -Inf"
With these sets of methods, it seems more clear that 4 clusters are detectable.
A key component to cluster selection and validation is having metrics to compare clusters. One popular metric to compare 2 cluster results is the \emph{Rand Index} defined as
$$RI = \frac{a+b}{a+b+c+d}$$where $a$ is the number of pairs that are together in both cluster results and $b$ are the number of pairs that are not together in both cluster results. $c$ and $d$ are the number of pairs that are together in the first cluster results but not together in the other, and vice versa. A high Rand Index means that clusters overlap a lot. The Adjusted RI (or corrected RI) is a metric that adjust for what cluster overlap could be expected just by chance.
A second commonly used metric is the \emph{Jaccard Index} defined as the ratio between the intersection and the union of two sets. Here, one can use this at the cluster level to measure how much in agreement a cluster allocation is between two cluster results.
Another metric, used below, is the \emph{Variation of Information}. Two cluster methods result in two different partions of the data $A_1,\cdots,A_K$ and $B_1,\cdots, B_J$. For each partition $A$ you can compute the cluster proportions $a_k, k=1,\cdots,K$ similarly for $B$ we get $b_j, j=1,\cdots,J$. Let $o_{kj} = A_k \cup B_j$ for all pairs of clusters between the two partitions. The Variation of Information is then defined as
$$ VI = - \sum_{k,j} o_{k,j} \bigl(\log(\frac{o_{kj}}{a_{k}}) + \log(\frac{o_{kj}}{b_{j}})\bigr) $$which is measure of distance between the clusterings. A small value indicates that there is a strong agreement between the clusters.
kk<-kmeans(iris[,-5],3)
hh<-cutree(hclust(dist(iris[,-5])),3)
cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T)
table(kk$cluster,hh)
pairs(iris[,-5],col=kk$cluster,pch=as.numeric(iris[,5]), main = "Overlap kmeans and true labels")
pairs(iris[,-5],col=hh,pch=as.numeric(iris[,5]), main = "Overlap hierarchical and true labels")
pairs(iris[,-5],col=hh,pch=kk$cluster, main = "Overlap hierarchical and kmeans")
hh 1 2 3 1 33 0 0 2 0 72 24 3 17 0 4
The kmeans and hierarchical cluster results with 3 clusters do not overlap as you can see from the indices and the scatter plots. Try with more methods, different distances and see what happens.
What about 2 clusters?
kk<-kmeans(irisnew[,-5],2)
hh<-cutree(hclust(dist(irisnew[,-5])),2)
cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T)
table(kk$cluster,hh)
hh 1 2 1 100 0 2 0 50
Now we have perfect agreement between kmeans and hierarchical clustering.
Let's try resampling to assess how much clusters may vary for a random subset of observations. Are clusters stable?
# Stability of clustering results - using bootstrap to assess
par(mfrow=c(2,2))
pp<-clusterboot(iris[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=2,multipleboot = F)
print(pp)
pp<-clusterboot(iris[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=3,multipleboot = F)
print(pp)
pp<-clusterboot(iris[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=4,multipleboot = F)
print(pp)
boot 1 boot 2 * Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 2 Number of clusters found in data: 2 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 1 1 dissolved: [1] 0 0 recovered: [1] 2 2 boot 1 boot 2
* Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 2 Number of clusters found in data: 3 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.9720280 1.0000000 0.9502315 dissolved: [1] 0 0 0 recovered: [1] 2 2 2 boot 1 boot 2
* Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 2 Number of clusters found in data: 4 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 1.0000000 0.7991071 0.8987179 0.9705882 dissolved: [1] 0 0 0 0 recovered: [1] 2 2 2 2
For the iris data, bootstrap results are not stable beyond 2 clusters. Try this on the iris data where you created greater separation between the clusters.
Let's run this on more bootstrap examples.
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=kmeansCBI,krange=2,multipleboot = F,count=F)
print(pp)
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=kmeansCBI,krange=3,multipleboot = F,count=F)
print(pp)
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=kmeansCBI,krange=4,multipleboot = F,count=F)
print(pp)
* Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 2 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.9280267 0.9521509 dissolved: [1] 1 0 recovered: [1] 91 96 * Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 3 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.9686008 0.9004731 0.8710462 dissolved: [1] 0 0 12 recovered: [1] 100 78 73 * Cluster stability assessment * Cluster method: kmeans Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 4 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.8234300 0.5033564 0.4587607 0.7015930 dissolved: [1] 6 54 68 1 recovered: [1] 73 4 2 23
Looks pretty stable for 2 to 3 clusters with kmeans, but not with 4.
What about hierarchical clustering?
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=hclustCBI,k=2,method="complete",multipleboot = F,count=F)
print(pp)
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=hclustCBI,k=3,method="complete",multipleboot = F,count=F)
print(pp)
pp<-clusterboot(irisnew[,-5],B=100,showplots=F,clustermethod=hclustCBI,k=4,method="complete",multipleboot = F,count=F)
print(pp)
* Cluster stability assessment * Cluster method: hclust/cutree Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 2 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.9938185 0.9921167 dissolved: [1] 0 0 recovered: [1] 98 98 * Cluster stability assessment * Cluster method: hclust/cutree Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 3 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 0.9805493 0.9825098 0.9730529 dissolved: [1] 0 1 4 recovered: [1] 96 96 95 * Cluster stability assessment * Cluster method: hclust/cutree Full clustering results are given as parameter result of the clusterboot object, which also provides further statistics of the resampling results. Number of resampling runs: 100 Number of clusters found in data: 4 Clusterwise Jaccard bootstrap (omitting multiple points) mean: [1] 1.0000000 0.9917513 1.0000000 0.9777238 dissolved: [1] 0 0 0 0 recovered: [1] 100 100 100 94
The clusters are fairly stable up to 4 clusters with hierarchical clustering with complete linkage. Remember though that the clusters look. Stable clusters but with limited overlap to the kmeans clusters. Choice of method matters.
Let's explore the cancer data I posted on canvas. There are 6 different cancer samples, 2887 in total, each sample measured over 20000+ genes.
#install.packages("irlba")
#install.packages("plot3D")
#install.packages("rgl")
library(irlba)
# fast svd for large data sets
library(plot3D)
library(rgl)
#### Let's look at a very large genome cancer data set
#### 6 cancer classes: breast, glioblastoma (brain), lung, ovarian, kidney and uterus
#### 20000+ genes are the feaures
load("TCGAdata.RData")
ss<-irlba(TCGA,10,10) # keep the leading 10 eigenvectors
pairs(ss$u[,1:5],col=as.numeric(as.factor(TCGAclassstr)))
par(xpd = TRUE)
legend("bottomright", fill = unique(as.factor(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))], legend = c(levels(as.factor(TCGAclassstr))))
#plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
#legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# 3D plot if you prefer.
You can see the cancer classes fairly well separated in the first leading eigencomponents. However, there is quite some imbalance between the classes here with many more breast cancer samples and very few uterine cases. Also, the uterine seems to be positioned between the other classes.
kk<-kmeans(ss$u[,1:10],6)
#
library(mda)
mda::confusion(kk$cluster,TCGAclassstr)
Warning message: "package 'mda' was built under R version 4.1.3" Loading required package: class Loaded mda 0.5-2
true predicted BC GBM KI LU OV U 1 0 0 0 0 262 1 2 1 170 505 0 0 0 3 0 0 0 531 0 0 4 943 0 2 0 0 0 5 271 2 6 40 4 56 6 0 0 93 0 0 0
Kmeans on the 10 leading components splits the breast cancer data set into 2 clusters while combining glioblastoma and kidney cancer. As expected, we don't retrieve all 6 classes when we clusters.
Does it work if we use partion around medoids?
library(cluster)
pp<-pam(ss$u,6)
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
mda::confusion(pp$clustering,TCGAclassstr)
true predicted BC GBM KI LU OV U 1 0 169 0 0 0 0 2 223 3 5 23 3 50 3 992 0 10 0 0 5 4 0 0 0 0 263 1 5 0 0 7 548 0 1 6 0 0 584 0 0 0
While the clusters look good in the silhouette width plot, the confusion matrix shows that they don't overlap with the cancer classes - breast cancer is split into two groups. PAM was, however, better at separating glioblastoma and kidney cancer.
Let's try a hacky version of consensus clustering from scratch.
#### consensus clustering - using a random subset of genes to cluster
####
DD<-matrix(0,2887,2887)
nz<-50
for (zz in (1:nz)) {
ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10) # pca on a random subset of genes
kk<-kmeans(ss$u[,1:10],6) # clustering on the leading pca components
dd<-as.matrix(dist(kk$cluster))
dd[dd!=0]<-1 # who is NOT in the same cluster = 1
DD<-DD+(1-dd) # same +1 for those who ARE in the same cluster
cat(zz)}
##
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
I resample 50 times. You can try fewer or more and see what happens. I now have a consensus matrix, DD, that tabulates how pairs of observations co-cluster across the different runs. I use this as new distance metric into hierarchical clustering.
hc<-hclust(as.dist(1-DD/50)) # Use the co-clustering results as a
# new distance metric in e.g. hierarchical clustering.
# plot(hc,labels=TCGAclassstr)
# nicer dendrogram
library(dendextend)
dend<-as.dendrogram(hc)
cols<-as.numeric(as.factor(TCGAclassstr))
labels_colors(dend)<-cols[order.dendrogram(dend)]
plot(dend)
Warning message: "package 'dendextend' was built under R version 4.1.3" --------------------- Welcome to dendextend version 1.15.2 Type citation('dendextend') for how to cite the package. Type browseVignettes(package = 'dendextend') for the package vignette. The github page is: https://github.com/talgalili/dendextend/ Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues You may ask questions at stackoverflow, use the r and dendextend tags: https://stackoverflow.com/questions/tagged/dendextend To suppress this message use: suppressPackageStartupMessages(library(dendextend)) --------------------- Attaching package: 'dendextend' The following object is masked from 'package:stats': cutree
From the dendrogram you can see that there are some samples that form tighter clusters than others (stay together more frequently) and others are scattered between the clusters. You see clear, stable subclusters in the breast cancer (black) class. Try this with different clustering methods, different amount of random features or observation sets and see what happens.
k<-cutree(hc,k=6)
mda::confusion(k,TCGAclassstr)
#
k<-cutree(hc,k=8)
mda::confusion(k,TCGAclassstr)
#
k<-cutree(hc,k=7)
mda::confusion(k,TCGAclassstr)
# how reproducible are the clusters?
#image(DD)
image(DD[hc$order,hc$order]) #
true predicted BC GBM KI LU OV U 1 251 172 7 15 3 56 2 964 0 0 0 0 0 3 0 0 0 0 262 1 4 0 0 96 0 1 0 5 0 0 0 556 0 0 6 0 0 503 0 0 0
true predicted BC GBM KI LU OV U 1 0 169 0 0 0 0 2 251 3 7 15 3 56 3 666 0 0 0 0 0 4 298 0 0 0 0 0 5 0 0 0 0 262 1 6 0 0 96 0 1 0 7 0 0 0 556 0 0 8 0 0 503 0 0 0
true predicted BC GBM KI LU OV U 1 0 169 0 0 0 0 2 251 3 7 15 3 56 3 964 0 0 0 0 0 4 0 0 0 0 262 1 5 0 0 96 0 1 0 6 0 0 0 556 0 0 7 0 0 503 0 0 0
We see as before that it's hard to all the classes and the clustering method favors the splitting of breast cancer into multiple clusters. Increasing to 7 clusters finds all the classes except the small uterine cancer, while splitting breast cancer into two subgroups. The consensus matrix depicts how stable the clusters are. You see that there are 4 fairly stable clusters in the bottom left corner of the consensus matrix that are well separate. The 3 at the top right corner, one looks stable, but the larger ones are clearly not as stable. This is the breast cancer class mixed with a few other samples and the uterine class.
Feature filtering
Let's sort the features by their variance and pick some of the top and bottom ones and see what their distribution looks like.
vv<-apply(TCGA,2,sd)
vv<-rev(sort.list(vv))
# some examples of filtered features
par(mfrow=c(2,2))
hist(TCGA[,vv[2]],50)
hist(TCGA[,vv[130]],50)
hist(TCGA[,vv[530]],50)
hist(TCGA[,vv[18500]],50)
What you see is that for most observations a feature may have 0 or near 0 values. This means that a gene may not be expressed for some of the cancer classes but highly expresed for others. If the features look multimodal this might indicate they contain cluster information. Feature 530 looks nicely bimodal whereas feature 18500 looks unimodal. Remember this is sorted by decreasing variance.
Let's use the leading 250 features and perform clustering. Here, I use a bi-clustering approach so I also cluster the features. The distance metric is euclidean.
guse<-vv[1:250]
# use top 250 genes
library(gplots)
heatmap.2(as.matrix(TCGA[,guse]),
col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
hclust=function(x) hclust(x,method="complete"),scale="column",trace="none")
#euclidean distance
Warning message: "package 'gplots' was built under R version 4.1.3" Attaching package: 'gplots' The following object is masked from 'package:stats': lowess
You see a nice patch work here. This is the sorted data matrix, sorted by observation (row) clustering with hierarchical clustering, euclidean distance and sorted by column clustering for the features. The color labels on the left are the true classes. You see a clear overlap with some clusters and classes. But the black class, breast cancer, is clearly split into two groups. You see that there are clusters of genes as well, groups that are either high (blue) or red(low) for a subset of cancers in different patterns. This is a very dense and informative summary of the data. However, always remember that the results are driven by your choices for clustering metric and method.
What if we try a different distance metric. Below, I use correlation as a similarity metric instead.
heatmap.2(as.matrix(TCGA[,guse]),
col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
hclust=function(x) hclust(x,method="complete"),scale="column",trace="none",
distfun=function(x) as.dist((1-cor(t(x),method="pearson",use="pairwise.complete.obs"))/2))
#correlation
Notice how the results changed. Now, breast cancer is actually forming a slightly tighter group.
Let's investigate how feature selection may impact clustering (also a topic for project 2).
First I use the top 2000 varying genes and then use the leading eigenvectors from a PCA on those 2000 features.
guse<-vv[1:2000]
# use top 2000 genes
ss<-irlba(TCGA,10,10)
pairs(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
par(xpd = TRUE)
legend("bottomright", fill = unique(as.factor(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))], legend = c(levels(as.factor(TCGAclassstr))))
This gives us a good separation for some of the larger classes but clearly also some overlap. If you plot this without the colors you see what I mean.
pairs(ss$u[,1:3])
Now it's not so easy to see how many clusters there are or where they are separated.
Let's now only use the top-varying features and see what kind of cluster separation we get then.
pairs(TCGA[,vv[1:3]],col=as.numeric(as.factor(TCGAclassstr)))
par(xpd = TRUE)
legend("bottomright", fill = unique(as.factor(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))], legend = c(levels(as.factor(TCGAclassstr))))
Clearly, the top-varying genes are mostly separting breast and lung cancer from the rest. It appears that lung cancer is well separated in the subspace spanned by the 1st and 3rd most varying genes whereas breast cancer is separated along the 2nd most varying genes.
pairs(TCGA[,vv[4:6]],col=as.numeric(as.factor(TCGAclassstr)))
par(xpd = TRUE)
legend("bottomright", fill = unique(as.factor(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))], legend = c(levels(as.factor(TCGAclassstr))))
If you look at the next few features, it is clear that some cancers dominate if you use variance as a feature selection.
Let's use the Consensurcluster package and investigate cluster stability and selection.
#BiocManager::install("ConsensusClusterPlus")
#browseVignettes("ConsensusClusterPlus")
# Cannot install directly to Rstudio depending on version - go via Bioconductor
library(ConsensusClusterPlus)
# this can be a bit slot depending on the number of features and observations!
ii<-sample(seq(1,2887),1000)
options(warn=-1)
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,
clusterAlg="km")
options(warn=0)
Note: The km (kmeans) option only supports a euclidean distance metric when supplying a data matrix. If you want to cluster a distance matrix, use a different algorithm such as 'hc' or 'pam'. Changing distance to euclidean end fraction clustered clustered
clustered
clustered
clustered
clustered
clustered
ConsensusClusterPlus outputs the consensusmatrices for your selected range of number of clusters. It also provides the empirical cdf of the consensusmatrices that helps you assess what a reasonable number of clusters may be. See more on this under Project 2.
Let's have a look at one of the outputs. Do the consensus cluster overlap with the classes?
ccu<-cc[[3]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
ccu<-cc[[5]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
ccu<-cc[[6]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
true predicted BC GBM KI LU OV U 1 6 67 203 0 0 5 2 413 0 1 0 0 1 3 0 0 2 198 92 12
true predicted BC GBM KI LU OV U 1 6 67 6 1 0 5 2 358 0 0 0 0 0 3 55 0 2 1 92 13 4 0 0 0 196 0 0 5 0 0 198 0 0 0
true predicted BC GBM KI LU OV U 1 0 0 2 0 0 0 2 412 0 0 0 0 1 3 7 67 3 1 0 5 4 0 0 0 196 0 0 5 0 0 199 0 0 0 6 0 0 2 1 92 12
You see that by increasing the number of clusters we don't retrieve stable clusters for allthe classes.
Let's try a different clustering method, correlation based similarity and hierarchical clustering.
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="hc")
end fraction clustered clustered
clustered
clustered
clustered
clustered
clustered
true predicted BC GBM KI LU OV U 1 0 0 203 0 0 0 2 411 0 0 0 0 1 3 1 67 0 0 0 0 4 0 0 0 196 0 0 5 0 0 0 0 92 5 6 1 0 3 1 0 0 7 0 0 0 1 0 12 8 6 0 0 0 0 0
ccu<-cc[[3]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
ccu<-cc[[5]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
ccu<-cc[[6]]
mda::confusion(ccu$consensusClass,TCGAclassstr[ii])
true predicted BC GBM KI LU OV U 1 8 67 206 2 0 12 2 411 0 0 0 92 6 3 0 0 0 196 0 0
true predicted BC GBM KI LU OV U 1 0 0 203 0 0 0 2 411 0 0 0 0 1 3 8 67 3 2 0 12 4 0 0 0 196 0 0 5 0 0 0 0 92 5
true predicted BC GBM KI LU OV U 1 0 0 203 0 0 0 2 411 0 0 0 0 1 3 7 67 0 1 0 12 4 0 0 0 196 0 0 5 0 0 0 0 92 5 6 1 0 3 1 0 0
With 5 clusters you almost retrieve the classes. Try this with different methods, different sampling of observations and different features.
We have talked about the inverse covariance matrix in class, mainly as to how it's an essential component of DA classification since the inverse appears in our decusion rules. The problem with this was that for large numbers of features, strong correlations between features etc, the inverse may become numerically unstable. We "fixed" this by regularizating by for example adding something to the diagonal of the covariance before taking the inverse. This constitutes a form of shrinkage, mainly along the directions of small variation.
In class I briefly discussed another variant of regularized variance estimation called graphical lasso. This results in a sparse estimate of the inverse covariance where only features that are strongly, directly correlated have a non-zero entry in the matrix (see lecture notes).
The output from graphical lasso is a so-called adjecency matrix with non-zero entries between objects/features that are directly dependent and zero otherwise. The more heavily we regularize the inverse, the fewer direct connections are estimated as non-zero. This is called sparsity, i.e., the number or direct links between observations/features.
Let's try this on the digit data set.
library(ElemStatLearn)
data(zip.train)
Nmat<-as.matrix(zip.train[,-1])
Numbers<-as.matrix(zip.train)
#
library(huge)
# The network modeling package using sparse gaussian graphical models
#help(huge)
#
its<-sample(seq(1,dim(Nmat)[1]),1000)
# select 1000 digits at random
gg<-huge(t(Nmat[its,]))
names(gg)
plot(gg)
Warning message: "package 'huge' was built under R version 4.1.3"
Conducting Meinshausen & Buhlmann graph estimation (mb)....done
I prefer to use other packages to actually depict the resulting networks or graphs.
library(network)
par(mfrow=c(1,1))
plot.network(network(gg$path[[4]]),usearrows=F)
plot.network(network(gg$path[[4]]),usearrows=F,displayisolates=F,label=Numbers[its,1],label.pos=5,label.cex=.5,vertex.col=Numbers[its,1])
# plotting the graphical lasso output
<sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
With the network package I can add labels, arrows etc to my graph for richer visualizations.
Change the number in the double square brackets to look at networks of a different sparsity.
Notice how this method does a good job at separting the digits from one another while also illustrating that some individual numbers may be close to a digit from a different class.
We can apply this to the cancer data as well.
vv<-apply(TCGA,2,sd)
vv<-rev(sort.list(vv))
# Sort by top variance genes
library(irlba)
ss<-irlba(TCGA[,vv[1:5000]],nu=25,nv=25)
# leading 100 PCs
gg<-huge(t(ss$u))
plot.network(network(gg$path[[3]]),usearrows=F,displayisolates=F,label=TCGAclassstr,label.pos=5,label.col="white",label.cex=.5,vertex.col=as.numeric(as.factor(TCGAclassstr)))
Conducting Meinshausen & Buhlmann graph estimation (mb)....done
<sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
Again, you can see a clear separating of the cancer classes but also an indication that breast cancer may be split into two groups. Try looking at different levels of sparsity, with different number of PC component inputs etc.
You can apply this method to the features (genes) instead and construct a network of features. For the gene data application this is actually a very common technique. We are looking for gene sub-networks, or pathways, that form a functional component in the cell.
# network of genes instead
vv<-apply(TCGA,2,sd)
vv<-rev(sort.list(vv))
#gg<-huge(TCGA[,vv[1:5000]])
plot.network(network(gg$path[[2]]),vertex.cex=.5,usearrows=F,displayisolates=F)
<sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
You see some subnetworks of different sizes and complexity. To get clusters (or modules) from a graph you can use various graph cutting algorithms. We will talk more about this and sparse modeling after the easter break.