######################################################################### ## Clustering tools f.clus <- function(LNT, eps=log(1,2), name='', labels=dimnames(LNT)[[2]]) { # Input is a data frame of log-transformed normalized expression data. # Produces three hierarchical clusters, one from Euclidean distance, # one from correlation, and one from manhattan distance of on-or-off # genes. jj <- LNT > eps plot(c(0,4), c(0,3), xaxt='n', yaxt='n', type='n', xlab='', ylab='') text(2, 2, name, cex=1.2) text(2, 1, 'Cluster Analysis', cex=1.2) plclust(hclust(dist(t(LNT))), labels=labels) title(paste('Clustering on Euclidean distance')) plclust(hclust((1-cor(LNT))/2), labels=labels) title('Clustering on correlation') plclust(hclust(dist(t(jj))), labels=labels) title('Clustering on presence or absence of genes') invisible(LNT) } gene.pc <- function(gene.data) { # gene.pc represents each gene as a point in sample space, # with axes given by principal components. spl.mean <- apply(gene.data, 2, mean) num.gene <- dim(gene.data)[1] centered <- sweep(gene.data, 2, spl.mean, '-') decomp <- svd(centered) variances <- decomp$d^2/num.gene components <- decomp$v scores <- decomp$u %*% diag(decomp$d) val <- list(scores=scores, variances=variances, components=components) class(val) <- 'gene.pc' val } plot.gene.pc <- function(pc, split=0) { plot(pc$scores[,1], pc$scores[,2], xlab='Comp1', ylab='Comp2') if (length(split)>1) { points(pc$scores[split,1], pc$scores[split,2], col=8, pch=16) points(pc$scores[!split,1], pc$scores[!split,2], col=4, pch=15) } invisible(pc) } sample.pc <- function(gene.data, splitter=0, usecor=F, center=T) { # sample.pc represents each sample as a point in gene space, # with axes given by principal components. num.sample <- dim(gene.data)[2] centered <- as.matrix(gene.data) if (center == T) { gene.mean <- apply(centered, 1, mean) centered <- sweep(centered, 1, gene.mean, '-') } if (usecor == T) { if(max(dim(centered)) > 2000) { gene.sd <- sqrt(apply(centered, 1, var)) } else { gene.sd <- sqrt(diag( centered %*% t(centered)/(num.sample-1) )) } gene.sd[gene.sd==0] <- 1 centered <- sweep(centered, 1, gene.sd, '/') } decomp <- svd(centered) variances <- decomp$d^2/num.sample components <- decomp$u scores <- t(decomp$d * t(decomp$v)) val <- list(scores=scores, variances=variances, components=components, splitter=splitter, usecor=usecor, raw=centered) class(val) <- 'sample.pc' val } plot.sample.pc <- function(pc, split=pc$splitter, name='', which=1:2, ...) { .my.plot <- function(z, colors, i, j, ...) { print(length(colors)) plot(color.coded.pair(z[,i], z[,j], colors), xlab=paste('Component', i), ylab=paste('Component', j), ...) } z <- pc$scores if (split == 0) { ccl <- list(color.coding(rep(T, length(pc$variances)), 1, 15)) } else if (is.logical(split)) { ccl <- list(color.coding(split, 1, 15), color.coding(!split, 2, 16)) } else { labels <- levels(split) ccl <- lapply(1:length(labels), function(x, f) { fc <- as.character(f) f1 <- attr(f, 'levels')[x] v <- fc == f1; color.coding(v, x+1, 15) }, split) } if (which == 'all') { plot(c(0,4), c(0,3), xaxt='n', yaxt='n', type='n', xlab='', ylab='') text(2, 2, name, cex=1.2) text(2, 1, 'Principal Components', cex=1.2) .my.plot(z, ccl, 1, 2, main='PCA', ...) .my.plot(z, ccl, 2, 3, main='PCA', ...) .my.plot(z, ccl, 1, 3, main='PCA', ...) } else { .my.plot(z, ccl, 1, 2, main=name, ...) } invisible(pc) } screeplot.sample.pc <- function(ob, cum=F, ...) { if (cum) { barplot(cumsum(ob$variances)/sum(ob$variances), ...) } else { barplot(ob$variances/sum(ob$variances), ...) } }