## simulations for pcanova-paper ## Kevin Coombes, May 2002 # First part simulates one structured and on structured matrix # Second part simulates 500 of each to estimaqte the distributions # of the PC correlation statistics # Uses three other sets of routines: # color.coded.pair (for plotting objects with consistent # colors and symbols in multiple plots) # gene.sample.pc (for PCA computations without constructing the # variance matrix) # pcanova (for the actual implementation of PCANOVA) ############################################################### # we need some big matrices, so we increase default size limits options(object.size=10000000) coltbl <- "0,0,0|255,0,255|192,0,192|0,128,0|255,0,0|0,128,255|128,64,0|255,128,0|128,0,255|0,0,255|192,192,1 92|255,209,143|220,255,200|169,226,255|255,255,195|255,195,255" use.splus.graphics <- F # set up the correct size and color maps for the figures if (use.splus.graphics) { startfig <- function(name, horiz, ...) { graphsheet(color.table=coltbl, ...) } endfig <- function() {} } else { startfig <- function(name, horiz=F, ...) { postscript(paste('//mene/bioinfo/Private/KRC-Temp/', name, '.eps', sep=''), horizontal=horiz, colors=c(0, 0.1, 0.15, 0.2, 0.1, 0.15, 0.2, 0.1, 0.15, 0.2, 1), ...) } endfig <- function() { dev.off() } } ############################## # Figure 5 # Note: this will only work if the NCI60 data has been loaded and analyzed # you should probably skip this if you got this file off the web site. startfig('fig5', horiz=T, width=9.75, height=7) par(mfrow=c(2,2)) plot(pe.out.all) endfig() ############################## # function that yields column vectors with all ones ones <- function(rows, cols=1) { matrix(1, rows, cols) } # basic simulation parameters n.gene <- 1000 n.sample <- 100 n.class <- 6 # randomly assign the classes to samples class.vec <- sample(n.class, n.sample, T) coloring <- (lapply(1:n.class, function(k,v) { color.coding(v==k, 1+k, LETTERS[k]) }, class.vec)) # class indicator matrix qq <- matrix(0, n.class, n.sample) for(i in 1:n.sample) { qq[class.vec[i], i] <- 1 } # column vector: number of samples of each class nn <- qq %*% ones(n.sample) ############################################################### # simulate a highly structured example # First, est up storage space in a matrix x.tight <- matrix(rep(0, n.gene*n.sample), n.gene, n.sample) # now generate the vectors of class means x0 <- matrix(rnorm(n.class*n.gene), n.gene, n.class) # now generate samples with normal noise around the class means eps <- 3 for (i in 1:n.sample) { x.tight[,i] <- x0[, class.vec[i]] + rnorm(n.gene, 0, eps) } # explanation of these matrix manipulations is contained in the paper xsums <- x.tight %*% t(qq) # matrix of class sums cc <- xsums %*% diag(as.vector(1/nn)) # matrix of class means xm <- x.tight %*% ones(n.sample)/n.sample # vector of overall mean zz <- x.tight - (xm %*% t(ones(n.sample))) # now we center the rows. ss <- cc %*% qq - xm %*% t(ones(n.sample)) # between-groups rr <- x.tight - (cc %*% qq) # matrix of residuals # sum of squares matrices (total, between, within) # need these to select princiapl components using the canonical variate criterion tight.tmat <- zz %*% t(zz) tight.bmat <- ss %*% t(ss) tight.wmat <- rr %*% t(rr) # use our code to perfrom principal components analysis tight.pca <- sample.pc(x.tight, usecor=F, center=T) # extract the components, then select in the correct order comp <- tight.pca$components aba <- diag(t(comp) %*% tight.bmat %*% comp) awa <- diag(t(comp) %*% tight.wmat %*% comp) ata <- diag(t(comp) %*% tight.tmat %*% comp) tight.canonical.variates <- rev(order((aba/awa)[1:n.sample-1])) # use our code for a complete PCANOVA tight.pe <- pe(x.tight, LETTERS[1:n.class], LETTERS[class.vec], class.vec+1, usecor=F, center=T) ############################################################### # alternate example with absolutely no structure # follows the same pattern as the "tight" example, # except that the data matrix is completely random x.random <- matrix(rnorm(n.gene*n.sample), n.gene, n.sample) xsums <- x.random %*% t(qq) # matrix of class sums cc <- xsums %*% diag(as.vector(1/nn)) # matrix of class means xm <- x.random %*% ones(n.sample)/n.sample # vector of overall mean zz <- x.random - (xm %*% t(ones(n.sample))) # now we center the rows. ss <- cc %*% qq - xm %*% t(ones(n.sample)) # between-groups rr <- x.random - (cc %*% qq) # matrix of residuals # sum of squares matrices (total, between, within) random.tmat <- zz %*% t(zz) random.bmat <- ss %*% t(ss) random.wmat <- rr %*% t(rr) # use our code to perfrom principal components analysis random.pca <- sample.pc(x.random, usecor=F, center=T) # extract the components, then select in the correct order comp <- random.pca$components aba <- diag(t(comp) %*% random.bmat %*% comp) awa <- diag(t(comp) %*% random.wmat %*% comp) ata <- diag(t(comp) %*% random.tmat %*% comp) random.canonical.variates <- rev(order((aba/awa)[1:(n.sample-1)])) # use our code for a complete PCANOVA random.pe <- pe(x.random, LETTERS[1:n.class], LETTERS[class.vec], 1+class.vec, usecor=F, center=T) ############################################################### # plots of the results. These are figures 1, 2, and 3 of the paper. # plotting routines are extracted from the guts of the PCANOVA routines # so we can separate pieces out for individual figures. ############################## # Figure 1 # Note that there are two identical halves, with the random simulation # on the left half and the structured simulation on the right half startfig('fig1', height=9, width=7) par(mfcol=c(3,2)) ob <- random.pe obc <- random.pca cv <- random.canonical.variates .plot.lab.col(ob$orig.pca[,1], ob$orig.pca[,2], ob$labels, ob$colors, xlab='First PC', ylab='Second PC') title('Full PCA: Random') plot(color.coded.pair(obc$scores[,cv[1]], obc$scores[,cv[2]], coloring), xlab='First Canonical PC', ylab='Second Canonical PC') title('Canonical Variates') groups <- sort(unique(ob$labels)) plot(c(ob$mixed.pca[, 1], ob$class.pca[, 1]), c(ob$mixed.pca[, 2], ob$class.pca[, 2]), type = "n", xlab = "First Component", ylab = "Second Component") .points.lab.col(ob$mixed.pca[, 1], ob$mixed.pca[, 2], ob$labels, ob$colors) .points.lab.col(ob$class.pca[, 1], ob$class.pca[, 2], labs = groups, cols = rep(1, length(groups)), cex = par("cex") * 1.5) title("Group Means PCA") ob <- tight.pe obc <- tight.pca cv <- tight.canonical.variates .plot.lab.col(ob$orig.pca[,1], ob$orig.pca[,2], ob$labels, ob$colors, xlab='First PC', ylab='Second PC') title('Full PCA: Structured') plot(color.coded.pair(obc$scores[,cv[1]], obc$scores[,cv[2]], coloring), xlab='First Canonical PC', ylab='Second Canonical PC') title('Canonical Variates') groups <- sort(unique(ob$labels)) plot(c(ob$mixed.pca[, 1], ob$class.pca[, 1]), c(ob$mixed.pca[, 2], ob$class.pca[, 2]), type = "n", xlab = "First Component", ylab = "Second Component") .points.lab.col(ob$mixed.pca[, 1], ob$mixed.pca[, 2], ob$labels, ob$colors) .points.lab.col(ob$class.pca[, 1], ob$class.pca[, 2], labs = groups, cols = rep(1, length(groups)), cex = par("cex") * 1.5) title("Group Means PCA") endfig() ############################## # Figure 2 startfig('fig2', height=4, width=9, horiz=T) par(mfrow=c(1,2)) ob <- random.pe .plot.lab.col(ob$resid.pca[, 1], ob$resid.pca[, 2], ob$labels, ob$colors, xlab = "First Component", ylab = "Second Component") title("Residual PCA: Random") ob <- tight.pe .plot.lab.col(ob$resid.pca[, 1], ob$resid.pca[, 2], ob$labels, ob$colors, xlab = "First Component", ylab = "Second Component") title("Residual PCA: Structured") endfig() ############################## # Figure 3 startfig('fig3', horiz=T, height=4, width=9) par(mfrow=c(1,2)) ob <- random.pe plot(c(0, ob$n), c(0, 2), type = "n", ylim = c(0, 1), xlab = "Number of Components", ylab = "PC Correlation") points(cumsum(ob$class2resid)/(1:ob$n), type='b', col=4, pch=15) points(cumsum(ob$class2orig)/(1:ob$n), type='b', col=8, pch=16) points(cumsum(ob$orig2resid)/(1:ob$n), type='b', col=6, pch=17) abline(h = 1/2) legend(1.83, 1.04, c('d(Class, Residual)', 'd(Total, Class)', 'd(Total, Residual)'), col=c(4, 8, 6), marks=c(15, 16, 17), cex=0.6) title("PCANOVA: Random") ob <- tight.pe plot(c(0, ob$n), c(0, 2), type = "n", ylim = c(0, 1), xlab = "Number of Components", ylab = "PC Correlation") points(cumsum(ob$class2resid)/(1:ob$n), type='b', col=4, pch=15) points(cumsum(ob$class2orig)/(1:ob$n), type='b', col=8, pch=16) points(cumsum(ob$orig2resid)/(1:ob$n), type='b', col=6, pch=17) abline(h = 1/2) legend(-0.24, 1.04, c('d(Class, Residual)', 'd(Total, Class)', 'd(Total, Residual)'), col=c(4, 8, 6), marks=c(15, 16, 17), cex=0.6) title("PCANOVA: Structured") endfig() ############################################################### # In the second part, we simulate 500 pairs of structured and # unstructured data in order to estimate the distributions of # the PC correlations n.sim <- 500 # set up matrices to store the simulation results group2full <- matrix(0, n.sim, n.class) group2resid <- matrix(0, n.sim, n.class) resid2full <- matrix(0, n.sim, n.class) R.group2full <- matrix(0, n.sim, n.class) R.group2resid <- matrix(0, n.sim, n.class) R.resid2full <- matrix(0, n.sim, n.class) # perform the simulations. This will take a while, so go get # a cup of coffee of start reading your email. I really hate # "for" loops in S-Plus. for (j in 1:n.sim) { print(j) # so we can tell the program is still alive x.tip <- matrix(rep(0, n.gene*n.sample), n.gene, n.sample) x0t <- matrix(rnorm(n.class*n.gene), n.gene, n.class) eps <- 3 for (i in 1:n.sample) { x.tip[,i] <- x0t[, class.vec[i]] + rnorm(n.gene, 0, eps) } tip.pe <- pe(x.tip, LETTERS[1:n.class], LETTERS[class.vec], class.vec, usecor=F, center=T) group2full[j,] <- tip.pe$class2orig group2resid[j,] <- tip.pe$class2resid resid2full[j,] <- tip.pe$orig2resid x.rip <- matrix(rnorm(n.gene*n.sample), n.gene, n.sample) rip.pe <- pe(x.rip, LETTERS[1:n.class], LETTERS[class.vec], class.vec, usecor=F, center=T) R.group2full[j,] <- rip.pe$class2orig R.group2resid[j,] <- rip.pe$class2resid R.resid2full[j,] <- rip.pe$orig2resid } # The PCANOVA routines plot the individual statistics (called s_i in the paper) # the default plotting routines know enough to convert these to the cumulative # statistics (called m_i in the paper). Some day that may change. For now, # however, we need to "cumulate" them by hand. dum.gf <- t(apply(group2full, 1, function(x) {cumsum(x)/(1:n.class)})) rum.gf <- t(apply(R.group2full, 1, function(x) {cumsum(x)/(1:n.class)})) dum.rf <- t(apply(resid2full, 1, function(x) {cumsum(x)/(1:n.class)})) rum.rf <- t(apply(R.resid2full, 1, function(x) {cumsum(x)/(1:n.class)})) # specialized plotting routine for the densities. bp <- function(i, first, second, nb=49, cols=c(6,8), ...) { h1 <- hist(first[,i], breaks=(0:nb)/nb, plot=F, probability=T) h2 <- hist(second[,i], breaks=(0:nb)/nb, plot=F, probability=T) x <- rbind(h1$counts, h2$counts) barplot(x, width=h1$breaks/2, histo=T, beside=T, col=cols, ...) d1 <- density(first[,i]) d2 <- density(second[,i]) lines(d1, lwd=3) lines(d2, lwd=3) invisible(i) } ############################## # Figure 4 startfig('fig4', horiz=T) opar <- par(mfrow=c(2,3)) bp(1, dum.gf, rum.gf, col=c(2, 16), nb=25, main='PCC 1, Full vs Group') text(0.2, 6, 'random', adj=0) text(0.6, 2, 'structured', adj=0) bp(2, dum.gf, rum.gf, col=c(2, 16), nb=25, main='PCC 2, Full vs Group') text(0.2, 7, 'random', adj=0) text(0.6, 2, 'structured', adj=0) bp(3, dum.gf, rum.gf, col=c(2, 16), nb=25, main='PCC 3, Full vs Group') text(0.2, 8, 'random', adj=0) text(0.6, 3, 'structured', adj=0) bp(1, dum.rf, rum.rf, col=c(2, 16), nb=25, main='PCC 1, Full vs Residual') text(0.6, 6, 'random', adj=0) text(0.13, 12, 'structured', adj=0) bp(2, dum.rf, rum.rf, col=c(2, 16), nb=25, main='PCC 2, Full vs Residual') text(0.6, 5, 'random', adj=0) text(0.13, 10, 'structured', adj=0) bp(3, dum.rf, rum.rf, col=c(2, 16), nb=25, main='PCC 3, Full vs Residual') text(0.6, 6, 'random', adj=0) text(0.13, 12, 'structured', adj=0) par(opar) endfig()