## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, error = FALSE, tidy = TRUE, eval = TRUE, dev = c("png", "pdf"), cache = TRUE ) ## ----run.examples, fig.height=2.5, fig.width=9, fig.cap ="**Figure 1: Intuition for data whitening transformation**. Given the singular value decomposition of $Y/\\sqrt{n}$ after centering columns is $UDV^T$, the steps in the transformation are: **A)** Original data, **B)** Data rotated along principal components, **C)** Data rotated and scaled, **D)** Data rotated, scaled and rotated back to original axes. Green arrows indicate principal axes and lengths indicate eigen-values.", echo=FALSE---- library(decorrelate) library(mvtnorm) library(ggplot2) library(cowplot) library(colorRamps) library(latex2exp) set.seed(2) Sigma <- matrix(c(1, .9, .9, 2), ncol = 2) Y <- rmvnorm(200, c(0, 0), Sigma) dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values lim <- range(Y) value <- (Y %*% U)[, 1] plot_data <- function(Y, rotation, rank, lim) { dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values if (rotation == 1) { X <- Y H <- with(eigen(cov(Y)), vectors %*% diag(values)) } else if (rotation == 2) { X <- Y %*% U H <- with(eigen(cov(X)), vectors %*% diag(values)) } else if (rotation == 3) { X <- Y %*% U %*% diag(1 / sqrt(lambda)) H <- matrix(c(1, 0, 0, 1), ncol = 2) } else if (rotation == 4) { X <- Y %*% U %*% diag(1 / sqrt(lambda)) %*% t(U) H <- with(eigen(cov(Y)), vectors) } df <- data.frame(X, rank = scale(rank)) H <- whitening:::makePosDiag(H) ggplot(df, aes(X1, X2, color = rank)) + geom_segment(x = 0, y = -lim[2], xend = 0, yend = lim[2], color = "black") + geom_segment(x = -lim[2], y = 0, xend = lim[2], yend = 0, color = "black") + geom_point(size = .8) + theme_void() + scale_color_gradient2(low = "blue2", mid = "grey60", high = "red2") + theme(aspect.ratio = 1, legend.position = "none", plot.title = element_text(hjust = 0.5)) + xlim(lim) + ylim(lim) + geom_segment(x = 0, y = 0, xend = H[1, 1], yend = H[2, 1], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) + geom_segment(x = 0, y = 0, xend = H[1, 2], yend = H[2, 2], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) } fig1 <- plot_data(Y, rotation = 1, rank(value), lim) + ggtitle(TeX("$Y$")) fig2 <- plot_data(Y, rotation = 2, rank(value), lim) + ggtitle(TeX("$YV$")) fig3 <- plot_data(Y, rotation = 3, rank(value), lim) + ggtitle(TeX("$YVD^{-1}$")) fig4 <- plot_data(Y, rotation = 4, rank(value), lim) + ggtitle(TeX("$YVD^{-1}V^T$")) plot_grid(fig1, fig2, fig3, fig4, ncol = 4, labels = "AUTO") ## ----example.1, fig.height=5, fig.width=5------------------------------------- library(decorrelate) library(Rfast) n <- 500 # number of samples p <- 200 # number of features # create correlation matrix Sigma <- autocorr.mat(p, .9) # draw data from correlation matrix Sigma Y <- rmvnorm(n, rep(0, p), sigma = Sigma * 5.1, seed = 1) rownames(Y) <- paste0("sample_", 1:n) colnames(Y) <- paste0("gene_", 1:p) # eclairs decomposition implements GIW-EB method: # *E*stimate *c*ovariance/correlation with *l*ow *r*ank and *s*hrinkage ecl <- eclairs(Y) # decorrelate data using eclairs decomposition Y_whitened <- decorrelate(Y, ecl) # the same whitening can be performed with one command where # the eigen-value shrinkage is performed internally Y_whitened2 <- whiten(Y) ## ----plots, fig.height=3.2, fig.width=9, cache=TRUE--------------------------- oldpar <- par(mfrow = c(1, 3)) # plot shrinkage of eigen-values plot(ecl) # correlation between variables in observed data image(cor(Y), axes = FALSE, main = "Correlation of observed data") # decorrelate data using eclairs decomposition image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data") par(oldpar) ## ----whitening---------------------------------------------------------------- # compute whitening matrix from eclairs decomposition W <- getWhiteningMatrix(ecl) # transform observed data using whitening matrix Z <- tcrossprod(Y, W) # evalute difference between whitened computed 2 ways max(abs(Z - Y_whitened)) ## ----cov.cor, eval=FALSE------------------------------------------------------ # # compute correlation matrix from eclairs # getCor(ecl) # # # compute covariance matrix from eclairs # getCov(ecl) ## ----multivariate------------------------------------------------------------- # draw from multivariate normal n <- 1000 mu <- rep(0, ncol(Y)) # using eclairs decomposition X.draw1 <- rmvnorm_eclairs(n, mu, ecl) ## ----example.2---------------------------------------------------------------- # use low rank decomposition with 50 components ecl <- eclairs(Y, k = 60) # decorrelate data using eclairs decomposition Y_whitened <- decorrelate(Y, ecl) ## ----plots2, fig.height=3.2, fig.width=9, echo=FALSE-------------------------- oldpar <- par(mfrow = c(1, 3)) # plot shrinkage of eigen-values plot(ecl) # decorrelate data using eclairs decomposition image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data") par(oldpar) ## ----kappa-------------------------------------------------------------------- kappa(ecl) ## ----examples3---------------------------------------------------------------- library(clusterGeneration) # generate covariance matrix, where the diagonals (i.e. variances) vary Sigma <- genPositiveDefMat(p, rangeVar=c(1, 1e6))$Sigma Y <- rmvnorm(n, rep(0, p), sigma = Sigma) # examine variances of the first 5 variables apply(Y, 2, var)[1:5] # transform removes covariance between columns # so variance of transformed features are *approximately* equal ecl_cov <- eclairs(Y, compute = "covariance") Z1 <- decorrelate(Y, ecl_cov) # variance are *approximately* equal apply(Z1, 2, var)[1:5] # transform removes **correlation** between columns # but variables are not scaled ecl_cor <- eclairs(Y, compute = "correlation") Z2 <- decorrelate(Y, ecl_cor) # variances are not standardized apply(Z2, 2, var)[1:5] ## ----session, echo=FALSE------------------------------------------------------ sessionInfo()