## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  error = FALSE,
  tidy = TRUE,
  eval = TRUE,
  dev = c("png", "pdf"),
  cache = TRUE
)

## ----run.examples, fig.height=2.5, fig.width=9, fig.cap ="**Figure 1: Intuition for data whitening transformation**. Given the singular value decomposition of $Y/\\sqrt{n}$ after centering columns is $UDV^T$, the steps in the transformation are: **A)** Original data, **B)** Data rotated along principal components, **C)** Data rotated and scaled, **D)** Data rotated, scaled and rotated back to original axes. Green arrows indicate principal axes and lengths indicate eigen-values.", echo=FALSE----
library(decorrelate)
library(mvtnorm)
library(ggplot2)
library(cowplot)
library(colorRamps)
library(latex2exp)
set.seed(2)

Sigma <- matrix(c(1, .9, .9, 2), ncol = 2)

Y <- rmvnorm(200, c(0, 0), Sigma)

dcmp <- eigen(cov(Y))
U <- whitening:::makePosDiag(dcmp$vectors)
lambda <- dcmp$values

lim <- range(Y)

value <- (Y %*% U)[, 1]

plot_data <- function(Y, rotation, rank, lim) {
  dcmp <- eigen(cov(Y))
  U <- whitening:::makePosDiag(dcmp$vectors)
  lambda <- dcmp$values

  if (rotation == 1) {
    X <- Y

    H <- with(eigen(cov(Y)), vectors %*% diag(values))
  } else if (rotation == 2) {
    X <- Y %*% U

    H <- with(eigen(cov(X)), vectors %*% diag(values))
  } else if (rotation == 3) {
    X <- Y %*% U %*% diag(1 / sqrt(lambda))

    H <- matrix(c(1, 0, 0, 1), ncol = 2)
  } else if (rotation == 4) {
    X <- Y %*% U %*% diag(1 / sqrt(lambda)) %*% t(U)

    H <- with(eigen(cov(Y)), vectors)
  }

  df <- data.frame(X, rank = scale(rank))

  H <- whitening:::makePosDiag(H)

  ggplot(df, aes(X1, X2, color = rank)) +
    geom_segment(x = 0, y = -lim[2], xend = 0, yend = lim[2], color = "black") +
    geom_segment(x = -lim[2], y = 0, xend = lim[2], yend = 0, color = "black") +
    geom_point(size = .8) +
    theme_void() +
    scale_color_gradient2(low = "blue2", mid = "grey60", high = "red2") +
    theme(aspect.ratio = 1, legend.position = "none", plot.title = element_text(hjust = 0.5)) +
    xlim(lim) +
    ylim(lim) +
    geom_segment(x = 0, y = 0, xend = H[1, 1], yend = H[2, 1], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) +
    geom_segment(x = 0, y = 0, xend = H[1, 2], yend = H[2, 2], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1)
}


fig1 <- plot_data(Y, rotation = 1, rank(value), lim) + ggtitle(TeX("$Y$"))
fig2 <- plot_data(Y, rotation = 2, rank(value), lim) + ggtitle(TeX("$YV$"))
fig3 <- plot_data(Y, rotation = 3, rank(value), lim) + ggtitle(TeX("$YVD^{-1}$"))
fig4 <- plot_data(Y, rotation = 4, rank(value), lim) + ggtitle(TeX("$YVD^{-1}V^T$"))
plot_grid(fig1, fig2, fig3, fig4, ncol = 4, labels = "AUTO")

## ----example.1, fig.height=5, fig.width=5-------------------------------------
library(decorrelate)
library(Rfast)

n <- 500 # number of samples
p <- 200 # number of features

# create correlation matrix
Sigma <- autocorr.mat(p, .9)

# draw data from correlation matrix Sigma
Y <- rmvnorm(n, rep(0, p), sigma = Sigma * 5.1, seed = 1)
rownames(Y) <- paste0("sample_", 1:n)
colnames(Y) <- paste0("gene_", 1:p)

# eclairs decomposition implements GIW-EB method:
# *E*stimate *c*ovariance/correlation with *l*ow *r*ank and *s*hrinkage
ecl <- eclairs(Y)

# decorrelate data using eclairs decomposition
Y_whitened <- decorrelate(Y, ecl)

# the same whitening can be performed with one command where
# the eigen-value shrinkage is performed internally
Y_whitened2 <- whiten(Y)

## ----plots, fig.height=3.2, fig.width=9, cache=TRUE---------------------------
oldpar <- par(mfrow = c(1, 3))

# plot shrinkage of eigen-values
plot(ecl)

# correlation between variables in observed data
image(cor(Y), axes = FALSE, main = "Correlation of observed data")

# decorrelate data using eclairs decomposition
image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data")
par(oldpar)

## ----whitening----------------------------------------------------------------
# compute whitening matrix from eclairs decomposition
W <- getWhiteningMatrix(ecl)

# transform observed data using whitening matrix
Z <- tcrossprod(Y, W)

# evalute difference between whitened computed 2 ways
max(abs(Z - Y_whitened))

## ----cov.cor, eval=FALSE------------------------------------------------------
# # compute correlation matrix from eclairs
# getCor(ecl)
# 
# # compute covariance matrix from eclairs
# getCov(ecl)

## ----multivariate-------------------------------------------------------------
# draw from multivariate normal
n <- 1000
mu <- rep(0, ncol(Y))

# using eclairs decomposition
X.draw1 <- rmvnorm_eclairs(n, mu, ecl)

## ----example.2----------------------------------------------------------------
# use low rank decomposition with 50 components
ecl <- eclairs(Y, k = 60)

# decorrelate data using eclairs decomposition
Y_whitened <- decorrelate(Y, ecl)

## ----plots2, fig.height=3.2, fig.width=9, echo=FALSE--------------------------
oldpar <- par(mfrow = c(1, 3))

# plot shrinkage of eigen-values
plot(ecl)

# decorrelate data using eclairs decomposition
image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data")
par(oldpar)

## ----kappa--------------------------------------------------------------------
kappa(ecl)

## ----examples3----------------------------------------------------------------
library(clusterGeneration)

# generate covariance matrix, where the diagonals (i.e. variances) vary
Sigma <- genPositiveDefMat(p, rangeVar=c(1, 1e6))$Sigma

Y <- rmvnorm(n, rep(0, p), sigma = Sigma)

# examine variances of the first 5 variables
apply(Y, 2, var)[1:5]

# transform removes covariance between columns
# so variance of transformed features are *approximately* equal
ecl_cov <- eclairs(Y, compute = "covariance")
Z1 <- decorrelate(Y, ecl_cov)

# variance are *approximately* equal
apply(Z1, 2, var)[1:5]

# transform removes **correlation** between columns
# but variables are not scaled
ecl_cor <- eclairs(Y, compute = "correlation")
Z2 <- decorrelate(Y, ecl_cor)

# variances are not standardized
apply(Z2, 2, var)[1:5]

## ----session, echo=FALSE------------------------------------------------------
sessionInfo()