--- title: 'Classifying in latent space: k-NN & Random-Forest wrappers' output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Classifying in latent space: k-NN & Random-Forest wrappers} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} params: family: red css: albers.css resource_files: - albers.css - albers.js includes: in_header: |- --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4 ) library(multivarious) library(dplyr) library(tibble) library(ggplot2) # Needed for plots library(knitr) # Needed for kable ``` ```{css, echo=FALSE} h1 { font-size: 1.8em; /* Adjust as needed */ margin-top: 1.5em; margin-bottom: 0.8em; border-bottom: 1px solid #dee2e6; /* Optional: add a subtle separator */ padding-bottom: 0.3em; } h2 { font-size: 1.4em; /* Adjust as needed */ margin-top: 1.2em; margin-bottom: 0.6em; } /* Add rules for h3, h4 etc. if they are used and need adjustment */ ``` # 1. Why classify after projection? Once a dimensionality-reduction model (PCA, PLS, CCA, …) is fitted, every new sample can be projected into the low-dimensional latent space. Running a classifier there – instead of on thousands of noisy raw variables – yields * fewer parameters & smaller models, * immunity to collinearity, * freedom to use partial data (ROI, missing sensors), * a clean separation between unsupervised decomposition and supervised prediction. The `classifier()` S3 family supplied by `multiblock` provides that glue: you hand it any projector (or `multiblock_biprojector`, `discriminant_projector`, …) plus class labels → it returns a ready predictor object. # 2. Iris demo – LDA → `discriminant_projector` → k-NN ```{r data_classifier} data(iris) X <- as.matrix(iris[, 1:4]) grp <- iris$Species # Fit classical Linear DA and wrap it if (!requireNamespace("MASS", quietly = TRUE)) { stop("MASS package required for LDA example") } # 1. Define and fit the pre-processing step using the training data preproc_fitted <- fit(center(), X) # 2. Transform the data Xp <- transform(preproc_fitted, X) # Assuming discriminant_projector, prep, center, scores are available lda_fit <- MASS::lda(X, grouping = grp) disc_proj <- multivarious::discriminant_projector( v = lda_fit$scaling, # loadings (p × d) s = Xp %*% lda_fit$scaling, # scores (n × d) sdev = lda_fit$svd, # singular values labels = grp, preproc = preproc_fitted # Pass the fitted pre-processor ) print(disc_proj) ``` ## 2.1 Visualise the latent space ```{r plot_latent_space} scores_df <- as_tibble(scores(disc_proj)[, 1:2], .name_repair = ~ c("LD1","LD2")) |> mutate(Species = iris$Species) ggplot(scores_df, aes(LD1, LD2, colour = Species)) + geom_point(size = 2, alpha = .7) + stat_ellipse(level = .9, linewidth = .3) + theme_minimal() + ggtitle("Iris – first two LDA components") ``` ## 2.2 Build a k-NN classifier on the latent scores ```{r build_knn_classifier} set.seed(42) train_id <- sample(seq_len(nrow(X)), size = 0.7*nrow(X)) test_id <- setdiff(seq_len(nrow(X)), train_id) # Assuming classifier function is available clf_knn <- multivarious::classifier( x = disc_proj, labels = grp[train_id], new_data= X[train_id, ], # Use training data to get reference scores knn = 3 ) print(clf_knn) ``` ## 2.3 Predict and evaluate ```{r predict_knn} pred_knn <- predict(clf_knn, new_data = X[test_id, ], metric = "cosine", prob_type = "knn_proportion") head(pred_knn$prob, 3) print(paste("Overall Accuracy:", mean(pred_knn$class == grp[test_id]))) # Assuming rank_score and topk are available rk <- rank_score(pred_knn$prob, grp[test_id]) tk2 <- topk (pred_knn$prob, grp[test_id], k = 2) tibble( prank_mean = mean(rk$prank), top2_acc = mean(tk2$topk) ) ``` ## 2.4 Confusion-matrix on the test set ```{r plot_confusion_matrix} cm <- table( Truth = grp[test_id], Predicted = pred_knn$class ) # Heat-map cm_df <- as.data.frame(cm) ggplot(cm_df, aes(Truth, Predicted, fill = Freq)) + geom_tile(colour = "grey80") + geom_text(aes(label = Freq), colour = "white", size = 4) + scale_fill_gradient(low = "#4575b4", high = "#d73027", name="Count", limits = c(0, 15)) + scale_y_discrete(limits = rev(levels(cm_df$Predicted))) + theme_minimal(base_size = 12) + coord_equal() + ggtitle("k-NN (k = 3) confusion matrix – test set") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Pretty table as well knitr::kable(cm, caption = "Confusion matrix (counts)") ``` # 3. Random-Forest on the same latent space ```{r build_rf_classifier} # Check if randomForest is installed if (requireNamespace("randomForest", quietly = TRUE)) { # Assuming rf_classifier.projector method is available rf_clf <- rf_classifier( # Using the generic here x = disc_proj, labels = grp[train_id], # Pass scores directly if method requires it, or let it call scores(x) scores = scores(disc_proj)[train_id, ] ) pred_rf <- predict(rf_clf, new_data = X[test_id, ]) print(paste("RF Accuracy:", mean(pred_rf$class == grp[test_id]))) } else { cat("randomForest package not installed. Skipping RF example.\n") } ``` The RF sees exactly three input variables (the LDA components) – that keeps trees shallow and speeds-up training. # 4. Partial-feature prediction: sepal block only Assume that in deployment we measure only Sepal variables (cols 1–2). A partial projection keeps the classifier happy: ```{r predict_partial} sepal_cols <- 1:2 # Create a classifier using reference scores from Sepal columns only clf_knn_sepal <- multivarious::classifier( x = disc_proj, labels = grp[train_id], new_data= X[train_id, sepal_cols], # Use training data subset colind = sepal_cols, # Indicate which columns were used knn = 3 ) # Predict using the dedicated sepal classifier pred_sepal <- predict( clf_knn_sepal, # Use the sepal-specific classifier new_data = X[test_id, sepal_cols] # No need for colind here as clf_knn_sepal expects sepal data ) print(paste("Accuracy (Sepal only):", mean(pred_sepal$class == grp[test_id]))) ``` Accuracy drops a bit – as expected when using fewer features. # 5. Which component block matters most? `feature_importance()` can rank variable groups via a simple "leave-block-out" score drop. ```{r calc_feature_importance} blocks <- list( Sepal = 1:2, Petal = 3:4 ) # Assuming feature_importance is available fi <- feature_importance( clf_knn, new_data = X[test_id, ], true_labels = grp[test_id], # Pass the correct test set labels blocks = blocks, fun = rank_score, # Use rank_score as the performance metric fun_direction = "lower_is_better", approach = "marginal" # Calculate marginal drop when block is removed ) print(fi) ```