---
title: 'Classifying in latent space: k-NN & Random-Forest wrappers'
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Classifying in latent space: k-NN & Random-Forest wrappers}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
params:
family: red
css: albers.css
resource_files:
- albers.css
- albers.js
includes:
in_header: |-
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
fig.height = 4
)
library(multivarious)
library(dplyr)
library(tibble)
library(ggplot2) # Needed for plots
library(knitr) # Needed for kable
```
```{css, echo=FALSE}
h1 {
font-size: 1.8em; /* Adjust as needed */
margin-top: 1.5em;
margin-bottom: 0.8em;
border-bottom: 1px solid #dee2e6; /* Optional: add a subtle separator */
padding-bottom: 0.3em;
}
h2 {
font-size: 1.4em; /* Adjust as needed */
margin-top: 1.2em;
margin-bottom: 0.6em;
}
/* Add rules for h3, h4 etc. if they are used and need adjustment */
```
# 1. Why classify after projection?
Once a dimensionality-reduction model (PCA, PLS, CCA, …) is fitted,
every new sample can be projected into the low-dimensional latent
space. Running a classifier there – instead of on thousands of
noisy raw variables – yields
* fewer parameters & smaller models,
* immunity to collinearity,
* freedom to use partial data (ROI, missing sensors),
* a clean separation between unsupervised decomposition and
supervised prediction.
The `classifier()` S3 family supplied by `multiblock` provides that
glue: you hand it any projector (or `multiblock_biprojector`,
`discriminant_projector`, …) plus class labels → it returns a ready
predictor object.
# 2. Iris demo – LDA → `discriminant_projector` → k-NN
```{r data_classifier}
data(iris)
X <- as.matrix(iris[, 1:4])
grp <- iris$Species
# Fit classical Linear DA and wrap it
if (!requireNamespace("MASS", quietly = TRUE)) {
stop("MASS package required for LDA example")
}
# 1. Define and fit the pre-processing step using the training data
preproc_fitted <- fit(center(), X)
# 2. Transform the data
Xp <- transform(preproc_fitted, X)
# Assuming discriminant_projector, prep, center, scores are available
lda_fit <- MASS::lda(X, grouping = grp)
disc_proj <- multivarious::discriminant_projector(
v = lda_fit$scaling, # loadings (p × d)
s = Xp %*% lda_fit$scaling, # scores (n × d)
sdev = lda_fit$svd, # singular values
labels = grp,
preproc = preproc_fitted # Pass the fitted pre-processor
)
print(disc_proj)
```
## 2.1 Visualise the latent space
```{r plot_latent_space}
scores_df <- as_tibble(scores(disc_proj)[, 1:2],
.name_repair = ~ c("LD1","LD2")) |>
mutate(Species = iris$Species)
ggplot(scores_df, aes(LD1, LD2, colour = Species)) +
geom_point(size = 2, alpha = .7) +
stat_ellipse(level = .9, linewidth = .3) +
theme_minimal() +
ggtitle("Iris – first two LDA components")
```
## 2.2 Build a k-NN classifier on the latent scores
```{r build_knn_classifier}
set.seed(42)
train_id <- sample(seq_len(nrow(X)), size = 0.7*nrow(X))
test_id <- setdiff(seq_len(nrow(X)), train_id)
# Assuming classifier function is available
clf_knn <- multivarious::classifier(
x = disc_proj,
labels = grp[train_id],
new_data= X[train_id, ], # Use training data to get reference scores
knn = 3
)
print(clf_knn)
```
## 2.3 Predict and evaluate
```{r predict_knn}
pred_knn <- predict(clf_knn, new_data = X[test_id, ],
metric = "cosine", prob_type = "knn_proportion")
head(pred_knn$prob, 3)
print(paste("Overall Accuracy:", mean(pred_knn$class == grp[test_id])))
# Assuming rank_score and topk are available
rk <- rank_score(pred_knn$prob, grp[test_id])
tk2 <- topk (pred_knn$prob, grp[test_id], k = 2)
tibble(
prank_mean = mean(rk$prank),
top2_acc = mean(tk2$topk)
)
```
## 2.4 Confusion-matrix on the test set
```{r plot_confusion_matrix}
cm <- table(
Truth = grp[test_id],
Predicted = pred_knn$class
)
# Heat-map
cm_df <- as.data.frame(cm)
ggplot(cm_df, aes(Truth, Predicted, fill = Freq)) +
geom_tile(colour = "grey80") +
geom_text(aes(label = Freq), colour = "white", size = 4) +
scale_fill_gradient(low = "#4575b4", high = "#d73027", name="Count", limits = c(0, 15)) +
scale_y_discrete(limits = rev(levels(cm_df$Predicted))) +
theme_minimal(base_size = 12) + coord_equal() +
ggtitle("k-NN (k = 3) confusion matrix – test set") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Pretty table as well
knitr::kable(cm, caption = "Confusion matrix (counts)")
```
# 3. Random-Forest on the same latent space
```{r build_rf_classifier}
# Check if randomForest is installed
if (requireNamespace("randomForest", quietly = TRUE)) {
# Assuming rf_classifier.projector method is available
rf_clf <- rf_classifier( # Using the generic here
x = disc_proj,
labels = grp[train_id],
# Pass scores directly if method requires it, or let it call scores(x)
scores = scores(disc_proj)[train_id, ]
)
pred_rf <- predict(rf_clf, new_data = X[test_id, ])
print(paste("RF Accuracy:", mean(pred_rf$class == grp[test_id])))
} else {
cat("randomForest package not installed. Skipping RF example.\n")
}
```
The RF sees exactly three input variables (the LDA components) – that
keeps trees shallow and speeds-up training.
# 4. Partial-feature prediction: sepal block only
Assume that in deployment we measure only Sepal variables
(cols 1–2). A partial projection keeps the classifier happy:
```{r predict_partial}
sepal_cols <- 1:2
# Create a classifier using reference scores from Sepal columns only
clf_knn_sepal <- multivarious::classifier(
x = disc_proj,
labels = grp[train_id],
new_data= X[train_id, sepal_cols], # Use training data subset
colind = sepal_cols, # Indicate which columns were used
knn = 3
)
# Predict using the dedicated sepal classifier
pred_sepal <- predict(
clf_knn_sepal, # Use the sepal-specific classifier
new_data = X[test_id, sepal_cols]
# No need for colind here as clf_knn_sepal expects sepal data
)
print(paste("Accuracy (Sepal only):", mean(pred_sepal$class == grp[test_id])))
```
Accuracy drops a bit – as expected when using fewer features.
# 5. Which component block matters most?
`feature_importance()` can rank variable groups via a simple
"leave-block-out" score drop.
```{r calc_feature_importance}
blocks <- list(
Sepal = 1:2,
Petal = 3:4
)
# Assuming feature_importance is available
fi <- feature_importance(
clf_knn,
new_data = X[test_id, ],
true_labels = grp[test_id], # Pass the correct test set labels
blocks = blocks,
fun = rank_score, # Use rank_score as the performance metric
fun_direction = "lower_is_better",
approach = "marginal" # Calculate marginal drop when block is removed
)
print(fi)
```