## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse  = TRUE,
  comment   = "#>",
  fig.width = 7,
  fig.height = 5,
  warning   = FALSE,
  message   = FALSE
)

## ----load-data----------------------------------------------------------------
library(cclustr)
library(mice)

if (!requireNamespace("mlbench", quietly = TRUE)) {
  stop("Package 'mlbench' is required for this vignette. ",
       "Install it with: install.packages('mlbench')")
}

data("PimaIndiansDiabetes2", package = "mlbench")

# Remove diabetes from the clustering dataset
df <- PimaIndiansDiabetes2[, -9]

cat("Dimensions:", nrow(df), "x", ncol(df), "\n")
cat("\nMissing values per variable:\n")
print(colSums(is.na(df)))
cat("\nTotal missing values:", sum(is.na(df)),
    "(", round(mean(is.na(df)) * 100, 1), "%)\n")

## ----imputation---------------------------------------------------------------

imp <- mice(df, m = 20, maxit = 20, method = "pmm", printFlag = FALSE, seed = 202604)

# Convert to cclustr standard format
mi_pima <- as_mild_list(imp)

cat("Number of imputed datasets :", length(mi_pima), "\n")
cat("Dimensions of each dataset :", nrow(mi_pima[[1]]),
    "x", ncol(mi_pima[[1]]), "\n")
cat("Any missing values remaining:", any(sapply(mi_pima, anyNA)), "\n")

## ----clustering---------------------------------------------------------------
set.seed(202604)
partitions <- cluster_imputations(
  imp_list   = mi_pima,
  method     = "kmeans",
  k          = 2:4,
  scale_data = "global"
)

cat("Available k values:", names(partitions), "\n")
cat("Number of partitions per k:", length(partitions$k2), "\n")

## ----consensus----------------------------------------------------------------
consensus_results <- consensus_clustering(
  partitions     = partitions,
  cluster_method = "ward.D2",
  consensus_method = "classic"
)

cat("Consensus solutions available:", names(consensus_results), "\n")

## ----validation---------------------------------------------------------------
val_table <- validate_clustering(
  partitions        = partitions,
  consensus_results = consensus_results
)

print(val_table)

## ----selection----------------------------------------------------------------
best <- choose_best_clustering(
  validation_table  = val_table,
  consensus_results = consensus_results,
  prefer_stability  = TRUE,
  tie_breaker       = "silhouette"
)

cat("Optimal k selected:", best$best_k, "\n")
cat("\nConsensus cluster sizes:\n")
print(table(best$best_consensus))
cat("\nWeighted rank scores:\n")
print(best$scores_table[, c("k", "pac", "silhouette_mean",
                             "ari_mean_between_imputations", "score")])

## ----validation-plot, fig.cap = "Validation metrics across k = 2, 3, 4. The optimal value per metric is highlighted in red."----
plot_validation_metrics(best)

## ----heatmap, fig.cap = "Co-assignment matrix for the optimal k. A clear block-diagonal structure indicates a stable consensus."----
plot_consensus_matrix(
  consensus_result = best,
  reorder          = TRUE,
  viridis_option   = "D",
  show_labels = FALSE
)

## ----dendrogram, fig.cap = "Consensus dendrogram for the optimal k, derived from the dissimilarity matrix 1 - C."----
plot_consensus_dendrogram(
  consensus_result = best,
  rect             = TRUE,
  labels           = FALSE
)

## ----cluster-summary----------------------------------------------------------
pima_clustered          <- mi_pima[[1]]
pima_clustered$cluster  <- factor(best$best_consensus)

# Mean values per cluster
num_vars <- c("pregnant", "glucose", "pressure",
              "triceps", "insulin", "mass", "pedigree", "age")

cat("Mean clinical values by cluster:\n")
print(
  aggregate(
    pima_clustered[, num_vars],
    by  = list(Cluster = pima_clustered$cluster),
    FUN = function(x) round(mean(x, na.rm = TRUE), 2)
  )
)

## ----boxplots, fig.cap = "Distribution of glucose and BMI by consensus cluster."----
par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))

boxplot(
  glucose ~ cluster,
  data = pima_clustered,
  main = "Plasma Glucose by Cluster",
  xlab = "Cluster",
  ylab = "Glucose (mg/dl)",
  col  = c("#4E79A7", "#F28E2B", "#E15759")
)

boxplot(
  mass ~ cluster,
  data = pima_clustered,
  main = "Body Mass Index by Cluster",
  xlab = "Cluster",
  ylab = "BMI (kg/m²)",
  col  = c("#4E79A7", "#F28E2B", "#E15759")
)

boxplot(
  insulin ~ cluster,
  data = pima_clustered,
  main = "Insulin",
  xlab = "Cluster",
  ylab = "Insulin (mu U/ml)",
  col  = c("#4E79A7", "#F28E2B", "#E15759")
)
par(mfrow = c(1, 1))

## ----full-pipeline------------------------------------------------------------
set.seed(202604)

result <- run_mi_clustering(
  data             = imp,
  method           = "kmeans",
  k                = 2:4,
  scale_data       = "global",
  consensus_method = "classic",
  prefer_stability = TRUE,
  tie_breaker      = "silhouette"
)

cat("Best k selected:", result$best_k, "\n")
cat("\nCluster sizes:\n")
print(table(result$best_consensus))

## ----weighted-ari, eval = requireNamespace("mclust", quietly = TRUE)----------
set.seed(202604)

result_w <- run_mi_clustering(
  data             = imp,
  method           = "kmeans",
  k                = 2:4,
  scale_data       = "global",
  consensus_method = "weighted_ari",
  prefer_stability = TRUE,
  tie_breaker      = "silhouette"
)

cat("Best k (ARI-weighted consensus):", result_w$best_k, "\n")
print(table(result_w$best_consensus))

