MDS + Procrustes Sensitivity

library(DataFusionGDM)
library(ggplot2)
library(vegan)

# Generate a synthetic full matrix in memory (no file I/O)
full_matrix <- simulate_genetic_distances(n_pops = 40, verbose = FALSE, seed = 42)$distance_matrix

k_values <- c(4, 8, 12)
num_tests <- 3
seed_base <- 42

prepare_matrices_from_matrix <- function(full_matrix, k, bias_mu = 0.1, noise_sd = 0.05, randomize_shared = TRUE, seed = 42) {
  all_pop_names <- rownames(full_matrix)
  np <- length(all_pop_names)
  stopifnot(k >= 1, k <= np)
  if (randomize_shared) {
    ordering <- order(sin(seq_len(np) + seed), decreasing = TRUE)
    shared_pop_names <- all_pop_names[ordering[seq_len(k)]]
  } else {
    shared_pop_names <- all_pop_names[seq_len(k)]
  }
  unique_pop_names <- setdiff(all_pop_names, shared_pop_names)
  A <- full_matrix; B <- full_matrix
  base_index <- seq_len(length(A)) + seed
  A <- abs(A + matrix(sin(base_index) * noise_sd, nrow(A)))
  B <- abs(B + matrix(cos(base_index) * noise_sd + bias_mu, nrow(B)))
  diag(A) <- 0; diag(B) <- 0
  label_A <- label_B <- rep("", np); names(label_A) <- names(label_B) <- all_pop_names
  label_A[shared_pop_names] <- paste0("S_", shared_pop_names)
  label_B[shared_pop_names] <- paste0("S_", shared_pop_names)
  label_A[unique_pop_names] <- paste0("A_", unique_pop_names)
  label_B[unique_pop_names] <- paste0("B_", unique_pop_names)
  rownames(A) <- label_A[rownames(A)]; colnames(A) <- label_A[colnames(A)]
  rownames(B) <- label_B[rownames(B)]; colnames(B) <- label_B[colnames(B)]
  list(A = A, B = B, np = np, k = k)
}

results <- data.frame()
for (k in k_values) {
  for (test_id in seq_len(num_tests)) {
    test_seed <- seed_base + test_id * 100 + k
    matrices <- prepare_matrices_from_matrix(full_matrix, k = k, seed = test_seed)
    A <- matrices$A; B <- matrices$B
    mds <- perform_mds(A, B)
    X <- mds$X; Y <- mds$Y; d_opt <- mds$d_opt
    pop_common <- intersect(rownames(A), rownames(B))
    X_sub <- X[pop_common, 1:d_opt]
    Y_sub <- Y[pop_common, 1:d_opt]
    Yt <- apply_procrustes(X_sub, Y_sub, Y)
    B_cal <- coords_to_distances(Yt)
    the_prior <- mean((A - B)^2)
    the_post  <- mean((A - B_cal)^2)
    improvement <- (the_prior - the_post) / the_prior * 100
    results <- rbind(results, data.frame(k = k, test_id = test_id, the_prior = the_prior,
                                         the_post = the_post, improvement = improvement))
  }
}

agg <- aggregate(cbind(the_prior, the_post, improvement) ~ k, data = results,
                 FUN = function(x) c(mean = mean(x), sd = sd(x)))
agg <- do.call(data.frame, agg)

p <- ggplot(agg, aes(x = k)) +
  geom_line(aes(y = improvement.mean), color = "blue") +
  geom_point(aes(y = improvement.mean), color = "blue") +
  geom_ribbon(aes(ymin = improvement.mean - improvement.sd,
                  ymax = improvement.mean + improvement.sd), fill = "blue", alpha = 0.2) +
  labs(title = "Calibration improvement vs shared k",
       x = "Shared populations (k)", y = "% Improvement") +
  theme_minimal()
print(p)