library(DataFusionGDM)
library(ggplot2)
library(vegan)
# Generate a synthetic full matrix in memory (no file I/O)
full_matrix <- simulate_genetic_distances(n_pops = 40, verbose = FALSE, seed = 42)$distance_matrix
k_values <- c(4, 8, 12)
num_tests <- 3
seed_base <- 42
prepare_matrices_from_matrix <- function(full_matrix, k, bias_mu = 0.1, noise_sd = 0.05, randomize_shared = TRUE, seed = 42) {
all_pop_names <- rownames(full_matrix)
np <- length(all_pop_names)
stopifnot(k >= 1, k <= np)
if (randomize_shared) {
ordering <- order(sin(seq_len(np) + seed), decreasing = TRUE)
shared_pop_names <- all_pop_names[ordering[seq_len(k)]]
} else {
shared_pop_names <- all_pop_names[seq_len(k)]
}
unique_pop_names <- setdiff(all_pop_names, shared_pop_names)
A <- full_matrix; B <- full_matrix
base_index <- seq_len(length(A)) + seed
A <- abs(A + matrix(sin(base_index) * noise_sd, nrow(A)))
B <- abs(B + matrix(cos(base_index) * noise_sd + bias_mu, nrow(B)))
diag(A) <- 0; diag(B) <- 0
label_A <- label_B <- rep("", np); names(label_A) <- names(label_B) <- all_pop_names
label_A[shared_pop_names] <- paste0("S_", shared_pop_names)
label_B[shared_pop_names] <- paste0("S_", shared_pop_names)
label_A[unique_pop_names] <- paste0("A_", unique_pop_names)
label_B[unique_pop_names] <- paste0("B_", unique_pop_names)
rownames(A) <- label_A[rownames(A)]; colnames(A) <- label_A[colnames(A)]
rownames(B) <- label_B[rownames(B)]; colnames(B) <- label_B[colnames(B)]
list(A = A, B = B, np = np, k = k)
}
results <- data.frame()
for (k in k_values) {
for (test_id in seq_len(num_tests)) {
test_seed <- seed_base + test_id * 100 + k
matrices <- prepare_matrices_from_matrix(full_matrix, k = k, seed = test_seed)
A <- matrices$A; B <- matrices$B
mds <- perform_mds(A, B)
X <- mds$X; Y <- mds$Y; d_opt <- mds$d_opt
pop_common <- intersect(rownames(A), rownames(B))
X_sub <- X[pop_common, 1:d_opt]
Y_sub <- Y[pop_common, 1:d_opt]
Yt <- apply_procrustes(X_sub, Y_sub, Y)
B_cal <- coords_to_distances(Yt)
the_prior <- mean((A - B)^2)
the_post <- mean((A - B_cal)^2)
improvement <- (the_prior - the_post) / the_prior * 100
results <- rbind(results, data.frame(k = k, test_id = test_id, the_prior = the_prior,
the_post = the_post, improvement = improvement))
}
}
agg <- aggregate(cbind(the_prior, the_post, improvement) ~ k, data = results,
FUN = function(x) c(mean = mean(x), sd = sd(x)))
agg <- do.call(data.frame, agg)
p <- ggplot(agg, aes(x = k)) +
geom_line(aes(y = improvement.mean), color = "blue") +
geom_point(aes(y = improvement.mean), color = "blue") +
geom_ribbon(aes(ymin = improvement.mean - improvement.sd,
ymax = improvement.mean + improvement.sd), fill = "blue", alpha = 0.2) +
labs(title = "Calibration improvement vs shared k",
x = "Shared populations (k)", y = "% Improvement") +
theme_minimal()
print(p)