Framework Integration with leakr

Cheryl Isabella Lim

2025-10-22

library(leakr)

Introduction

The leakr package integrates seamlessly with popular machine learning frameworks in R, allowing you to incorporate leakage detection directly into existing modelling workflows. This vignette demonstrates how to use leakr with caret, mlr3, and tidymodels frameworks.

Integration functions automatically extract relevant information from trained models and preprocessing pipelines, making it easy to audit your complete machine learning workflow for potential data leakage issues.

Integration with caret

The caret package is widely used for classification and regression training. leakr can analyse caret training objects to detect leakage in the underlying data and preprocessing steps.

Basic caret Integration

# Load required libraries
library(caret)

# Prepare iris data for caret
data(iris)
set.seed(123)

# Create train/test split using caret
train_index <- createDataPartition(iris$Species, p = 0.8, list = FALSE)
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]

# Train a model using caret
model_fit <- train(
  Species ~ ., 
  data = train_data, 
  method = "rf",
  trControl = trainControl(method = "cv", number = 5)
)

# Use leakr to audit the caret model
caret_audit <- leakr_from_caret(
  train_obj = model_fit,
  original_data = iris,
  target_name = "Species"
)

print(caret_audit)

Advanced caret Integration with Preprocessing

# Example with preprocessing steps that might introduce leakage
set.seed(456)

# Create a more complex dataset
complex_data <- data.frame(
  feature1 = rnorm(200),
  feature2 = rnorm(200, 50, 10),
  feature3 = sample(c("A", "B", "C"), 200, replace = TRUE),
  target = factor(sample(c("positive", "negative"), 200, replace = TRUE))
)

# Add missing values to demonstrate preprocessing
complex_data$feature1[sample(1:200, 20)] <- NA
complex_data$feature2[sample(1:200, 15)] <- NA

# Create train/test split
train_idx <- createDataPartition(complex_data$target, p = 0.7, list = FALSE)
train_complex <- complex_data[train_idx, ]
test_complex <- complex_data[-train_idx, ]

# Define preprocessing with potential leakage risks
preprocess_recipe <- preProcess(
  train_complex[, -4],  # Exclude target
  method = c("center", "scale", "medianImpute")
)

# Train model with preprocessing
model_complex <- train(
  target ~ .,
  data = train_complex,
  method = "glm",
  preProcess = c("center", "scale", "medianImpute"),
  trControl = trainControl(method = "cv", number = 3)
)

# Audit the complex workflow
complex_audit <- leakr_from_caret(
  train_obj = model_complex,
  original_data = complex_data,
  target_name = "target"
)

# Generate detailed summary
caret_summary <- leakr_summarise(complex_audit, show_config = TRUE)
print(caret_summary)

Integration with mlr3

The mlr3 ecosystem provides a modern, object-oriented approach to machine learning in R. leakr can extract information from mlr3 tasks and learners.

Basic mlr3 Integration

# Load mlr3 components
library(mlr3)
library(mlr3learners)

# Create an mlr3 task
iris_task <- TaskClassif$new(
  id = "iris_classification",
  backend = iris,
  target = "Species"
)

# Use leakr to audit the mlr3 task
mlr3_audit <- leakr_from_mlr3(
  task = iris_task,
  include_target = TRUE
)

print(mlr3_audit)

Advanced mlr3 Integration with Pipelines

library(mlr3pipelines)

# Create a more complex dataset for demonstration
titanic_like <- data.frame(
  age = c(rnorm(100, 35, 10), rep(NA, 20)),
  fare = c(rnorm(100, 50, 20), rep(NA, 20)),
  sex = sample(c("male", "female"), 120, replace = TRUE),
  class = sample(c("1st", "2nd", "3rd"), 120, replace = TRUE),
  survived = factor(sample(c("yes", "no"), 120, replace = TRUE)),
  stringsAsFactors = TRUE
)

# Create task
survival_task <- TaskClassif$new(
  id = "survival_prediction",
  backend = titanic_like,
  target = "survived"
)

# Create preprocessing pipeline that might introduce leakage
preprocessing_pipeline <- po("imputehist") %>>%  # Imputation
  po("scale") %>>%                               # Scaling
  po("encode")                                   # Factor encoding

# Create full pipeline with learner
full_pipeline <- preprocessing_pipeline %>>%
  po("learner", lrn("classif.rpart"))

# Convert to learner
pipeline_learner <- as_learner(full_pipeline)

# Audit the mlr3 pipeline
pipeline_audit <- leakr_from_mlr3(
  task = survival_task,
  include_target = TRUE
)

# Detailed analysis
mlr3_summary <- leakr_summarise(pipeline_audit, top_n = 8)
print(mlr3_summary)

Integration with tidymodels

The tidymodels framework provides a consistent approach to modelling in R. leakr can analyse workflows and recipes for potential leakage issues.

Basic tidymodels Integration

# Load tidymodels components
library(tidymodels)

# Create initial split
data(iris)
set.seed(789)
iris_split <- initial_split(iris, prop = 0.8, strata = Species)

# Create recipe
iris_recipe <- recipe(Species ~ ., data = training(iris_split)) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Create model specification
iris_model <- rand_forest(mode = "classification") %>%
  set_engine("ranger")

# Create workflow
iris_workflow <- workflow() %>%
  add_recipe(iris_recipe) %>%
  add_model(iris_model)

# Use leakr to audit the tidymodels workflow
tidymodels_audit <- leakr_from_tidymodels(
  workflow = iris_workflow,
  data = iris
)

print(tidymodels_audit)

Advanced tidymodels Integration with Feature Engineering

# Create a dataset with potential feature engineering leakage
set.seed(987)
engineering_data <- data.frame(
  customer_id = 1:300,
  purchase_amount = rlnorm(300, 3, 1),
  days_since_last = rpois(300, 30),
  category = sample(c("electronics", "clothing", "books"), 300, replace = TRUE),
  month = sample(1:12, 300, replace = TRUE),
  will_return = factor(sample(c("yes", "no"), 300, replace = TRUE, prob = c(0.3, 0.7)))
)

# Add potential leakage: customer_lifetime_value (future information)
engineering_data$customer_lifetime_value <- 
  ifelse(engineering_data$will_return == "yes", 
         engineering_data$purchase_amount * runif(300, 2, 5),
         engineering_data$purchase_amount * runif(300, 0.5, 1.5))

# Create data split
engineering_split <- initial_split(engineering_data, prop = 0.8, strata = will_return)

# Create comprehensive recipe with potential leakage sources
engineering_recipe <- recipe(will_return ~ ., data = training(engineering_split)) %>%
  update_role(customer_id, new_role = "ID") %>%
  step_log(purchase_amount, customer_lifetime_value) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_interact(terms = ~ purchase_amount:days_since_last) %>%
  step_pca(all_numeric_predictors(), num_comp = 5)

# Create model specification
engineering_model <- logistic_reg() %>%
  set_engine("glm")

# Create workflow
engineering_workflow <- workflow() %>%
  add_recipe(engineering_recipe) %>%
  add_model(engineering_model)

# Audit the complex tidymodels workflow
complex_tidymodels_audit <- leakr_from_tidymodels(
  workflow = engineering_workflow,
  data = engineering_data
)

# Generate detailed summary
tidymodels_summary <- leakr_summarise(
  complex_tidymodels_audit, 
  top_n = 10, 
  show_config = TRUE
)

print(tidymodels_summary)

Data Import and Export Integration

leakr provides seamless integration with data import/export operations, allowing you to audit data as part of your data pipeline:

Import with Automatic Auditing

# Import data with automatic leakage checking
# This would typically be used with real files
example_data <- data.frame(
  id = 1:100,
  feature1 = rnorm(100),
  feature2 = sample(letters[1:5], 100, replace = TRUE),
  target = factor(sample(c("A", "B"), 100, replace = TRUE))
)

# Simulate importing and auditing in one step
imported_audit <- leakr_audit(
  data = example_data,
  target = "target",
  id = "id"
)

# Quick import function (simulated)
leakr_quick_audit <- function(data_path, target, ...) {
  # In practice, this would use leakr_import() followed by leakr_audit()
  # data <- leakr_import(data_path, ...)
  # audit <- leakr_audit(data, target = target)
  # return(list(data = data, audit = audit))
  
  # For demonstration
  return(imported_audit)
}

Export with Audit Reports

# Export data along with audit reports
export_config <- list(
  include_audit_report = TRUE,
  format = "comprehensive",
  generate_summary = TRUE
)

# This would export both data and audit results
# leakr_export_data(
#   data = example_data,
#   file_path = "audited_dataset",
#   audit_report = imported_audit,
#   config = export_config
# )

Snapshot and Version Control Integration

leakr supports data versioning and snapshot functionality for tracking changes in leakage patterns over time:

Creating Data Snapshots

# Create snapshot of current data state
snapshot_info <- leakr_create_snapshot(
  data = example_data,
  name = "baseline_data",
  metadata = list(
    created_by = "data_scientist",
    purpose = "baseline_analysis",
    version = "1.0"
  )
)

# List available snapshots
available_snapshots <- leakr_list_snapshots()
print(available_snapshots)

# Load previous snapshot for comparison
# previous_data <- leakr_load_snapshot("baseline_data")

Workflow Integration Patterns

Pattern 1: Pre-Training Validation

# Complete pre-training validation workflow
validate_before_training <- function(data, target, test_split = 0.2) {
  # Step 1: Basic data validation
  validated_data <- validate_and_preprocess_data(
    data = data,
    target = target,
    split = NULL,
    id = NULL,
    config = list(remove_empty_cols = TRUE)
  )
  
  # Step 2: Create train/test split
  set.seed(42)
  n <- nrow(validated_data)
  train_indices <- sample(1:n, (1 - test_split) * n)
  split_vector <- rep("test", n)
  split_vector[train_indices] <- "train"
  
  # Step 3: Comprehensive leakage audit
  audit_report <- leakr_audit(
    data = validated_data,
    target = target,
    split = split_vector
  )
  
  # Step 4: Check for blocking issues
  critical_issues <- length(audit_report$issues[
    sapply(audit_report$issues, function(x) x$severity == "high")
  ])
  
  if (critical_issues > 0) {
    warning(paste("Found", critical_issues, "critical issues. Review before training."))
  }
  
  return(list(
    data = validated_data,
    split = split_vector,
    audit = audit_report,
    safe_to_train = critical_issues == 0
  ))
}

# Example usage
# validation_result <- validate_before_training(your_data, "target_column")
# if (validation_result$safe_to_train) {
#   # Proceed with model training
# }

Pattern 2: Post-Training Audit

# Post-training comprehensive audit
post_training_audit <- function(model_object, framework = "caret") {
  
  audit_result <- switch(framework,
    "caret" = leakr_from_caret(model_object),
    "mlr3" = leakr_from_mlr3(model_object),
    "tidymodels" = leakr_from_tidymodels(model_object),
    stop("Unsupported framework")
  )
  
  # Generate comprehensive summary
  summary_report <- leakr_summarise(
    audit_result,
    top_n = 15,
    show_config = TRUE
  )
  
  return(list(
    audit = audit_result,
    summary = summary_report
  ))
}

Pattern 3: Continuous Monitoring

# Set up continuous monitoring for production data
setup_leakage_monitoring <- function(data_source, target, schedule = "daily") {
  
  monitor_config <- list(
    alert_threshold = "medium",  # Alert on medium+ severity issues
    notification_email = "data-team@company.com",
    generate_plots = TRUE,
    archive_reports = TRUE
  )
  
  # This would typically integrate with a scheduler like cron
  monitoring_function <- function() {
    # Load current data
    current_data <- data_source()  # Function to fetch current data
    
    # Run audit
    current_audit <- leakr_audit(
      data = current_data,
      target = target,
      config = monitor_config
    )
    
    # Check for issues requiring attention
    medium_high_issues <- length(current_audit$issues[
      sapply(current_audit$issues, function(x) x$severity %in% c("medium", "high"))
    ])
    
    if (medium_high_issues > 0) {
      # Send alert (implementation would depend on your notification system)
      message(paste("Leakage monitoring alert:", medium_high_issues, "issues detected"))
    }
    
    # Archive report with timestamp
    timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
    # save(current_audit, file = paste0("audit_", timestamp, ".RData"))
    
    return(current_audit)
  }
  
  return(monitoring_function)
}

# Example setup
# monitor <- setup_leakage_monitoring(
#   data_source = function() { read.csv("daily_data.csv") },
#   target = "outcome"
# )
# daily_audit <- monitor()

Performance Considerations

Memory-Efficient Processing

# Configuration for large-scale processing
large_scale_config <- list(
  sample_size = 10000,           # Limit memory usage
  chunk_processing = TRUE,        # Process in chunks
  parallel_detectors = FALSE,     # Disable if memory constrained
  save_intermediate = TRUE,       # Save intermediate results
  cleanup_temp = TRUE            # Clean up temporary objects
)

# Process very large dataset efficiently
process_large_dataset <- function(data_path, target, config = large_scale_config) {
  
  # Process in chunks if dataset is too large to fit in memory
  if (file.size(data_path) > 1e9) {  # > 1GB
    # Implement chunked processing
    message("Large dataset detected, using chunked processing")
    
    # This would implement actual chunked reading and processing
    # chunk_results <- process_in_chunks(data_path, target, config)
    # combined_audit <- combine_audit_results(chunk_results)
    # return(combined_audit)
  } else {
    # Standard processing for manageable datasets
    data <- read.csv(data_path)
    return(leakr_audit(data, target = target, config = config))
  }
}

Best Practices for Framework Integration

1. Integration Timing

2. Configuration Management

# Create environment-specific configurations
development_config <- list(
  sample_size = 1000,
  generate_plots = TRUE,
  detailed_logging = TRUE
)

production_config <- list(
  sample_size = 10000,
  generate_plots = FALSE,
  detailed_logging = FALSE,
  alert_on_issues = TRUE
)

testing_config <- list(
  sample_size = 500,
  run_all_detectors = TRUE,
  strict_thresholds = TRUE
)

3. Error Handling and Logging

# Robust error handling for production environments
safe_audit <- function(data, target, ...) {
  tryCatch({
    audit_result <- leakr_audit(data, target = target, ...)
    
    # Log successful audit
    message(paste("Audit completed successfully at", Sys.time()))
    
    return(list(
      success = TRUE,
      audit = audit_result,
      timestamp = Sys.time()
    ))
    
  }, error = function(e) {
    # Log error details
    error_msg <- paste("Audit failed:", e$message)
    warning(error_msg)
    
    return(list(
      success = FALSE,
      error = error_msg,
      timestamp = Sys.time()
    ))
  })
}

Summary

This vignette has demonstrated comprehensive integration patterns for using leakr with major R machine learning frameworks:

Key integration principles:

  1. Early detection: Integrate leakage detection early in your workflow
  2. Comprehensive coverage: Audit both data and preprocessing steps
  3. Framework consistency: Use framework-specific integration functions when available
  4. Monitoring mindset: Set up continuous monitoring for production systems
  5. Configuration management: Adapt detection sensitivity to your environment

By following these patterns, you can ensure that leakage detection becomes a natural and reliable part of your machine learning workflows, helping maintain model integrity and reproducibility across different frameworks and environments.