Train-Test Splitting for Symbolic Regression

# Install the Python backend (only needs to be done once). 
leaf::install_leaf()

# Load package 
library(leaf)
if (!backend_available()) {
  message("Install backend with leaf::install_leaf()")
}

set.seed(42)

N <- 50L

# Generate features
x1 <- runif(N, min = 1, max = 40)
x2 <- runif(N, min = 0, max = 2)

# Generate target: y = 0.23·log(x1) + 2·x1·x2 + noise
y <- 0.23 * log(x1) + 2 * x1 * x2 + rnorm(N, mean = 0, sd = 1e-2)

data <- data.frame(x1 = x1, x2 = x2, y = y)

# Train/test split
train_ratio <- 0.8
train_index <- sample(seq_len(nrow(data)), size = floor(train_ratio * nrow(data)))

train_data <- data[train_index, ]
test_data  <- data[-train_index, ]
print(head(train_data))
#>           x1        x2         y
#> 24 37.920061 0.9587971 73.541917
#> 34 27.721619 1.2912638 72.354745
#> 25  4.215065 0.3948207  3.653854
#> 28 36.323787 0.7509799 55.387777
#> 43  2.459810 0.4171399  2.265693
#> 14 10.961724 1.1329768 25.403471
print(head(test_data))
#>           x1        x2         y
#> 6  21.244742 1.4975908 64.337530
#> 7  29.726944 1.3545537 81.320448
#> 8   6.251997 0.3425287  4.705442
#> 15 19.029420 1.6993794 65.346714
#> 18  5.582007 1.6563170 18.897027
#> 19 19.524886 1.3864096 54.831677

# Initialise the symbolic regressor
regressor = leaf::SymbolicRegressor$new(
  engine='rsrm',
  loss='MSE',
  num_iterations=2L, 
  threshold=1e-10,
  operators = c("+"),
  base=list(verbose = FALSE),
  mcts=list(times = 20L),
  gp=list(times = 6L)
)

# Stage 1: Discover equation skeletons
search_results = regressor$search_equations(
    data = train_data,
    formula = 'y ~ f( log(x1), x1*x2 )',
    normalization = 'divide_by_gmd'
)
#> 1. Processing data for equation search based on formula...
#> 2. Running engine 'rsrm' over 1 folds using up to 1 processes...
#> -- FINAL RESULTS --
#> Episode: 2/2
#> time: 7.88s
#> loss: 8.57761298360967e-05
#> form: F
#> HOF:
#>                  equation  complexity  loss
#> 0              41.2303*X2           2  0.18
#> 1  0.1944*X1 + 40.8084*X2           4  0.00
#> ---
#> 
task:dataset_d5a244a5-e53e-4009-bca7-6c3555d75360 expr:0.19442414717846374*X1 + 40.80839593452161*X2 Loss_MSE:0.00 Test 0/1.
#> final result:
#> success rate : 0%
#> average discovery time is 7.885 seconds
#> Number of equations looked at (per test) [Total, Timed out, Successful]:  [[6, 0, 6]]
#> 3. Found 2 raw skeletons. Deduplicating...
print("=== Search results ===")
#> [1] "=== Search results ==="
print(search_results)
#>                Equation Complexity
#> 0              β1⋅x1⋅x2          3
#> 1 β1⋅log(x1) + β2⋅x1⋅x2          6

# Stage 2: Fit parameters and compute loss
fit_results = regressor$fit(
    data = train_data
)
#> Fitting parameters for 2 equations...
#> Parameter fitting complete.
print("\n=== Fit results ===")
#> [1] "\n=== Fit results ==="
print(fit_results)
#>                Equation Complexity         Loss
#> 0              β1⋅x1⋅x2          3 1.754847e-01
#> 1 β1⋅log(x1) + β2⋅x1⋅x2          6 8.577613e-05

# Stage 3: Evaluate additional metrics
eval_table = regressor$evaluate(
  metrics = c('R2', 'Elbow'),
  data = test_data
)

print("\n=== Additional metrics ===")
#> [1] "\n=== Additional metrics ==="
print(regressor$get_pareto_front())
#>                Equation Complexity         Loss        R2       Elbow
#> 0              β1⋅x1⋅x2          3 1.754847e-01 0.9999509 0.990636907
#> 1 β1⋅log(x1) + β2⋅x1⋅x2          6 8.577613e-05 0.9999999 0.009363093