Introduction

This vignette demonstrates how to get started with the leaf R package, a high-level interface to the LEAF Symbolic Regression framework.

Note: Before using leaf, you need to install the Python backend by running leaf::install_leaf().

# Install the Python backend (only needs to be done once). 
leaf::install_leaf()

Load package

library(leaf)
if (!backend_available()) {
  message("Install backend with leaf::install_leaf()")
}

Initialize a symbolic regression analysis

regressor <- SymbolicRegressor$new(
  engine = "rsrm",
  num_iterations=4L, 
  loss = "MSE",
  operators = c("+", "-", "*", "/"),
  threshold = 1e-10,
  base = list(verbose = FALSE)
)

Load the data

train_data <- leaf_data("eg_train")
test_data <- leaf_data("eg_test")
print(head(train_data))
#>    X       x1         x2          y
#> 1 12 37.45222 1.51908854 114.625592
#> 2  4 26.02808 0.07787298   4.804276
#> 3 37  9.09870 0.17996103   3.780866
#> 4  8 26.62270 0.52217593  28.528340
#> 5  3 33.38746 1.56938555 105.608897
#> 6  6 29.72694 1.35455366  81.320448
print(head(test_data))
#>    X        x1        x2         y
#> 1 13 10.961724 1.1329768  25.40347
#> 2 39 24.859367 0.6104367  31.09740
#> 3 30 29.766229 1.1632080  70.04424
#> 4 45 38.345487 1.4681886 113.42693
#> 5 17  5.582007 1.6563170  18.89703
#> 6 48 38.867698 1.4879493 116.50896

Define the formula

The formula specifies the transformed inputs and grouping structure.

Terms inside f(…) define the inputs (x1, x2, etc.)
The part after | defines grouping variables if specified

Functional transformations are allowed inside the formula.

formula <- "y ~ f(x1, x2, log(x2))"

Search for candidate equations

search_results <- regressor$search_equations(
  data = train_data,
  formula = formula
)
#> 1. Processing data for equation search based on formula...
#> 2. Running engine 'rsrm' over 1 folds using up to 1 processes...
#> -- FINAL RESULTS --
#> Episode: 4/4
#> time: 60.08s
#> loss: 0.00014448195748697755
#> form: X2/X1+1-X1+F
#> HOF:
#>                                                           equation  complexity                                                                                                   loss
#> 0                                                                0           0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00
#> 1                                                          44.5014           1                                                                                                1292.93
#> 2                                                       33.4330*X2           2                                                                                                 450.49
#> 3                                                    17.8258*X1*X2           3                                                                                                   0.15
#> 4                                           17.6940*X1*X2 + 0.5466           4                                                                                                   0.03
#> 5                                         X1*(17.6544*X2 + 0.3209)           5                                                                                                   0.02
#> 6                               17.6484*X1*X2 + 0.2050*X1 + 0.2815           6                                                                                                   0.01
#> 7                               17.6599*X1*X2 + 0.7257 - 0.0709/X1           7                                                                                                   0.01
#> 8             X1*(17.6404*X2*(X1 + 0.7876) + 1.0622)/(X1 + 0.7876)           8                                                                                                   0.00
#> 9       17.6249*X1*X2 + 0.1456*X1 + 0.0418*X2 + 0.4554 - 0.0461/X1          11                                                                                                   0.00
#> 10  -0.0423*X1**2 + 17.6418*X1*X2 + 0.2866*X1 + 0.3691 - 0.0330/X1          12                                                                                                   0.00
#> 11   17.6423*X1*X2 + 0.0970*X1 + 0.5986 - 0.1043/X1 + 0.0048/X1**2          13                                                                                                   0.00
#> ---
#> 
task:dataset_923a85b0-5900-4aa2-a712-ab798821ae56 expr:17.642277203891876*X1*X2 + 0.09700466267074608*X1 + 0.5985664260163593 + -0.10427594286280444/X1 + 0.004757006682361494/X1**2 Loss_MSE:0.00 Test 0/1.
#> final result:
#> success rate : 0%
#> average discovery time is 60.086 seconds
#> Number of equations looked at (per test) [Total, Timed out, Successful]:  [[3955, 0, 3955]]
#> 3. Found 12 raw skeletons. Deduplicating...

head(search_results)
#>                Equation Complexity
#> 0                    β1          1
#> 1                 β1⋅x2          2
#> 2              β1⋅x1⋅x2          3
#> 3       x1⋅(β1⋅x2 + β2)          4
#> 4         β1⋅x1⋅x2 + β2          4
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3          6

Fit equation parameters

fit_results <- regressor$fit(
  data = train_data,
  do_cv = FALSE
)
#> Fitting parameters for 11 equations...
#> Parameter fitting complete.

head(fit_results)
#>                Equation Complexity         Loss
#> 0                    β1          1 1.292927e+03
#> 1                 β1⋅x2          2 4.504898e+02
#> 2              β1⋅x1⋅x2          3 1.513596e-01
#> 3       x1⋅(β1⋅x2 + β2)          4 2.168116e-02
#> 4         β1⋅x1⋅x2 + β2          4 3.160112e-02
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3          6 6.814638e-03

Evaluate equations

eval_table <- regressor$evaluate(
  metrics = c("RMSE", "R2"),
  data = test_data
)

head(eval_table)
#>                      Equation Complexity         Loss        RMSE
#> 0                          β1          1 1.292927e+03 38.53714791
#> 1                       β1⋅x2          2 4.504898e+02 29.85585277
#> 2                    β1⋅x1⋅x2          3 1.513596e-01  0.37579394
#> 3             x1⋅(β1⋅x2 + β2)          4 2.168116e-02  0.20061360
#> 5       β1⋅x1⋅x2 + β2⋅x1 + β3          6 6.814638e-03  0.08439777
#> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1          7 6.803462e-03  0.06765065
#>            R2
#> 0 -0.01053178
#> 1  0.39347348
#> 2  0.99990391
#> 3  0.99997262
#> 5  0.99999515
#> 6  0.99999689

Make predictions

preds <- regressor$predict(
  data = test_data,
  equation_id = 3
)

head(preds)
#> [1]  25.12047  30.96778  70.01509 113.60330  18.63938 116.68777

Inspect equation

regressor$show_equation(equation_id = 3)
#> Details for Equation ID: 3
#> ----------------------------------------
#> Equation: x1⋅(β1⋅x2 + β2)
#> ----------------------------------------
#> Full Expression per Group:
#> 
#> Group 'default_group':
#> 2.00163686843539⋅x₁⋅x₂ + 0.0238483758781546⋅x₁

See final results

print(regressor$get_pareto_front())
#>                                            Equation Complexity
#> 0                                                β1          1
#> 1                                             β1⋅x2          2
#> 2                                          β1⋅x1⋅x2          3
#> 3                                   x1⋅(β1⋅x2 + β2)          4
#> 5                             β1⋅x1⋅x2 + β2⋅x1 + β3          6
#> 6                       β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1          7
#> 7            x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1          9
#> 8       β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1         11
#> 9  -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1         12
#> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1         13
#>            Loss        RMSE          R2
#> 0  1.292927e+03 38.53714791 -0.01053178
#> 1  4.504898e+02 29.85585277  0.39347348
#> 2  1.513596e-01  0.37579394  0.99990391
#> 3  2.168116e-02  0.20061360  0.99997262
#> 5  6.814638e-03  0.08439777  0.99999515
#> 6  6.803462e-03  0.06765065  0.99999689
#> 7  4.223240e-04  0.01968863  0.99999974
#> 8  3.611868e-04  0.03950045  0.99999894
#> 9  1.582007e-04  0.01388075  0.99999987
#> 10 1.444818e-04  0.01629805  0.99999982

Plot pareto front

results <- regressor$get_results()
plot_pareto(results, 'Complexity', 'RMSE', y_bigger_better = FALSE)
#> $plot

plot of chunk plot

#> 
#> $pareto_points
#>                                           Equation Complexity
#> 0                                               β1          1
#> 1                                            β1⋅x2          2
#> 2                                         β1⋅x1⋅x2          3
#> 4                                    β1⋅x1⋅x2 + β2          4
#> 5                            β1⋅x1⋅x2 + β2⋅x1 + β3          6
#> 6                      β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1          7
#> 7           x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1          9
#> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1         12
#>           Loss        RMSE          R2 .pareto
#> 0 1.292927e+03 38.53714791 -0.01053178    TRUE
#> 1 4.504898e+02 29.85585277  0.39347348    TRUE
#> 2 1.513596e-01  0.37579394  0.99990391    TRUE
#> 4 3.160112e-02  0.09294067  0.99999412    TRUE
#> 5 6.814638e-03  0.08439777  0.99999515    TRUE
#> 6 6.803462e-03  0.06765065  0.99999689    TRUE
#> 7 4.223240e-04  0.01968863  0.99999974    TRUE
#> 9 1.582007e-04  0.01388075  0.99999987    TRUE
#> 
#> $all
#>                                            Equation Complexity
#> 0                                                β1          1
#> 1                                             β1⋅x2          2
#> 2                                          β1⋅x1⋅x2          3
#> 3                                   x1⋅(β1⋅x2 + β2)          4
#> 4                                     β1⋅x1⋅x2 + β2          4
#> 5                             β1⋅x1⋅x2 + β2⋅x1 + β3          6
#> 6                       β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1          7
#> 7            x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1          9
#> 8       β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1         11
#> 9  -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1         12
#> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1         13
#>            Loss        RMSE          R2 .pareto
#> 0  1.292927e+03 38.53714791 -0.01053178    TRUE
#> 1  4.504898e+02 29.85585277  0.39347348    TRUE
#> 2  1.513596e-01  0.37579394  0.99990391    TRUE
#> 3  2.168116e-02  0.20061360  0.99997262   FALSE
#> 4  3.160112e-02  0.09294067  0.99999412    TRUE
#> 5  6.814638e-03  0.08439777  0.99999515    TRUE
#> 6  6.803462e-03  0.06765065  0.99999689    TRUE
#> 7  4.223240e-04  0.01968863  0.99999974    TRUE
#> 8  3.611868e-04  0.03950045  0.99999894   FALSE
#> 9  1.582007e-04  0.01388075  0.99999987    TRUE
#> 10 1.444818e-04  0.01629805  0.99999982   FALSE

Compute optimal threshold (only for classification tasks)

Note on Task Types: The find_optimal_threshold() method is specifically designed for binary classification tasks. If your target variable is continuous (Regression), you should use the raw predictions directly. If your target is binary (Classification), this step helps transform continuous scores into class labels (0 or 1).

threshold <- regressor$find_optimal_threshold(
  equation_id = 1,
  data = train_data,
  k_folds = 5
)
#> The model isn't a classification model.

print(threshold)
#> numeric(0)

Save and load analysis

# Save current analysis
path <- tempfile(fileext = ".pkl")
regressor$save(path)
#> SymbolicRegressor instance successfully saved to C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl

# Load it later
regressor_loaded <- load_leaf_analysis(path)
#> SymbolicRegressor instance loaded successfully from C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl

# Make predictions with loaded model
preds_loaded <- regressor_loaded$predict(
  data = test_data,
  equation_id = 3
)

head(preds_loaded)
#> [1]  25.12047  30.96778  70.01509 113.60330  18.63938 116.68777

Getting Started with leaf

Your Name