Stability-Selection for GAMLSS with SelectBoost.gamlss

This vignette demonstrates bootstrap stability-selection for GAMLSS models.

What you’ll learn

How to launch sb_gamlss() with correlated-selectBoost bootstraps for μ and σ.
How to interpret selection_table(), plot_sb_gamlss(), and effect_plot() outputs.
How to mix engines across parameters (engine, engine_sigma, engine_tau) and keep factor/spline terms grouped.
How to automate c0 exploration with sb_gamlss_c0_grid(), autoboost_gamlss(), and fastboost_gamlss().

Conference highlight. SelectBoost for GAMLSS was presented at the Joint Statistics Meetings 2024 (Portland, OR) in the talk “An Improvement for Variable Selection for Generalized Additive Models for Location, Shape and Scale and Quantile Regression” by Frédéric Bertrand and M. Maumy. The presentation stressed how correlation-aware resampling sharpens variable selection for GAMLSS and quantile regression problems even when predictors are numerous and strongly correlated.

set.seed(1)
library(gamlss)
library(SelectBoost.gamlss)

n <- 400
x1 <- rnorm(n); x2 <- rnorm(n); x3 <- rnorm(n)
mu <- 1 + 1.5*x1 - 1.2*x3
y  <- gamlss.dist::rNO(n, mu = mu, sigma = 1)

dat <- data.frame(y, x1, x2, x3)

fit <- sb_gamlss(
  y ~ 1,
  mu_scope = ~ x1 + x2 + x3,
  sigma_scope = ~ x1 + x2,
  family = gamlss.dist::NO(),
  data = dat,
  B = 50,
  sample_fraction = 0.7,
  pi_thr = 0.6,
  k = 2,
  pre_standardize = TRUE,
  trace = FALSE
)

fit$final_formula
sel <- selection_table(fit)
sel <- sel[order(sel$parameter, -sel$prop), , drop = FALSE]
head(sel, n = min(8L, nrow(sel)))
stable <- sel[sel$prop >= fit$pi_thr, , drop = FALSE]
if (nrow(stable)) {
  stable
} else {
  cat("No terms reached the stability threshold of", fit$pi_thr, "for this run.\n")
}
plot_sb_gamlss(fit, top = 10)
if (requireNamespace("ggplot2", quietly = TRUE)) {
  print(effect_plot(fit, "x1", dat, what = "mu"))
}

selection_table() ranks stable terms for each parameter; the chunk above prints the eight strongest effects (sorted by stability) alongside the terms exceeding the stability threshold. plot_sb_gamlss() overlays selection frequency vs. coefficient magnitude, and effect_plot() visualizes partial effects (using ggplot2 if available, otherwise base R).

Grouped penalties and parameter-specific engines

SelectBoost.gamlss lets you mix-and-match engines across parameters. Grouped penalties (via grpreg and SGL) keep whole factors/spline bases/interactions together, while engine, engine_sigma, engine_nu, and engine_tau control the selection backend individually.

set.seed(2)
f  <- factor(sample(letters[1:3], n, TRUE))
dat2 <- transform(dat, f = f, x4 = rnorm(n))

fit_grouped <- sb_gamlss(
  y ~ 1,
  mu_scope    = ~ f + x1 + pb(x2) + x3 + x1:f,
  sigma_scope = ~ f + x1 + pb(x2),
  family = gamlss.dist::NO(),
  data = dat2,
  engine = "grpreg",                # μ: grouped penalty
  engine_sigma = "sgl",             # σ: sparse group lasso
  engine_tau = "glmnet",            # τ (if present) would fall back automatically
  grpreg_penalty = "grLasso",
  sgl_alpha = 0.7,
  B = 40,
  pi_thr = 0.6,
  pre_standardize = TRUE,
  trace = FALSE
)

sel_grouped <- selection_table(fit_grouped)
sel_grouped <- sel_grouped[order(sel_grouped$parameter, -sel_grouped$prop), , drop = FALSE]
head(sel_grouped, n = min(8L, nrow(sel_grouped)))
stable_grouped <- sel_grouped[sel_grouped$prop >= fit_grouped$pi_thr, , drop = FALSE]
if (nrow(stable_grouped)) {
  stable_grouped
} else {
  cat("No terms reached the stability threshold of", fit_grouped$pi_thr, "for this run.\n")
}

Grouped engines build design matrices internally (safely handling factors/splines) and retain original formulas for the final gamlss refit.

SelectBoost integrations: c0 grids, autoboost, and fastboost

The SelectBoost wrappers expose correlation-aware grouping and automated c0 choice.

grid_obj <- sb_gamlss_c0_grid(
  y ~ 1,
  data = dat,
  family = gamlss.dist::NO(),
  mu_scope = ~ x1 + x2 + x3,
  sigma_scope = ~ x1 + x2,
  c0_grid = seq(0.2, 0.8, by = 0.2),
  B = 30,
  pi_thr = 0.6,
  pre_standardize = TRUE,
  trace = FALSE,
  progress = TRUE
)
plot(grid_obj)
confidence_table(grid_obj)

auto_obj <- autoboost_gamlss(
  y ~ 1,
  data = dat,
  family = gamlss.dist::NO(),
  mu_scope = ~ x1 + x2 + x3,
  sigma_scope = ~ x1 + x2,
  c0_grid = seq(0.2, 0.8, by = 0.2),
  B = 30,
  pi_thr = 0.6,
  pre_standardize = TRUE,
  trace = FALSE
)
attr(auto_obj, "chosen_c0")

fast_obj <- fastboost_gamlss(
  y ~ 1,
  data = dat,
  family = gamlss.dist::NO(),
  mu_scope = ~ x1 + x2 + x3,
  sigma_scope = ~ x1 + x2,
  B = 30,
  sample_fraction = 0.6,
  pi_thr = 0.6,
  pre_standardize = TRUE,
  trace = FALSE
)
plot_sb_gamlss(fast_obj)

Use progress = TRUE on any of these helpers to monitor grid/config iteration.

Confidence summaries and effect diagnostics

confidence_functionals() collapses stability curves across the c0 grid into scalar summaries (AUSC, coverage, quantiles, weighted, conservative). The output ranks terms and integrates seamlessly with plot() and plot_stability_curves().

cf <- confidence_functionals(
  grid_obj,
  pi_thr = 0.6,
  q = c(0.5, 0.8, 0.9),
  conservative = TRUE,
  B = 30
)
head(cf)
plot(cf, top = 8)
plot_stability_curves(grid_obj, terms = c("x1", "x3"), parameter = "mu")

For model interpretation, pair the stability summaries with partial-effect diagnostics via effect_plot() (see the quick-start example above) or use selection_table() to inspect the final refit.

Tuning engines and penalties

tune_sb_gamlss() evaluates different engine/penalty configurations using either stability or deviance metrics (with optional K-fold CV). Supply a list of configurations and a shared baseline of arguments.

configs <- list(
  list(engine = "stepGAIC"),
  list(engine = "glmnet", glmnet_alpha = 1),
  list(engine = "grpreg", grpreg_penalty = "grLasso", engine_sigma = "sgl", sgl_alpha = 0.8)
)
baseline <- list(
  formula = y ~ 1,
  data = dat2,
  family = gamlss.dist::NO(),
  mu_scope = ~ f + x1 + pb(x2) + x3 + x1:f,
  sigma_scope = ~ f + x1 + pb(x2),
  pi_thr = 0.6,
  pre_standardize = TRUE,
  sample_fraction = 0.7,
  trace = FALSE,
  parallel = "auto"
)
tuned <- tune_sb_gamlss(configs, base_args = baseline, B_small = 25, metric = "stability", progress = TRUE)
tuned$best_config

Switch metric = "deviance" and set K to engage fast deviance CV with the optimized density routes.

Fast deviance utilities and diagnostics

Deviance-based tuning reuses loglik_gamlss_newdata_fast() under the hood. Use fast_vs_generic_ll() to benchmark the speedup relative to the generic density evaluator, and check_fast_vs_generic() to verify numerical agreement (with per-family tolerances).

# Example: compare deviance routes on a fitted model
fit_ga <- gamlss::gamlss(y ~ x1 + x2, sigma.formula = ~ x1, data = dat, family = gamlss.dist::NO())
fast_vs_generic_ll(fit_ga, newdata = dat, reps = 30)
check_fast_vs_generic(fit_ga, newdata = dat, tol = 1e-6)

See the dedicated fast-deviance vignettes for expanded results across many gamlss.dist families.

Parallel bootstraps, progress, and long tests

All bootstrap helpers accept parallel = "auto" to leverage future.apply (or any plan you set via future::plan()). Progress bars are enabled via progress = TRUE on sb_gamlss(), grid/autoboost helpers, and tune_sb_gamlss().

future::plan(multisession, workers = 4)
fit_parallel <- sb_gamlss(
  y ~ 1,
  mu_scope = ~ x1 + x2 + x3,
  sigma_scope = ~ x1 + x2,
  family = gamlss.dist::NO(),
  data = dat,
  B = 200,
  pi_thr = 0.6,
  pre_standardize = TRUE,
  parallel = "auto",
  progress = TRUE,
  trace = FALSE
)

To opt into the full suite of equality/benchmark tests locally, set options(SelectBoost.gamlss.run_long_tests = TRUE) or Sys.setenv(RUN_LONG_TESTS = "true") before running devtools::test().