Advanced Real Data Examples: Zero-Inflation, Semicontinuous, and Longitudinal Growth

This vignette demonstrates SelectBoost.gamlss on three additional real datasets: - Zero-inflated counts (pscl::bioChemists): ZIP / ZINB - Semicontinuous (datasets::airquality Ozone): ZAGA - Longitudinal growth with random effects (nlme::Orthodont): random intercepts

What you’ll learn

How to fit sb_gamlss() with grouped penalties on real zero-inflated data sets.
How to adapt SelectBoost scopes to semicontinuous responses (ZAGA) and longitudinal models with random effects.
How to interpret selection_table() outputs when categorical terms are represented by grouped dummy variables.

Conference highlight. These examples build on the communication “An Improvement for Variable Selection for Generalized Additive Models for Location, Shape and Scale and Quantile Regression” (Frédéric Bertrand & Myriam Maumy) delivered at the Joint Statistics Meetings 2024 in Portland, emphasizing SelectBoost-driven variable selection for GAMLSS and quantile regression through resampling-intensive benchmarking.

library(SelectBoost.gamlss)
library(gamlss)
#> Loading required package: splines
#> Loading required package: gamlss.data
#> 
#> Attaching package: 'gamlss.data'
#> The following object is masked from 'package:datasets':
#> 
#>     sleep
#> Loading required package: gamlss.dist
#> Loading required package: nlme
#> Loading required package: parallel
#>  **********   GAMLSS Version 5.5-0  **********
#> For more on GAMLSS look at https://www.gamlss.com/
#> Type gamlssNews() to see new features/changes/bug fixes.

Optional engines: install.packages(c("glmnet","grpreg","SGL","future.apply"))

1. Zero-inflated counts — `pscl::bioChemists` (ZIP/ZINB)

bc <- NULL
if (requireNamespace("pscl", quietly = TRUE)) {
  data("bioChemists", package = "pscl")
  bc <- pscl::bioChemists
  needed <- c("art", "kid5", "phd", "fem", "mar", "ment")
  missing_cols <- setdiff(needed, names(bc))
  if (length(missing_cols)) {
    cat(
      "bioChemists dataset is missing columns:",
      paste(missing_cols, collapse = ", "),
      "— skipping ZIP example.\n"
    )
    bc <- NULL
  } else {
    keep <- stats::complete.cases(bc[, needed])
    bc <- bc[keep, , drop = FALSE]
    bc$fem <- factor(bc$fem)
    bc$mar <- factor(bc$mar)
    bc$kid5 <- factor(bc$kid5)

    # ZIP engine via stepwise (mu) + grouped/elastic features
    set.seed(10)
    fit_zip <- sb_gamlss(
      art ~ 1, data = bc, family = gamlss.dist::ZIP(),
      mu_scope = ~ kid5 + fem + mar + phd + ment,
      engine = "grpreg", grpreg_penalty = "grLasso",
      B = 80, pi_thr = 0.6, pre_standardize = FALSE, parallel = "auto", trace = FALSE
    )
    sel_zip <- selection_table(fit_zip)
    sel_zip <- sel_zip[order(sel_zip$parameter, -sel_zip$prop), , drop = FALSE]
    head(sel_zip, n = min(8L, nrow(sel_zip)))
    stable_zip <- sel_zip[sel_zip$prop >= fit_zip$pi_thr, , drop = FALSE]
    if (nrow(stable_zip)) {
      stable_zip
    } else {
      cat("No terms reached the stability threshold of", fit_zip$pi_thr, "for this run.\n")
    }
  }
} else {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
}

The tables above report the eight most stable terms (sorted within each parameter) together with the subset that exceeds the stability threshold pi_thr.

ZINB often handles overdispersion better than ZIP

if (requireNamespace("pscl", quietly = TRUE) && !is.null(bc)) {
  set.seed(11)
  fit_zinb <- sb_gamlss(
    art ~ 1, data = bc, family = gamlss.dist::ZINBI(),
    mu_scope = ~ kid5 + fem + mar + phd + ment,
    engine = "grpreg", grpreg_penalty = "grLasso",
    B = 80, pi_thr = 0.6, pre_standardize = FALSE, parallel = "auto", trace = FALSE
  )
  sel_zinb <- selection_table(fit_zinb)
  sel_zinb <- sel_zinb[order(sel_zinb$parameter, -sel_zinb$prop), , drop = FALSE]
  head(sel_zinb, n = min(8L, nrow(sel_zinb)))
  stable_zinb <- sel_zinb[sel_zinb$prop >= fit_zinb$pi_thr, , drop = FALSE]
  if (nrow(stable_zinb)) {
    stable_zinb
  } else {
    cat("No terms reached the stability threshold of", fit_zinb$pi_thr, "for this run.\n")
  }
} else if (!requireNamespace("pscl", quietly = TRUE)) {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
} else {
  cat("bioChemists dataset unavailable; skipping ZINBI example.\n")
}

Side-by-side metrics (ZIP vs ZINB)

if (requireNamespace("pscl", quietly = TRUE)) {
  if (exists("fit_zip", inherits = FALSE) && exists("fit_zinb", inherits = FALSE)) {
    zip_g <- fit_zip$final_fit
    zinb_g <- fit_zinb$final_fit
    if (inherits(zip_g, "gamlss") && inherits(zinb_g, "gamlss")) {
      safe_scalar <- function(expr) {
        out <- tryCatch(expr, error = function(e) NA_real_)
        if (length(out) == 0L) NA_real_ else as.numeric(out[1L])
      }
      met <- data.frame(
        model = c("ZIP", "ZINBI"),
        AIC = c(safe_scalar(AIC(zip_g)), safe_scalar(AIC(zinb_g))),
        GAIC_k2 = c(safe_scalar(gamlss::GAIC(zip_g, k = 2)), safe_scalar(gamlss::GAIC(zinb_g, k = 2))),
        deviance = c(safe_scalar(zip_g$deviance), safe_scalar(zinb_g$deviance))
      )
      if (any(!is.na(met$AIC)) || any(!is.na(met$GAIC_k2)) || any(!is.na(met$deviance))) {
        print(met)
      } else {
        cat("Model fit statistics unavailable; skipping metric table.\n")
      }
    } else {
      cat("Final fits not found; metrics skipped.\n")
    }
  } else {
    cat("Models not available; run the fitting chunks first.\n")
  }
} else {
  cat("Package 'pscl' not installed; skipping bioChemists example.\n")
}

Notes. Many art values are zero; ZINB usually outperforms ZIP when overdispersion is present. Group lasso treats factor levels as single grouped terms.

2. Semicontinuous (ZAGA) — `airquality::Ozone`

Ozone is non-negative and has zeros; positive part is continuous — ideal for ZAGA (zero-adjusted Gamma).

aq <- subset(airquality, !is.na(Ozone) & !is.na(Wind) & !is.na(Temp) & !is.na(Solar.R))
aq$Month <- factor(aq$Month)

set.seed(20)
fit_zaga <- sb_gamlss(
  Ozone ~ 1, data = aq, family = gamlss.dist::ZAGA(),
  mu_scope    = ~ pb(Temp) + pb(Wind) + pb(Solar.R) + Month,
  sigma_scope = ~ pb(Temp),
  engine = "grpreg", grpreg_penalty = "grLasso",
  B = 80, pi_thr = 0.6, pre_standardize = TRUE, parallel = "auto", trace = FALSE
)
sel_zaga <- selection_table(fit_zaga)
sel_zaga <- sel_zaga[order(sel_zaga$parameter, -sel_zaga$prop), , drop = FALSE]
head(sel_zaga, n = min(8L, nrow(sel_zaga)))
stable_zaga <- sel_zaga[sel_zaga$prop >= fit_zaga$pi_thr, , drop = FALSE]
if (nrow(stable_zaga)) {
  stable_zaga
} else {
  cat("No terms reached the stability threshold of", fit_zaga$pi_thr, "for this run.\n")
}

As with the count model, we present the eight most persistent effects plus the subset clearing the stability requirement.

Effect plot for Temp on mu (ZAGA)

if (requireNamespace('ggplot2', quietly = TRUE)) {
  print(effect_plot(fit_zaga, 'Temp', aq, what = 'mu'))
}

Notes. ZAGA models a point mass at zero plus a Gamma for the positive part. Group selection keeps whole spline bases together.

3. Longitudinal growth with random effects — `nlme::Orthodont`

Fit random intercepts by subject using gamlss::random(), while selecting fixed effects for μ.

if (requireNamespace("nlme", quietly = TRUE)) {
  data_ok <- tryCatch({
    utils::data("Orthodont", package = "nlme", envir = environment())
    TRUE
  }, error = function(e) FALSE)
  if (!data_ok || !exists("Orthodont", inherits = FALSE)) {
    cat("Dataset 'Orthodont' unavailable; skipping longitudinal example.\n")
  } else {
    od <- get("Orthodont", inherits = FALSE)
    rm(Orthodont)
    od$Subject <- factor(od$Subject)
    od$Sex <- factor(od$Sex)
    set.seed(30)
    fit_long <- sb_gamlss(
      distance ~ gamlss::random(Subject),     # random intercept
      data = od, family = gamlss.dist::NO(),
      mu_scope = ~ pb(age) + Sex + pb(age):Sex,      # select fixed effects
      engine = "grpreg", grpreg_penalty = "grLasso",
      B = 80, pi_thr = 0.6, pre_standardize = TRUE, parallel = "auto", trace = FALSE
    )
  sel_long <- selection_table(fit_long)
  sel_long <- sel_long[order(sel_long$parameter, -sel_long$prop), , drop = FALSE]
  head(sel_long, n = min(8L, nrow(sel_long)))
  stable_long <- sel_long[sel_long$prop >= fit_long$pi_thr, , drop = FALSE]
  if (nrow(stable_long)) {
    stable_long
  } else {
    cat("No terms reached the stability threshold of", fit_long$pi_thr, "for this run.\n")
  }
    if (requireNamespace("ggplot2", quietly = TRUE)) {
      gg <- tryCatch(
        effect_plot(fit_long, "age", od, what = "mu"),
        error = function(e) {
          message("effect_plot() skipped for random-effect example: ", conditionMessage(e))
          NULL
        }
      )
      if (!is.null(gg)) {
        print(gg)
      }
    }
  }
} else {
  cat("Package 'nlme' not installed; skipping longitudinal example.\n")
}

Notes. The random(Subject) term stays in the base formula (not in mu_scope) so it is always included, while SelectBoost focuses on selecting the fixed structure. Some plotting helpers (e.g., effect_plot()) may be skipped when random-effect predictions are unavailable in a given setup.