From 9f7e16ec685238b59665780c74e5505f200b4bf6 Mon Sep 17 00:00:00 2001 From: Samuel Jenness Date: Mon, 20 Apr 2026 12:57:28 -0400 Subject: [PATCH 1/4] Add duration.method flag with empirical / weibull_strat / joint_lm (#63 phase 3) New `duration.method` argument to `build_netparams()`. Default `"empirical"` preserves byte-identical behavior. Two alternatives: - `"weibull_strat"`: per-stratum Weibull AFT fits via survival::survreg with proper right-censoring (ongoing partnerships = censored, completed = observed events). Uses more of the data than empirical (which only uses ongoing). Attaches the Weibull shape parameter k as a `"weibull_shape"` attribute on `durs..byage` as a diagnostic for the constant-hazard assumption (k == 1 means geometric; the current ARTnet data gives k ~= 0.5 in every stratum, a clean rejection of the geometric assumption). - `"joint_lm"`: log-linear regression `lm(log(duration.time) ~ joint ego + partner + matching terms)` on ongoing partnerships. Stratum medians computed from model predictions. Fitted model stored at `netparams$$joint_duration_model` for use by future build_netstats refactors that want per-dyad predictions. All three methods produce `durs..{homog,byage}` data.frames with identical shapes and column names so the geometric transformation (`mean.dur.adj = 1/(1 - 2^(-1/median))`) and downstream `dissolution_coefs()` pipeline are unchanged. TERGM parameterization identical regardless of method. Implementation is a minimal surgical override at the end of each layer's empirical duration block in NetParams.R. When duration.method != "empirical", the stratum median / mean values are replaced with model-based estimates and rates.*.adj / mean.dur.adj are recomputed. Smoothing (smooth.main.dur) and sex.cess.mod logic apply uniformly after the override. Per-stratum fits fall back to empirical when the new method fails (sparse data, convergence issues). New dependency: survival (base-R recommended package) added to Suggests. The weibull_strat path guards with a requireNamespace check and errors clearly if survival isn't installed. Findings from Atlanta + race = TRUE (empirical vs weibull_strat vs joint_lm on mean.dur.adj): Main layer match.grp.1: 73 | 379 | 76 Main layer match.grp.5: 934 | 1,482,404 | 446 Casl layer nonmatch: 106 | 359 | 93 Casl layer match.grp.5: 150 | 1,157 | 129 Weibull shape (main): 0.48, 0.72, 0.60, 0.83, 0.35, 0.46 -- all well below 1, consistent with decreasing hazard. The extreme Weibull mean.dur.adj values in high-age-matched strata reflect extrapolation under k << 1 on heavily censored data; the roxygen doc flags this explicitly and recommends joint_lm as the more production-safe non-default option. New tests: tests/testthat/test-duration-method.R (8 blocks, 112 assertions). Covers: default is empirical; weibull_strat produces valid durs shape + shape diagnostic attribute; joint_lm stores lm model; output shape invariant across all three methods (for dissolution_coefs compatibility); non-duration netparams fields preserved across methods; weibull shapes are all < 1 (substantive finding locked in); joint_lm composes cleanly with method = "joint" in build_netstats. Backward-compat snapshot: default and explicit method = "existing" match 3/3 on all parameter sets. Full test suite: 563 assertions pass in 10.3s. R CMD check: 0 errors / 0 warnings / 0 relevant notes (one "unable to verify current time" environment note, unrelated). Part of #63. Co-Authored-By: Claude Opus 4.7 (1M context) --- DESCRIPTION | 1 + R/NetParams.R | 276 ++++++++++++++++++++++++++ man/build_netparams.Rd | 33 +++ tests/testthat/test-duration-method.R | 162 +++++++++++++++ 4 files changed, 472 insertions(+) create mode 100644 tests/testthat/test-duration-method.R diff --git a/DESCRIPTION b/DESCRIPTION index 0b75779..ea56a6f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,6 +18,7 @@ Suggests: ARTnetData, knitr, rmarkdown, + survival, testthat (>= 3.0.0) VignetteBuilder: knitr RoxygenNote: 7.3.3 diff --git a/R/NetParams.R b/R/NetParams.R index 1fddf02..6cdceee 100644 --- a/R/NetParams.R +++ b/R/NetParams.R @@ -1,3 +1,155 @@ +# Weibull AFT fit for partnership durations with right-censoring. +# In survreg, `event = 1` means observed / completed (ONGOING = 0) and +# `event = 0` means censored / still ongoing (ONGOING = 1). Uses both +# completed and ongoing partnerships. Returns a small list with the +# stratum-level summary quantities plus the shape parameter `k` as a +# diagnostic for the geometric-distribution assumption: k ~= 1 means +# the constant-hazard (geometric) assumption holds; k far from 1 means +# the observed hazard is increasing (k > 1) or decreasing (k < 1) in +# partnership age. Returns NULL when the fit is infeasible (too few +# events / too few censored / convergence failure) so callers can fall +# back to empirical. +fit_weibull_dur <- function(data) { + if (nrow(data) < 10 || !requireNamespace("survival", quietly = TRUE)) { + return(NULL) + } + event <- as.integer(data$ongoing2 == 0) + n_events <- sum(event) + n_censored <- nrow(data) - n_events + # survreg needs some of each; otherwise scale is unidentified. + if (n_events < 3 || n_censored < 3) return(NULL) + if (any(data$duration.time <= 0, na.rm = TRUE)) { + data <- data[data$duration.time > 0 & !is.na(data$duration.time), , drop = FALSE] + event <- as.integer(data$ongoing2 == 0) + if (nrow(data) < 10) return(NULL) + } + fit <- tryCatch( + suppressWarnings(survival::survreg( + survival::Surv(data$duration.time, event) ~ 1, + dist = "weibull" + )), + error = function(e) NULL + ) + if (is.null(fit)) return(NULL) + # survreg parameterization: weibull_shape = 1 / fit$scale; + # weibull_scale = exp((Intercept)). + shape <- 1 / fit$scale + scale_weibull <- exp(coef(fit)[["(Intercept)"]]) + med <- scale_weibull * log(2)^(1 / shape) # closed-form median + mean_val <- scale_weibull * gamma(1 + 1 / shape) # closed-form mean + list(mean.dur = mean_val, median.dur = med, weibull_shape = shape) +} + + +# Joint log-linear lm on log(duration.time) for ongoing partnerships. +# Same length-bias convention as the "empirical" default (uses only +# ongoing, relies on the memoryless property to read stratum-specific +# predicted durations as the median full-duration quantity that feeds +# the geometric transformation in build_netstats). Returns the fitted +# model plus a vector of per-row predicted durations on the training +# set so callers can stratify however they like. +fit_joint_lm_dur <- function(l_layer, race, geog.lvl) { + ongoing <- l_layer[l_layer$ongoing2 == 1 & + !is.na(l_layer$duration.time) & + l_layer$duration.time > 0, , drop = FALSE] + if (nrow(ongoing) < 20) return(NULL) + terms <- c("index.age.grp", "sqrt(index.age.grp)", "hiv2", "same.age.grp") + if (isTRUE(race)) { + terms <- c(terms, "as.factor(race.cat.num)", "same.race") + } + if (!is.null(geog.lvl)) { + terms <- c("geogYN", terms) + } + fml <- reformulate(terms, response = "log(duration.time)") + fit <- tryCatch( + suppressWarnings(lm(fml, data = ongoing)), + error = function(e) NULL + ) + if (is.null(fit)) return(NULL) + fitted_dur <- exp(predict(fit, newdata = ongoing)) + list(model = fit, ongoing = ongoing, fitted_dur = fitted_dur) +} + + +# Given a partnership-level layer and a non-default duration.method, +# return a list(homog, byage, joint_duration_model) of duration +# summaries matching the shape of the empirical output (so that the +# geometric transformation, smoothing, and sex.cess.mod logic in the +# calling layer block apply uniformly). `byage_strata` is the vector +# of stratum keys expected in the output (first = 0 for non-matched, +# subsequent = index.age.grp values for matched-within-age-group). +compute_alt_durs <- function(l_layer, duration.method, byage_strata, + race, geog.lvl) { + stopifnot(duration.method %in% c("weibull_strat", "joint_lm")) + + # Subset each stratum the same way the empirical block does. + strata_data <- lapply(byage_strata, function(k) { + if (k == 0) { + l_layer[l_layer$same.age.grp == 0, , drop = FALSE] + } else { + l_layer[l_layer$same.age.grp == 1 & l_layer$index.age.grp == k, + , drop = FALSE] + } + }) + names(strata_data) <- as.character(byage_strata) + + if (duration.method == "weibull_strat") { + # Per-stratum Weibull fits. + per_stratum <- lapply(strata_data, fit_weibull_dur) + byage <- data.frame( + index.age.grp = byage_strata, + mean.dur = vapply(per_stratum, + function(x) if (is.null(x)) NA_real_ else x$mean.dur, numeric(1)), + median.dur = vapply(per_stratum, + function(x) if (is.null(x)) NA_real_ else x$median.dur, numeric(1)) + ) + shapes <- vapply(per_stratum, + function(x) if (is.null(x)) NA_real_ else x$weibull_shape, numeric(1)) + + # Overall (homog) fit on the whole subset used for durations. + homog_fit <- fit_weibull_dur(l_layer) + homog <- if (is.null(homog_fit)) { + data.frame(mean.dur = NA_real_, median.dur = NA_real_) + } else { + data.frame(mean.dur = homog_fit$mean.dur, + median.dur = homog_fit$median.dur) + } + return(list(homog = homog, byage = byage, weibull_shapes = shapes, + weibull_shape_overall = if (is.null(homog_fit)) NA_real_ + else homog_fit$weibull_shape)) + } + + # duration.method == "joint_lm" + res <- fit_joint_lm_dur(l_layer, race = race, geog.lvl = geog.lvl) + if (is.null(res)) { + return(list(homog = data.frame(mean.dur = NA_real_, median.dur = NA_real_), + byage = data.frame(index.age.grp = byage_strata, + mean.dur = NA_real_, median.dur = NA_real_), + model = NULL)) + } + # Stratum medians from predicted durations on the training set. + byage <- do.call(rbind, lapply(byage_strata, function(k) { + sub <- if (k == 0) { + res$ongoing[res$ongoing$same.age.grp == 0, , drop = FALSE] + } else { + res$ongoing[res$ongoing$same.age.grp == 1 & + res$ongoing$index.age.grp == k, , drop = FALSE] + } + if (nrow(sub) == 0) { + data.frame(index.age.grp = k, mean.dur = NA_real_, median.dur = NA_real_) + } else { + pred_sub <- exp(predict(res$model, newdata = sub)) + data.frame(index.age.grp = k, + mean.dur = mean(pred_sub, na.rm = TRUE), + median.dur = median(pred_sub, na.rm = TRUE)) + } + })) + homog <- data.frame(mean.dur = mean(res$fitted_dur, na.rm = TRUE), + median.dur = median(res$fitted_dur, na.rm = TRUE)) + list(homog = homog, byage = byage, model = res$model) +} + + # Fit a joint GLM predicting `response` from all available individual # attributes (age, race, the concurrent-layer degree, HIV status, # geography). Candidate interactions are considered one at a time and @@ -56,6 +208,38 @@ fit_joint_glm <- function(d, response, main_terms, #' oldest and second oldest age groups. #' @param oo.nquants Number of quantiles to split the one-off partnership risk distribution (count #' of one-off partners per unit time). +#' @param duration.method Character. Controls how partnership durations for dissolution-coef +#' estimation are computed. One of: +#' \itemize{ +#' \item `"empirical"` (default) — mean / median of observed durations among ongoing +#' partnerships, stratified by (age-match x index.age.grp). Relies on the +#' geometric / memoryless assumption that median elapsed time in ongoing +#' partnerships equals median full-partnership duration. Byte-identical to the +#' pre-refactor behavior. +#' \item `"weibull_strat"` — Weibull AFT fits per stratum via +#' `survival::survreg(Surv(duration.time, 1 - ongoing2) ~ 1, ...)`. Uses both +#' ongoing (right-censored) and completed partnerships. The Weibull shape +#' parameter `k` is attached to `durs..byage` as a `"weibull_shape"` +#' attribute — diagnostic for the constant-hazard assumption embedded in the +#' TERGM dissolution (k ~= 1 means geometric is correct; k far from 1 flags +#' mis-specification). **Caveat**: on the ARTnet data the fitted `k` comes +#' out well below 1 (decreasing hazard) in every stratum, and the implied +#' Weibull median can extrapolate to implausibly large values in heavily +#' censored strata (e.g., the oldest matched age groups). Treat +#' `"weibull_strat"` as diagnostic — look at the `k` values to decide +#' whether the geometric assumption is defensible — rather than as a +#' drop-in production method. `"joint_lm"` is the more production-safe +#' non-default option. +#' \item `"joint_lm"` — log-linear `lm(log(duration.time) ~ )` on ongoing partnerships; per-stratum medians computed from model +#' predictions. Fitted model is stored at `netparams$$joint_duration_model` +#' for optional per-dyad use by future `build_netstats` refactors. +#' } +#' Under all three methods, the stratum-level medians flow through the same +#' geometric transformation (`mean.dur.adj = 1 / (1 - 2^(-1 / median))`) that +#' `build_netstats` passes to `dissolution_coefs()`, so the TERGM offset structure +#' is identical across methods. The one-off layer has no durations, so this flag +#' only affects main and casual-layer output. #' @param method Character. Either `"existing"` (default) or `"joint"`. `"existing"` reproduces #' the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear #' fit for each ERGM target statistic. `"joint"` leaves all of those outputs intact **and** @@ -115,9 +299,17 @@ fit_joint_glm <- function(d, response, main_terms, build_netparams <- function(epistats, smooth.main.dur = FALSE, oo.nquants = 5, + duration.method = c("empirical", "weibull_strat", "joint_lm"), method = c("existing", "joint"), browser = FALSE) { method <- match.arg(method) + duration.method <- match.arg(duration.method) + + if (duration.method == "weibull_strat" && + !requireNamespace("survival", quietly = TRUE)) { + stop("duration.method = 'weibull_strat' requires the 'survival' package. ", + "Install it with install.packages('survival').") + } # Ensures that ARTnetData is installed if (system.file(package = "ARTnetData") == "") stop(missing_data_msg) @@ -592,6 +784,51 @@ build_netparams <- function(epistats, durs.main.all <- durs.main.all[, c(3, 1, 2, 4, 5)] out$main$durs.main.byage <- durs.main.all + + ## duration.method override (see #63 phase 3) ---- + # Replace the empirical stratum medians with model-based medians and + # re-apply the geometric transformation, preserving the downstream + # data.frame shape so smoothing and sex.cess.mod below work unchanged. + if (duration.method != "empirical") { + alt <- compute_alt_durs( + l_layer = lmain[lmain$RAI == 1 | lmain$IAI == 1, , drop = FALSE], + duration.method = duration.method, + byage_strata = out$main$durs.main.byage$index.age.grp, + race = race, geog.lvl = geog.lvl + ) + # homog: replace mean/median, recompute adj from new median + if (!is.na(alt$homog$median.dur)) { + out$main$durs.main.homog$mean.dur <- alt$homog$mean.dur + out$main$durs.main.homog$median.dur <- alt$homog$median.dur + out$main$durs.main.homog$rates.main.adj <- + 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) + out$main$durs.main.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) + } + # byage: per-stratum replacement, with empirical fallback if a stratum + # fit failed (keeps the row populated so dissolution_coefs doesn't NA). + for (i in seq_len(nrow(out$main$durs.main.byage))) { + k <- out$main$durs.main.byage$index.age.grp[i] + j <- which(alt$byage$index.age.grp == k) + if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { + out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] + out$main$durs.main.byage$rates.main.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$main$durs.main.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } + } + if (duration.method == "weibull_strat") { + attr(out$main$durs.main.byage, "weibull_shape") <- alt$weibull_shapes + attr(out$main$durs.main.homog, "weibull_shape") <- alt$weibull_shape_overall + } + if (duration.method == "joint_lm" && !is.null(alt$model)) { + out$main$joint_duration_model <- alt$model + } + } + + if (smooth.main.dur == TRUE) { n2 <- nrow(durs.main.all) n1 <- n2 - 1 @@ -984,6 +1221,45 @@ build_netparams <- function(epistats, durs.casl.all <- durs.casl.all[, c(3, 1, 2, 4, 5)] out$casl$durs.casl.byage <- durs.casl.all + + ## duration.method override (see #63 phase 3) ---- + if (duration.method != "empirical") { + alt <- compute_alt_durs( + l_layer = lcasl[lcasl$RAI == 1 | lcasl$IAI == 1, , drop = FALSE], + duration.method = duration.method, + byage_strata = out$casl$durs.casl.byage$index.age.grp, + race = race, geog.lvl = geog.lvl + ) + if (!is.na(alt$homog$median.dur)) { + out$casl$durs.casl.homog$mean.dur <- alt$homog$mean.dur + out$casl$durs.casl.homog$median.dur <- alt$homog$median.dur + out$casl$durs.casl.homog$rates.casl.adj <- + 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) + out$casl$durs.casl.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) + } + for (i in seq_len(nrow(out$casl$durs.casl.byage))) { + k <- out$casl$durs.casl.byage$index.age.grp[i] + j <- which(alt$byage$index.age.grp == k) + if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { + out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] + out$casl$durs.casl.byage$rates.casl.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$casl$durs.casl.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } + } + if (duration.method == "weibull_strat") { + attr(out$casl$durs.casl.byage, "weibull_shape") <- alt$weibull_shapes + attr(out$casl$durs.casl.homog, "weibull_shape") <- alt$weibull_shape_overall + } + if (duration.method == "joint_lm" && !is.null(alt$model)) { + out$casl$joint_duration_model <- alt$model + } + } + + # If sexual cessation model, then set diss coef for age grp above boundary to 1 if (sex.cess.mod == TRUE) { index.age.grp <- max(out$casl$durs.casl.byage$index.age.grp) + 1 diff --git a/man/build_netparams.Rd b/man/build_netparams.Rd index 85a2f6d..9188de1 100644 --- a/man/build_netparams.Rd +++ b/man/build_netparams.Rd @@ -8,6 +8,7 @@ build_netparams( epistats, smooth.main.dur = FALSE, oo.nquants = 5, + duration.method = c("empirical", "weibull_strat", "joint_lm"), method = c("existing", "joint"), browser = FALSE ) @@ -21,6 +22,38 @@ oldest and second oldest age groups.} \item{oo.nquants}{Number of quantiles to split the one-off partnership risk distribution (count of one-off partners per unit time).} +\item{duration.method}{Character. Controls how partnership durations for dissolution-coef +estimation are computed. One of: +\itemize{ +\item \code{"empirical"} (default) — mean / median of observed durations among ongoing +partnerships, stratified by (age-match x index.age.grp). Relies on the +geometric / memoryless assumption that median elapsed time in ongoing +partnerships equals median full-partnership duration. Byte-identical to the +pre-refactor behavior. +\item \code{"weibull_strat"} — Weibull AFT fits per stratum via +\code{survival::survreg(Surv(duration.time, 1 - ongoing2) ~ 1, ...)}. Uses both +ongoing (right-censored) and completed partnerships. The Weibull shape +parameter \code{k} is attached to \verb{durs..byage} as a \code{"weibull_shape"} +attribute — diagnostic for the constant-hazard assumption embedded in the +TERGM dissolution (k ~= 1 means geometric is correct; k far from 1 flags +mis-specification). \strong{Caveat}: on the ARTnet data the fitted \code{k} comes +out well below 1 (decreasing hazard) in every stratum, and the implied +Weibull median can extrapolate to implausibly large values in heavily +censored strata (e.g., the oldest matched age groups). Treat +\code{"weibull_strat"} as diagnostic — look at the \code{k} values to decide +whether the geometric assumption is defensible — rather than as a +drop-in production method. \code{"joint_lm"} is the more production-safe +non-default option. +\item \code{"joint_lm"} — log-linear \verb{lm(log(duration.time) ~ )} on ongoing partnerships; per-stratum medians computed from model +predictions. Fitted model is stored at \verb{netparams$$joint_duration_model} +for optional per-dyad use by future \code{build_netstats} refactors. +} +Under all three methods, the stratum-level medians flow through the same +geometric transformation (\code{mean.dur.adj = 1 / (1 - 2^(-1 / median))}) that +\code{build_netstats} passes to \code{dissolution_coefs()}, so the TERGM offset structure +is identical across methods. The one-off layer has no durations, so this flag +only affects main and casual-layer output.} + \item{method}{Character. Either \code{"existing"} (default) or \code{"joint"}. \code{"existing"} reproduces the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear fit for each ERGM target statistic. \code{"joint"} leaves all of those outputs intact \strong{and} diff --git a/tests/testthat/test-duration-method.R b/tests/testthat/test-duration-method.R new file mode 100644 index 0000000..460622d --- /dev/null +++ b/tests/testthat/test-duration-method.R @@ -0,0 +1,162 @@ +# Tests for the duration.method flag added in PR for #63 phase 3. + +skip_without_artnetdata <- function() { + testthat::skip_if(system.file(package = "ARTnetData") == "", + "ARTnetData not installed") +} + +cache_env <- new.env(parent = emptyenv()) + +get_np <- function(duration.method) { + key <- duration.method + if (!is.null(cache_env[[key]])) return(cache_env[[key]]) + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + set.seed(20260419L) + np <- build_netparams(epistats, smooth.main.dur = TRUE, + duration.method = duration.method, + method = "existing") + cache_env[[key]] <- np + np +} + +test_that("default duration.method is 'empirical'", { + skip_without_artnetdata() + np_default <- get_np("empirical") + # No joint_duration_model attached when method is empirical + expect_null(np_default$main$joint_duration_model) + expect_null(np_default$casl$joint_duration_model) + # No weibull_shape attribute + expect_null(attr(np_default$main$durs.main.byage, "weibull_shape")) + expect_null(attr(np_default$casl$durs.casl.byage, "weibull_shape")) +}) + +test_that("weibull_strat produces valid durs shape and shape diagnostic", { + skip_without_artnetdata() + testthat::skip_if_not_installed("survival") + np <- get_np("weibull_strat") + for (layer in c("main", "casl")) { + df_byage <- np[[layer]][[paste0("durs.", layer, ".byage")]] + df_homog <- np[[layer]][[paste0("durs.", layer, ".homog")]] + # Shape attributes attached as diagnostic + shapes <- attr(df_byage, "weibull_shape") + expect_type(shapes, "double") + expect_length(shapes, nrow(df_byage)) + expect_true(all(is.finite(shapes)), + info = paste("shapes must be finite:", layer)) + # Medians must be positive + expect_true(all(df_byage$median.dur > 0, na.rm = TRUE)) + expect_true(all(df_homog$median.dur > 0, na.rm = TRUE)) + # mean.dur.adj > 0 + expect_true(all(df_byage$mean.dur.adj > 0, na.rm = TRUE)) + } +}) + +test_that("joint_lm stores fitted model at joint_duration_model", { + skip_without_artnetdata() + np <- get_np("joint_lm") + for (layer in c("main", "casl")) { + m <- np[[layer]]$joint_duration_model + expect_s3_class(m, "lm") + # Fit should have a reasonable number of observations + expect_gt(stats::nobs(m), 500) + # Response is log(duration.time) + expect_true(grepl("log\\(duration\\.time\\)", + as.character(formula(m))[2])) + } +}) + +test_that("all duration.methods preserve output shape for dissolution_coefs", { + skip_without_artnetdata() + testthat::skip_if_not_installed("survival") + # Every method must produce durs..byage with the columns + # dissolution_coefs() needs downstream, in the same order and types. + expected_cols <- c("index.age.grp", "mean.dur", "median.dur", + "rates.main.adj", "mean.dur.adj") + for (dm in c("empirical", "weibull_strat", "joint_lm")) { + df <- get_np(dm)$main$durs.main.byage + expect_identical(colnames(df), expected_cols, + info = paste("column mismatch for", dm)) + expect_gt(nrow(df), 0) + expect_true(all(is.finite(df$mean.dur.adj)), + info = paste("mean.dur.adj must be finite for", dm)) + } +}) + +test_that("duration.method does not affect non-duration netparams fields", { + skip_without_artnetdata() + testthat::skip_if_not_installed("survival") + np_e <- get_np("empirical") + np_w <- get_np("weibull_strat") + np_j <- get_np("joint_lm") + # Everything except durs.*.byage, durs.*.homog, joint_duration_model + # should be identical across methods. + for (layer in c("main", "casl", "inst", "all")) { + if (is.null(np_e[[layer]])) next + common_fields <- setdiff( + names(np_e[[layer]]), + c(paste0("durs.", layer, ".byage"), + paste0("durs.", layer, ".homog"), + "joint_duration_model") + ) + for (f in common_fields) { + expect_equal(np_w[[layer]][[f]], np_e[[layer]][[f]], + info = paste0("[weibull ", layer, "$", f, "]")) + expect_equal(np_j[[layer]][[f]], np_e[[layer]][[f]], + info = paste0("[joint_lm ", layer, "$", f, "]")) + } + } +}) + +test_that("weibull_strat requires survival package", { + skip_without_artnetdata() + # If survival isn't available the call should error clearly + # (we can't actually uninstall it in a test, but we can confirm the + # guard exists by searching the function body). + body_src <- deparse(build_netparams) + expect_true(any(grepl("duration\\.method = 'weibull_strat' requires", + body_src))) +}) + +test_that("weibull shape exposes hazard mis-specification", { + skip_without_artnetdata() + testthat::skip_if_not_installed("survival") + np <- get_np("weibull_strat") + shapes_main <- attr(np$main$durs.main.byage, "weibull_shape") + # Under the null geometric (constant-hazard) model, k == 1. Empirically + # on ARTnet these come out well under 1 -- this test locks in that + # substantive finding so any regression to k ~= 1 is visible. + expect_true(all(shapes_main < 1, na.rm = TRUE), + info = paste("main layer shapes should all be < 1; got:", + paste(round(shapes_main, 3), collapse = ", "))) +}) + +test_that("joint_lm can be combined with method = 'joint' in build_netstats", { + skip_without_artnetdata() + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + set.seed(20260419L) + np <- build_netparams(epistats, smooth.main.dur = TRUE, + duration.method = "joint_lm", + method = "joint") + set.seed(20260419L) + ns <- build_netstats(epistats, np, + expect.mort = 0.000478213, network.size = 3000, + method = "joint") + # Dissolution coefs still valid + expect_s3_class(ns$main$diss.byage, "disscoef") + expect_s3_class(ns$casl$diss.byage, "disscoef") + expect_true(all(is.finite(ns$main$diss.byage$coef.diss))) + # Both the ego-level joint_model and the dyad joint_duration_model + # coexist on netparams + expect_s3_class(np$main$joint_model, "glm") + expect_s3_class(np$main$joint_duration_model, "lm") +}) From ab582594315b3a423b98e62f28340a9ab7768142 Mon Sep 17 00:00:00 2001 From: Samuel Jenness Date: Mon, 20 Apr 2026 13:34:26 -0400 Subject: [PATCH 2/4] Length-bias-corrected Weibull MLE for weibull_strat (#71 review) Replaces the naive survreg-based Weibull fit with a length-biased MLE. ARTnet is a cross-sectional prevalence sample, not an incidence cohort: duration.time for ongoing partnerships is the backward recurrence time (age at inspection in a renewal process), not the full partnership duration. Under stationarity and Weibull(k, lambda) durations, the density of observed ages is: f_A(a) = S(a; k, lambda) / mu(k, lambda) where S is the Weibull survival function and mu = lambda * Gamma(1 + 1/k) is the Weibull mean. Maximizing sum log f_A(t_i) over ongoing obs via optim() gives unbiased estimates of (k, lambda) of the underlying full-duration distribution. The naive survreg treatment of the same data interprets length-biased elapsed durations as incidence-cohort survival times and produces catastrophically biased estimates (scale parameter off by 3+ orders of magnitude on ARTnet, median extrapolations to 15-20 year main partnerships in the oldest matched age groups, etc.). Completed partnerships (ongoing2 == 0) are deliberately not used here: they are subject to a different truncation (past-12-month recall window) that would require separate modeling. Restricting to ongoing matches the convention of empirical and joint_lm. Additionally, under weibull_strat, mean.dur.adj is now set directly to the Weibull analytical mean (wt * lambda * Gamma(1 + 1/k)) rather than being derived from median.dur via the geometric-distribution formula. When k is far from 1, these diverge by orders of magnitude; the geometric formula is inappropriate and was masking the underlying fit quality. Empirical comparison on Atlanta + race = TRUE, mean.dur.adj: MAIN empirical weibull_lb joint_lm nonmatch 208.5 160.0 235.5 matched.1 72.6 72.2 75.7 matched.5 934.4 1368.4 445.9 (was 1,482,404 under naive) CASL empirical weibull_lb joint_lm nonmatch 105.8 64.4 93.3 matched.5 150.1 96.6 129.1 (was 1,157,200 under naive) All three methods now produce values in the same order of magnitude. Weibull shape parameters are also more informative: MAIN k per stratum: 0.68 1.03 1.96 2.62 2.84 4.96 (overall 0.63) CASL k per stratum: 0.62 0.68 0.55 0.59 0.53 0.59 (overall 0.57) Casual partnerships show consistent decreasing hazard (k < 1) across all strata. Main shows a more interesting pattern: non-matched and youngest-matched near k = 1 (geometric-like), older-matched strata with k > 1 (increasing hazard -- stable mature relationships with break points). Issue #72 opened for the broader length-bias and 5-partnership truncation issue affecting formation stats. Test: "all shapes < 1" assertion replaced with a generous sanity range (0.1 <= k <= 20) per stratum and a narrower band around the overall pooled k (0.3-1.2) that would catch a regression back to naive-fit behavior. Full suite: 570/570 pass. R CMD check: 0/0/0 after adding stats::optim to @importFrom. Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + R/ARTnet-package.R | 4 +- R/NetParams.R | 232 ++++++++++++++++++-------- man/build_netparams.Rd | 39 +++-- tests/testthat/test-duration-method.R | 28 +++- 5 files changed, 210 insertions(+), 94 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 8edaa0d..63528f4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ importFrom(stats,gaussian) importFrom(stats,glm) importFrom(stats,lm) importFrom(stats,median) +importFrom(stats,optim) importFrom(stats,poisson) importFrom(stats,predict) importFrom(stats,rbinom) diff --git a/R/ARTnet-package.R b/R/ARTnet-package.R index d0e91e3..90e487f 100644 --- a/R/ARTnet-package.R +++ b/R/ARTnet-package.R @@ -13,8 +13,8 @@ #' @name ARTnet-package #' @aliases ARTnet #' @import EpiModel dplyr -#' @importFrom stats AIC binomial coef formula gaussian glm lm median poisson -#' predict rbinom reformulate runif update +#' @importFrom stats AIC binomial coef formula gaussian glm lm median optim +#' poisson predict rbinom reformulate runif update #' @importFrom utils head #' #' @details diff --git a/R/NetParams.R b/R/NetParams.R index 6cdceee..dac2173 100644 --- a/R/NetParams.R +++ b/R/NetParams.R @@ -1,43 +1,85 @@ -# Weibull AFT fit for partnership durations with right-censoring. -# In survreg, `event = 1` means observed / completed (ONGOING = 0) and -# `event = 0` means censored / still ongoing (ONGOING = 1). Uses both -# completed and ongoing partnerships. Returns a small list with the -# stratum-level summary quantities plus the shape parameter `k` as a -# diagnostic for the geometric-distribution assumption: k ~= 1 means -# the constant-hazard (geometric) assumption holds; k far from 1 means -# the observed hazard is increasing (k > 1) or decreasing (k < 1) in -# partnership age. Returns NULL when the fit is infeasible (too few -# events / too few censored / convergence failure) so callers can fall -# back to empirical. +# Length-biased Weibull MLE for partnership durations. +# +# ARTnet is a cross-sectional prevalence sample, not an incidence cohort. +# For ongoing partnerships, `duration.time` is the backward recurrence +# time (age at inspection) in a renewal process, not the total partnership +# duration. Under stationarity and partnership durations ~ Weibull(k, lambda), +# the density of observed ages is +# +# f_A(a; k, lambda) = S(a; k, lambda) / mu(k, lambda) +# +# where S is the Weibull survival function and mu = lambda * Gamma(1 + 1/k) +# is the Weibull mean. The negative log-likelihood on ongoing observations +# only is +# +# -sum log f_A(t_i) = sum (t_i/lambda)^k + n * log(lambda) + n * log Gamma(1 + 1/k) +# +# Maximizing this over (k, lambda) yields the parameters of the underlying +# full-duration Weibull distribution without the length-bias that standard +# survreg(Surv(t, event) ~ 1) introduces when applied to cross-sectional +# data. +# +# Completed partnerships (ongoing2 == 0) are not used here: they are subject +# to a different truncation (past-12-month recall window) that would require +# separate modeling. Restricting to ongoing matches the convention of the +# empirical and joint_lm duration methods. +# +# Returns NULL on convergence failure, pathological parameter estimates, or +# insufficient data. fit_weibull_dur <- function(data) { - if (nrow(data) < 10 || !requireNamespace("survival", quietly = TRUE)) { - return(NULL) - } - event <- as.integer(data$ongoing2 == 0) - n_events <- sum(event) - n_censored <- nrow(data) - n_events - # survreg needs some of each; otherwise scale is unidentified. - if (n_events < 3 || n_censored < 3) return(NULL) - if (any(data$duration.time <= 0, na.rm = TRUE)) { - data <- data[data$duration.time > 0 & !is.na(data$duration.time), , drop = FALSE] - event <- as.integer(data$ongoing2 == 0) - if (nrow(data) < 10) return(NULL) + if (!requireNamespace("survival", quietly = TRUE)) return(NULL) + # Survival package is kept as a Suggests-declared prerequisite so users + # get a clear error if they explicitly request this method without it + # installed; the fitting logic itself is base-R (custom optim()) and + # does not call survival::. + ongoing <- data$duration.time[data$ongoing2 == 1] + ongoing <- ongoing[!is.na(ongoing) & ongoing > 0] + if (length(ongoing) < 15) return(NULL) + + neg_ll <- function(par) { + k <- exp(par[1]) + lambda <- exp(par[2]) + if (!is.finite(k) || !is.finite(lambda) || k <= 0 || lambda <= 0) { + return(1e12) + } + n <- length(ongoing) + val <- sum((ongoing / lambda)^k) + n * log(lambda) + n * lgamma(1 + 1 / k) + if (!is.finite(val)) return(1e12) + val } + + # Starting point: method-of-moments from length-biased-mean of Weibull. + # If the data were Weibull-generated with k = 1 and mean mu, observed mean + # of the backward recurrence time would be mu/2. So init lambda at 2 * mean(ongoing). + init <- c(log(1), log(max(mean(ongoing) * 2, 1))) + fit <- tryCatch( - suppressWarnings(survival::survreg( - survival::Surv(data$duration.time, event) ~ 1, - dist = "weibull" - )), + optim(init, neg_ll, method = "BFGS", control = list(maxit = 200)), error = function(e) NULL ) - if (is.null(fit)) return(NULL) - # survreg parameterization: weibull_shape = 1 / fit$scale; - # weibull_scale = exp((Intercept)). - shape <- 1 / fit$scale - scale_weibull <- exp(coef(fit)[["(Intercept)"]]) - med <- scale_weibull * log(2)^(1 / shape) # closed-form median - mean_val <- scale_weibull * gamma(1 + 1 / shape) # closed-form mean - list(mean.dur = mean_val, median.dur = med, weibull_shape = shape) + if (is.null(fit) || fit$convergence != 0) { + fit <- tryCatch( + optim(init, neg_ll, method = "Nelder-Mead", + control = list(maxit = 2000)), + error = function(e) NULL + ) + } + if (is.null(fit) || fit$convergence != 0) return(NULL) + + k <- exp(fit$par[1]) + lambda <- exp(fit$par[2]) + # Reject pathological estimates that indicate the optimization wandered + # off. Heavy-tailed but reasonable: k in [0.05, 20], scale in positive + # range not larger than 10^8 time units. + if (!is.finite(k) || !is.finite(lambda) || + k < 0.05 || k > 20 || lambda <= 0 || lambda > 1e8) { + return(NULL) + } + + med <- lambda * log(2)^(1 / k) + mean_val <- lambda * gamma(1 + 1 / k) + list(mean.dur = mean_val, median.dur = med, + weibull_shape = k, weibull_scale = lambda) } @@ -216,20 +258,31 @@ fit_joint_glm <- function(d, response, main_terms, #' geometric / memoryless assumption that median elapsed time in ongoing #' partnerships equals median full-partnership duration. Byte-identical to the #' pre-refactor behavior. -#' \item `"weibull_strat"` — Weibull AFT fits per stratum via -#' `survival::survreg(Surv(duration.time, 1 - ongoing2) ~ 1, ...)`. Uses both -#' ongoing (right-censored) and completed partnerships. The Weibull shape -#' parameter `k` is attached to `durs..byage` as a `"weibull_shape"` -#' attribute — diagnostic for the constant-hazard assumption embedded in the -#' TERGM dissolution (k ~= 1 means geometric is correct; k far from 1 flags -#' mis-specification). **Caveat**: on the ARTnet data the fitted `k` comes -#' out well below 1 (decreasing hazard) in every stratum, and the implied -#' Weibull median can extrapolate to implausibly large values in heavily -#' censored strata (e.g., the oldest matched age groups). Treat -#' `"weibull_strat"` as diagnostic — look at the `k` values to decide -#' whether the geometric assumption is defensible — rather than as a -#' drop-in production method. `"joint_lm"` is the more production-safe -#' non-default option. +#' \item `"weibull_strat"` — **length-bias-corrected** Weibull MLE per stratum. +#' ARTnet is a cross-sectional prevalence sample, not an incidence cohort: +#' `duration.time` for ongoing partnerships is the backward recurrence time +#' (age at inspection in a renewal process), not the full partnership +#' duration. Under stationarity and Weibull(k, lambda) durations, the density +#' of observed ages is `f_A(a) = S(a; k, lambda) / (lambda * Gamma(1 + 1/k))`, +#' which this method fits directly via `optim()` on ongoing partnerships. +#' A naive `survreg(Surv(t, event) ~ 1)` treatment of the same data would +#' interpret length-biased elapsed durations as incidence-cohort survival +#' times, biasing the scale parameter by multiple orders of magnitude on +#' ARTnet; this method corrects for that. The fitted Weibull analytical mean +#' (`lambda * Gamma(1 + 1/k)`) is used directly as `mean.dur.adj` — it does +#' NOT pass through the geometric-distribution median->mean formula the +#' empirical and joint_lm methods use, because under Weibull with k far from +#' 1 those two quantities diverge. The Weibull shape `k` is attached to +#' `durs..byage` as a `"weibull_shape"` attribute. `k ~= 1` supports +#' the constant-hazard (geometric) assumption embedded in the TERGM +#' dissolution offset; `k < 1` means the hazard of dissolution decreases +#' with partnership age (early periods risky, stable partnerships stay +#' stable); `k > 1` the reverse. On the ARTnet data, `k` fits at roughly +#' 0.5, a clean rejection of the geometric assumption. Completed +#' partnerships (`ongoing2 == 0`) are not used because they are subject to +#' a different truncation (past-12-month recall) that would require separate +#' modeling; see issue #72 for broader discussion of ARTnet's sampling +#' corrections. #' \item `"joint_lm"` — log-linear `lm(log(duration.time) ~ )` on ongoing partnerships; per-stratum medians computed from model #' predictions. Fitted model is stored at `netparams$$joint_duration_model` @@ -786,9 +839,16 @@ build_netparams <- function(epistats, ## duration.method override (see #63 phase 3) ---- - # Replace the empirical stratum medians with model-based medians and - # re-apply the geometric transformation, preserving the downstream + # Replace the empirical stratum summaries with model-based ones and + # recompute mean.dur.adj + rates.main.adj, preserving the downstream # data.frame shape so smoothing and sex.cess.mod below work unchanged. + # Under weibull_strat the Weibull analytical mean (length-bias- + # corrected) is used directly as mean.dur.adj — we do NOT pass the + # Weibull median through the geometric-distribution formula, because + # when the fitted shape k is far from 1, geometric(median) and + # Weibull(mean) diverge by multiple orders of magnitude. Under + # joint_lm the predicted stratum medians flow through the geometric + # transformation exactly as empirical does. if (duration.method != "empirical") { alt <- compute_alt_durs( l_layer = lmain[lmain$RAI == 1 | lmain$IAI == 1, , drop = FALSE], @@ -796,14 +856,21 @@ build_netparams <- function(epistats, byage_strata = out$main$durs.main.byage$index.age.grp, race = race, geog.lvl = geog.lvl ) - # homog: replace mean/median, recompute adj from new median + use_direct_mean <- duration.method == "weibull_strat" + # homog if (!is.na(alt$homog$median.dur)) { out$main$durs.main.homog$mean.dur <- alt$homog$mean.dur out$main$durs.main.homog$median.dur <- alt$homog$median.dur - out$main$durs.main.homog$rates.main.adj <- - 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) - out$main$durs.main.homog$mean.dur.adj <- - 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) + if (use_direct_mean) { + out$main$durs.main.homog$mean.dur.adj <- wt * alt$homog$mean.dur + out$main$durs.main.homog$rates.main.adj <- + 1 / out$main$durs.main.homog$mean.dur.adj + } else { + out$main$durs.main.homog$rates.main.adj <- + 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) + out$main$durs.main.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) + } } # byage: per-stratum replacement, with empirical fallback if a stratum # fit failed (keeps the row populated so dissolution_coefs doesn't NA). @@ -811,12 +878,19 @@ build_netparams <- function(epistats, k <- out$main$durs.main.byage$index.age.grp[i] j <- which(alt$byage$index.age.grp == k) if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { - out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] - out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] - out$main$durs.main.byage$rates.main.adj[i] <- - 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) - out$main$durs.main.byage$mean.dur.adj[i] <- - 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] + if (use_direct_mean) { + out$main$durs.main.byage$mean.dur.adj[i] <- + wt * alt$byage$mean.dur[j] + out$main$durs.main.byage$rates.main.adj[i] <- + 1 / out$main$durs.main.byage$mean.dur.adj[i] + } else { + out$main$durs.main.byage$rates.main.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$main$durs.main.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } } } if (duration.method == "weibull_strat") { @@ -1230,24 +1304,38 @@ build_netparams <- function(epistats, byage_strata = out$casl$durs.casl.byage$index.age.grp, race = race, geog.lvl = geog.lvl ) + use_direct_mean <- duration.method == "weibull_strat" if (!is.na(alt$homog$median.dur)) { out$casl$durs.casl.homog$mean.dur <- alt$homog$mean.dur out$casl$durs.casl.homog$median.dur <- alt$homog$median.dur - out$casl$durs.casl.homog$rates.casl.adj <- - 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) - out$casl$durs.casl.homog$mean.dur.adj <- - 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) + if (use_direct_mean) { + out$casl$durs.casl.homog$mean.dur.adj <- wt * alt$homog$mean.dur + out$casl$durs.casl.homog$rates.casl.adj <- + 1 / out$casl$durs.casl.homog$mean.dur.adj + } else { + out$casl$durs.casl.homog$rates.casl.adj <- + 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) + out$casl$durs.casl.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) + } } for (i in seq_len(nrow(out$casl$durs.casl.byage))) { k <- out$casl$durs.casl.byage$index.age.grp[i] j <- which(alt$byage$index.age.grp == k) if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { - out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] - out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] - out$casl$durs.casl.byage$rates.casl.adj[i] <- - 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) - out$casl$durs.casl.byage$mean.dur.adj[i] <- - 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] + if (use_direct_mean) { + out$casl$durs.casl.byage$mean.dur.adj[i] <- + wt * alt$byage$mean.dur[j] + out$casl$durs.casl.byage$rates.casl.adj[i] <- + 1 / out$casl$durs.casl.byage$mean.dur.adj[i] + } else { + out$casl$durs.casl.byage$rates.casl.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$casl$durs.casl.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } } } if (duration.method == "weibull_strat") { diff --git a/man/build_netparams.Rd b/man/build_netparams.Rd index 9188de1..6aecba4 100644 --- a/man/build_netparams.Rd +++ b/man/build_netparams.Rd @@ -30,20 +30,31 @@ partnerships, stratified by (age-match x index.age.grp). Relies on the geometric / memoryless assumption that median elapsed time in ongoing partnerships equals median full-partnership duration. Byte-identical to the pre-refactor behavior. -\item \code{"weibull_strat"} — Weibull AFT fits per stratum via -\code{survival::survreg(Surv(duration.time, 1 - ongoing2) ~ 1, ...)}. Uses both -ongoing (right-censored) and completed partnerships. The Weibull shape -parameter \code{k} is attached to \verb{durs..byage} as a \code{"weibull_shape"} -attribute — diagnostic for the constant-hazard assumption embedded in the -TERGM dissolution (k ~= 1 means geometric is correct; k far from 1 flags -mis-specification). \strong{Caveat}: on the ARTnet data the fitted \code{k} comes -out well below 1 (decreasing hazard) in every stratum, and the implied -Weibull median can extrapolate to implausibly large values in heavily -censored strata (e.g., the oldest matched age groups). Treat -\code{"weibull_strat"} as diagnostic — look at the \code{k} values to decide -whether the geometric assumption is defensible — rather than as a -drop-in production method. \code{"joint_lm"} is the more production-safe -non-default option. +\item \code{"weibull_strat"} — \strong{length-bias-corrected} Weibull MLE per stratum. +ARTnet is a cross-sectional prevalence sample, not an incidence cohort: +\code{duration.time} for ongoing partnerships is the backward recurrence time +(age at inspection in a renewal process), not the full partnership +duration. Under stationarity and Weibull(k, lambda) durations, the density +of observed ages is \verb{f_A(a) = S(a; k, lambda) / (lambda * Gamma(1 + 1/k))}, +which this method fits directly via \code{optim()} on ongoing partnerships. +A naive \code{survreg(Surv(t, event) ~ 1)} treatment of the same data would +interpret length-biased elapsed durations as incidence-cohort survival +times, biasing the scale parameter by multiple orders of magnitude on +ARTnet; this method corrects for that. The fitted Weibull analytical mean +(\code{lambda * Gamma(1 + 1/k)}) is used directly as \code{mean.dur.adj} — it does +NOT pass through the geometric-distribution median->mean formula the +empirical and joint_lm methods use, because under Weibull with k far from +1 those two quantities diverge. The Weibull shape \code{k} is attached to +\verb{durs..byage} as a \code{"weibull_shape"} attribute. \verb{k ~= 1} supports +the constant-hazard (geometric) assumption embedded in the TERGM +dissolution offset; \code{k < 1} means the hazard of dissolution decreases +with partnership age (early periods risky, stable partnerships stay +stable); \code{k > 1} the reverse. On the ARTnet data, \code{k} fits at roughly +0.5, a clean rejection of the geometric assumption. Completed +partnerships (\code{ongoing2 == 0}) are not used because they are subject to +a different truncation (past-12-month recall) that would require separate +modeling; see issue #72 for broader discussion of ARTnet's sampling +corrections. \item \code{"joint_lm"} — log-linear \verb{lm(log(duration.time) ~ )} on ongoing partnerships; per-stratum medians computed from model predictions. Fitted model is stored at \verb{netparams$$joint_duration_model} for optional per-dyad use by future \code{build_netstats} refactors. diff --git a/tests/testthat/test-duration-method.R b/tests/testthat/test-duration-method.R index 460622d..4dcfdab 100644 --- a/tests/testthat/test-duration-method.R +++ b/tests/testthat/test-duration-method.R @@ -122,17 +122,33 @@ test_that("weibull_strat requires survival package", { body_src))) }) -test_that("weibull shape exposes hazard mis-specification", { +test_that("weibull shape k is finite and in a plausible range per stratum", { skip_without_artnetdata() testthat::skip_if_not_installed("survival") np <- get_np("weibull_strat") shapes_main <- attr(np$main$durs.main.byage, "weibull_shape") - # Under the null geometric (constant-hazard) model, k == 1. Empirically - # on ARTnet these come out well under 1 -- this test locks in that - # substantive finding so any regression to k ~= 1 is visible. - expect_true(all(shapes_main < 1, na.rm = TRUE), - info = paste("main layer shapes should all be < 1; got:", + shapes_casl <- attr(np$casl$durs.casl.byage, "weibull_shape") + shape_main_overall <- attr(np$main$durs.main.homog, "weibull_shape") + shape_casl_overall <- attr(np$casl$durs.casl.homog, "weibull_shape") + # Under length-biased MLE, per-stratum shapes should all be finite and + # in a plausible range (sanity checks against pathological optimization). + expect_true(all(is.finite(shapes_main))) + expect_true(all(is.finite(shapes_casl))) + expect_true(all(shapes_main >= 0.1 & shapes_main <= 20), + info = paste("main shapes out of range:", paste(round(shapes_main, 3), collapse = ", "))) + expect_true(all(shapes_casl >= 0.1 & shapes_casl <= 20), + info = paste("casl shapes out of range:", + paste(round(shapes_casl, 3), collapse = ", "))) + # Substantive finding on ARTnet: the overall (pooled) shape parameter + # for both layers lands well below 1 (decreasing hazard). If someone + # accidentally reverts to a naive survreg-style fit, the overall shape + # would shift to a very different value, so lock in the current value + # to a generous band. + expect_gt(shape_main_overall, 0.3) + expect_lt(shape_main_overall, 1.2) + expect_gt(shape_casl_overall, 0.3) + expect_lt(shape_casl_overall, 1.2) }) test_that("joint_lm can be combined with method = 'joint' in build_netstats", { From 40288d21e4fb487c49d7b1e232860a17c6d39973 Mon Sep 17 00:00:00 2001 From: Samuel Jenness Date: Mon, 20 Apr 2026 21:00:49 -0400 Subject: [PATCH 3/4] Drop weibull_strat: diagnostic that geometric TERGM can't honor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per PI + Steve Goodreau review on PR #71: the Weibull-fit mean full-partnership duration can't be faithfully reproduced by a geometric-dissolution TERGM. Under Weibull with k != 1, the mean full duration and the mean cross-sectional age of extant ties diverge; TERGM with `offset(edges)` can only match the latter (via the inspection-paradox identity at exponential). Passing the length-bias-corrected Weibull mean to `dissolution_coefs()` produces a sim whose individual-partnership duration matches the Weibull but whose cross-sectional age structure is mis-matched to reality by a factor of (1 + CV^2)/2 — i.e., ~2x under k = 0.5. Adding non-geometric dissolution to tergm is a real upstream project (C-level ergm term that reads dynamic-network toggle history to get partnership age), not in ARTnet's scope. So weibull_strat as a production target was structurally wrong given the framework we're stuck with. Keeping it as a diagnostic-only option would have been defensible, but a diagnostic nobody acts on is cruft in production code. Dropped entirely. The remaining duration.method menu is: - "empirical" (default) — unadjusted median age of extant ties, geometric transformation to mean.dur.adj. - "joint_lm" — covariate-adjusted median age of extant ties (same target, regression-based), same geometric transformation. Steve's reframe makes the menu read cleanly: both methods target the same observable cross-sectional quantity that geometric TERGM can honor. joint_lm is the covariate-adjusted version of empirical — the direct duration analog of the marginal-vs-joint fix the rest of #63 applies to formation stats. Removed: - fit_weibull_dur() helper and its optim-based length-biased MLE. - weibull_strat branches in compute_alt_durs() and in the main/casl block override logic (use_direct_mean paths, weibull_shape attrs, survival package guard). - Four weibull-specific test blocks in test-duration-method.R. - stats::optim from @importFrom (no longer used). Opened issue #73 to track the remaining joint_lm gap: the stratum medians are currently computed by predicting the fitted log-duration lm at ARTnet observations, not at the synthetic target population constructed in build_netstats. This is "confounding-corrected within ARTnet" but not fully g-computed. Same pattern as #68 closing the loop on #61. Issue references point here and at the existing roxygen for forward discoverability. Full suite: 508/508 pass. R CMD check: 0/0/0. Backward-compat snapshot: 3/3 on both default and explicit method = "existing". Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 - R/ARTnet-package.R | 4 +- R/NetParams.R | 292 +++++--------------------- man/build_netparams.Rd | 54 ++--- tests/testthat/test-duration-method.R | 79 ++----- 5 files changed, 91 insertions(+), 339 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 63528f4..8edaa0d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,7 +19,6 @@ importFrom(stats,gaussian) importFrom(stats,glm) importFrom(stats,lm) importFrom(stats,median) -importFrom(stats,optim) importFrom(stats,poisson) importFrom(stats,predict) importFrom(stats,rbinom) diff --git a/R/ARTnet-package.R b/R/ARTnet-package.R index 90e487f..d0e91e3 100644 --- a/R/ARTnet-package.R +++ b/R/ARTnet-package.R @@ -13,8 +13,8 @@ #' @name ARTnet-package #' @aliases ARTnet #' @import EpiModel dplyr -#' @importFrom stats AIC binomial coef formula gaussian glm lm median optim -#' poisson predict rbinom reformulate runif update +#' @importFrom stats AIC binomial coef formula gaussian glm lm median poisson +#' predict rbinom reformulate runif update #' @importFrom utils head #' #' @details diff --git a/R/NetParams.R b/R/NetParams.R index dac2173..df507f5 100644 --- a/R/NetParams.R +++ b/R/NetParams.R @@ -1,88 +1,3 @@ -# Length-biased Weibull MLE for partnership durations. -# -# ARTnet is a cross-sectional prevalence sample, not an incidence cohort. -# For ongoing partnerships, `duration.time` is the backward recurrence -# time (age at inspection) in a renewal process, not the total partnership -# duration. Under stationarity and partnership durations ~ Weibull(k, lambda), -# the density of observed ages is -# -# f_A(a; k, lambda) = S(a; k, lambda) / mu(k, lambda) -# -# where S is the Weibull survival function and mu = lambda * Gamma(1 + 1/k) -# is the Weibull mean. The negative log-likelihood on ongoing observations -# only is -# -# -sum log f_A(t_i) = sum (t_i/lambda)^k + n * log(lambda) + n * log Gamma(1 + 1/k) -# -# Maximizing this over (k, lambda) yields the parameters of the underlying -# full-duration Weibull distribution without the length-bias that standard -# survreg(Surv(t, event) ~ 1) introduces when applied to cross-sectional -# data. -# -# Completed partnerships (ongoing2 == 0) are not used here: they are subject -# to a different truncation (past-12-month recall window) that would require -# separate modeling. Restricting to ongoing matches the convention of the -# empirical and joint_lm duration methods. -# -# Returns NULL on convergence failure, pathological parameter estimates, or -# insufficient data. -fit_weibull_dur <- function(data) { - if (!requireNamespace("survival", quietly = TRUE)) return(NULL) - # Survival package is kept as a Suggests-declared prerequisite so users - # get a clear error if they explicitly request this method without it - # installed; the fitting logic itself is base-R (custom optim()) and - # does not call survival::. - ongoing <- data$duration.time[data$ongoing2 == 1] - ongoing <- ongoing[!is.na(ongoing) & ongoing > 0] - if (length(ongoing) < 15) return(NULL) - - neg_ll <- function(par) { - k <- exp(par[1]) - lambda <- exp(par[2]) - if (!is.finite(k) || !is.finite(lambda) || k <= 0 || lambda <= 0) { - return(1e12) - } - n <- length(ongoing) - val <- sum((ongoing / lambda)^k) + n * log(lambda) + n * lgamma(1 + 1 / k) - if (!is.finite(val)) return(1e12) - val - } - - # Starting point: method-of-moments from length-biased-mean of Weibull. - # If the data were Weibull-generated with k = 1 and mean mu, observed mean - # of the backward recurrence time would be mu/2. So init lambda at 2 * mean(ongoing). - init <- c(log(1), log(max(mean(ongoing) * 2, 1))) - - fit <- tryCatch( - optim(init, neg_ll, method = "BFGS", control = list(maxit = 200)), - error = function(e) NULL - ) - if (is.null(fit) || fit$convergence != 0) { - fit <- tryCatch( - optim(init, neg_ll, method = "Nelder-Mead", - control = list(maxit = 2000)), - error = function(e) NULL - ) - } - if (is.null(fit) || fit$convergence != 0) return(NULL) - - k <- exp(fit$par[1]) - lambda <- exp(fit$par[2]) - # Reject pathological estimates that indicate the optimization wandered - # off. Heavy-tailed but reasonable: k in [0.05, 20], scale in positive - # range not larger than 10^8 time units. - if (!is.finite(k) || !is.finite(lambda) || - k < 0.05 || k > 20 || lambda <= 0 || lambda > 1e8) { - return(NULL) - } - - med <- lambda * log(2)^(1 / k) - mean_val <- lambda * gamma(1 + 1 / k) - list(mean.dur = mean_val, median.dur = med, - weibull_shape = k, weibull_scale = lambda) -} - - # Joint log-linear lm on log(duration.time) for ongoing partnerships. # Same length-bias convention as the "empirical" default (uses only # ongoing, relies on the memoryless property to read stratum-specific @@ -122,46 +37,8 @@ fit_joint_lm_dur <- function(l_layer, race, geog.lvl) { # subsequent = index.age.grp values for matched-within-age-group). compute_alt_durs <- function(l_layer, duration.method, byage_strata, race, geog.lvl) { - stopifnot(duration.method %in% c("weibull_strat", "joint_lm")) + stopifnot(duration.method == "joint_lm") - # Subset each stratum the same way the empirical block does. - strata_data <- lapply(byage_strata, function(k) { - if (k == 0) { - l_layer[l_layer$same.age.grp == 0, , drop = FALSE] - } else { - l_layer[l_layer$same.age.grp == 1 & l_layer$index.age.grp == k, - , drop = FALSE] - } - }) - names(strata_data) <- as.character(byage_strata) - - if (duration.method == "weibull_strat") { - # Per-stratum Weibull fits. - per_stratum <- lapply(strata_data, fit_weibull_dur) - byage <- data.frame( - index.age.grp = byage_strata, - mean.dur = vapply(per_stratum, - function(x) if (is.null(x)) NA_real_ else x$mean.dur, numeric(1)), - median.dur = vapply(per_stratum, - function(x) if (is.null(x)) NA_real_ else x$median.dur, numeric(1)) - ) - shapes <- vapply(per_stratum, - function(x) if (is.null(x)) NA_real_ else x$weibull_shape, numeric(1)) - - # Overall (homog) fit on the whole subset used for durations. - homog_fit <- fit_weibull_dur(l_layer) - homog <- if (is.null(homog_fit)) { - data.frame(mean.dur = NA_real_, median.dur = NA_real_) - } else { - data.frame(mean.dur = homog_fit$mean.dur, - median.dur = homog_fit$median.dur) - } - return(list(homog = homog, byage = byage, weibull_shapes = shapes, - weibull_shape_overall = if (is.null(homog_fit)) NA_real_ - else homog_fit$weibull_shape)) - } - - # duration.method == "joint_lm" res <- fit_joint_lm_dur(l_layer, race = race, geog.lvl = geog.lvl) if (is.null(res)) { return(list(homog = data.frame(mean.dur = NA_real_, median.dur = NA_real_), @@ -258,41 +135,29 @@ fit_joint_glm <- function(d, response, main_terms, #' geometric / memoryless assumption that median elapsed time in ongoing #' partnerships equals median full-partnership duration. Byte-identical to the #' pre-refactor behavior. -#' \item `"weibull_strat"` — **length-bias-corrected** Weibull MLE per stratum. -#' ARTnet is a cross-sectional prevalence sample, not an incidence cohort: -#' `duration.time` for ongoing partnerships is the backward recurrence time -#' (age at inspection in a renewal process), not the full partnership -#' duration. Under stationarity and Weibull(k, lambda) durations, the density -#' of observed ages is `f_A(a) = S(a; k, lambda) / (lambda * Gamma(1 + 1/k))`, -#' which this method fits directly via `optim()` on ongoing partnerships. -#' A naive `survreg(Surv(t, event) ~ 1)` treatment of the same data would -#' interpret length-biased elapsed durations as incidence-cohort survival -#' times, biasing the scale parameter by multiple orders of magnitude on -#' ARTnet; this method corrects for that. The fitted Weibull analytical mean -#' (`lambda * Gamma(1 + 1/k)`) is used directly as `mean.dur.adj` — it does -#' NOT pass through the geometric-distribution median->mean formula the -#' empirical and joint_lm methods use, because under Weibull with k far from -#' 1 those two quantities diverge. The Weibull shape `k` is attached to -#' `durs..byage` as a `"weibull_shape"` attribute. `k ~= 1` supports -#' the constant-hazard (geometric) assumption embedded in the TERGM -#' dissolution offset; `k < 1` means the hazard of dissolution decreases -#' with partnership age (early periods risky, stable partnerships stay -#' stable); `k > 1` the reverse. On the ARTnet data, `k` fits at roughly -#' 0.5, a clean rejection of the geometric assumption. Completed -#' partnerships (`ongoing2 == 0`) are not used because they are subject to -#' a different truncation (past-12-month recall) that would require separate -#' modeling; see issue #72 for broader discussion of ARTnet's sampling -#' corrections. #' \item `"joint_lm"` — log-linear `lm(log(duration.time) ~ )` on ongoing partnerships; per-stratum medians computed from model -#' predictions. Fitted model is stored at `netparams$$joint_duration_model` -#' for optional per-dyad use by future `build_netstats` refactors. +#' predictions. The covariate-adjusted version of `"empirical"`: targets the +#' same cross-sectional age-of-extant-ties quantity (via the same geometric +#' transformation), but corrects for confounding of the duration estimate by +#' other attributes — the direct analog of the marginal-vs-joint fix that +#' `method = "joint"` applies to formation stats. Fitted model is stored at +#' `netparams$$joint_duration_model`; see issue #73 for the follow-up +#' to predict per-dyad on the synthetic target population in `build_netstats`. #' } -#' Under all three methods, the stratum-level medians flow through the same -#' geometric transformation (`mean.dur.adj = 1 / (1 - 2^(-1 / median))`) that -#' `build_netstats` passes to `dissolution_coefs()`, so the TERGM offset structure -#' is identical across methods. The one-off layer has no durations, so this flag -#' only affects main and casual-layer output. +#' Both methods flow through the same geometric transformation +#' (`mean.dur.adj = 1 / (1 - 2^(-1 / median))`) that `build_netstats` passes to +#' `dissolution_coefs()`, so the TERGM offset structure is identical across +#' methods. The TERGM dissolution is geometric (constant hazard per timestep), so +#' under the inspection-paradox identity the `duration` parameter it consumes +#' simultaneously determines the simulated mean partnership duration and the +#' simulated mean age of extant ties at cross-section; both methods target the +#' observable latter quantity. A prior `"weibull_strat"` option targeting the +#' underlying mean full-partnership duration was explored and removed: its +#' output can't be faithfully honored by a geometric TERGM without a non-standard +#' age-dependent dissolution term, making it a diagnostic rather than a +#' production target. The one-off layer has no durations, so this flag only +#' affects main and casual-layer output. #' @param method Character. Either `"existing"` (default) or `"joint"`. `"existing"` reproduces #' the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear #' fit for each ERGM target statistic. `"joint"` leaves all of those outputs intact **and** @@ -352,18 +217,12 @@ fit_joint_glm <- function(d, response, main_terms, build_netparams <- function(epistats, smooth.main.dur = FALSE, oo.nquants = 5, - duration.method = c("empirical", "weibull_strat", "joint_lm"), + duration.method = c("empirical", "joint_lm"), method = c("existing", "joint"), browser = FALSE) { method <- match.arg(method) duration.method <- match.arg(duration.method) - if (duration.method == "weibull_strat" && - !requireNamespace("survival", quietly = TRUE)) { - stop("duration.method = 'weibull_strat' requires the 'survival' package. ", - "Install it with install.packages('survival').") - } - # Ensures that ARTnetData is installed if (system.file(package = "ARTnetData") == "") stop(missing_data_msg) @@ -839,38 +698,26 @@ build_netparams <- function(epistats, ## duration.method override (see #63 phase 3) ---- - # Replace the empirical stratum summaries with model-based ones and - # recompute mean.dur.adj + rates.main.adj, preserving the downstream - # data.frame shape so smoothing and sex.cess.mod below work unchanged. - # Under weibull_strat the Weibull analytical mean (length-bias- - # corrected) is used directly as mean.dur.adj — we do NOT pass the - # Weibull median through the geometric-distribution formula, because - # when the fitted shape k is far from 1, geometric(median) and - # Weibull(mean) diverge by multiple orders of magnitude. Under - # joint_lm the predicted stratum medians flow through the geometric - # transformation exactly as empirical does. - if (duration.method != "empirical") { + # Replace the empirical stratum medians with model-based ones and + # re-apply the geometric transformation. Under joint_lm the predicted + # stratum medians flow through the same `mean.dur.adj = + # 1/(1 - 2^(-1/median))` formula empirical uses — the only difference + # is that the medians are covariate-adjusted rather than marginal. + if (duration.method == "joint_lm") { alt <- compute_alt_durs( l_layer = lmain[lmain$RAI == 1 | lmain$IAI == 1, , drop = FALSE], duration.method = duration.method, byage_strata = out$main$durs.main.byage$index.age.grp, race = race, geog.lvl = geog.lvl ) - use_direct_mean <- duration.method == "weibull_strat" # homog if (!is.na(alt$homog$median.dur)) { - out$main$durs.main.homog$mean.dur <- alt$homog$mean.dur - out$main$durs.main.homog$median.dur <- alt$homog$median.dur - if (use_direct_mean) { - out$main$durs.main.homog$mean.dur.adj <- wt * alt$homog$mean.dur - out$main$durs.main.homog$rates.main.adj <- - 1 / out$main$durs.main.homog$mean.dur.adj - } else { - out$main$durs.main.homog$rates.main.adj <- - 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) - out$main$durs.main.homog$mean.dur.adj <- - 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) - } + out$main$durs.main.homog$mean.dur <- alt$homog$mean.dur + out$main$durs.main.homog$median.dur <- alt$homog$median.dur + out$main$durs.main.homog$rates.main.adj <- + 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) + out$main$durs.main.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) } # byage: per-stratum replacement, with empirical fallback if a stratum # fit failed (keeps the row populated so dissolution_coefs doesn't NA). @@ -878,26 +725,15 @@ build_netparams <- function(epistats, k <- out$main$durs.main.byage$index.age.grp[i] j <- which(alt$byage$index.age.grp == k) if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { - out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] - out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] - if (use_direct_mean) { - out$main$durs.main.byage$mean.dur.adj[i] <- - wt * alt$byage$mean.dur[j] - out$main$durs.main.byage$rates.main.adj[i] <- - 1 / out$main$durs.main.byage$mean.dur.adj[i] - } else { - out$main$durs.main.byage$rates.main.adj[i] <- - 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) - out$main$durs.main.byage$mean.dur.adj[i] <- - 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) - } + out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] + out$main$durs.main.byage$rates.main.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$main$durs.main.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) } } - if (duration.method == "weibull_strat") { - attr(out$main$durs.main.byage, "weibull_shape") <- alt$weibull_shapes - attr(out$main$durs.main.homog, "weibull_shape") <- alt$weibull_shape_overall - } - if (duration.method == "joint_lm" && !is.null(alt$model)) { + if (!is.null(alt$model)) { out$main$joint_duration_model <- alt$model } } @@ -1297,52 +1133,34 @@ build_netparams <- function(epistats, ## duration.method override (see #63 phase 3) ---- - if (duration.method != "empirical") { + if (duration.method == "joint_lm") { alt <- compute_alt_durs( l_layer = lcasl[lcasl$RAI == 1 | lcasl$IAI == 1, , drop = FALSE], duration.method = duration.method, byage_strata = out$casl$durs.casl.byage$index.age.grp, race = race, geog.lvl = geog.lvl ) - use_direct_mean <- duration.method == "weibull_strat" if (!is.na(alt$homog$median.dur)) { - out$casl$durs.casl.homog$mean.dur <- alt$homog$mean.dur - out$casl$durs.casl.homog$median.dur <- alt$homog$median.dur - if (use_direct_mean) { - out$casl$durs.casl.homog$mean.dur.adj <- wt * alt$homog$mean.dur - out$casl$durs.casl.homog$rates.casl.adj <- - 1 / out$casl$durs.casl.homog$mean.dur.adj - } else { - out$casl$durs.casl.homog$rates.casl.adj <- - 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) - out$casl$durs.casl.homog$mean.dur.adj <- - 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) - } + out$casl$durs.casl.homog$mean.dur <- alt$homog$mean.dur + out$casl$durs.casl.homog$median.dur <- alt$homog$median.dur + out$casl$durs.casl.homog$rates.casl.adj <- + 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) + out$casl$durs.casl.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) } for (i in seq_len(nrow(out$casl$durs.casl.byage))) { k <- out$casl$durs.casl.byage$index.age.grp[i] j <- which(alt$byage$index.age.grp == k) if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { - out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] - out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] - if (use_direct_mean) { - out$casl$durs.casl.byage$mean.dur.adj[i] <- - wt * alt$byage$mean.dur[j] - out$casl$durs.casl.byage$rates.casl.adj[i] <- - 1 / out$casl$durs.casl.byage$mean.dur.adj[i] - } else { - out$casl$durs.casl.byage$rates.casl.adj[i] <- - 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) - out$casl$durs.casl.byage$mean.dur.adj[i] <- - 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) - } + out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] + out$casl$durs.casl.byage$rates.casl.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$casl$durs.casl.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) } } - if (duration.method == "weibull_strat") { - attr(out$casl$durs.casl.byage, "weibull_shape") <- alt$weibull_shapes - attr(out$casl$durs.casl.homog, "weibull_shape") <- alt$weibull_shape_overall - } - if (duration.method == "joint_lm" && !is.null(alt$model)) { + if (!is.null(alt$model)) { out$casl$joint_duration_model <- alt$model } } diff --git a/man/build_netparams.Rd b/man/build_netparams.Rd index 6aecba4..14cc261 100644 --- a/man/build_netparams.Rd +++ b/man/build_netparams.Rd @@ -8,7 +8,7 @@ build_netparams( epistats, smooth.main.dur = FALSE, oo.nquants = 5, - duration.method = c("empirical", "weibull_strat", "joint_lm"), + duration.method = c("empirical", "joint_lm"), method = c("existing", "joint"), browser = FALSE ) @@ -30,40 +30,28 @@ partnerships, stratified by (age-match x index.age.grp). Relies on the geometric / memoryless assumption that median elapsed time in ongoing partnerships equals median full-partnership duration. Byte-identical to the pre-refactor behavior. -\item \code{"weibull_strat"} — \strong{length-bias-corrected} Weibull MLE per stratum. -ARTnet is a cross-sectional prevalence sample, not an incidence cohort: -\code{duration.time} for ongoing partnerships is the backward recurrence time -(age at inspection in a renewal process), not the full partnership -duration. Under stationarity and Weibull(k, lambda) durations, the density -of observed ages is \verb{f_A(a) = S(a; k, lambda) / (lambda * Gamma(1 + 1/k))}, -which this method fits directly via \code{optim()} on ongoing partnerships. -A naive \code{survreg(Surv(t, event) ~ 1)} treatment of the same data would -interpret length-biased elapsed durations as incidence-cohort survival -times, biasing the scale parameter by multiple orders of magnitude on -ARTnet; this method corrects for that. The fitted Weibull analytical mean -(\code{lambda * Gamma(1 + 1/k)}) is used directly as \code{mean.dur.adj} — it does -NOT pass through the geometric-distribution median->mean formula the -empirical and joint_lm methods use, because under Weibull with k far from -1 those two quantities diverge. The Weibull shape \code{k} is attached to -\verb{durs..byage} as a \code{"weibull_shape"} attribute. \verb{k ~= 1} supports -the constant-hazard (geometric) assumption embedded in the TERGM -dissolution offset; \code{k < 1} means the hazard of dissolution decreases -with partnership age (early periods risky, stable partnerships stay -stable); \code{k > 1} the reverse. On the ARTnet data, \code{k} fits at roughly -0.5, a clean rejection of the geometric assumption. Completed -partnerships (\code{ongoing2 == 0}) are not used because they are subject to -a different truncation (past-12-month recall) that would require separate -modeling; see issue #72 for broader discussion of ARTnet's sampling -corrections. \item \code{"joint_lm"} — log-linear \verb{lm(log(duration.time) ~ )} on ongoing partnerships; per-stratum medians computed from model -predictions. Fitted model is stored at \verb{netparams$$joint_duration_model} -for optional per-dyad use by future \code{build_netstats} refactors. +predictions. The covariate-adjusted version of \code{"empirical"}: targets the +same cross-sectional age-of-extant-ties quantity (via the same geometric +transformation), but corrects for confounding of the duration estimate by +other attributes — the direct analog of the marginal-vs-joint fix that +\code{method = "joint"} applies to formation stats. Fitted model is stored at +\verb{netparams$$joint_duration_model}; see issue #73 for the follow-up +to predict per-dyad on the synthetic target population in \code{build_netstats}. } -Under all three methods, the stratum-level medians flow through the same -geometric transformation (\code{mean.dur.adj = 1 / (1 - 2^(-1 / median))}) that -\code{build_netstats} passes to \code{dissolution_coefs()}, so the TERGM offset structure -is identical across methods. The one-off layer has no durations, so this flag -only affects main and casual-layer output.} +Both methods flow through the same geometric transformation +(\code{mean.dur.adj = 1 / (1 - 2^(-1 / median))}) that \code{build_netstats} passes to +\code{dissolution_coefs()}, so the TERGM offset structure is identical across +methods. The TERGM dissolution is geometric (constant hazard per timestep), so +under the inspection-paradox identity the \code{duration} parameter it consumes +simultaneously determines the simulated mean partnership duration and the +simulated mean age of extant ties at cross-section; both methods target the +observable latter quantity. A prior \code{"weibull_strat"} option targeting the +underlying mean full-partnership duration was explored and removed: its +output can't be faithfully honored by a geometric TERGM without a non-standard +age-dependent dissolution term, making it a diagnostic rather than a +production target. The one-off layer has no durations, so this flag only +affects main and casual-layer output.} \item{method}{Character. Either \code{"existing"} (default) or \code{"joint"}. \code{"existing"} reproduces the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear diff --git a/tests/testthat/test-duration-method.R b/tests/testthat/test-duration-method.R index 4dcfdab..632857b 100644 --- a/tests/testthat/test-duration-method.R +++ b/tests/testthat/test-duration-method.R @@ -30,30 +30,6 @@ test_that("default duration.method is 'empirical'", { # No joint_duration_model attached when method is empirical expect_null(np_default$main$joint_duration_model) expect_null(np_default$casl$joint_duration_model) - # No weibull_shape attribute - expect_null(attr(np_default$main$durs.main.byage, "weibull_shape")) - expect_null(attr(np_default$casl$durs.casl.byage, "weibull_shape")) -}) - -test_that("weibull_strat produces valid durs shape and shape diagnostic", { - skip_without_artnetdata() - testthat::skip_if_not_installed("survival") - np <- get_np("weibull_strat") - for (layer in c("main", "casl")) { - df_byage <- np[[layer]][[paste0("durs.", layer, ".byage")]] - df_homog <- np[[layer]][[paste0("durs.", layer, ".homog")]] - # Shape attributes attached as diagnostic - shapes <- attr(df_byage, "weibull_shape") - expect_type(shapes, "double") - expect_length(shapes, nrow(df_byage)) - expect_true(all(is.finite(shapes)), - info = paste("shapes must be finite:", layer)) - # Medians must be positive - expect_true(all(df_byage$median.dur > 0, na.rm = TRUE)) - expect_true(all(df_homog$median.dur > 0, na.rm = TRUE)) - # mean.dur.adj > 0 - expect_true(all(df_byage$mean.dur.adj > 0, na.rm = TRUE)) - } }) test_that("joint_lm stores fitted model at joint_duration_model", { @@ -72,12 +48,11 @@ test_that("joint_lm stores fitted model at joint_duration_model", { test_that("all duration.methods preserve output shape for dissolution_coefs", { skip_without_artnetdata() - testthat::skip_if_not_installed("survival") # Every method must produce durs..byage with the columns # dissolution_coefs() needs downstream, in the same order and types. expected_cols <- c("index.age.grp", "mean.dur", "median.dur", "rates.main.adj", "mean.dur.adj") - for (dm in c("empirical", "weibull_strat", "joint_lm")) { + for (dm in c("empirical", "joint_lm")) { df <- get_np(dm)$main$durs.main.byage expect_identical(colnames(df), expected_cols, info = paste("column mismatch for", dm)) @@ -89,9 +64,7 @@ test_that("all duration.methods preserve output shape for dissolution_coefs", { test_that("duration.method does not affect non-duration netparams fields", { skip_without_artnetdata() - testthat::skip_if_not_installed("survival") np_e <- get_np("empirical") - np_w <- get_np("weibull_strat") np_j <- get_np("joint_lm") # Everything except durs.*.byage, durs.*.homog, joint_duration_model # should be identical across methods. @@ -104,51 +77,25 @@ test_that("duration.method does not affect non-duration netparams fields", { "joint_duration_model") ) for (f in common_fields) { - expect_equal(np_w[[layer]][[f]], np_e[[layer]][[f]], - info = paste0("[weibull ", layer, "$", f, "]")) expect_equal(np_j[[layer]][[f]], np_e[[layer]][[f]], info = paste0("[joint_lm ", layer, "$", f, "]")) } } }) -test_that("weibull_strat requires survival package", { - skip_without_artnetdata() - # If survival isn't available the call should error clearly - # (we can't actually uninstall it in a test, but we can confirm the - # guard exists by searching the function body). - body_src <- deparse(build_netparams) - expect_true(any(grepl("duration\\.method = 'weibull_strat' requires", - body_src))) -}) - -test_that("weibull shape k is finite and in a plausible range per stratum", { +test_that("joint_lm rejects invalid method arg with clear error", { skip_without_artnetdata() - testthat::skip_if_not_installed("survival") - np <- get_np("weibull_strat") - shapes_main <- attr(np$main$durs.main.byage, "weibull_shape") - shapes_casl <- attr(np$casl$durs.casl.byage, "weibull_shape") - shape_main_overall <- attr(np$main$durs.main.homog, "weibull_shape") - shape_casl_overall <- attr(np$casl$durs.casl.homog, "weibull_shape") - # Under length-biased MLE, per-stratum shapes should all be finite and - # in a plausible range (sanity checks against pathological optimization). - expect_true(all(is.finite(shapes_main))) - expect_true(all(is.finite(shapes_casl))) - expect_true(all(shapes_main >= 0.1 & shapes_main <= 20), - info = paste("main shapes out of range:", - paste(round(shapes_main, 3), collapse = ", "))) - expect_true(all(shapes_casl >= 0.1 & shapes_casl <= 20), - info = paste("casl shapes out of range:", - paste(round(shapes_casl, 3), collapse = ", "))) - # Substantive finding on ARTnet: the overall (pooled) shape parameter - # for both layers lands well below 1 (decreasing hazard). If someone - # accidentally reverts to a naive survreg-style fit, the overall shape - # would shift to a very different value, so lock in the current value - # to a generous band. - expect_gt(shape_main_overall, 0.3) - expect_lt(shape_main_overall, 1.2) - expect_gt(shape_casl_overall, 0.3) - expect_lt(shape_casl_overall, 1.2) + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + expect_error( + build_netparams(epistats, duration.method = "weibull_strat", + method = "existing"), + regexp = "should be one of" + ) }) test_that("joint_lm can be combined with method = 'joint' in build_netstats", { From 7b9b19629761912a54d659fa2c032baf5107a8b9 Mon Sep 17 00:00:00 2001 From: Samuel Jenness Date: Sat, 25 Apr 2026 12:06:42 -0400 Subject: [PATCH 4/4] Update gitignore --- .Rbuildignore | 2 ++ .gitignore | 1 + 2 files changed, 3 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 587f5c2..2fc4eb5 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,5 @@ ^CLAUDE\.md$ ^inst/validation/snapshots$ ^\.github$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index 050561d..d061d71 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ Rplots.pdf vignettes/*.html vignettes/*.R inst/validation/snapshots/*.rds +.positai