10 Case Study: NHANES

This case study follows the tutorial laid out in section 7.5 of Heeringa (2017). It uses the National Health and Nutrition Examination Survey (NHANES) to build a predictive model of diastolic blood pressure (DBP). The NHANES sample is a stratified cluster sample, weighted for unequal probabilities of selection, non-response, and post-stratification.

library(tidyverse)
library(scales)
library(janitor)
library(glue)
library(survey)
library(srvyr)
library(gtsummary)
library(broom)
library(ggeffects)
library(svydiags)
library(NHANES)

10.1 Data Preparation

data(NHANESraw)

Heeringa identifies a data cleaning step, converting 0s to NA.

nhanes <-
  NHANESraw |> 
  filter(SurveyYr == "2011_12") |> 
  mutate(
    BPDia1 = if_else(BPDia1 == 0, NA_integer_, BPDia1),
    Race1 = fct_relevel(Race1, "Mexican", after = 0),
    Gender = fct_relevel(Gender, "male", after = 0)
  )

Create a survey design object for analysis. Use it to create a new variable, centered age, Age.

nhanes_des <- as_survey_design(
  nhanes,
  ids = SDMVPSU,  # primary sampling unit (cluster)
  strata = SDMVSTRA,  # sampling stratum
  nest = TRUE,  # strata ids are repeated across clusters
  weights = WTMEC2YR
)

weighted_mean_age <-
  nhanes_des |>
  filter(Age >= 18) |>
  summarize(M = survey_mean(Age)) |>
  pull(M)

nhanes_des <- 
  nhanes_des |> 
  mutate(
    Age18p = Age >= 18,
    Age = Age - weighted_mean_age
  )

summary(nhanes_des)

Stratified 1 - level Cluster Sampling design (with replacement)
With (31) clusters.
Called via srvyr
Probabilities:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
4.493e-06 2.874e-05 5.528e-05       Inf 8.949e-05       Inf 
Stratum Sizes: 
            90  91  92  93  94  95  96  97  98  99 100 101 102 103
obs        862 998 875 602 688 722 676 608 708 682 700 715 624 296
design.PSU   3   3   3   2   2   2   2   2   2   2   2   2   2   2
actual.PSU   3   3   3   2   2   2   2   2   2   2   2   2   2   2
Data variables:
 [1] "ID"               "SurveyYr"         "Gender"           "Age"             
 [5] "AgeMonths"        "Race1"            "Race3"            "Education"       
 [9] "MaritalStatus"    "HHIncome"         "HHIncomeMid"      "Poverty"         
[13] "HomeRooms"        "HomeOwn"          "Work"             "Weight"          
[17] "Length"           "HeadCirc"         "Height"           "BMI"             
[21] "BMICatUnder20yrs" "BMI_WHO"          "Pulse"            "BPSysAve"        
[25] "BPDiaAve"         "BPSys1"           "BPDia1"           "BPSys2"          
[29] "BPDia2"           "BPSys3"           "BPDia3"           "Testosterone"    
[33] "DirectChol"       "TotChol"          "UrineVol1"        "UrineFlow1"      
[37] "UrineVol2"        "UrineFlow2"       "Diabetes"         "DiabetesAge"     
[41] "HealthGen"        "DaysPhysHlthBad"  "DaysMentHlthBad"  "LittleInterest"  
[45] "Depressed"        "nPregnancies"     "nBabies"          "Age1stBaby"      
[49] "SleepHrsNight"    "SleepTrouble"     "PhysActive"       "PhysActiveDays"  
[53] "TVHrsDay"         "CompHrsDay"       "TVHrsDayChild"    "CompHrsDayChild" 
[57] "Alcohol12PlusYr"  "AlcoholDay"       "AlcoholYear"      "SmokeNow"        
[61] "Smoke100"         "SmokeAge"         "Marijuana"        "AgeFirstMarij"   
[65] "RegularMarij"     "AgeRegMarij"      "HardDrugs"        "SexEver"         
[69] "SexAge"           "SexNumPartnLife"  "SexNumPartYear"   "SameSex"         
[73] "SexOrientation"   "WTINT2YR"         "WTMEC2YR"         "SDMVPSU"         
[77] "SDMVSTRA"         "PregnantNow"      "Age18p"

10.2 Descriptive Analysis

Regress the continuous response variable, diastolic blood pressure (BPDia1), on four predictors of interest: race/ethnicity (Race1), centered-age (Age), gender (Gender), and sexual orientation (SexOrientation).

Show the code

nhanes_des |>
  tbl_uvregression(
    y = BPDia1,
    method = survey::svyglm,
    method.args = list(subset = (Age18p == 1)),
    include = c(Race1, Age, Gender, SexOrientation),
    estimate_fun = label_style_number(digits = 2),
    pvalue_fun = label_style_pvalue(digits = 2),
    label = list(
      Race1 ~ "Race/ethnicity", 
      Age ~ "Age (cent.)", 
      Gender ~ "Gender"
      # MaritalStatus ~ "Marital status"
    )
  ) |>
  # add_global_p(anova_fun = tidy_wald_test) |>
  as_gt() |>
  gt::tab_caption(".") |>
  gt::tab_header(
    title = "Initial Design-Based Bivariate Regression Analysis Results ",
    subtitle = glue("Assessing Potential Predictors of Diastolic Blood Pressure ",
                    "for the 2011–2012 NHANES Adult Sample")
  ) |>
  gt::tab_options(heading.align = "left")

.
Initial Design-Based Bivariate Regression Analysis Results
Assessing Potential Predictors of Diastolic Blood Pressure for the 2011–2012 NHANES Adult Sample
Characteristic	N	Beta	95% CI ¹	p-value
Race/ethnicity	212,747,914
Mexican		—	—
Black		2.29	0.77, 3.81	0.006
Hispanic		-0.15	-3.30, 2.99	0.92
White		2.18	0.58, 3.79	0.011
Other		1.31	-0.22, 2.83	0.087
Age (cent.)	212,747,914	0.04	0.00, 0.08	0.053
Gender	212,747,914
male		—	—
female		-2.20	-3.40, -1.00	0.001
SexOrientation	140,128,079
Bisexual		—	—
Heterosexual		2.42	-1.74, 6.59	0.23
Homosexual		1.61	-2.04, 5.26	0.36
¹ CI = Confidence Interval

The design-based t-tests suggest that race/ethnicity, age, and gender have potentially significant relationships with DBP, while sexual orientation does not appear to be related. Non-Hispanic whites, non-Hispanic blacks, older adults, and males appear to have the highest DBPs at first glance. Include the first three predictors in an initial model.

10.3 Analysis

10.3.1 Model Fitting

Fit an initial regression model with the variables of interest. Notice how the dataset is filtered inside the model with subset.

fit_1 <- svyglm(
  BPDia1 ~ Race1 + Age + Gender,
  design = nhanes_des,
  subset = (Age18p == 1)
)

Show the code

fit_1 |>
  gtsummary::tbl_regression(
    label = list(
      Race1 ~ "Race/ethnicity", 
      Age ~ "Age (cent.)", 
      Gender ~ "Gender"
    ),
    intercept = TRUE,
    estimate_fun = label_style_number(digits = 3)
  ) |>
  gtsummary::add_glance_source_note() |>
  gtsummary::add_n(location = "level") |>
  gtsummary::bold_p() |>
  as_gt() |>
  gt::tab_caption(".") |>
  gt::tab_header(
    title = "Initial Model for Diastolic Blood Pressure",
    subtitle = glue("Design-Adjusted Test Statistics and Confidence Intervals ",
                    "for the Parameters, and Design Effects for the Parameter ",
                    "Estimates.")
  ) |>
  gt::tab_options(heading.align = "left")

.
Characteristic	N	Beta	95% CI ¹	p-value
Initial Model for Diastolic Blood Pressure
Design-Adjusted Test Statistics and Confidence Intervals for the Parameters, and Design Effects for the Parameter Estimates.
(Intercept)	212,747,914	71.149	70.008, 72.289	<0.001
Race/ethnicity
Mexican	16,644,312	—	—
Black	24,960,921	2.302	0.839, 3.765	0.005
Hispanic	13,924,721	-0.141	-3.167, 2.884	>0.9
White	140,677,592	1.904	0.123, 3.685	0.038
Other	16,540,366	1.262	-0.294, 2.817	0.10
Age (cent.)	212,747,914	0.037	-0.009, 0.083	0.10
Gender
male	104,775,375	—	—
female	107,972,538	-2.291	-3.498, -1.084	0.002
Null deviance = 703,891; Null df = 5,111; AIC = 41,417; BIC = 691,687; Deviance = 691,627; Residual df = 11; No. Obs. = 5,112
¹ CI = Confidence Interval

The t statistics degrees of freedom equal the number of ultimate clusters (31) minus the number of strata (14).

Plot residuals against the predictor variables. The augment() function has not been specifically written for svyglm objects, so you need to make a couple adjustments. Start with the centered age, Age. The plot has a curvilinear pattern suggesting a quadratic relationship. Add a squared term and re-plot the residuals.

preds_1 <-
  augment(fit_1) |>
  mutate(
    .se.fit = sqrt(attr(.fitted, "var")),
    .fitted = as.numeric(.fitted)
  )

fit_2 <- svyglm(
  BPDia1 ~ Race1 + Age + I(Age^2) + Gender,
  design = nhanes_des,
  subset = (Age18p == 1)
)

preds_2 <- 
  augment(fit_2) |>
  mutate(
    .se.fit = sqrt(attr(.fitted, "var")),
    .fitted = as.numeric(.fitted)
  )

Show the code

bind_rows(
  Original = preds_1,
  `With squared term` = preds_2,
  .id = "mdl"
) |>
  ggplot(aes(y = .resid, x = Age)) +
  geom_point(color = "goldenrod", alpha = .2) +
  facet_wrap(vars(mdl)) +
  labs(
    x = "Centered Age",
    title = glue("Residuals vs centered age before and after addition of ",
                    "squared age variable.")
  )

It does appear better. The AIC and BIC also decreased, meaning the second model is more efficient.

Show the code

bind_rows(
  fit_1 = glance(fit_1),
  fit_2 = glance(fit_2),
  .id = "Model Fit"
) |>
  gt::gt()

Model Fit	null.deviance	df.null	AIC	BIC	deviance	df.residual	nobs
fit_1	703891.3	5111	41416.82	691686.9	691627.1	11	5112
fit_2	703891.3	5111	40859.00	623672.1	623603.8	10	5112

Add interactions between Age and each of the demographic factor variables, one at a time, to test whether they moderate the relationship between age and DBP. Start with race/ethnicity, and perform a Wald test of the null hypothesis that all eight parameters are simultaneously equal to zero.

fit_3 <- svyglm(
  BPDia1 ~ Race1*Age + Race1*I(Age^2) + Gender,
  design = nhanes_des,
  subset = (Age18p == 1)
)

regTermTest(fit_3, ~ Race1:Age + Race1:I(Age^2))

Wald test for Race1:Age Race1:I(Age^2)
 in svyglm(formula = BPDia1 ~ Race1 * Age + Race1 * I(Age^2) + Gender, 
    design = nhanes_des, subset = (Age18p == 1))
F =  11.8955  on  8  and  2  df: p= 0.079828

None of the interaction estimators have a p-value < .05, and the Wald test fails to reject the null hypothesis. How about interacting with gender? This time the interactions are significant and the Wald test rejects the null hypothesis.

fit_4 <- svyglm(
  BPDia1 ~ Race1 + Gender*Age + Gender*I(Age^2),
  design = nhanes_des,
  subset = (Age18p == 1)
)

summary(fit_4, df.resid = degf(nhanes_des))


Call:
svyglm(formula = BPDia1 ~ Race1 + Gender * Age + Gender * I(Age^2), 
    design = nhanes_des, subset = (Age18p == 1))

Survey design:
Called via srvyr

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           74.9844260  0.6456943 116.130  < 2e-16 ***
Race1Black             2.5401774  0.7328015   3.466 0.002951 ** 
Race1Hispanic          0.2056088  1.2087808   0.170 0.866942    
Race1White             2.0990068  0.8453388   2.483 0.023758 *  
Race1Other             1.4274416  0.6919988   2.063 0.054754 .  
Genderfemale          -3.1709391  0.7571871  -4.188 0.000618 ***
Age                    0.0482851  0.0163167   2.959 0.008785 ** 
I(Age^2)              -0.0135697  0.0008394 -16.165 9.39e-12 ***
Genderfemale:Age       0.0475724  0.0229597   2.072 0.053791 .  
Genderfemale:I(Age^2)  0.0033007  0.0016223   2.035 0.057796 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 121.4838)

Number of Fisher Scoring iterations: 2

regTermTest(fit_4, ~ Gender:Age + Gender:I(Age^2))

Wald test for Gender:Age Gender:I(Age^2)
 in svyglm(formula = BPDia1 ~ Race1 + Gender * Age + Gender * I(Age^2), 
    design = nhanes_des, subset = (Age18p == 1))
F =  5.356873  on  2  and  8  df: p= 0.033398

Plot marginal predicted values from DBP to assess variability in the relationship between age and DBP depending on race/ethnicity and gender. African Americans have a higher acceleration in DBP until middle age, then the gap decreases.

Show the code

bind_rows(
  `Race/Ethnicity` = fit_4 |>
    predict_response(terms = c("Age [-30:30 by=5]", "Race1")) |>
    as_tibble(),
  Gender = fit_4 |>
    predict_response(terms = c("Age [-30:30 by=5]", "Gender")) |>
    as_tibble(),
  .id = "Covar"
) |>
  mutate(group = fct_relevel(group, c("female", "male"), after = 0)) |>
  ggplot(aes(x = x, y = predicted, color = group)) + 
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.8) +
  geom_point() +
  geom_line() +
  labs(
    x = "Centered Age", y = "Marginal predicted DBP", color = NULL,
    title = "Marginal predicted values from regression model.",
    subtitle = "Interaction of age and race/ethnicity"
  ) +
  facet_wrap(vars(Covar)) +
  theme(legend.position = "top", legend.justification = "left")

10.3.2 Model Diagnostics

The residuals vs fitted plot varies equally around 0 in a straight horizontal line (linearity and homoscedasticity). The Q-Q residuals plot is bowed at the high end, so prediction intervals maybe be inaccurate. The equal spread along the horizontal of the scale-location plot supports homoscedasticity. The residuals vs leverage plot identifies a few influential observations.

par(mfrow = c(2, 2))
plot(fit_4, labels.id = NULL)

Check for influential points and outliers.

library(MASS)
cooks_d4 <- svyCooksD(fit_4, doplot = TRUE)