Exercise 14. Non-collapsibility of proportional hazards models
We simulate for time-to-event data assuming constant hazards and then investigate whether we can estimate the underlying parameters. Note that the binary variable \(X\) is essentially a coin toss and we have used a large variance for the normally distributed \(U\).
You may have to install the required packages the first time you use them. You can install a package by install.packages("package_of_interest") for each package you require.
The assumed causal diagram is reproduced below:
set.seed(12345)
d <- local({
    n <- 1e4
    x <- rbinom(n, 1, 0.5)
    u <- rnorm(n, 0, 3)
    t <- rexp(n, exp(-5+x+u))
    c <- runif(n, 0, 10)
    y <- pmin(t, c)
    delta <- (t < c)
    data.frame(y,x,u,delta)
})
head(d)(a) Fitting models with both \(X\) and \(U\)
For constant hazards, we can fit (i) Poisson regression, (ii) Cox regression and (iii) flexible parametric survival models.
summary(fit1 <- glm(delta~x+u+offset(log(y)),data=d,family=poisson))
summary(fit2 <- coxph(Surv(y,delta)~x+u,data=d))
summary(fit3 <- stpm2(Surv(y,delta)~x+u,data=d,df=4))
## summary table for the coefficients for X
rbind(Poisson=coef(summary(fit1))["x",c("Estimate","Std. Error")],
      Cox=coef(summary(fit2))["x",c("coef","se(coef)")],
      Stpm2=coef(summary(fit3))["x",c("Estimate","Std. Error")])It may be useful to investigate whether the hazard ratio for \(X\) is time-varying hazard ratio and the form for survival.
fit <- stpm2(Surv(y,delta)~x+u,data=d,df=4, tvc=list(x=2))
plot(fit, type="hr", newdata=data.frame(x=0,u=0), var="x", ylim=c(1,4))s <- predict(fit, type="surv", newdata=data.frame(x=0:1,u=3), grid=TRUE, full=TRUE,
             se.fit=TRUE)
ggplot(s, aes(x=y,y=Estimate,fill=factor(x),ymin=lower,ymax=upper)) +
    ylab("Survival") +
    geom_ribbon(alpha=0.6) +
    geom_line()(b) Fitting models with only \(X\)
We now model by excluding the variable \(U\). This variable could be excluded when it is not measured or perhaps when the variable is not considered to be a confounding variable – from the causal diagram, the two variables \(X\) and \(U\) are not correlated and are only connected through the time variable \(T\).
summary(fit1 <- glm(delta~x+offset(log(y)),data=d,family=poisson))
summary(fit2 <- coxph(Surv(y,delta)~x,data=d))
summary(fit3 <- stpm2(Surv(y,delta)~x,data=d,df=4))## summary table for the coefficients for X
rbind(Poisson=coef(summary(fit1))["x",c("Estimate","Std. Error")],
      Cox=coef(summary(fit2))["x",c("coef","se(coef)")],
      Stpm2=coef(summary(fit3))["x",c("Estimate","Std. Error")])Again, we suggest investigating whether the hazard ratio for \(X\) is time-varying.
fit <- stpm2(Surv(y,delta)~x,data=d,df=4, tvc=list(x=2))
plot(fit, type="hr", newdata=data.frame(x=0), var="x", ylim=c(1,4))What do you see from the time-varing hazard ratio? Is \(U\) a potential confounder for \(X\)?
(c) Rarer outcomes
We now simulate for rarer outcomes by changing the censoring distribution:
set.seed(12345)
d <- local({
    n <- 1e4*10 # CHANGED N
    x <- rbinom(n, 1, 0.5)
    u <- rnorm(n, 0, 3)
    t <- rexp(n, exp(-5+x+u))
    c <- runif(n, 0, 10/1000) # CHANGED FROM 10 TO 0.01
    y <- pmin(t, c)
    delta <- (t < c)
    data.frame(y,x,u,delta)
})summary(fit1 <- glm(delta~x+offset(log(y)),data=d,family=poisson))
summary(fit2 <- coxph(Surv(y,delta)~x,data=d))
summary(fit3 <- stpm2(Surv(y,delta)~x,data=d,df=4))## summary table for the coefficients for X
rbind(Poisson=coef(summary(fit1))["x",c("Estimate","Std. Error")],
      Cox=coef(summary(fit2))["x",c("coef","se(coef)")],
      Stpm2=coef(summary(fit3))["x",c("Estimate","Std. Error")])##         Estimate Std. Error
## Poisson 1.081150  0.1212037
## Cox     1.081513  0.1212136
## Stpm2   1.081399  0.1212136What do you observe?
(d) Less heterogeneity
We now simulate for less heterogeneity by changing the reducing the standard deviation for the random effect \(U\) from 3 to 1.
set.seed(12345)
d <- local( {
    n <- 1e4
    x <- rbinom(n, 1, 0.5)
    u <- rnorm(n, 0, 1) # CHANGED SD FROM 3 TO 1
    t <- rexp(n, exp(-5+x+u))
    c <- runif(n, 0, 10) 
    y <- pmin(t, c)
    delta <- (t < c)
    data.frame(y,x,u,delta)
})summary(fit1 <- glm(delta~x+offset(log(y)),data=d,family=poisson))
summary(fit2 <- coxph(Surv(y,delta)~x,data=d))
summary(fit3 <- stpm2(Surv(y,delta)~x,data=d,df=4))## summary table for the coefficients for X
rbind(Poisson=coef(summary(fit1))["x",c("Estimate","Std. Error")],
      Cox=coef(summary(fit2))["x",c("coef","se(coef)")],
      Stpm2=coef(summary(fit3))["x",c("Estimate","Std. Error")])##          Estimate Std. Error
## Poisson 0.9498227 0.07583196
## Cox     0.9444216 0.07584759
## Stpm2   0.9441747 0.07584672What do you observe?
(e) Accelerated failure time models
As an alternative model class, we can fit accelerated failure time models with a smooth baseline survival function. We can use the rstpm2::aft function, which uses splines to model baseline survival. Using the baseline simulation, fit and interpret smooth accelerated failure time models:
## Maximum likelihood estimation
## 
## Call:
## bbmle::mle2(minuslogl = negll, start = coef, eval.only = TRUE, 
##     vecpar = TRUE, gr = gradient, control = control)
## 
## Coefficients:
##                                        Estimate Std. Error  z value  Pr(z)    
## x                                     -1.036381   0.087589 -11.8324 <2e-16 ***
## u                                     -0.993865   0.048413 -20.5289 <2e-16 ***
## nsx(logtstar, df, intercept = TRUE)1  -3.424935   0.362341  -9.4523 <2e-16 ***
## nsx(logtstar, df, intercept = TRUE)2   0.285719   0.263517   1.0843 0.2783    
## nsx(logtstar, df, intercept = TRUE)3 -14.160032   0.770291 -18.3827 <2e-16 ***
## nsx(logtstar, df, intercept = TRUE)4   4.529814   0.488246   9.2777 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## -2 log L: 7681.266## Maximum likelihood estimation
## 
## Call:
## bbmle::mle2(minuslogl = negll, start = coef, eval.only = TRUE, 
##     vecpar = TRUE, gr = gradient, control = control)
## 
## Coefficients:
##                                        Estimate Std. Error  z value   Pr(z)    
## x                                     -1.094700   0.094337 -11.6042 < 2e-16 ***
## nsx(logtstar, df, intercept = TRUE)1  -3.076847   0.303330 -10.1436 < 2e-16 ***
## nsx(logtstar, df, intercept = TRUE)2   0.577282   0.226405   2.5498 0.01078 *  
## nsx(logtstar, df, intercept = TRUE)3 -13.490435   0.660353 -20.4291 < 2e-16 ***
## nsx(logtstar, df, intercept = TRUE)4   4.657197   0.419495  11.1019 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## -2 log L: 8334.838