--- title: "Fitting generalized linear models using the mcglm package" author: "Prof. Wagner Hugo Bonat" date: "`r paste('mcglm', packageVersion('mcglm'), Sys.Date())`" vignette: > %\VignetteIndexEntry{Fitting generalized linear models using the mcglm package} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} #----------------------------------------------------------------------- library(knitr) #----------------------------------------------------------------------- ## show.settings() ``` **** To install the stable version of [`mcglm`][], use `devtools::install_git()`. For more information, visit [mcglm/README]. ```{r, eval=FALSE} #library(devtools) #install_git("bonatwagner/mcglm") ``` ```{r, eval=FALSE, error=FALSE, message=FALSE, warning=FALSE} library(mcglm) packageVersion("mcglm") ``` ##### Abstract The `mcglm` package implements the multivariate covariance generalized linear models (McGLMs) proposed by Bonat and J$\o$rgensen (2016). In this introductory vignette we employed the `mcglm` package for fitting a set of generalized linear models and compare our results with the ones obtained by ordinary `R` functions like `lm` and `glm`. **** ## Example 1 - Count data Consider the count data obtained in Dobson (1990). ```{r, warning = FALSE, message = FALSE} ## Dobson (1990) Page 93: Randomized Controlled Trial : counts <- c(18,17,15,20,10,20,25,13,12) outcome <- gl(3,1,9) treatment <- gl(3,3) print(d.AD <- data.frame(treatment, outcome, counts)) ``` Ordinary analysis using quasi-Poisson model. ```{r, warning = FALSE, message = FALSE} fit.glm <- glm(counts ~ outcome + treatment, family = quasipoisson) ``` The orthodox quasi-Poisson model is obtained by specifying the variance function as `tweedie` and fix the power parameter at $1$. Since, we are dealing with independent data, the matrix linear predictor is composed of a diagonal matrix. ```{r, warning = FALSE, message = FALSE} require(mcglm) require(Matrix) # Matrix linear predictor Z0 <- mc_id(d.AD) fit.qglm <- mcglm(linear_pred = c(counts ~ outcome + treatment), matrix_pred = list("resp1" = Z0), link = "log", variance = "tweedie", data = d.AD, control_algorithm = list(verbose = FALSE, method = "chaser", tuning = 0.8)) ``` Comparing regression parameter estimates and standard errors. ```{r, warning = FALSE, message = FALSE} cbind("GLM" = coef(fit.glm), "McGLM" = coef(fit.qglm, type = "beta")$Estimates) cbind("GLM" = sqrt(diag(vcov(fit.glm))), "McGLM" = coef(fit.qglm, type = "beta", std.error = TRUE)$Std.error) ``` **** ## Example 2 - Continuous data with offset Consider the example from Venables & Ripley (2002, p.189). The response variable is continuous for which we can assume the Gaussian distribution. In this example, we exemplify the use of the `offset` argument. ```{r, warning = FALSE, message = FALSE} # Loading the data set utils::data(anorexia, package = "MASS") # GLM fit anorex.1 <- glm(Postwt ~ Prewt + Treat + offset(Prewt), family = gaussian, data = anorexia) # McGLM fit Z0 <- mc_id(anorexia) fit.anorexia <- mcglm(linear_pred = c(Postwt ~ Prewt + Treat), matrix_pred = list(Z0), offset = list(anorexia$Prewt), power_fixed = TRUE, data = anorexia, control_algorithm = list("correct" = TRUE)) ``` Comparing parameter estimates and standard errors. ```{r, warning = FALSE, message = FALSE} # Estimates cbind("McGLM" = round(coef(fit.anorexia, type = "beta")$Estimates,5), "GLM" = round(coef(anorex.1),5)) # Standard errors cbind("McGLM" = sqrt(diag(vcov(fit.anorexia))), "GLM" = c(sqrt(diag(vcov(anorex.1))),NA)) ``` **** ## Example 3 - Continuous positive data Consider the following data set from McCullagh & Nelder (1989, pp.300-2). It is an example of Gamma regression model. ```{r, warning = FALSE, message = FALSE} clotting <- data.frame( u = c(5,10,15,20,30,40,60,80,100), lot1 = c(118,58,42,35,27,25,21,19,18), lot2 = c(69,35,26,21,18,16,13,12,12)) ``` Analysis using generalized linear models `glm` function in `R`. ```{r, warning = FALSE, message = FALSE} fit.lot1 <- glm(lot1 ~ log(u), data = clotting, family = Gamma(link = "inverse")) fit.lot2 <- glm(lot2 ~ log(u), data = clotting, family = Gamma(link = "inverse")) ``` The code below exemplify how to use the `control_initial` argument for fixing the power parameter at $p = 2$. ```{r, warning = FALSE, message = FALSE} list_initial = list() list_initial$regression <- list(coef(fit.lot1)) list_initial$power <- list(c(2)) list_initial$tau <- list(summary(fit.lot1)$dispersion) list_initial$rho = 0 ``` The `control_initial` argument should be a named list with initial values for all parameters involved in the model. Note that, in this example we used the parameter estimates from the `glm` fit as initial values for the regression and dispersion parameters. The power parameter was fixed at $p = 2$, since we want to fit Gamma regression models. In that case, we have only one response variable, but the initial value for correlation parameter $\rho$ should be specified. ```{r, warning = FALSE, message = FALSE} Z0 <- mc_id(clotting) fit.lot1.mcglm <- mcglm(linear_pred = c(lot1 ~ log(u)), matrix_pred = list(Z0), link = "inverse", variance = "tweedie", data = clotting, control_initial = list_initial) ``` Comparing parameter estimates and standard errors. ```{r, warning = FALSE, message = FALSE} # Estimates cbind("mcglm" = round(coef(fit.lot1.mcglm, type = "beta")$Estimates,5), "glm" = round(coef(fit.lot1),5)) # Standard errors cbind("mcglm" = sqrt(diag(vcov(fit.lot1.mcglm))), "glm" = c(sqrt(diag(vcov(fit.lot1))),NA)) ``` Initial values for the response variable `lot2` ```{r, warning = FALSE, message = FALSE} list_initial$regression <- list("resp1" = coef(fit.lot2)) list_initial$tau <- list("resp1" = c(var(1/clotting$lot2))) ``` Note that, since the `list_initial` object already have all components required, we just modify the `entries` regression and tau. ```{r, warning = FALSE, message = FALSE} fit.lot2.mcglm <- mcglm(linear_pred = c(lot2 ~ log(u)), matrix_pred = list(Z0), link = "inverse", variance = "tweedie", data = clotting, control_initial = list_initial) ``` Comparing parameter estimates and standard errors. ```{r, warning = FALSE, message = FALSE} # Estimates cbind("mcglm" = round(coef(fit.lot2.mcglm, type = "beta")$Estimates,5), "glm" = round(coef(fit.lot2),5)) # Standard errors cbind("mcglm" = sqrt(diag(vcov(fit.lot2.mcglm))), "glm" = c(sqrt(diag(vcov(fit.lot2))),NA)) ``` The main contribution of the `mcglm`package is that it easily fits multivariate regression models. For example, for the `clotting` data a bivariate Gamma model is a suitable choice. ```{r, warning = FALSE, message = FALSE} # Initial values list_initial = list() list_initial$regression <- list(coef(fit.lot1), coef(fit.lot2)) list_initial$power <- list(c(2),c(2)) list_initial$tau <- list(c(0.00149), c(0.001276)) list_initial$rho = 0.80 # Matrix linear predictor Z0 <- mc_id(clotting) # Fit bivariate Gamma model fit.joint.mcglm <- mcglm(linear_pred = c(lot1 ~ log(u), lot2 ~ log(u)), matrix_pred = list(Z0, Z0), link = c("inverse", "inverse"), variance = c("tweedie", "tweedie"), data = clotting, control_initial = list_initial, control_algorithm = list("correct" = TRUE, "method" = "chaser", "tuning" = 0.1, "max_iter" = 1000)) summary(fit.joint.mcglm) ``` We also can easily change the link function. The code below fit a bivariate Gamma model using the `log` link function. ```{r, warning = FALSE, message = FALSE} # Initial values list_initial = list() list_initial$regression <- list(c(log(mean(clotting$lot1)),0), c(log(mean(clotting$lot2)),0)) list_initial$power <- list(c(2), c(2)) list_initial$tau <- list(c(0.023), c(0.024)) list_initial$rho = 0 # Fit bivariate Gamma model fit.joint.log <- mcglm(linear_pred = c(lot1 ~ log(u), lot2 ~ log(u)), matrix_pred = list(Z0,Z0), link = c("log", "log"), variance = c("tweedie", "tweedie"), data = clotting, control_initial = list_initial) summary(fit.joint.log) ``` **** ## Example 4 - Binomial data Consider the example `menarche` from the MASS `R` package. ```{r, warning = FALSE, message = FALSE} require(MASS) data(menarche) data <- data.frame("resp" = menarche$Menarche/menarche$Total, "Ntrial" = menarche$Total, "Age" = menarche$Age) ``` Logistic regression model. ```{r, warning = FALSE, message = FALSE} glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age, family=binomial(logit), data=menarche) ``` The same fitted by `mcglm` function. ```{r, warning = FALSE, message = FALSE} # Matrix linear predictor Z0 <- mc_id(data) fit.logit <- mcglm(linear_pred = c(resp ~ Age), matrix_pred = list(Z0), link = "logit", variance = "binomialP", Ntrial = list(data$Ntrial), data = data) ``` Comparing parameter estimates and standard errors. ```{r, warning = FALSE, message = FALSE} # Estimates cbind("GLM" = coef(glm.out), "McGLM" = coef(fit.logit, type = "beta")$Estimates) # Standard error cbind("GLM" = c(sqrt(diag(vcov(glm.out))),NA), "McGLM" = sqrt(diag(vcov(fit.logit)))) ``` We can estimate a more flexible model using the extended binomial variance function. ```{r, warning = FALSE, message = FALSE} fit.logit.power <- mcglm(linear_pred = c(resp ~ Age), matrix_pred = list(Z0), link = "logit", variance = "binomialP", Ntrial = list(data$Ntrial), power_fixed = FALSE, data = data) summary(fit.logit.power) ``` [`mcglm`]: https://github.com/bonatwagner/mcglm [mcglm/README]: https://github.com/bonatwagner/mcglm/blob/main/README.md