##### preamble ########################################################### # context of the data set: # A representative sample of n = 768 women who were at least 21 years old, # of Pima Indian heritage and living near Phoenix, Arizona, # was tested in the late 1980s for diabetes according to # World Health Organization criteria. # The data were collected by the US National Institute of Diabetes # and Digestive and Kidney Diseases. # unbuffer the output # change directory to the location of the data file # 'pima-indians-diabetes.txt' # on my desktop at home this is # setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Pima-Indians-Diabetes' ) # run the next command the first time you want to work with 'reglogit' # and don't run it thereafter # install.packages( 'reglogit' ) library( reglogit ) ##### read in the data set and examine its structure ##################### pima <- read.csv( 'pima-indians-diabetes.txt' ) str( pima ) # 'data.frame': 768 obs. of 9 variables: # $ pregnancies: int 6 1 8 1 0 5 3 10 2 8 ... # $ glucose : int 148 85 183 89 137 116 78 115 197 125 ... # $ dbp : int 72 66 64 66 40 74 50 0 70 96 ... # $ skinfold : int 35 29 0 23 35 0 32 0 45 0 ... # $ insulin : int 0 0 0 94 168 0 88 0 543 0 ... # $ bmi : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ... # $ pedigree : num 0.627 0.351 0.672 0.167 2.288 ... # $ age : int 50 31 32 21 33 30 26 29 53 54 ... # $ diabetes : int 1 0 1 0 1 0 1 0 1 1 ... # here if i had more time i would study the predictor variables # in detail, but i don't (my bad); we're just going to look below # at the nature of bayesian regularization of logistic regression # coefficients (typically this involves shrinking them toward a prior mean # of 0, to achieve a reduction in mean squared error of prediction) ##### prepare the inputs to 'reglogit' ################################## X <- as.matrix( pima[ , - 9 ] ) y <- as.numeric( pima[ , 9 ] ) # BG: pre-normalize X matrix to match the comparison in the paper # Gramacy RB, Polson NG (2012). Simulation-based regularized # logistic regression. Bayesian Analysis, 7(3), pp. 567-590; # arXiv:1005.3430; http://arxiv.org/abs/1005.3430 one <- rep( 1, nrow( X ) ) normx <- sqrt( drop( one %*% ( X^2 ) ) ) X <- scale( X, F, normx ) ## compare to the GLM fit summary( fit.logit <- glm( y ~ X, family = binomial( link = 'logit' ) ) ) # Call: # glm(formula = y ~ X, family = binomial(link = "logit")) # Deviance Residuals: # Min 1Q Median 3Q Max # -2.5566 -0.7274 -0.4159 0.7267 2.9297 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -8.4047 0.7166 -11.728 < 2e-16 *** # Xpregnancies 17.4480 4.5436 3.840 0.000123 *** # Xglucose 121.8551 12.8520 9.481 < 2e-16 *** # Xdbp -26.4411 10.4082 -2.540 0.011072 * # Xskinfold 0.4459 4.9708 0.090 0.928515 # Xinsulin -4.6273 3.4994 -1.322 0.186065 # Xbmi 81.9056 13.7764 5.945 2.76e-09 *** # Xpedigree 15.0995 4.7790 3.160 0.001580 ** # Xage 14.5282 9.1208 1.593 0.111192 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 993.48 on 767 degrees of freedom # Residual deviance: 723.45 on 759 degrees of freedom # AIC: 741.45 # Number of Fisher Scoring iterations: 5 # note that some of the predictors have coefficients with # large standard errors (glucose, dbp, bmi); # shrinkage of the coefficients toward 0 may well serve # to stabilize the maximum-likelihood fitting process beta.initial.values <- fit.logit$coefficients ## do the MCMC (Gibbs) sampling with nu = 6 T <- 101000 seed <- 5459224 set.seed( seed ) system.time( results.nu.6 <- reglogit( T, y, X, nu = 6, nup = NULL, bstart = beta.initial.values, normalize = F ) ) # round 100 # round 200 # # . # . # . # # round 101000 # user system elapsed # 1596.98 32.59 304.10 ## plot the marginal posterior distributions of the coefficients n.burnin <- 1000 burnin <- ( 1:n.burnin ) boxplot( results.nu.6$beta[ - burnin, ], main = 'nu = 6', ylab = 'posterior', xlab = 'coefficients', bty = 'n', names = c( 'mu', paste( 'b', 1:8, sep = '' ) ) ) abline( h = 0, lty = 2 ) ## add in GLM fit and MAP with legend points( beta.initial.values, col = 2, pch = 17 ) points( results.nu.6$map$beta, pch = 19, col = 3 ) legend( 'topright', c( 'MLE', 'MAP' ), col = 2:3, pch = c( 17, 19 ) ) ## simple prediction predictions.nu.6 <- predict( results.nu.6, XX = X ) str( predictions.nu.6 ) # List of 4 # $ p : num [1:768, 1:90900] 0.6915 0.1004 0.6854 0.0905 0.7876 ... # $ mp : num [1:768] 0.639 0.077 0.7467 0.0734 0.7699 ... # $ c : num [1:768] 1 0 1 0 1 0 0 0 1 0 ... # $ ent: num [1:768] 0.654 0.271 0.566 0.262 0.539 ... ## hit rate mean( predictions.nu.6$c == y ) # [1] 0.7786458 ## now do the MCMC (Gibbs) sampling with nu = 1 T <- 101000 seed <- 5459224 set.seed( seed ) system.time( results.nu.1 <- reglogit( T, y, X, nu = 1, nup = NULL, bstart = beta.initial.values, normalize = F ) ) # round 100 # round 200 # # . # . # . # # round 101000 # user system elapsed # 1602.16 34.17 307.42 ## plot the marginal posterior distributions of the coefficients n.burnin <- 1000 burnin <- ( 1:n.burnin ) boxplot( results.nu.1$beta[ - burnin, ], main = 'nu = 1', ylab = 'posterior', xlab = 'coefficients', bty = 'n', names = c( 'mu', paste( 'b', 1:8, sep = '' ) ) ) abline( h = 0, lty = 2 ) ## add in GLM fit and MAP with legend points( beta.initial.values, col = 2, pch = 17 ) points( results.nu.1$map$beta, pch = 19, col = 3 ) legend( 'topright', c( 'MLE', 'MAP' ), col = 2:3, pch = c( 17, 19 ) ) ## simple prediction predictions.nu.1 <- predict( results.nu.1, XX = X ) str( predictions.nu.1 ) # List of 4 # $ p : num [1:768, 1:90900] 0.494 0.242 0.604 0.24 0.521 ... # $ mp : num [1:768] 0.447 0.23 0.534 0.231 0.444 ... # $ c : num [1:768] 0 0 1 0 0 0 0 0 1 0 ... # $ ent: num [1:768] 0.688 0.539 0.691 0.541 0.687 ... ## hit rate mean( predictions.nu.1$c == y ) # [1] 0.7083333