##### preamble ###########################################################

# context of the data set:

# A representative sample of n = 768 women who were at least 21 years old, 
# of Pima Indian heritage and living near Phoenix, Arizona, 
# was tested in the late 1980s for diabetes according to 
# World Health Organization criteria. 

# The data were collected by the US National Institute of Diabetes 
# and Digestive and Kidney Diseases.

# unbuffer the output

# change directory to the location of the data file

#   'pima-indians-diabetes.txt'

# on my desktop at home this is

#   setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Pima-Indians-Diabetes' )

# run the next command the first time you want to work with 'reglogit'
# and don't run it thereafter

# install.packages( 'reglogit' )

library( reglogit )

##### read in the data set and examine its structure #####################

pima <- read.csv( 'pima-indians-diabetes.txt' )

str( pima )

# 'data.frame':   768 obs. of  9 variables:
#  $ pregnancies: int  6 1 8 1 0 5 3 10 2 8 ...
#  $ glucose    : int  148 85 183 89 137 116 78 115 197 125 ...
#  $ dbp        : int  72 66 64 66 40 74 50 0 70 96 ...
#  $ skinfold   : int  35 29 0 23 35 0 32 0 45 0 ...
#  $ insulin    : int  0 0 0 94 168 0 88 0 543 0 ...
#  $ bmi        : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
#  $ pedigree   : num  0.627 0.351 0.672 0.167 2.288 ...
#  $ age        : int  50 31 32 21 33 30 26 29 53 54 ...
#  $ diabetes   : int  1 0 1 0 1 0 1 0 1 1 ...

# here if i had more time i would study the predictor variables
# in detail, but i don't (my bad); we're just going to look below
# at the nature of bayesian regularization of logistic regression
# coefficients (typically this involves shrinking them toward a prior mean
# of 0, to achieve a reduction in mean squared error of prediction)

##### prepare the inputs to 'reglogit' ##################################

X <- as.matrix( pima[ , - 9 ] )

y <- as.numeric( pima[ , 9 ] )

# BG: pre-normalize X matrix to match the comparison in the paper

# Gramacy RB, Polson NG (2012). Simulation-based regularized 
#   logistic regression. Bayesian Analysis, 7(3), pp. 567-590; 
#   arXiv:1005.3430; http://arxiv.org/abs/1005.3430

one <- rep( 1, nrow( X ) )

normx <- sqrt( drop( one %*% ( X^2 ) ) )

X <- scale( X, F, normx ) 

## compare to the GLM fit

summary( fit.logit <- glm( y ~ X, family = binomial( link = 'logit' ) ) )

# Call:
# glm(formula = y ~ X, family = binomial(link = "logit"))

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -2.5566  -0.7274  -0.4159   0.7267   2.9297  

# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)    
# (Intercept)   -8.4047     0.7166 -11.728  < 2e-16 ***
# Xpregnancies  17.4480     4.5436   3.840 0.000123 ***
# Xglucose     121.8551    12.8520   9.481  < 2e-16 ***
# Xdbp         -26.4411    10.4082  -2.540 0.011072 *  
# Xskinfold      0.4459     4.9708   0.090 0.928515    
# Xinsulin      -4.6273     3.4994  -1.322 0.186065    
# Xbmi          81.9056    13.7764   5.945 2.76e-09 ***
# Xpedigree     15.0995     4.7790   3.160 0.001580 ** 
# Xage          14.5282     9.1208   1.593 0.111192    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 993.48  on 767  degrees of freedom
# Residual deviance: 723.45  on 759  degrees of freedom
# AIC: 741.45

# Number of Fisher Scoring iterations: 5

# note that some of the predictors have coefficients with
# large standard errors (glucose, dbp, bmi); 
# shrinkage of the coefficients toward 0 may well serve
# to stabilize the maximum-likelihood fitting process

beta.initial.values <- fit.logit$coefficients

## do the MCMC (Gibbs) sampling with nu = 6

T <- 101000

seed <- 5459224

set.seed( seed )

system.time( 

  results.nu.6 <- reglogit( T, y, X, nu = 6, nup = NULL, 
    bstart = beta.initial.values, normalize = F )

)

# round 100 
# round 200 
#
#     .
#     .
#     .
# 
# round 101000

#    user  system elapsed 
# 1596.98   32.59  304.10

## plot the marginal posterior distributions of the coefficients

n.burnin <- 1000

burnin <- ( 1:n.burnin )

boxplot( results.nu.6$beta[ - burnin, ], main = 'nu = 6', 
  ylab = 'posterior', xlab = 'coefficients', bty = 'n', 
  names = c( 'mu', paste( 'b', 1:8, sep = '' ) ) )

abline( h = 0, lty = 2 )

## add in GLM fit and MAP with legend

points( beta.initial.values, col = 2, pch = 17 )

points( results.nu.6$map$beta, pch = 19, col = 3 )

legend( 'topright', c( 'MLE', 'MAP' ), col = 2:3, pch = c( 17, 19 ) )

## simple prediction

predictions.nu.6 <- predict( results.nu.6, XX = X )

str( predictions.nu.6 )

# List of 4
#  $ p  : num [1:768, 1:90900] 0.6915 0.1004 0.6854 0.0905 0.7876 ...
#  $ mp : num [1:768] 0.639 0.077 0.7467 0.0734 0.7699 ...
#  $ c  : num [1:768] 1 0 1 0 1 0 0 0 1 0 ...
#  $ ent: num [1:768] 0.654 0.271 0.566 0.262 0.539 ...

## hit rate

mean( predictions.nu.6$c == y )

# [1] 0.7786458

## now do the MCMC (Gibbs) sampling with nu = 1

T <- 101000

seed <- 5459224

set.seed( seed )

system.time( 

  results.nu.1 <- reglogit( T, y, X, nu = 1, nup = NULL, 
    bstart = beta.initial.values, normalize = F )

)

# round 100 
# round 200 
#
#     .
#     .
#     .
# 
# round 101000

#    user  system elapsed 
# 1602.16   34.17  307.42

## plot the marginal posterior distributions of the coefficients

n.burnin <- 1000

burnin <- ( 1:n.burnin )

boxplot( results.nu.1$beta[ - burnin, ], main = 'nu = 1', 
  ylab = 'posterior', xlab = 'coefficients', bty = 'n', 
  names = c( 'mu', paste( 'b', 1:8, sep = '' ) ) )

abline( h = 0, lty = 2 )

## add in GLM fit and MAP with legend

points( beta.initial.values, col = 2, pch = 17 )

points( results.nu.1$map$beta, pch = 19, col = 3 )

legend( 'topright', c( 'MLE', 'MAP' ), col = 2:3, pch = c( 17, 19 ) )

## simple prediction

predictions.nu.1 <- predict( results.nu.1, XX = X )

str( predictions.nu.1 )

# List of 4

#  $ p  : num [1:768, 1:90900] 0.494 0.242 0.604 0.24 0.521 ...
#  $ mp : num [1:768] 0.447 0.23 0.534 0.231 0.444 ...
#  $ c  : num [1:768] 0 0 1 0 0 0 0 0 1 0 ...
#  $ ent: num [1:768] 0.688 0.539 0.691 0.541 0.687 ...

## hit rate

mean( predictions.nu.1$c == y )

# [1] 0.7083333