# R code for a partial analysis of the spam data set

################## the first block of code starts here ########################

##### data set context and description ########################################

# see the file

#   'spam-data-context.txt'

# on the course web page for context and data description
# in this case study

# we have 4601 email messages, each marked 'spam' or non-spam ('ham')
# in a gold-standard manner, and 58 other variables:

# 57 of them are potential predictors of the dichotomous (1/0) outcome
# 'spam', and 1 is a dichotomy that records whether each email message
# belongs either to the randomly-partitioned modeling or validation
# subsets for cross-validation purposes

# note: in machine learning the usual terminology is 'training' for
# what i call 'modeling' and 'test' for what i call 'validation',
# and where statisticians would say 'predictor variable',
# machine learners would usually say 'feature' or 'attribute'

##### preamble ################################################################

# if relevant on your machine (e.g., Windows people using R instead
# of RStudio), unbuffer the output

# set your working directory to the place where you downloaded
# the data file 'spam.csv' and the file 'spam-analysis-functions-R.txt'

# on my desktop at home this is

#   setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Spam-Data-Analysis' )

# the next command reads in code for some functions we'll use below

source( 'spam-analysis-functions-R.txt' )

# the next command calls a function defined in 
# 'spam-analysis-functions-R.txt' that automatically
# installs a package (if it has not previously been installed)
# and brings it into the current R session with 'library'; if it
# *has* already been installed the function just invokes 'library'

# (please ignore its incorrect warning message if it issues one)

dynamic.require( 'gtools' )

##### read in the data set and examine its structure ##########################

raw.data <- read.table( 'spam.txt', header = T )

str( raw.data )

# 'data.frame':   4601 obs. of  59 variables:
#  $ spam      : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
#  $ testid    : logi  TRUE FALSE TRUE FALSE FALSE FALSE ...
#  $ make      : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
#  $ address   : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
#  $ all       : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
#  $ X3d       : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ our       : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
#  $ over      : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
#  $ remove    : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
#  $ internet  : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
#  $ order     : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
#  $ mail      : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
#  $ receive   : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
#  $ will      : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
#  $ people    : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
#  $ report    : num  0 0.21 0 0 0 0 0 0 0 0 ...
#  $ addresses : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
#  $ free      : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
#  $ business  : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
#  $ email     : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
#  $ you       : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
#  $ credit    : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
#  $ your      : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
#  $ font      : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ X000      : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
#  $ money     : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
#  $ hp        : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ hpl       : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ george    : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ X650      : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ lab       : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ labs      : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ telnet    : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ X857      : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ data      : num  0 0 0 0 0 0 0 0 0.15 0 ...
#  $ X415      : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ X85       : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ technology: num  0 0 0 0 0 0 0 0 0 0 ...
#  $ X1999     : num  0 0.07 0 0 0 0 0 0 0 0 ...
#  $ parts     : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ pm        : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ direct    : num  0 0 0.06 0 0 0 0 0 0 0 ...
#  $ cs        : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ meeting   : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ original  : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
#  $ project   : num  0 0 0 0 0 0 0 0 0 0.06 ...
#  $ re        : num  0 0 0.06 0 0 0 0 0 0 0 ...
#  $ edu       : num  0 0 0.06 0 0 0 0 0 0 0 ...
#  $ table     : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ conference: num  0 0 0 0 0 0 0 0 0 0 ...
#  $ ch.       : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
#  $ ch..1     : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
#  $ ch..2     : num  0 0 0 0 0 0 0 0 0 0 ...
#  $ ch..3     : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
#  $ ch..4     : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
#  $ ch..5     : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
#  $ crl.ave   : num  3.76 5.11 9.82 3.54 3.54 ...
#  $ crl.long  : int  61 101 485 40 40 15 4 11 445 43 ...
#  $ crl.tot   : int  278 1028 2259 191 191 54 112 49 1257 749 ...

# the 54 predictors from 'make' to 'ch..5' are expressed as percentages
# from 0 to 100

# in 'crl.ave', 'crl.long', and 'crl.tot', the character string
# 'crl' is short for 'capital run length'; in other words, 'crl.ave'
# is the average length in each email message of consecutive runs
# of capital letters, 'crl.long' is the longest run of consecutive
# capital letters in the email message, and 'crl.tot' is the total
# number of consecutive capital letters in each message

# note: ch.   = ;
#       ch..1 = (
#       ch..2 = [
#       ch..3 = !
#       ch..4 = $
#       ch..5 = #

##### data curation preceding the modeling, part (1) ##########################

sum( is.na( raw.data ) )

# [1] 0

# no missingness anywhere: excellent

print( n <- nrow( raw.data ) )

# [1] 4601

# number of documents (email messages)

# the next command recodes the outcome variable to numeric from logical

raw.data$spam <- ifelse( raw.data$spam == T, 1, 0 )

# the next command computes the relative frequency distribution of 'spam'

with( raw.data, table( spam ) / n )

# spam

#         0         1 
# 0.6059552 0.3940448

# so we have about 39% 1s and 61% 0s; that should be enough of each
# category to have a decent chance of predicting the 1s

# the next command computes the relative frequency distribution of 'testid'

with( raw.data, table( spam, testid ) / n )

# testid

#     FALSE      TRUE 
# 0.6661595 0.3338405

# the people who created this data set put almost exactly 2/3
# of the data in the modeling subset ('testid' = F) and 1/3
# in the validation subset ('testid' = T); this is a reasonable choice

# another reasonable choice would be 50/50; the research literature
# is still unsettled over the question of the optimal modeling/validation
# split

# the next two commands extract the modeling and validation subsets;
# our plan is to buidl predictive models on the 'modeling' data
# and see how well they perform on the 'validation' data

raw.data.modeling <- raw.data[ ! raw.data$testid, ]

n.modeling <- length( raw.data.modeling )

raw.data.validation <- raw.data[ raw.data$testid, ]

n.validation <- length( raw.data.validation )

##### data curation preceding the modeling, part (2) ##########################

##### initial model-building on the 'modeling' subset #########################

# at this point in careful statistical data science work, we would
# examine each predictor variable one by one, to see if 
# *transformed* versions of any of the variables might have
# more predictive signal in them than the raw versions do

# since this is quite time-consuming here with 57 predictors, 
# i'm only going to ask you to examine several of them, to give you
# a sense of how this process goes

# we want to make numerical and graphical descriptive summaries
# (1) of all of the predictors univariately and (2) of the
# bivariate relationships between each of the predictors and the outcome

# the built-in function 'summary' is a good place to start

# please examine the file

#   'spam-data-analysis-variable-summary.txt'

# on the course web page, which includes a summary of all 59 variables
# obtained with the command

summary( raw.data.modeling )

################## the first block of code ends here ########################

################## the second block of code begins here #####################

# to see all 57 histograms of the predictor variables below, 
# you may need to adjust the choice of 'n.histogram.matrix' 
# and fiddle with the rest of the histogram code, 
# if your plotting window is too small, or you could try
# resizing your plotting window to make it bigger

# now run only plotting code block 1 and examine the plot

##### plotting code block 1 begins on this line #############################

n.histogram.matrix <- 5

par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) )

for ( i in 1:n.histogram.matrix^2 ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

##### plotting code block 1 ends on this line ###############################

# after examining the first set of plots, run plotting code block 2

##### plotting code block 2 begins on this line #############################

par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) )

for ( i in ( n.histogram.matrix^2 + 1 ):( 2 * n.histogram.matrix^2 ) ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

##### plotting code block 2 ends on this line ##############################

# and after examining the second set of plots, run plotting code block 3

##### plotting code block 3 begins on this line ############################

par( mfrow = c( 3, 3 ) )

for ( i in ( 2 * n.histogram.matrix^2 + 1 ):( ncol( raw.data ) - 2 ) ) {

  hist( raw.data.modeling[ , i + 2 ], main = '', prob = T,
    xlab = names( raw.data )[ i + 2 ] )

}

par( mfrow = c( 1, 1 ) )

##### plotting code block 3 ends on this line #############################

################## the second block of code ends here #####################

################## the third block of code begins here ####################

k <- 57

correlation.vector <- rep( NA, k - 1 )

for ( i in 1:k ) {

  correlation.vector[ i ] <- cor( raw.data.modeling$spam,
    raw.data.modeling[ , i + 2 ] )

}

correlation.table <- cbind( names( raw.data.modeling[ , 3:( k + 2 ) ] ), 
  signif( correlation.vector, 4 ) )

correlation.table[ order( abs( correlation.vector ), 
  decreasing = T ), ]

#       [,1]         [,2]      
#  [1,] "your"       "0.4012"  
#  [2,] "remove"     "0.3421"  
#  [3,] "free"       "0.3381"  
#  [4,] "X000"       "0.3369"  
#  [5,] "ch..4"      "0.3271"  
#  [6,] "you"        "0.2778"  
#  [7,] "business"   "0.2724"  
#  [8,] "hp"         "-0.2646" 
#  [9,] "receive"    "0.2467"  
# [10,] "over"       "0.2427"  
# 
#   . 
#   .
#   .
# 
# [48,] "ch..2"      "-0.06692"
# [49,] "ch..5"      "0.06583" 
# [50,] "report"     "0.0648"
# [51,] "direct"     "-0.06262"
# [52,] "X3d"        "0.05758" 
# [53,] "ch."        "-0.05623"
# [54,] "table"      "-0.04498"
# [55,] "address"    "-0.04102"
# [56,] "parts"      "-0.03357"
# [57,] "will"       "0.007804"

################## the third block of code ends here ######################

################## the fourth block of code begins here ###################

##### run this first sub-block, stop and examine the output ###############

# long-right-hand-tailed variables that (a) take on only positive values
# and (b) have a mode to the right of 0 can sometimes be transformed 
# to Normality by taking logs: if the mode of such a variable is positive,
# the log transform may well work; if not, the transformed variable
# will have a spike in the left tail, with the rest of the transformed
# values perhaps looking Normal

# this is an instance of method (I) in the problem statement
# for finding extra predictive signal

# let's look at the variable 'make'

with( raw.data.modeling, hist( make, breaks = 100, prob = T, main = '' ) )

with( raw.data.modeling, sum( make == 0 ) / n )

# [1] 0.5090198

# this shows that the mode of 'make' is 0

# if we were to try to compute log( make ), 50.9% of the values
# would be ( - infinity )

with( raw.data.modeling, min( make[ make > 0 ] ) )

# [1] 0.01

# a standard thing to try with a variable such as 'make'
# is the transformation

#   x -> log( x + C ),

# where C is the smallest positive value x takes on

log.make.modeling <- log( raw.data.modeling$make + 0.01 )

hist( log.make.modeling, breaks = 100, prob = T, main = '', 
  xlab = 'log( make + 0.01 )' )

# we see the anticipated shape

# a good bit of the predictive signal from such a variable x
# can be extracted by (a) making an indicator variable that's 1
# when x is 0 and (b) predicting y from both log( x + C ) and
# the indicator

make.is.0.modeling <- ifelse( raw.data.modeling$make == 0, 1, 0 )

tab.sum( make.is.0.modeling, raw.data.modeling$spam )

#      make.is.0.modeling n    mean      sd       
# [1,] 0                  723  0.593361  0.4915464
# [2,] 1                  2342 0.3368915 0.4727484
# [3,] "Total"            3065 0.3973899 0.4894378

# you can see that the indicator variable for ( make = 0 )
# by itself has a lot of predictive signal

# here are the results of a logistic regression of 'spam'
# just on 'make' (note that it's always good form to store
# the results of the regression fitting in an object):

logistic.regression.of.spam.on.make <- glm( raw.data.modeling$spam ~ 
  raw.data.modeling$make, family = binomial( link = 'logit' ) )

summary( logistic.regression.of.spam.on.make )

# Call:
# glm(formula = raw.data.modeling$spam ~ raw.data.modeling$make, 
#   family = binomial(link = 'logit'))

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -2.4152  -0.9729  -0.9729   1.3967   1.3967  

# Coefficients:
#                        Estimate Std. Error z value Pr(>|z|)    
# (Intercept)            -0.50213    0.03967  -12.66  < 2e-16 ***
# raw.data.modeling$make  0.77487    0.13201    5.87 4.36e-09 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119  on 3064  degrees of freedom
# Residual deviance: 4078  on 3063  degrees of freedom
# AIC: 4082

# Number of Fisher Scoring iterations: 4

# there are logistic regression situations in which maximum-likelihood (ML)
# fitting fails, because the log-likelihood function goes to + infinity
# when one or more of the estimated beta coefficients tends to
# infinity in absolute value; this may occur when some of the p.hat
# values from the ML fit are identically 1 or 0 (or close enough to 1 or 0
# to be essentially 1 or 0 in practice), because 
# logit( 1 ) = + infinity and logit( 0 ) = - infinity

# glm tries to cope with this by enforcing a condition of the form

#   - boundary < beta.hat < + boundary

# where 'boundary' is a value that represents 'essentially infinite'

# this means that we always need to follow a call to glm by
# asking it two questions: do you think that the Fisher scoring algorithm
# converged? and did it converge to a vector of beta.hat values
# in which one or more of the beta.hats touched the boundary?

# having previously stored the results of the glm fitting in an object,
# (here 'logistic.regression.of.spam.on.make'), this can be done 
# with the following command:

print( paste( 'converged?', 
  logistic.regression.of.spam.on.make$converged, 'boundary?', 
  logistic.regression.of.spam.on.make$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

# this means that everything is ok: convergence occurred,
# and not to the boundary

############# the first sub-block ends here ################################

# when you've absorbed the output of the first sub-block,

##### run this second sub-block, stop and examine the output ###############

# and here are the results of a logistic regression of 'spam'
# on log( 'make' + 0.01 ) and 'make.is.0':

logistic.regression.of.spam.on.log.make.and.make.is.0 <- 
  glm( raw.data.modeling$spam ~ log.make.modeling +
  make.is.0.modeling, family = binomial( link = 'logit' ) )

summary( logistic.regression.of.spam.on.log.make.and.make.is.0 )

# Call:
# glm(formula = raw.data.modeling$spam ~ log.make.modeling + 
# make.is.0.modeling, family = binomial(link = 'logit'))

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -1.5085  -0.9064  -0.9064   1.4751   1.4751  

# Coefficients:
#                    Estimate Std. Error z value Pr(>|z|)    
# (Intercept)         0.54374    0.12340   4.406 1.05e-05 ***
# log.make.modeling   0.14127    0.08209   1.721   0.0853 .  
# make.is.0.modeling -0.57034    0.29408  -1.939   0.0524 .  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119.0  on 3064  degrees of freedom
# Residual deviance: 3966.8  on 3062  degrees of freedom
# AIC: 3972.8

# Number of Fisher Scoring iterations: 4

print( paste( 'converged?', 
  logistic.regression.of.spam.on.log.make.and.make.is.0$converged, 
  'boundary?', 
  logistic.regression.of.spam.on.log.make.and.make.is.0$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

################## the second sub-block ends here ###########################

################## the fourth block of code ends here #######################

################## the fifth block of code begins here ######################

# a different method for extracting more predictive signal
# that often works is to transform a continuous variable
# such as 'make' into a set of categorical indicators,
# by cutting it at (for example) its deciles (10th percentile,
# 20th percentile, ..., 100th percentile)

# this permits non-linear relationships between the predictor
# and outcome to be explored without having to specify
# the nature of the non-linearity

# this is an instance of method (II) in the problem statement
# for finding extra predictive signal

# i've written a function called 

#   'logistic.regression.univariate.exploration'

# that does this; i included it in the file 'spam-analysis-functions-R.txt'
# that you 'source'd at the beginning of this analysis

# the syntax of this function is as follows:

#   logistic.regression.univariate.exploration( x, y, 
#     n.bootstrap.replicates, n.cutpoints )

# 'x' is the predictor; 'y' is the binary outcome; 'n.bootstrap.replicates'
# is the desired number of lowess bootstrap curves in the plot; and
# 'n.cutpoints' is the desired number of categories defined by the
# quantiles of 'x' (if a single value of 'x' occurs with high frequency,
# you will get fewer categories; for example, 'make' is exactly 0
# 76% of the time, so when i ask for 10 groups cut at the deciles of 'make'
# only 3 groups are defined: (0th to 80th percentile), (80th to 90th),
# (90th to 100th)

# now run only exploration code block 1 and examine the output

##### exploration code block 1 begins on this line ########################

# the next command examines the predictor 'make' with 10 requested
# quantile cuts

logistic.regression.univariate.exploration( raw.data.modeling$make, 
  raw.data.modeling$spam, 100, 10 )

##### exploration code block 1 ends on this line ########################

# after studying the first set of outputs, run exploration code block 2

##### exploration code block 2 begins on this line ########################

# the next command examines 'make' with 20 requested quantile cuts

logistic.regression.univariate.exploration( raw.data.modeling$make, 
  raw.data.modeling$spam, 100, 20 )

##### exploration code block 2 ends on this line ########################

# after studying the second set of outputs, run exploration code block 3

##### exploration code block 3 begins on this line ########################

# the next command examines the predictor 'your' with 10 requested 
# quantile cuts

logistic.regression.univariate.exploration( raw.data.modeling$your, 
  raw.data.modeling$spam, 100, 10 )

##### exploration code block 3 ends on this line ########################

################## the fifth block of code ends here #######################

# at this point, if we had enough time, we would transform all
# of the predictors to extract maximam predictive signal
# before fitting a version of the full model with all predictors included

# but i've run out of time (my bad), so we're now just going to
# fit the full model without taking advantage of the extra signal
# from transformations, because something happens that's worth knowing about 
# when fitting GLMs to binary outcomes

################## the sixth block of code begins here ######################

# i'm now going to transform all of the predictors to have mean 0
# and sd 1 (this is a form of *standardization*); this puts all
# of the beta coefficients (except the intercept, which doesn't need
# standardizing) on the same scale, so that they become directly
# comparable -- after this standardization, it's meaningful 
# to sort the predictors on the absolute value of their beta.hats

# the next 3 commands create a new data frame in which
# all of the predictors have been standardized

raw.X.modeling <- raw.data.modeling[ , 3:( k + 2 ) ]

scaled.X.modeling.data.frame <- 
  as.data.frame( cbind( raw.data.modeling$spam, scale( raw.X.modeling ) ) )

names( scaled.X.modeling.data.frame )[ 1 ] <- 'spam'

summary( full.model.all.x.scaled.modeling <- glm( spam ~ ., 
  data = scaled.X.modeling.data.frame, 
  family = binomial( link = 'logit' ) ) )

# Call:
# glm(formula = spam ~ ., family = binomial(link = "logit"), 
#   data = scaled.X.modeling.data.frame)

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -4.4162  -0.1754   0.0000   0.0959   3.6230  

# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -19.15048    2.91169  -6.577 4.80e-11 ***
# make         -0.05740    0.08437  -0.680 0.496312    
# address      -0.19613    0.11235  -1.746 0.080860 .  
# all           0.11289    0.07528   1.500 0.133705    
# X3d           3.23705    2.52415   1.282 0.199690    
# our           0.26636    0.07285   3.656 0.000256 ***
# 
#  .
#  .
#  .
#
# ch..4         1.74436    0.25784   6.765 1.33e-11 ***
# ch..5         1.13535    0.77745   1.460 0.144193    
# crl.ave       1.00481    0.80115   1.254 0.209766    
# crl.long      2.72019    0.79835   3.407 0.000656 ***
# crl.tot       0.35138    0.17477   2.011 0.044367 *  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 4119  on 3064  degrees of freedom
# Residual deviance: 1125  on 3007  degrees of freedom
# AIC: 1241

# Number of Fisher Scoring iterations: 13

# Warning message:
# glm.fit: fitted probabilities numerically 0 or 1 occurred 

# what this warning may be telling us is that the maximum-likelihood
# fitting process has bumped up against the boundary for one or more
# of the beta.hats; let's find out

print( paste( 'converged?', full.model.all.x.scaled.modeling$converged, 
  'boundary?', full.model.all.x.scaled.modeling$boundary ) )

# [1] "converged? TRUE boundary? FALSE"

# so we don't have a boundary problem; we *do* have several predictors
# with massive beta.hats after standardization, but some of these
# predictors also have massive beta.hat standard errors;
# bayesian logistic regression with a regularizing prior will help
# to stabilize maximum-likelihood estimation here

# i've run out of time to demonstrate that to you here (my bad)

################## the sixth block of code ends here ######################

################## the seventh block of code begins here ##################

##### run this first sub-block, stop and examine the output ###############

# instead let's finish by (a) seeing how good the p.hats
# *appear to be* internal to the modeling data set, 
# and (b) seeing how well the full model fit to the modeling data set
# validates on the validation data set, by examining the p.hats
# obtained by using the modeling-data beta.hats *on the validation data*

p.hat.full.model.all.x.scaled.modeling <- 
  predict( full.model.all.x.scaled.modeling, type = 'response' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 0 ],
  nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

hist( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 1 ],
  nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 0 ] ),
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 1 ] ),
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 1 ] ) -
  mean( p.hat.full.model.all.x.scaled.modeling[ 
  scaled.X.modeling.data.frame$spam == 0 ] ) )

# [1] 0.09029248 0.86307864 0.77278616

################### the first sub-block ends here ###########################

# when you've absorbed the output of the first sub-block,

##### run this second sub-block, stop and examine the output ################

# now standardize the predictors in the validation subset

raw.X.validation <- raw.data.validation[ , 3:( k + 2 ) ]

scaled.X.validation.data.frame <- 
  as.data.frame( cbind( raw.data.validation$spam, 
  scale( raw.X.validation ) ) )

names( scaled.X.validation.data.frame )[ 1 ] <- 'spam'

# and use the *modeling* beta.hats to create p.hats based on
# the *validation* predictors:

p.hat.full.model.all.x.scaled.validation <- 
  predict( full.model.all.x.scaled.modeling, type = 'response',
  data = scaled.X.validation.data.frame )

par( mfrow = c( 2, 1 ) )

hist( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 0 ],
  nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

hist( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 1 ],
  nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 0 ] ),
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 1 ] ),
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 1 ] ) -
  mean( p.hat.full.model.all.x.scaled.validation[ 
  scaled.X.validation.data.frame$spam == 0 ] ) )

# [1] 0.3526234 0.4679253 0.1153018

################### the second sub-block ends here ##########################

################## the seventh block of code ends here ######################

# a regularized model with far fewer predictors should exhibit
# much better out-of-sample predictive validation

# much more to do here, but out of time (my bad)