# R code for a partial analysis of the spam data set ################## the first block of code starts here ######################## ##### data set context and description ######################################## # see the file # 'spam-data-context.txt' # on the course web page for context and data description # in this case study # we have 4601 email messages, each marked 'spam' or non-spam ('ham') # in a gold-standard manner, and 58 other variables: # 57 of them are potential predictors of the dichotomous (1/0) outcome # 'spam', and 1 is a dichotomy that records whether each email message # belongs either to the randomly-partitioned modeling or validation # subsets for cross-validation purposes # note: in machine learning the usual terminology is 'training' for # what i call 'modeling' and 'test' for what i call 'validation', # and where statisticians would say 'predictor variable', # machine learners would usually say 'feature' or 'attribute' ##### preamble ################################################################ # if relevant on your machine (e.g., Windows people using R instead # of RStudio), unbuffer the output # set your working directory to the place where you downloaded # the data file 'spam.csv' and the file 'spam-analysis-functions-R.txt' # on my desktop at home this is # setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Spam-Data-Analysis' ) # the next command reads in code for some functions we'll use below source( 'spam-analysis-functions-R.txt' ) # the next command calls a function defined in # 'spam-analysis-functions-R.txt' that automatically # installs a package (if it has not previously been installed) # and brings it into the current R session with 'library'; if it # *has* already been installed the function just invokes 'library' # (please ignore its incorrect warning message if it issues one) dynamic.require( 'gtools' ) ##### read in the data set and examine its structure ########################## raw.data <- read.table( 'spam.txt', header = T ) str( raw.data ) # 'data.frame': 4601 obs. of 59 variables: # $ spam : logi TRUE TRUE TRUE TRUE TRUE TRUE ... # $ testid : logi TRUE FALSE TRUE FALSE FALSE FALSE ... # $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ... # $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ... # $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ... # $ X3d : num 0 0 0 0 0 0 0 0 0 0 ... # $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ... # $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ... # $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ... # $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ... # $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ... # $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ... # $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ... # $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ... # $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ... # $ report : num 0 0.21 0 0 0 0 0 0 0 0 ... # $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ... # $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ... # $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ... # $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ... # $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ... # $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ... # $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ... # $ font : num 0 0 0 0 0 0 0 0 0 0 ... # $ X000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ... # $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ... # $ hp : num 0 0 0 0 0 0 0 0 0 0 ... # $ hpl : num 0 0 0 0 0 0 0 0 0 0 ... # $ george : num 0 0 0 0 0 0 0 0 0 0 ... # $ X650 : num 0 0 0 0 0 0 0 0 0 0 ... # $ lab : num 0 0 0 0 0 0 0 0 0 0 ... # $ labs : num 0 0 0 0 0 0 0 0 0 0 ... # $ telnet : num 0 0 0 0 0 0 0 0 0 0 ... # $ X857 : num 0 0 0 0 0 0 0 0 0 0 ... # $ data : num 0 0 0 0 0 0 0 0 0.15 0 ... # $ X415 : num 0 0 0 0 0 0 0 0 0 0 ... # $ X85 : num 0 0 0 0 0 0 0 0 0 0 ... # $ technology: num 0 0 0 0 0 0 0 0 0 0 ... # $ X1999 : num 0 0.07 0 0 0 0 0 0 0 0 ... # $ parts : num 0 0 0 0 0 0 0 0 0 0 ... # $ pm : num 0 0 0 0 0 0 0 0 0 0 ... # $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ cs : num 0 0 0 0 0 0 0 0 0 0 ... # $ meeting : num 0 0 0 0 0 0 0 0 0 0 ... # $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ... # $ project : num 0 0 0 0 0 0 0 0 0 0.06 ... # $ re : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ... # $ table : num 0 0 0 0 0 0 0 0 0 0 ... # $ conference: num 0 0 0 0 0 0 0 0 0 0 ... # $ ch. : num 0 0 0.01 0 0 0 0 0 0 0.04 ... # $ ch..1 : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ... # $ ch..2 : num 0 0 0 0 0 0 0 0 0 0 ... # $ ch..3 : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ... # $ ch..4 : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ... # $ ch..5 : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ... # $ crl.ave : num 3.76 5.11 9.82 3.54 3.54 ... # $ crl.long : int 61 101 485 40 40 15 4 11 445 43 ... # $ crl.tot : int 278 1028 2259 191 191 54 112 49 1257 749 ... # the 54 predictors from 'make' to 'ch..5' are expressed as percentages # from 0 to 100 # in 'crl.ave', 'crl.long', and 'crl.tot', the character string # 'crl' is short for 'capital run length'; in other words, 'crl.ave' # is the average length in each email message of consecutive runs # of capital letters, 'crl.long' is the longest run of consecutive # capital letters in the email message, and 'crl.tot' is the total # number of consecutive capital letters in each message # note: ch. = ; # ch..1 = ( # ch..2 = [ # ch..3 = ! # ch..4 = $ # ch..5 = # ##### data curation preceding the modeling, part (1) ########################## sum( is.na( raw.data ) ) # [1] 0 # no missingness anywhere: excellent print( n <- nrow( raw.data ) ) # [1] 4601 # number of documents (email messages) # the next command recodes the outcome variable to numeric from logical raw.data$spam <- ifelse( raw.data$spam == T, 1, 0 ) # the next command computes the relative frequency distribution of 'spam' with( raw.data, table( spam ) / n ) # spam # 0 1 # 0.6059552 0.3940448 # so we have about 39% 1s and 61% 0s; that should be enough of each # category to have a decent chance of predicting the 1s # the next command computes the relative frequency distribution of 'testid' with( raw.data, table( spam, testid ) / n ) # testid # FALSE TRUE # 0.6661595 0.3338405 # the people who created this data set put almost exactly 2/3 # of the data in the modeling subset ('testid' = F) and 1/3 # in the validation subset ('testid' = T); this is a reasonable choice # another reasonable choice would be 50/50; the research literature # is still unsettled over the question of the optimal modeling/validation # split # the next two commands extract the modeling and validation subsets; # our plan is to buidl predictive models on the 'modeling' data # and see how well they perform on the 'validation' data raw.data.modeling <- raw.data[ ! raw.data$testid, ] n.modeling <- length( raw.data.modeling ) raw.data.validation <- raw.data[ raw.data$testid, ] n.validation <- length( raw.data.validation ) ##### data curation preceding the modeling, part (2) ########################## ##### initial model-building on the 'modeling' subset ######################### # at this point in careful statistical data science work, we would # examine each predictor variable one by one, to see if # *transformed* versions of any of the variables might have # more predictive signal in them than the raw versions do # since this is quite time-consuming here with 57 predictors, # i'm only going to ask you to examine several of them, to give you # a sense of how this process goes # we want to make numerical and graphical descriptive summaries # (1) of all of the predictors univariately and (2) of the # bivariate relationships between each of the predictors and the outcome # the built-in function 'summary' is a good place to start # please examine the file # 'spam-data-analysis-variable-summary.txt' # on the course web page, which includes a summary of all 59 variables # obtained with the command summary( raw.data.modeling ) ################## the first block of code ends here ######################## ################## the second block of code begins here ##################### # to see all 57 histograms of the predictor variables below, # you may need to adjust the choice of 'n.histogram.matrix' # and fiddle with the rest of the histogram code, # if your plotting window is too small, or you could try # resizing your plotting window to make it bigger # now run only plotting code block 1 and examine the plot ##### plotting code block 1 begins on this line ############################# n.histogram.matrix <- 5 par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) ) for ( i in 1:n.histogram.matrix^2 ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) ##### plotting code block 1 ends on this line ############################### # after examining the first set of plots, run plotting code block 2 ##### plotting code block 2 begins on this line ############################# par( mfrow = c( n.histogram.matrix, n.histogram.matrix ) ) for ( i in ( n.histogram.matrix^2 + 1 ):( 2 * n.histogram.matrix^2 ) ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) ##### plotting code block 2 ends on this line ############################## # and after examining the second set of plots, run plotting code block 3 ##### plotting code block 3 begins on this line ############################ par( mfrow = c( 3, 3 ) ) for ( i in ( 2 * n.histogram.matrix^2 + 1 ):( ncol( raw.data ) - 2 ) ) { hist( raw.data.modeling[ , i + 2 ], main = '', prob = T, xlab = names( raw.data )[ i + 2 ] ) } par( mfrow = c( 1, 1 ) ) ##### plotting code block 3 ends on this line ############################# ################## the second block of code ends here ##################### ################## the third block of code begins here #################### k <- 57 correlation.vector <- rep( NA, k - 1 ) for ( i in 1:k ) { correlation.vector[ i ] <- cor( raw.data.modeling$spam, raw.data.modeling[ , i + 2 ] ) } correlation.table <- cbind( names( raw.data.modeling[ , 3:( k + 2 ) ] ), signif( correlation.vector, 4 ) ) correlation.table[ order( abs( correlation.vector ), decreasing = T ), ] # [,1] [,2] # [1,] "your" "0.4012" # [2,] "remove" "0.3421" # [3,] "free" "0.3381" # [4,] "X000" "0.3369" # [5,] "ch..4" "0.3271" # [6,] "you" "0.2778" # [7,] "business" "0.2724" # [8,] "hp" "-0.2646" # [9,] "receive" "0.2467" # [10,] "over" "0.2427" # # . # . # . # # [48,] "ch..2" "-0.06692" # [49,] "ch..5" "0.06583" # [50,] "report" "0.0648" # [51,] "direct" "-0.06262" # [52,] "X3d" "0.05758" # [53,] "ch." "-0.05623" # [54,] "table" "-0.04498" # [55,] "address" "-0.04102" # [56,] "parts" "-0.03357" # [57,] "will" "0.007804" ################## the third block of code ends here ###################### ################## the fourth block of code begins here ################### ##### run this first sub-block, stop and examine the output ############### # long-right-hand-tailed variables that (a) take on only positive values # and (b) have a mode to the right of 0 can sometimes be transformed # to Normality by taking logs: if the mode of such a variable is positive, # the log transform may well work; if not, the transformed variable # will have a spike in the left tail, with the rest of the transformed # values perhaps looking Normal # this is an instance of method (I) in the problem statement # for finding extra predictive signal # let's look at the variable 'make' with( raw.data.modeling, hist( make, breaks = 100, prob = T, main = '' ) ) with( raw.data.modeling, sum( make == 0 ) / n ) # [1] 0.5090198 # this shows that the mode of 'make' is 0 # if we were to try to compute log( make ), 50.9% of the values # would be ( - infinity ) with( raw.data.modeling, min( make[ make > 0 ] ) ) # [1] 0.01 # a standard thing to try with a variable such as 'make' # is the transformation # x -> log( x + C ), # where C is the smallest positive value x takes on log.make.modeling <- log( raw.data.modeling$make + 0.01 ) hist( log.make.modeling, breaks = 100, prob = T, main = '', xlab = 'log( make + 0.01 )' ) # we see the anticipated shape # a good bit of the predictive signal from such a variable x # can be extracted by (a) making an indicator variable that's 1 # when x is 0 and (b) predicting y from both log( x + C ) and # the indicator make.is.0.modeling <- ifelse( raw.data.modeling$make == 0, 1, 0 ) tab.sum( make.is.0.modeling, raw.data.modeling$spam ) # make.is.0.modeling n mean sd # [1,] 0 723 0.593361 0.4915464 # [2,] 1 2342 0.3368915 0.4727484 # [3,] "Total" 3065 0.3973899 0.4894378 # you can see that the indicator variable for ( make = 0 ) # by itself has a lot of predictive signal # here are the results of a logistic regression of 'spam' # just on 'make' (note that it's always good form to store # the results of the regression fitting in an object): logistic.regression.of.spam.on.make <- glm( raw.data.modeling$spam ~ raw.data.modeling$make, family = binomial( link = 'logit' ) ) summary( logistic.regression.of.spam.on.make ) # Call: # glm(formula = raw.data.modeling$spam ~ raw.data.modeling$make, # family = binomial(link = 'logit')) # Deviance Residuals: # Min 1Q Median 3Q Max # -2.4152 -0.9729 -0.9729 1.3967 1.3967 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -0.50213 0.03967 -12.66 < 2e-16 *** # raw.data.modeling$make 0.77487 0.13201 5.87 4.36e-09 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119 on 3064 degrees of freedom # Residual deviance: 4078 on 3063 degrees of freedom # AIC: 4082 # Number of Fisher Scoring iterations: 4 # there are logistic regression situations in which maximum-likelihood (ML) # fitting fails, because the log-likelihood function goes to + infinity # when one or more of the estimated beta coefficients tends to # infinity in absolute value; this may occur when some of the p.hat # values from the ML fit are identically 1 or 0 (or close enough to 1 or 0 # to be essentially 1 or 0 in practice), because # logit( 1 ) = + infinity and logit( 0 ) = - infinity # glm tries to cope with this by enforcing a condition of the form # - boundary < beta.hat < + boundary # where 'boundary' is a value that represents 'essentially infinite' # this means that we always need to follow a call to glm by # asking it two questions: do you think that the Fisher scoring algorithm # converged? and did it converge to a vector of beta.hat values # in which one or more of the beta.hats touched the boundary? # having previously stored the results of the glm fitting in an object, # (here 'logistic.regression.of.spam.on.make'), this can be done # with the following command: print( paste( 'converged?', logistic.regression.of.spam.on.make$converged, 'boundary?', logistic.regression.of.spam.on.make$boundary ) ) # [1] "converged? TRUE boundary? FALSE" # this means that everything is ok: convergence occurred, # and not to the boundary ############# the first sub-block ends here ################################ # when you've absorbed the output of the first sub-block, ##### run this second sub-block, stop and examine the output ############### # and here are the results of a logistic regression of 'spam' # on log( 'make' + 0.01 ) and 'make.is.0': logistic.regression.of.spam.on.log.make.and.make.is.0 <- glm( raw.data.modeling$spam ~ log.make.modeling + make.is.0.modeling, family = binomial( link = 'logit' ) ) summary( logistic.regression.of.spam.on.log.make.and.make.is.0 ) # Call: # glm(formula = raw.data.modeling$spam ~ log.make.modeling + # make.is.0.modeling, family = binomial(link = 'logit')) # Deviance Residuals: # Min 1Q Median 3Q Max # -1.5085 -0.9064 -0.9064 1.4751 1.4751 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) 0.54374 0.12340 4.406 1.05e-05 *** # log.make.modeling 0.14127 0.08209 1.721 0.0853 . # make.is.0.modeling -0.57034 0.29408 -1.939 0.0524 . # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119.0 on 3064 degrees of freedom # Residual deviance: 3966.8 on 3062 degrees of freedom # AIC: 3972.8 # Number of Fisher Scoring iterations: 4 print( paste( 'converged?', logistic.regression.of.spam.on.log.make.and.make.is.0$converged, 'boundary?', logistic.regression.of.spam.on.log.make.and.make.is.0$boundary ) ) # [1] "converged? TRUE boundary? FALSE" ################## the second sub-block ends here ########################### ################## the fourth block of code ends here ####################### ################## the fifth block of code begins here ###################### # a different method for extracting more predictive signal # that often works is to transform a continuous variable # such as 'make' into a set of categorical indicators, # by cutting it at (for example) its deciles (10th percentile, # 20th percentile, ..., 100th percentile) # this permits non-linear relationships between the predictor # and outcome to be explored without having to specify # the nature of the non-linearity # this is an instance of method (II) in the problem statement # for finding extra predictive signal # i've written a function called # 'logistic.regression.univariate.exploration' # that does this; i included it in the file 'spam-analysis-functions-R.txt' # that you 'source'd at the beginning of this analysis # the syntax of this function is as follows: # logistic.regression.univariate.exploration( x, y, # n.bootstrap.replicates, n.cutpoints ) # 'x' is the predictor; 'y' is the binary outcome; 'n.bootstrap.replicates' # is the desired number of lowess bootstrap curves in the plot; and # 'n.cutpoints' is the desired number of categories defined by the # quantiles of 'x' (if a single value of 'x' occurs with high frequency, # you will get fewer categories; for example, 'make' is exactly 0 # 76% of the time, so when i ask for 10 groups cut at the deciles of 'make' # only 3 groups are defined: (0th to 80th percentile), (80th to 90th), # (90th to 100th) # now run only exploration code block 1 and examine the output ##### exploration code block 1 begins on this line ######################## # the next command examines the predictor 'make' with 10 requested # quantile cuts logistic.regression.univariate.exploration( raw.data.modeling$make, raw.data.modeling$spam, 100, 10 ) ##### exploration code block 1 ends on this line ######################## # after studying the first set of outputs, run exploration code block 2 ##### exploration code block 2 begins on this line ######################## # the next command examines 'make' with 20 requested quantile cuts logistic.regression.univariate.exploration( raw.data.modeling$make, raw.data.modeling$spam, 100, 20 ) ##### exploration code block 2 ends on this line ######################## # after studying the second set of outputs, run exploration code block 3 ##### exploration code block 3 begins on this line ######################## # the next command examines the predictor 'your' with 10 requested # quantile cuts logistic.regression.univariate.exploration( raw.data.modeling$your, raw.data.modeling$spam, 100, 10 ) ##### exploration code block 3 ends on this line ######################## ################## the fifth block of code ends here ####################### # at this point, if we had enough time, we would transform all # of the predictors to extract maximam predictive signal # before fitting a version of the full model with all predictors included # but i've run out of time (my bad), so we're now just going to # fit the full model without taking advantage of the extra signal # from transformations, because something happens that's worth knowing about # when fitting GLMs to binary outcomes ################## the sixth block of code begins here ###################### # i'm now going to transform all of the predictors to have mean 0 # and sd 1 (this is a form of *standardization*); this puts all # of the beta coefficients (except the intercept, which doesn't need # standardizing) on the same scale, so that they become directly # comparable -- after this standardization, it's meaningful # to sort the predictors on the absolute value of their beta.hats # the next 3 commands create a new data frame in which # all of the predictors have been standardized raw.X.modeling <- raw.data.modeling[ , 3:( k + 2 ) ] scaled.X.modeling.data.frame <- as.data.frame( cbind( raw.data.modeling$spam, scale( raw.X.modeling ) ) ) names( scaled.X.modeling.data.frame )[ 1 ] <- 'spam' summary( full.model.all.x.scaled.modeling <- glm( spam ~ ., data = scaled.X.modeling.data.frame, family = binomial( link = 'logit' ) ) ) # Call: # glm(formula = spam ~ ., family = binomial(link = "logit"), # data = scaled.X.modeling.data.frame) # Deviance Residuals: # Min 1Q Median 3Q Max # -4.4162 -0.1754 0.0000 0.0959 3.6230 # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -19.15048 2.91169 -6.577 4.80e-11 *** # make -0.05740 0.08437 -0.680 0.496312 # address -0.19613 0.11235 -1.746 0.080860 . # all 0.11289 0.07528 1.500 0.133705 # X3d 3.23705 2.52415 1.282 0.199690 # our 0.26636 0.07285 3.656 0.000256 *** # # . # . # . # # ch..4 1.74436 0.25784 6.765 1.33e-11 *** # ch..5 1.13535 0.77745 1.460 0.144193 # crl.ave 1.00481 0.80115 1.254 0.209766 # crl.long 2.72019 0.79835 3.407 0.000656 *** # crl.tot 0.35138 0.17477 2.011 0.044367 * # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # (Dispersion parameter for binomial family taken to be 1) # Null deviance: 4119 on 3064 degrees of freedom # Residual deviance: 1125 on 3007 degrees of freedom # AIC: 1241 # Number of Fisher Scoring iterations: 13 # Warning message: # glm.fit: fitted probabilities numerically 0 or 1 occurred # what this warning may be telling us is that the maximum-likelihood # fitting process has bumped up against the boundary for one or more # of the beta.hats; let's find out print( paste( 'converged?', full.model.all.x.scaled.modeling$converged, 'boundary?', full.model.all.x.scaled.modeling$boundary ) ) # [1] "converged? TRUE boundary? FALSE" # so we don't have a boundary problem; we *do* have several predictors # with massive beta.hats after standardization, but some of these # predictors also have massive beta.hat standard errors; # bayesian logistic regression with a regularizing prior will help # to stabilize maximum-likelihood estimation here # i've run out of time to demonstrate that to you here (my bad) ################## the sixth block of code ends here ###################### ################## the seventh block of code begins here ################## ##### run this first sub-block, stop and examine the output ############### # instead let's finish by (a) seeing how good the p.hats # *appear to be* internal to the modeling data set, # and (b) seeing how well the full model fit to the modeling data set # validates on the validation data set, by examining the p.hats # obtained by using the modeling-data beta.hats *on the validation data* p.hat.full.model.all.x.scaled.modeling <- predict( full.model.all.x.scaled.modeling, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 0 ] ), mean( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 1 ] ), mean( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 1 ] ) - mean( p.hat.full.model.all.x.scaled.modeling[ scaled.X.modeling.data.frame$spam == 0 ] ) ) # [1] 0.09029248 0.86307864 0.77278616 ################### the first sub-block ends here ########################### # when you've absorbed the output of the first sub-block, ##### run this second sub-block, stop and examine the output ################ # now standardize the predictors in the validation subset raw.X.validation <- raw.data.validation[ , 3:( k + 2 ) ] scaled.X.validation.data.frame <- as.data.frame( cbind( raw.data.validation$spam, scale( raw.X.validation ) ) ) names( scaled.X.validation.data.frame )[ 1 ] <- 'spam' # and use the *modeling* beta.hats to create p.hats based on # the *validation* predictors: p.hat.full.model.all.x.scaled.validation <- predict( full.model.all.x.scaled.modeling, type = 'response', data = scaled.X.validation.data.frame ) par( mfrow = c( 2, 1 ) ) hist( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 0 ] ), mean( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 1 ] ), mean( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 1 ] ) - mean( p.hat.full.model.all.x.scaled.validation[ scaled.X.validation.data.frame$spam == 0 ] ) ) # [1] 0.3526234 0.4679253 0.1153018 ################### the second sub-block ends here ########################## ################## the seventh block of code ends here ###################### # a regularized model with far fewer predictors should exhibit # much better out-of-sample predictive validation # much more to do here, but out of time (my bad)