##### preamble ###################################################################### # unbuffer the output # set Your working directory to the place where You downloaded # the data file 'kidney-function-data.txt' ; on my desktop this is # setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Kidney-Function' ) # load any required CRAN packages (previously installed): library( plotly ) ##### get the data set ############################################################## # read in the kidney function data with the built-in function 'read.csv' : raw.data <- read.csv( 'kidney-function-data.txt', header = T, sep = ' ' ) # this data set consists of age and kidney function measurements # on 157 healthy volunteers (potential kidney donors) # who came to the nephrology lab of dr. brian myers (stanford university) # in the early 2000s # 'read.csv' was designed to input comma-separated files, # such as those created by excel, but it also handles # data sets in which other forms of column separation are used # 'raw.data' is an important type of data structure in R called # a 'data frame'; this is a generalization of a matrix, # with rows for subjects under study and columns for variables # measured on the subjects, in which the variables can be # a mixture of data types (numerical, character, logical, ...) str( raw.data ) # 'data.frame': 157 obs. of 2 variables: # $ age : int 18 19 19 20 21 21 21 22 22 22 ... # $ kidney.function: num 2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ... # as with 'list' objects, You access the variables in a data frame # with the syntax 'data.frame.name$variable.name' : ##### check for missing values ###################################################### sum( is.na( raw.data ) ) # [1] 0 # this checks all 157 * 2 values in the data frame for missingness; # 0 means no missing data in this data frame ##### extract the variables as separate vectors, and define the sample size ######### age <- raw.data$age kidney.function <- raw.data$kidney.function print( n <- length( age ) ) # [1] 157 ##### descriptive analysis: (1) one variable at a time ############################## c( summary( age ), sd( age ) ) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 18.00 25.00 31.00 36.39 43.00 88.00 15.9232 table( age ) # age # 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 41 42 43 44 45 46 47 48 51 53 54 # 1 2 1 3 6 11 12 10 7 12 4 5 4 7 7 6 4 2 5 1 2 1 3 1 1 1 2 1 1 1 1 1 4 # 55 56 57 60 61 62 63 65 68 69 70 71 72 73 74 80 82 88 # 1 1 2 3 2 3 2 2 1 1 1 1 1 2 1 1 1 1 # as usual, age is measured as an integer (if Your age is 29 years and 11 months, # You still say that Your age is 29) par( mfrow = c( 1, 2 ) ) hist( age, main = '', prob = T, col = 'red' ) plot( density( age ), main = '', col = 'blue', xlab = 'age', lwd = 2 ) par( mfrow = c( 1, 1 ) ) # the age distribution is bimodal, but that doesn't affect the validity # of the analyses below c( summary( kidney.function ), sd( kidney.function ) ) # Min. 1st Qu. Median Mean 3rd Qu. Max. # -6.4500000000 -1.3500000000 -0.0500000000 -0.0001910828 1.5200000000 5.1900000000 2.1883422104 # kidney function has been standardized by subtracting off its mean, # so that the variable we're working with has mean essentially 0; # it's also been rounded to two digits after the decimal point # (that also won't affect the validity of the analyses below) par( mfrow = c( 1, 2 ) ) hist( kidney.function, main = '', prob = T, col = 'red', xlab = 'kidney function' ) plot( density( kidney.function ), main = '', col = 'blue', xlab = 'kidney function', lwd = 2 ) par( mfrow = c( 1, 1 ) ) # Normality of 'kidney.function' is not required for the validity of the analysis below; # it turns out that what *is* required is that the conditional distribution # of kidney function in each narrow vertical strip defined by age # should be fairly close to Gaussian qqnorm( kidney.function, main = 'Normal qqplot of kidney function', col = 'red', pch = 16 ) abline( mean( kidney.function ), sd( kidney.function ), lwd = 2, col = 'blue' ) # kidney function looks rather close to Gaussian ##### descriptive analysis: (2) two variables at a time ############################## # let's look at a scatter plot of the bivariate relationship # between age and kidney function: plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) # not surprisingly, kidney function tends to decline with age cor( age, kidney.function ) # [1] -0.5718387 # the correlation is (of course) negative and moderately strong ##### non-parametric smoother/prediction function ################################### # the next command invokes the built-in function 'lowess' # (LOcally-WEighted Scatter plot Smoother) to superimpose # a frequentist non-parametric estimate E.hat.lowess ( y | x ) # of the conditional expectation (mean) of kidney function (y) # given age (x), E ( y | x ) # here 'non-parametric' means 'not based on a parametric model' lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' ) # f governs the degree of smoothness of the lowess estimate # in a manner similar to the number of bars in a histogram; # smaller f values create less local bias but greater variance; # the default is 'f = 2 / 3' # 'f = 1 / 3' reveals a flat spot from age roughly 25 to 30 # in which kidney function appears not to decline; # this is diminished but still present with 'f = 2 / 3' plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) lines( lowess( age, kidney.function ), lwd = 2, col = 'blue' ) # i don't know what the optimal value of f is; this question # could readily be investigated with simulation studies # in which we know what the right answer is for E ( y | x ), # but the answer may well depend on the form of the scatterplot # let's go back to 'f = 1 / 3' : plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' ) ##### parametric smoother/prediction function ###################################### # an obvious parametric model to try here is *simple linear regression*: # y[ i ] = beta.0 + beta.1 * x[ i ] + e[ i ], # in which e[ i ] ~IID N ( 0, sigma.e^2 ) # 'simple' here means that there's only one predictor variable x[ i ] # this can be implemented with the built-in function 'lm' (linear model): summary( linear.fit <- lm( kidney.function ~ age ) ) # Call: # lm(formula = kidney.function ~ age) # Residuals: # Min 1Q Median 3Q Max # -4.2018 -1.3451 0.0765 1.0719 4.5252 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 2.860027 0.359565 7.954 3.53e-13 *** # age -0.078588 0.009056 -8.678 5.18e-15 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 1.801 on 155 degrees of freedom # Multiple R-squared: 0.327, Adjusted R-squared: 0.3227 # F-statistic: 75.31 on 1 and 155 DF, p-value: 5.182e-15 # the object 'linear.fit' is complicated: str( linear.fit ) # List of 12 # $ coefficients : Named num [1:2] 2.86 -0.0786 # ..- attr(*, "names")= chr [1:2] "(Intercept)" "age" # $ residuals : Named num [1:157] 0.995 2.493 -2.587 1.012 -0.23 ... # ..- attr(*, "names")= chr [1:157] "1" "2" "3" "4" ... # $ effects : Named num [1:157] 0.00239 -15.62972 -2.84089 0.76896 -0.4612 ... # ..- attr(*, "names")= chr [1:157] "(Intercept)" "age" "" "" ... # $ rank : int 2 # $ fitted.values: Named num [1:157] 1.45 1.37 1.37 1.29 1.21 ... # ..- attr(*, "names")= chr [1:157] "1" "2" "3" "4" ... # $ assign : int [1:2] 0 1 # $ qr :List of 5 # ..$ qr : num [1:157, 1:2] -12.53 0.0798 0.0798 0.0798 0.0798 ... # .. ..- attr(*, "dimnames")=List of 2 # .. .. ..$ : chr [1:157] "1" "2" "3" "4" ... # .. .. ..$ : chr [1:2] "(Intercept)" "age" # .. ..- attr(*, "assign")= int [1:2] 0 1 # ..$ qraux: num [1:2] 1.08 1.08 # ..$ pivot: int [1:2] 1 2 # ..$ tol : num 1e-07 # ..$ rank : int 2 # ..- attr(*, "class")= chr "qr" # $ df.residual : int 155 # $ xlevels : Named list() # $ call : language lm(formula = kidney.function ~ age) # $ terms :Classes 'terms', 'formula' language kidney.function ~ age # .. ..- attr(*, "variables")= language list(kidney.function, age) # .. ..- attr(*, "factors")= int [1:2, 1] 0 1 # .. .. ..- attr(*, "dimnames")=List of 2 # .. .. .. ..$ : chr [1:2] "kidney.function" "age" # .. .. .. ..$ : chr "age" # .. ..- attr(*, "term.labels")= chr "age" # .. ..- attr(*, "order")= int 1 # .. ..- attr(*, "intercept")= int 1 # .. ..- attr(*, "response")= int 1 # .. ..- attr(*, ".Environment")= # .. ..- attr(*, "predvars")= language list(kidney.function, age) # .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric" # .. .. ..- attr(*, "names")= chr [1:2] "kidney.function" "age" # $ model :'data.frame': 157 obs. of 2 variables: # ..$ kidney.function: num [1:157] 2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ... # ..$ age : int [1:157] 18 19 19 20 21 21 21 22 22 22 ... # ..- attr(*, "terms")=Classes 'terms', 'formula' language kidney.function ~ age # .. .. ..- attr(*, "variables")= language list(kidney.function, age) # .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1 # .. .. .. ..- attr(*, "dimnames")=List of 2 # .. .. .. .. ..$ : chr [1:2] "kidney.function" "age" # .. .. .. .. ..$ : chr "age" # .. .. ..- attr(*, "term.labels")= chr "age" # .. .. ..- attr(*, "order")= int 1 # .. .. ..- attr(*, "intercept")= int 1 # .. .. ..- attr(*, "response")= int 1 # .. .. ..- attr(*, ".Environment")= # .. .. ..- attr(*, "predvars")= language list(kidney.function, age) # .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric" # .. .. .. ..- attr(*, "names")= chr [1:2] "kidney.function" "age" # - attr(*, "class")= chr "lm" # the $fitted.values list element gives the predicted y values: # y.hat[ i ] = beta.0.hat + beta.1.hat * x[ i ] lines( age, linear.fit$fitted.values, lwd = 2, lty = 2, col = 'forestgreen' ) # You can see that lowess and the linear fit broadly agree on E.hat ( y | x ) # the next graph is a standard diagnostic called the *residual plot* # the residuals are defined to be the vertical discrepancies between # y[ i ] and y.hat[ i ] : # e.hat[ i ] = y[ i ] - y.hat[ i ] # if we plot the residuals (vertical scale) against the fitted values # (horizontal scale) and the linear fit is good, # (a) there should be no trend or pattern visible, and # (b) the variability of the residuals e.hat[ i ] in all narrow vertical strips # defined by y.hat[ i ] should be roughly constant; this is called the # assumption of *homoscedasticity* (greek for 'same scatter') plot( linear.fit$fitted.values, linear.fit$residuals, pch = 16, col = 'blue', xlab = 'predicted values', ylab = 'residuals' ) abline( h = 0, lwd = 2 ) lines( lowess( linear.fit$fitted.values, linear.fit$residuals ), lwd = 2, col = 'red' ) # the next code block defines 10 vertical strips so that all strips # have the same number of data points and plots the vertical strip boundaries n.vertical.strips <- 10 print( fitted.values.vertical.strip.boundaries <- quantile( linear.fit$fitted.values, prob = ( 0:n.vertical.strips ) / n.vertical.strips ) ) # 0% 10% 20% 30% 40% 50% 60% 70% # -4.05575378 -2.01245498 -1.27372387 -0.06346227 0.26660908 0.42378591 0.73813957 0.81672799 # 80% 90% 100% # 0.97390482 1.05249324 1.44543532 for ( i in 1:n.vertical.strips ) { abline( v = fitted.values.vertical.strip.boundaries[ i ], lwd = 2, lty = 3, col = 'magenta3' ) } # the next code block computes the SDs of the residuals in the vertical strips residual.standard.deviations <- rep( NA, n.vertical.strips ) for ( i in 1:n.vertical.strips ) { residual.standard.deviations[ i ] <- sd( linear.fit$residuals[ ( fitted.values.vertical.strip.boundaries[ i ] < linear.fit$fitted.values ) & ( linear.fit$fitted.values < fitted.values.vertical.strip.boundaries[ i + 1 ] ) ] ) } print( residual.standard.deviations ) # [1] 2.241392 2.164971 1.543255 1.506966 1.934142 1.470057 1.565862 2.334100 2.165670 1.987367 # the residual plot looks reasonably healthy: the linear fit is good and # the residuals are not far from homoscedasticity ##### inference with the non-parametric smoother/prediction function ################### # returning to the lowess fit, it produces E.hat.lowess ( y | x ) ; # how much uncertainty do we have in these predictions? # there is no closed-form algebraic expression for the repeated-sampling variance # of the lowess estimates, so instead we answer this question with simulation, # using the *bootstrap*, a super-important frequentist method that's actually a version # of bayesian non-parametric inference in disguise # the data set here can be visualizes as a matrix with n = 157 rows and 2 columns # ( y, x ) -- i like to think of y on the left and x on the right because # models for y look like y = f ( x ) + e, in which y is on the left # and x is on the right of the equals sign # we're thinking of this data set as like a random sample from the population # P = {healthy American people from 18 to 88 years old in the early 2000s} # as usual, P is unknown to us apart from our sample of size n from it # the bootstrap idea is to temporarily pretend that *the sample is the population*, # and repeatedly *draw IID samples of sizde n from the sample* # since in this case study the sample is a matrix of 157 rows and 2 columns, # each bootstrap sample will also be a matrix of 157 rows and 2 columns # since the sampling is IID, some of the rows of the original data set # will not appear at all in any particular bootstrap sample, and other rows # of the original data set may appear more than once (this is perfectly fine) # You can then do something interesting to each bootstrap sample data set # (e.g., fit a lowess curve to it) and base Your inferences on # summaries of the resulting interesting things ##### example of bootstrap inference for the population correlation ################## # here's a simple example of the bootstrap in action # earlier we saw that the cor( age, kidney.function ) # [1] -0.5718387 # there's a built-in function that computes a 95% confidence interval (CI) # for the population correlation rho, under the assumption of # bivariate normality: cor.test( age, kidney.function, method = 'pearson' ) # Pearson's product-moment correlation # data: age and kidney.function # t = -8.6782, df = 155, p-value = 5.182e-15 # alternative hypothesis: true correlation is not equal to 0 # 95 percent confidence interval: # -0.6685901 -0.4560497 # sample estimates: # cor # -0.5718387 # but we can easily get a non-parametric 95% CI with the bootstrap: n.bootstrap.replicates <- 10000 seed <- 4110739 rho.hat.star <- rep( NA, n.bootstrap.replicates ) set.seed( seed ) system.time( { for ( i in 1:n.bootstrap.replicates ) { bootstrap.indices <- sample( 1:n, n, replace = T ) bootstrap.data.set <- raw.data[ bootstrap.indices, ] rho.hat.star[ i ] <- cor( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ] ) } } ) # user system elapsed # 1.9 0.0 1.9 print( CI.limits <- quantile( rho.hat.star, prob = c( 0.025, 0.975 ) ) ) # 2.5% 97.5% # -0.6759057 -0.4431610 # this is called a *percentile-method* bootstrap CI for rho # personally, i would trust the bootstrap interval here # over the parametric interval, because the distribution in this case study # is *not* bivariate normal # what the bootstrap is secretly doing is sampling from the posterior # for the bivariate CDF using a low-information-content bayesian # nonparametric prior; the bootstrap approximation to the posterior # distribution for rho can then be visualized simply by making a histogram # of the rho.hat.star values: hist( rho.hat.star, breaks = 50, prob = T, xlab = 'rho', col = 'red', main = 'Bootstrap approximate posterior distribution for rho' ) segments( CI.limits[ 1 ], 0, CI.limits[ 1 ], 2, lwd = 2, lty = 2, col = 'blue' ) segments( CI.limits[ 2 ], 0, CI.limits[ 2 ], 2, lwd = 2, lty = 2, col = 'blue' ) print( rho.hat.bootstrap <- mean( rho.hat.star ) ) # [1] -0.5688725 abline( v = mean( rho.hat.bootstrap ), lwd = 3, lty = 1, col = 'black' ) ##### bootstrapping the lowess curves ############################################### # we can do the same thing with lowess to get uncertainty bands for # its estimate of E ( y | x ) : plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) n.bootstrap.replicates <- 1000 seed <- 3201991 set.seed( seed ) for ( i in 1:n.bootstrap.replicates ) { bootstrap.indices <- sample( 1:n, n, replace = T ) bootstrap.data.set <- raw.data[ bootstrap.indices, ] lines( lowess( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ], f = 1 / 3 ), lwd = 1, lty = 3 ) } lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' ) # You can see, as is reasonable, that we have more uncertainty about E ( y | x ) # at the left and right edges of the data (i.e., as x gets farther away from x.bar) ##### inference in the simple linear regression model ################################## # there are several ways to do this: first i'll work with # our usual 'optim'-based method # log likelihood function for general use (note the inclusion # of the constant term, which is important in model comparison) linear.model.log.likelihood <- function( beta.0, beta.1, sigma.e, x, y ) { n <- length( x ) ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma.e ) - sum( ( y - ( beta.0 + beta.1 * x ) )^2 ) / ( 2 * sigma.e^2 ) return( ll ) } # log likelihood function for use in 'optim' linear.model.log.likelihood.for.optim <- function( theta.linear.model, x, y ) { beta.0 <- theta.linear.model[ 1 ] beta.1 <- theta.linear.model[ 2 ] sigma.e <- theta.linear.model[ 3 ] n <- length( x ) ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma.e ) - sum( ( y - ( beta.0 + beta.1 * x ) )^2 ) / ( 2 * sigma.e^2 ) return( ll ) } theta.linear.model.initial.values <- c( 0, 0, 1 ) print( linear.model.maximum.likelihood.results <- optim( theta.linear.model.initial.values, linear.model.log.likelihood.for.optim, x = age, y = kidney.function, hessian = T, control = list( fnscale = -1 ) ) ) # $par # [1] 2.85982176 -0.07858272 1.78970214 # $value # [1] -314.1387 # $counts # function gradient # 150 NA # $convergence # [1] 0 # $message # NULL # $hessian # [,1] [,2] [,3] # [1,] -4.901603e+01 -1.783934e+03 1.254961e-04 # [2,] -1.783934e+03 -7.727487e+04 8.318036e-02 # [3,] 1.254961e-04 8.318036e-02 -9.800163e+01 print( linear.model.maximum.likelihood.covariance.matrix <- solve( - linear.model.maximum.likelihood.results$hessian ) ) # [,1] [,2] [,3] # [1,] 1.276664e-01 -2.947250e-03 -2.338040e-06 # [2,] -2.947250e-03 8.097975e-05 6.495868e-08 # [3,] -2.338040e-06 6.495868e-08 1.020391e-02 print( linear.model.maximum.likelihood.estimated.standard.errors <- sqrt( diag( linear.model.maximum.likelihood.covariance.matrix ) ) ) # [1] 0.357304321 0.008998875 0.101014416 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 2.860027 0.359565 7.954 3.53e-13 *** # age -0.078588 0.009056 -8.678 5.18e-15 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 1.801 on 155 degrees of freedom # You can see that the 'optim' results match the output of the 'lm' function # closely ##### BIC for the linear model ################################################## k.linear.model <- 3 print( BIC.linear.model <- - 2 * linear.model.maximum.likelihood.results$value + k.linear.model * log( n ) ) # [1] 643.4461 ##### matrix approach to obtaining the MLEs ##################################### X <- as.matrix( cbind( rep( 1, n ), age ) ) y <- kidney.function print( beta.hat <- solve( t( X ) %*% X ) %*% ( t( X ) %*% y ) ) # [,1] # 2.86002680 # age -0.07858842 print( sigma.e.hat.mle <- linear.model.maximum.likelihood.results$par[ 3 ] ) # [1] 1.789702 print( beta.hat.covariance.matrix <- sigma.e.hat.mle^2 * solve( t( X ) %*% X ) ) # age # 0.12766638 -2.947250e-03 # age -0.00294725 8.097975e-05 se.hat.linear.regression.y.hat <- sqrt( beta.hat.covariance.matrix[ 1, 1 ] + age^2 * beta.hat.covariance.matrix[ 2, 2 ] + 2 * age * beta.hat.covariance.matrix[ 1, 2 ] ) alpha <- 0.005 plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) lines( age, linear.fit$fitted.values - qnorm( 1 - alpha / 2 ) * se.hat.linear.regression.y.hat, lty = 3, lwd = 2, col = 'blue' ) lines( age, linear.fit$fitted.values + qnorm( 1 - alpha / 2 ) * se.hat.linear.regression.y.hat, lty = 3, lwd = 2, col = 'blue' ) # now let's compare the parametric and nonparametric uncertainty bands # for E ( y | x ) par( mfrow = c( 2, 1 ) ) plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) n.bootstrap.replicates <- 1000 bootstrap.lowess.predictions <- matrix( NA, n.bootstrap.replicates, n ) seed <- 3201991 set.seed( seed ) for ( i in 1:n.bootstrap.replicates ) { bootstrap.indices <- sample( 1:n, n, replace = T ) bootstrap.data.set <- raw.data[ bootstrap.indices, ] bootstrap.lowess.predictions[ i, ] <- lowess( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ], 1 / 3 )$y lines( lowess( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ], 1 / 3 ), lwd = 1, lty = 3 ) } lines( lowess( age, kidney.function, 1 / 3 ), lwd = 2, col = 'blue' ) plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) lines( age, linear.fit$fitted.values, lwd = 2, lty = 1, col = 'blue' ) lines( age, linear.fit$fitted.values - qnorm( 1 - alpha / 2 ) * se.hat.linear.regression.y.hat, lty = 2, lwd = 2, col = 'black' ) lines( age, linear.fit$fitted.values + qnorm( 1 - alpha / 2 ) * se.hat.linear.regression.y.hat, lty = 2, lwd = 2, col = 'black' ) par( mfrow = c( 1, 1 ) ) # You can see that avoiding the parametric linear modeling assumption # comes with a price in increased predictive uncertainty; # this is a typical finding when comparing parametric and nonparametric modeling # which would You rather have: nonparametric uncertainty bands # with guaranteed validity that are wider, or parametric uncertainty bands # with uncertain validity that are narrower? ##### visualization of the likelihood function in the simple linear regression model ###### n.grid <- 500 beta.0.hat.mle <- linear.model.maximum.likelihood.results$par[ 1 ] se.hat.beta.0.hat.mle <- linear.model.maximum.likelihood.estimated.standard.errors[ 1 ] beta.0.grid <- seq( beta.0.hat.mle - 4 * se.hat.beta.0.hat.mle, beta.0.hat.mle + 4 * se.hat.beta.0.hat.mle, length = n.grid ) beta.1.hat.mle <- linear.model.maximum.likelihood.results$par[ 2 ] se.hat.beta.1.hat.mle <- linear.model.maximum.likelihood.estimated.standard.errors[ 2 ] beta.1.grid <- seq( beta.1.hat.mle - 4 * se.hat.beta.1.hat.mle, beta.1.hat.mle + 4 * se.hat.beta.1.hat.mle, length = n.grid ) ll.grid.for.beta.0.and.beta.1 <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { ll.grid.for.beta.0.and.beta.1[ i, j ] <- linear.model.log.likelihood( beta.0.grid[ i ], beta.1.grid[ j ], sigma.e.hat.mle, age, kidney.function ) } } ll.grid.for.beta.0.and.beta.1.transpose <- t( ll.grid.for.beta.0.and.beta.1 ) interactive.likelihood.for.beta.0.and.beta.1.perspective.plot <- plot_ly( x = beta.0.grid, y = beta.1.grid, z = exp( ll.grid.for.beta.0.and.beta.1.transpose ) ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of likelihood function for beta.0 and beta.1', scene = list( xaxis = list( title = 'beta.0' ), yaxis = list( title = 'beta.1' ), zaxis = list( title = 'Likehood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Likelihood' ) interactive.likelihood.for.beta.0.and.beta.1.perspective.plot ##################################################################################### se.hat.sigma.hat.mle <- linear.model.maximum.likelihood.estimated.standard.errors[ 3 ] sigma.e.grid <- seq( sigma.e.hat.mle - 4 * se.hat.sigma.hat.mle, sigma.e.hat.mle + 4 * se.hat.sigma.hat.mle, length = n.grid ) ll.grid.for.beta.0.and.sigma.e <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { ll.grid.for.beta.0.and.sigma.e[ i, j ] <- linear.model.log.likelihood( beta.0.grid[ i ], beta.1.hat.mle, sigma.e.grid[ j ], age, kidney.function ) } } ll.grid.for.beta.0.and.sigma.e.transpose <- t( ll.grid.for.beta.0.and.sigma.e ) interactive.likelihood.for.beta.0.and.sigma.e.perspective.plot <- plot_ly( x = beta.0.grid, y = sigma.e.grid, z = exp( ll.grid.for.beta.0.and.sigma.e.transpose ) ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of likelihood function for beta.0 and sigma.e', scene = list( xaxis = list( title = 'beta.0' ), yaxis = list( title = 'sigma.e' ), zaxis = list( title = 'Likehood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Likelihood' ) interactive.likelihood.for.beta.0.and.sigma.e.perspective.plot ##################################################################################### ll.grid.for.beta.1.and.sigma.e <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { ll.grid.for.beta.1.and.sigma.e[ i, j ] <- linear.model.log.likelihood( beta.0.hat.mle, beta.1.grid[ i ], sigma.e.grid[ j ], age, kidney.function ) } } ll.grid.for.beta.1.and.sigma.e.transpose <- t( ll.grid.for.beta.1.and.sigma.e ) interactive.likelihood.for.beta.1.and.sigma.e.perspective.plot <- plot_ly( x = beta.1.grid, y = sigma.e.grid, z = exp( ll.grid.for.beta.1.and.sigma.e.transpose ) ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of likelihood function for beta.1 and sigma.e', scene = list( xaxis = list( title = 'beta.1' ), yaxis = list( title = 'sigma.e' ), zaxis = list( title = 'Likehood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Likelihood' ) interactive.likelihood.for.beta.1.and.sigma.e.perspective.plot ###### is the linear fit statistically significantly better than the constant fit? ###### # let's compare the linear model # y[ i ] = beta.0 + beta.1 * x[ i ] + e[ i ], e[ i ] ~IID N ( 0, sigma.e^2 ) # with the constant model # y[ i ] = mu + epsilon[ i ], epsilon[ i ] ~IID N ( 0, sigma^2 ) # one frequentist approach would build a 95% confidence interval # for beta.1 in the linear model and see if it includes 0 # recall these results earlier from 'lm': # Call: # lm(formula = kidney.function ~ age) # Residuals: # Min 1Q Median 3Q Max # -4.2018 -1.3451 0.0765 1.0719 4.5252 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 2.860027 0.359565 7.954 3.53e-13 *** # age -0.078588 0.009056 -8.678 5.18e-15 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 1.801 on 155 degrees of freedom # Multiple R-squared: 0.327, Adjusted R-squared: 0.3227 # F-statistic: 75.31 on 1 and 155 DF, p-value: 5.182e-15 # the 95% CI would run from -0.078588 - 1.96 * 0.009056 =. -0.096 # to -0.078588 + 1.96 * 0.009056 =. -0.061; this interval # emphatically doesn't include 0, so the linear model # is statistically significantly better than the constant model # one bayesian approach would use BIC to choose between the models constant.model.log.likelihood <- function( mu, sigma, x, y ) { n <- length( x ) ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma ) - sum( ( y - mu )^2 ) / ( 2 * sigma^2 ) return( ll ) } constant.model.log.likelihood.for.optim <- function( theta.constant.model, x, y ) { mu <- theta.constant.model[ 1 ] sigma <- theta.constant.model[ 2 ] n <- length( x ) ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma ) - sum( ( y - mu )^2 ) / ( 2 * sigma^2 ) return( ll ) } theta.constant.model.initial.values <- c( 0, 1 ) print( constant.model.maximum.likelihood.results <- optim( theta.constant.model.initial.values, constant.model.log.likelihood.for.optim, x = age, y = kidney.function, hessian = T, control = list( fnscale = -1 ) ) ) $par [1] -0.0004708477 2.1814794894 $value [1] -345.2254 $counts function gradient 57 NA $convergence [1] 0 $message NULL $hessian [,1] [,2] [1,] -32.991144678 -0.008461924 [2,] -0.008461924 -65.971740099 k.constant.model <- 2 print( BIC.constant.model <- - 2 * constant.model.maximum.likelihood.results$value + k.constant.model * log( n ) ) # [1] 700.5633 # summary of BIC analysis; recall that good models have small BIC values | model lack of fit | model complexity | | | | # model | - 2 * max.log.likelihood | k k * log( n ) | BIC -----------+--------------------------+------------------+------- # constant | 690.5 | 2 10.11 | 700.6 # linear | 628.3 | 3 15.17 | 643.4 # the constant model wins by about 5 on the BIC scale in complexity # but it loses by about 62 on the BIC scale in goodness of fit; # the result is an emphatic preference by BIC for the linear model, # which matches the frequentist answer here (with n = 157 and only 3 # unknown parameters, we're comfortably in bernstein-von-mises territory) # a simpler way to compute the BIC values when comparing regression models: # use the built-in function 'step', which is also employed in take-home test 3: null.model <- lm( kidney.function ~ 1, data = raw.data ) full.model <- lm( kidney.function ~ age, data = raw.data ) variable.selection.with.bic <- step( null.model, scope = list( lower = null.model, upper = full.model ), direction = 'forward', trace = 1, steps = 1000, k = log( n ) ) Start: AIC=249.96 kidney.function ~ 1 Df Sum of Sq RSS AIC + age 1 244.29 502.77 192.84 747.06 249.96 Step: AIC=192.84 kidney.function ~ age # the results look different, but the difference in BIC values # between the two models with my home-grown method # ( 700.6 - 643.4 ) = 57.2 # and the corresponding difference from the function 'step' # ( 249.96 - 192.84 ) = 57.1 # is the same apart from rounding error ###### is the linear fit practically significantly better than the constant fit? ###### plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 ) lines( age, linear.fit$fitted.values, lwd = 2, col = 'blue' ) abline( h = constant.model.maximum.likelihood.results$par[ 1 ], lwd = 2, lty = 2, col = 'black' ) # rmse = root mean squared error print( linear.fit.rmse <- sqrt( mean( ( kidney.function - linear.fit$fitted.values )^2 ) ) ) # [1] 1.789515 print( constant.fit.rmse <- sd( kidney.function ) ) # [1] 2.188342 ( constant.fit.rmse - linear.fit.rmse ) / linear.fit.rmse # [1] 0.2228687 ( linear.fit.rmse - constant.fit.rmse ) / constant.fit.rmse # [1] -0.1822507 # You could either say that the typical amount by which the # constant-fit predictions are wrong is 22% higher than # the corresponding amount for the linear-fit predictions, # or that typical amount by which the linear-fit predictions # are wrong is 18% lower than the corresponding amount # for the constant-fit predictions (both statements are true) ( constant.fit.rmse - linear.fit.rmse ) / mean( c( constant.fit.rmse, linear.fit.rmse ) ) # [1] 0.2005235 # or You could just say that the predictions from the linear fit # are 20% more accurate than those from the constant fit # any way You cut it, the linear fit is practically significantly better # than the constant fit ##### bayesian analysis of the multiple linear regression model # in the model (in matrix form) # y = X beta + e, e[ i ] ~IID N ( 0, sigma^2 ) , # there *is* a conjugate prior for the vector theta = ( sigma^2, beta ) # of unknowns, of length ( k + 1 ) if X is ( n by k ) # see the excellent notes by sudipto banerjee giving what he calls # the 'gory details' # the entire bayesian analysis is performed conditional on the matrix X, # even when the x values for individual i are randomly sampled jointly # with the y value for that individual; gelman et al. p. 354 gives a good # explanation of why this is reasonable # a useful low-information-content improper prior -- call it script-P -- # is obtained by putting a Uniform( - infinity, + infinity ) prior on # ( log( sigma ), beta ); this prior leads to a proper posterior whenever # n > k and X is of full rank, which will often be true # see chapter 14 in the gelman et al. book for more details # back on the scale of ( sigma^2, beta ) this prior is of the form # p ( sigma^2, beta | X, script-P, script-B ) = c / sigma^2 # the conditional posterior for beta given sigma^2 is multivariate normal # with this prior: # ( beta | sigma^2, y, X, script-P, script-N, script-B ) ~ # N_k ( beta.hat, sigma^2 V ) # here script-N is the Normality assumption for the 'errors' e, # N_k is the k-dimensional multivariate normal distribution, # beta.hat is the MLE vector, given by # beta.hat = ( X^T X )^( -1 ) X^T y (^T means transpose), # and V = ( X^T X )^( -1 ) # the marginal posterior for sigma^2 with this prior turns out to be # a member of the scaled-inverse-chi-squared family of distributions: # ( sigma^2 | y, X, script-P, script-N, script-B ) ~ # scaled-inverse-chi-squared( n - k, s^2 ) # banerjee refers to this distribution as the Inverse-Gamma family, # which is another name for it # to sample from the posterior for ( sigma^2, beta ), You simply # (1) make a sigma^2 draw from scaled-inverse-chi-squared( n - k, s^2 ); # call this draw sigma.star^2; and # (2) make a beta draw from the multivariate Normal distribution # with mean beta.hat and covariance matrix sigma.star^2 V scaled.inverse.chisq.density <- function( theta, nu, theta.0 ) { # note that theta plays the role of sigma^2 in this function; # when used as a prior for theta, nu represents the prior sample size # and theta.0 = sigma.0^2 represents the prior estimate # of theta = sigma^2 log.density <- ( nu / 2 ) * log( nu / 2 ) - lgamma( nu / 2 ) + ( nu / 2 ) * log( theta.0 ) - ( 1 + nu / 2 ) * log( theta ) - nu * theta.0 / ( 2 * theta ) return( exp( log.density ) ) } r.scaled.inverse.chi.squared <- function( M, nu.0, theta.0 ) { return( ) } regression.posterior.sampling <- function( y, X, M ) { n <- dim( X )[ 1 ] k <- dim( X )[ 2 ] theta.star <- matrix( NA, M, k + 1 ) for ( m in 1:M ) { sigma.squared.star <- r.scaled.inverse.chi.squared } return( theta.star ) }