##### preamble ######################################################################

# unbuffer the output

# set Your working directory to the place where You downloaded
# the data file 'kidney-function-data.txt' ; on my desktop this is

# setwd( 'C:/DD/Teaching/AMS-206/Winter-2019/Kidney-Function' )

# load any required CRAN packages (previously installed):

library( plotly )

##### get the data set ##############################################################

# read in the kidney function data with the built-in function 'read.csv' :

raw.data <- read.csv( 'kidney-function-data.txt',
  header = T, sep = ' ' )

# this data set consists of age and kidney function measurements
# on 157 healthy volunteers (potential kidney donors)
# who came to the nephrology lab of dr. brian myers (stanford university)
# in the early 2000s

# 'read.csv' was designed to input comma-separated files,
# such as those created by excel, but it also handles
# data sets in which other forms of column separation are used

# 'raw.data' is an important type of data structure in R called
# a 'data frame'; this is a generalization of a matrix,
# with rows for subjects under study and columns for variables
# measured on the subjects, in which the variables can be
# a mixture of data types (numerical, character, logical, ...)

str( raw.data )

# 'data.frame':   157 obs. of  2 variables:
#  $ age            : int  18 19 19 20 21 21 21 22 22 22 ...
#  $ kidney.function: num  2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...

# as with 'list' objects, You access the variables in a data frame
# with the syntax 'data.frame.name$variable.name' :

##### check for missing values ######################################################

sum( is.na( raw.data ) )

# [1] 0

# this checks all 157 * 2 values in the data frame for missingness;
# 0 means no missing data in this data frame

##### extract the variables as separate vectors, and define the sample size #########

age <- raw.data$age

kidney.function <- raw.data$kidney.function

print( n <- length( age ) )

# [1] 157

##### descriptive analysis: (1) one variable at a time ##############################

c( summary( age ), sd( age ) )

#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  18.00   25.00   31.00   36.39   43.00   88.00 15.9232 

table( age )

# age

# 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 41 42 43 44 45 46 47 48 51 53 54 
#  1  2  1  3  6 11 12 10  7 12  4  5  4  7  7  6  4  2  5  1  2  1  3  1  1  1  2  1  1  1  1  1  4 

# 55 56 57 60 61 62 63 65 68 69 70 71 72 73 74 80 82 88 
#  1  1  2  3  2  3  2  2  1  1  1  1  1  2  1  1  1  1

# as usual, age is measured as an integer (if Your age is 29 years and 11 months,
# You still say that Your age is 29)

par( mfrow = c( 1, 2 ) )

hist( age, main = '', prob = T, col = 'red' )

plot( density( age ), main = '', col = 'blue', xlab = 'age', lwd = 2 )

par( mfrow = c( 1, 1 ) )

# the age distribution is bimodal, but that doesn't affect the validity
# of the analyses below

c( summary( kidney.function ), sd( kidney.function ) )

#          Min.       1st Qu.        Median          Mean       3rd Qu.          Max.               
# -6.4500000000 -1.3500000000 -0.0500000000 -0.0001910828  1.5200000000  5.1900000000  2.1883422104

# kidney function has been standardized by subtracting off its mean,
# so that the variable we're working with has mean essentially 0;
# it's also been rounded to two digits after the decimal point
# (that also won't affect the validity of the analyses below)

par( mfrow = c( 1, 2 ) )

hist( kidney.function, main = '', prob = T, col = 'red', xlab = 'kidney function' )

plot( density( kidney.function ), main = '', col = 'blue', 
  xlab = 'kidney function', lwd = 2 )

par( mfrow = c( 1, 1 ) )

# Normality of 'kidney.function' is not required for the validity of the analysis below;
# it turns out that what *is* required is that the conditional distribution
# of kidney function in each narrow vertical strip defined by age
# should be fairly close to Gaussian

qqnorm( kidney.function, main = 'Normal qqplot of kidney function', col = 'red',
  pch = 16 )

abline( mean( kidney.function ), sd( kidney.function ), lwd = 2, col = 'blue' )

# kidney function looks rather close to Gaussian

##### descriptive analysis: (2) two variables at a time ##############################

# let's look at a scatter plot of the bivariate relationship
# between age and kidney function:

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

# not surprisingly, kidney function tends to decline with age

cor( age, kidney.function )

# [1] -0.5718387

# the correlation is (of course) negative and moderately strong

##### non-parametric smoother/prediction function ###################################

# the next command invokes the built-in function 'lowess'
# (LOcally-WEighted Scatter plot Smoother) to superimpose
# a frequentist non-parametric estimate E.hat.lowess ( y | x )
# of the conditional expectation (mean) of kidney function (y) 
# given age (x), E ( y | x )

# here 'non-parametric' means 'not based on a parametric model'

lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' )

# f governs the degree of smoothness of the lowess estimate
# in a manner similar to the number of bars in a histogram;
# smaller f values create less local bias but greater variance;
# the default is 'f = 2 / 3'

# 'f = 1 / 3' reveals a flat spot from age roughly 25 to 30
# in which kidney function appears not to decline;
# this is diminished but still present with 'f = 2 / 3'

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

lines( lowess( age, kidney.function ), lwd = 2, col = 'blue' )

# i don't know what the optimal value of f is; this question
# could readily be investigated with simulation studies
# in which we know what the right answer is for E ( y | x ),
# but the answer may well depend on the form of the scatterplot

# let's go back to 'f = 1 / 3' :

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' )

##### parametric smoother/prediction function ######################################

# an obvious parametric model to try here is *simple linear regression*:

#   y[ i ] = beta.0 + beta.1 * x[ i ] + e[ i ],
#   in which e[ i ] ~IID N ( 0, sigma.e^2 )

# 'simple' here means that there's only one predictor variable x[ i ]

# this can be implemented with the built-in function 'lm' (linear model):

summary( linear.fit <- lm( kidney.function ~ age ) )

# Call:
# lm(formula = kidney.function ~ age)

# Residuals:
#     Min      1Q  Median      3Q     Max 
# -4.2018 -1.3451  0.0765  1.0719  4.5252 

# Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.860027   0.359565   7.954 3.53e-13 ***
# age         -0.078588   0.009056  -8.678 5.18e-15 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 1.801 on 155 degrees of freedom
# Multiple R-squared:  0.327,     Adjusted R-squared:  0.3227 
# F-statistic: 75.31 on 1 and 155 DF,  p-value: 5.182e-15

# the object 'linear.fit' is complicated:

str( linear.fit )

# List of 12
#  $ coefficients : Named num [1:2] 2.86 -0.0786
#   ..- attr(*, "names")= chr [1:2] "(Intercept)" "age"
#  $ residuals    : Named num [1:157] 0.995 2.493 -2.587 1.012 -0.23 ...
#   ..- attr(*, "names")= chr [1:157] "1" "2" "3" "4" ...
#  $ effects      : Named num [1:157] 0.00239 -15.62972 -2.84089 0.76896 -0.4612 ...
#   ..- attr(*, "names")= chr [1:157] "(Intercept)" "age" "" "" ...
#  $ rank         : int 2
#  $ fitted.values: Named num [1:157] 1.45 1.37 1.37 1.29 1.21 ...
#   ..- attr(*, "names")= chr [1:157] "1" "2" "3" "4" ...
#  $ assign       : int [1:2] 0 1
#  $ qr           :List of 5
#   ..$ qr   : num [1:157, 1:2] -12.53 0.0798 0.0798 0.0798 0.0798 ...
#   .. ..- attr(*, "dimnames")=List of 2
#   .. .. ..$ : chr [1:157] "1" "2" "3" "4" ...
#   .. .. ..$ : chr [1:2] "(Intercept)" "age"
#   .. ..- attr(*, "assign")= int [1:2] 0 1
#   ..$ qraux: num [1:2] 1.08 1.08
#   ..$ pivot: int [1:2] 1 2
#   ..$ tol  : num 1e-07
#   ..$ rank : int 2
#   ..- attr(*, "class")= chr "qr"
#  $ df.residual  : int 155
#  $ xlevels      : Named list()
#  $ call         : language lm(formula = kidney.function ~ age)
#  $ terms        :Classes 'terms', 'formula'  language kidney.function ~ age
#   .. ..- attr(*, "variables")= language list(kidney.function, age)
#   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
#   .. .. ..- attr(*, "dimnames")=List of 2
#   .. .. .. ..$ : chr [1:2] "kidney.function" "age"
#   .. .. .. ..$ : chr "age"
#   .. ..- attr(*, "term.labels")= chr "age"
#   .. ..- attr(*, "order")= int 1
#   .. ..- attr(*, "intercept")= int 1
#   .. ..- attr(*, "response")= int 1
#   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
#   .. ..- attr(*, "predvars")= language list(kidney.function, age)
#   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
#   .. .. ..- attr(*, "names")= chr [1:2] "kidney.function" "age"
#  $ model        :'data.frame':  157 obs. of  2 variables:
#   ..$ kidney.function: num [1:157] 2.44 3.86 -1.22 2.3 0.98 -0.5 2.74 -0.12 -1.21 0.99 ...
#   ..$ age            : int [1:157] 18 19 19 20 21 21 21 22 22 22 ...
#   ..- attr(*, "terms")=Classes 'terms', 'formula'  language kidney.function ~ age
#   .. .. ..- attr(*, "variables")= language list(kidney.function, age)
#   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
#   .. .. .. ..- attr(*, "dimnames")=List of 2
#   .. .. .. .. ..$ : chr [1:2] "kidney.function" "age"
#   .. .. .. .. ..$ : chr "age"
#   .. .. ..- attr(*, "term.labels")= chr "age"
#   .. .. ..- attr(*, "order")= int 1
#   .. .. ..- attr(*, "intercept")= int 1
#   .. .. ..- attr(*, "response")= int 1
#   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
#   .. .. ..- attr(*, "predvars")= language list(kidney.function, age)
#   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
#   .. .. .. ..- attr(*, "names")= chr [1:2] "kidney.function" "age"
#  - attr(*, "class")= chr "lm"

# the $fitted.values list element gives the predicted y values:

#   y.hat[ i ] = beta.0.hat + beta.1.hat * x[ i ]

lines( age, linear.fit$fitted.values, lwd = 2, lty = 2, col = 'forestgreen' )

# You can see that lowess and the linear fit broadly agree on E.hat ( y | x )

# the next graph is a standard diagnostic called the *residual plot*

# the residuals are defined to be the vertical discrepancies between
# y[ i ] and y.hat[ i ] :

#   e.hat[ i ] = y[ i ] - y.hat[ i ]

# if we plot the residuals (vertical scale) against the fitted values
# (horizontal scale) and the linear fit is good, 

#   (a) there should be no trend or pattern visible, and

#   (b) the variability of the residuals e.hat[ i ] in all narrow vertical strips
#   defined by y.hat[ i ] should be roughly constant; this is called the
#   assumption of *homoscedasticity* (greek for 'same scatter')

plot( linear.fit$fitted.values, linear.fit$residuals, pch = 16, col = 'blue',
  xlab = 'predicted values', ylab = 'residuals' )

abline( h = 0, lwd = 2 )

lines( lowess( linear.fit$fitted.values, linear.fit$residuals ), lwd = 2, col = 'red' )

# the next code block defines 10 vertical strips so that all strips
# have the same number of data points and plots the vertical strip boundaries

n.vertical.strips <- 10

print( fitted.values.vertical.strip.boundaries <- 
  quantile( linear.fit$fitted.values, prob = ( 0:n.vertical.strips ) / 
  n.vertical.strips ) )

#          0%         10%         20%         30%         40%         50%         60%         70% 
# -4.05575378 -2.01245498 -1.27372387 -0.06346227  0.26660908  0.42378591  0.73813957  0.81672799 

#         80%         90%        100% 
#  0.97390482  1.05249324  1.44543532 

for ( i in 1:n.vertical.strips ) {

  abline( v = fitted.values.vertical.strip.boundaries[ i ],
    lwd = 2, lty = 3, col = 'magenta3' )

}

# the next code block computes the SDs of the residuals in the vertical strips

residual.standard.deviations <- rep( NA, n.vertical.strips )

for ( i in 1:n.vertical.strips ) {

  residual.standard.deviations[ i ] <- sd( linear.fit$residuals[ 
    ( fitted.values.vertical.strip.boundaries[ i ] < linear.fit$fitted.values ) & 
    ( linear.fit$fitted.values < fitted.values.vertical.strip.boundaries[ i + 1 ] ) ] )

}

print( residual.standard.deviations )

# [1] 2.241392 2.164971 1.543255 1.506966 1.934142 1.470057 1.565862 2.334100 2.165670 1.987367

# the residual plot looks reasonably healthy: the linear fit is good and
# the residuals are not far from homoscedasticity

##### inference with the non-parametric smoother/prediction function ###################

# returning to the lowess fit, it produces E.hat.lowess ( y | x ) ;
# how much uncertainty do we have in these predictions?

# there is no closed-form algebraic expression for the repeated-sampling variance
# of the lowess estimates, so instead we answer this question with simulation,
# using the *bootstrap*, a super-important frequentist method that's actually a version
# of bayesian non-parametric inference in disguise

# the data set here can be visualizes as a matrix with n = 157 rows and 2 columns
# ( y, x ) -- i like to think of y on the left and x on the right because
# models for y look like y = f ( x ) + e, in which y is on the left
# and x is on the right of the equals sign

# we're thinking of this data set as like a random sample from the population
# P = {healthy American people from 18 to 88 years old in the early 2000s}

# as usual, P is unknown to us apart from our sample of size n from it

# the bootstrap idea is to temporarily pretend that *the sample is the population*,
# and repeatedly *draw IID samples of sizde n from the sample*

# since in this case study the sample is a matrix of 157 rows and 2 columns,
# each bootstrap sample will also be a matrix of 157 rows and 2 columns

# since the sampling is IID, some of the rows of the original data set
# will not appear at all in any particular bootstrap sample, and other rows
# of the original data set may appear more than once (this is perfectly fine)

# You can then do something interesting to each bootstrap sample data set
# (e.g., fit a lowess curve to it) and base Your inferences on
# summaries of the resulting interesting things

##### example of bootstrap inference for the population correlation ##################

# here's a simple example of the bootstrap in action

# earlier we saw that the 

cor( age, kidney.function )

# [1] -0.5718387

# there's a built-in function that computes a 95% confidence interval (CI)
# for the population correlation rho, under the assumption of
# bivariate normality:

cor.test( age, kidney.function, method = 'pearson' )


#         Pearson's product-moment correlation

# data:  age and kidney.function

# t = -8.6782, df = 155, p-value = 5.182e-15
# alternative hypothesis: true correlation is not equal to 0

# 95 percent confidence interval:
#  -0.6685901 -0.4560497

# sample estimates:
#        cor 
# -0.5718387 

# but we can easily get a non-parametric 95% CI with the bootstrap:

n.bootstrap.replicates <- 10000

seed <- 4110739

rho.hat.star <- rep( NA, n.bootstrap.replicates )

set.seed( seed )

system.time( {

  for ( i in 1:n.bootstrap.replicates ) {

    bootstrap.indices <- sample( 1:n, n, replace = T )

    bootstrap.data.set <- raw.data[ bootstrap.indices, ]

    rho.hat.star[ i ] <- cor( bootstrap.data.set[ , 1 ],
      bootstrap.data.set[ , 2 ] )

  }

}

)

#   user  system elapsed 
#    1.9     0.0     1.9 

print( CI.limits <- quantile( rho.hat.star, prob = c( 0.025, 0.975 ) ) )

#       2.5%      97.5% 
# -0.6759057 -0.4431610

# this is called a *percentile-method* bootstrap CI for rho

# personally, i would trust the bootstrap interval here
# over the parametric interval, because the distribution in this case study
# is *not* bivariate normal

# what the bootstrap is secretly doing is sampling from the posterior
# for the bivariate CDF using a low-information-content bayesian
# nonparametric prior; the bootstrap approximation to the posterior
# distribution for rho can then be visualized simply by making a histogram
# of the rho.hat.star values:

hist( rho.hat.star, breaks = 50, prob = T, xlab = 'rho', 
  col = 'red', main = 'Bootstrap approximate posterior distribution for rho' )

segments( CI.limits[ 1 ], 0, CI.limits[ 1 ], 2, lwd = 2, lty = 2, col = 'blue' )

segments( CI.limits[ 2 ], 0, CI.limits[ 2 ], 2, lwd = 2, lty = 2, col = 'blue' )

print( rho.hat.bootstrap <- mean( rho.hat.star ) )

# [1] -0.5688725

abline( v = mean( rho.hat.bootstrap ), lwd = 3, lty = 1, col = 'black' )

##### bootstrapping the lowess curves ###############################################

# we can do the same thing with lowess to get uncertainty bands for
# its estimate of E ( y | x ) :

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

n.bootstrap.replicates <- 1000

seed <- 3201991

set.seed( seed )

for ( i in 1:n.bootstrap.replicates ) {

  bootstrap.indices <- sample( 1:n, n, replace = T )

  bootstrap.data.set <- raw.data[ bootstrap.indices, ]

  lines( lowess( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ], f = 1 / 3 ), 
    lwd = 1, lty = 3 )

}

lines( lowess( age, kidney.function, f = 1 / 3 ), lwd = 2, col = 'blue' )

# You can see, as is reasonable, that we have more uncertainty about E ( y | x )
# at the left and right edges of the data (i.e., as x gets farther away from x.bar)

##### inference in the simple linear regression model ##################################

# there are several ways to do this: first i'll work with
# our usual 'optim'-based method

# log likelihood function for general use (note the inclusion 
# of the constant term, which is important in model comparison)

linear.model.log.likelihood <- function( beta.0, beta.1, sigma.e, x, y ) {

  n <- length( x )

  ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma.e ) - 
    sum( ( y - ( beta.0 + beta.1 * x ) )^2 ) / ( 2 * sigma.e^2 )

  return( ll )

}

# log likelihood function for use in 'optim'

linear.model.log.likelihood.for.optim <- function( theta.linear.model, x, y ) {

  beta.0 <- theta.linear.model[ 1 ]

  beta.1 <- theta.linear.model[ 2 ] 

  sigma.e <- theta.linear.model[ 3 ]

  n <- length( x )

  ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma.e ) - 
    sum( ( y - ( beta.0 + beta.1 * x ) )^2 ) / ( 2 * sigma.e^2 )

  return( ll )

}

theta.linear.model.initial.values <- c( 0, 0, 1 )

print( linear.model.maximum.likelihood.results <- 
  optim( theta.linear.model.initial.values, linear.model.log.likelihood.for.optim, 
  x = age, y = kidney.function, hessian = T, control = list( fnscale = -1 ) ) )

# $par
# [1]  2.85982176 -0.07858272  1.78970214

# $value
# [1] -314.1387

# $counts
# function gradient 
#      150       NA 

# $convergence
# [1] 0

# $message
# NULL

# $hessian
#               [,1]          [,2]          [,3]
# [1,] -4.901603e+01 -1.783934e+03  1.254961e-04
# [2,] -1.783934e+03 -7.727487e+04  8.318036e-02
# [3,]  1.254961e-04  8.318036e-02 -9.800163e+01

print( linear.model.maximum.likelihood.covariance.matrix <- 
  solve( - linear.model.maximum.likelihood.results$hessian ) )

#               [,1]          [,2]          [,3]
# [1,]  1.276664e-01 -2.947250e-03 -2.338040e-06
# [2,] -2.947250e-03  8.097975e-05  6.495868e-08
# [3,] -2.338040e-06  6.495868e-08  1.020391e-02

print( linear.model.maximum.likelihood.estimated.standard.errors <- 
  sqrt( diag( linear.model.maximum.likelihood.covariance.matrix ) ) )

# [1] 0.357304321 0.008998875 0.101014416

# Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.860027   0.359565   7.954 3.53e-13 ***
# age         -0.078588   0.009056  -8.678 5.18e-15 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 1.801 on 155 degrees of freedom

# You can see that the 'optim' results match the output of the 'lm' function
# closely

##### BIC for the linear model ##################################################

k.linear.model <- 3

print( BIC.linear.model <- 
  - 2 * linear.model.maximum.likelihood.results$value + 
  k.linear.model * log( n ) )

# [1] 643.4461

##### matrix approach to obtaining the MLEs #####################################

X <- as.matrix( cbind( rep( 1, n ), age ) )

y <- kidney.function

print( beta.hat <- solve( t( X ) %*% X ) %*% ( t( X ) %*% y ) )

#            [,1]
#      2.86002680
# age -0.07858842

print( sigma.e.hat.mle <- linear.model.maximum.likelihood.results$par[ 3 ] )

# [1] 1.789702

print( beta.hat.covariance.matrix <- sigma.e.hat.mle^2 * solve( t( X ) %*% X ) )

#                          age
#      0.12766638 -2.947250e-03
# age -0.00294725  8.097975e-05

se.hat.linear.regression.y.hat <- sqrt( beta.hat.covariance.matrix[ 1, 1 ] + 
  age^2 * beta.hat.covariance.matrix[ 2, 2 ] + 
  2 * age * beta.hat.covariance.matrix[ 1, 2 ] )

alpha <- 0.005

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

lines( age, linear.fit$fitted.values - qnorm( 1 - alpha / 2 ) *
  se.hat.linear.regression.y.hat, lty = 3, lwd = 2, col = 'blue' )

lines( age, linear.fit$fitted.values + qnorm( 1 - alpha / 2 ) *
  se.hat.linear.regression.y.hat, lty = 3, lwd = 2, col = 'blue' )

# now let's compare the parametric and nonparametric uncertainty bands
# for E ( y | x )

par( mfrow = c( 2, 1 ) )

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

n.bootstrap.replicates <- 1000

bootstrap.lowess.predictions <- matrix( NA, n.bootstrap.replicates, n )

seed <- 3201991

set.seed( seed )

for ( i in 1:n.bootstrap.replicates ) {

  bootstrap.indices <- sample( 1:n, n, replace = T )

  bootstrap.data.set <- raw.data[ bootstrap.indices, ]

  bootstrap.lowess.predictions[ i, ] <- lowess( bootstrap.data.set[ , 1 ], 
    bootstrap.data.set[ , 2 ], 1 / 3 )$y

  lines( lowess( bootstrap.data.set[ , 1 ], bootstrap.data.set[ , 2 ], 1 / 3 ), 
    lwd = 1, lty = 3 )

}

lines( lowess( age, kidney.function, 1 / 3 ), lwd = 2, col = 'blue' )

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

lines( age, linear.fit$fitted.values, lwd = 2, lty = 1, col = 'blue' )

lines( age, linear.fit$fitted.values - qnorm( 1 - alpha / 2 ) *
  se.hat.linear.regression.y.hat, lty = 2, lwd = 2, col = 'black' )

lines( age, linear.fit$fitted.values + qnorm( 1 - alpha / 2 ) *
  se.hat.linear.regression.y.hat, lty = 2, lwd = 2, col = 'black' )

par( mfrow = c( 1, 1 ) )

# You can see that avoiding the parametric linear modeling assumption
# comes with a price in increased predictive uncertainty;
# this is a typical finding when comparing parametric and nonparametric modeling

# which would You rather have: nonparametric uncertainty bands
# with guaranteed validity that are wider, or parametric uncertainty bands
# with uncertain validity that are narrower?

##### visualization of the likelihood function in the simple linear regression model ######

n.grid <- 500

beta.0.hat.mle <- linear.model.maximum.likelihood.results$par[ 1 ]

se.hat.beta.0.hat.mle <- 
  linear.model.maximum.likelihood.estimated.standard.errors[ 1 ]

beta.0.grid <- seq( beta.0.hat.mle - 4 * se.hat.beta.0.hat.mle,
  beta.0.hat.mle + 4 * se.hat.beta.0.hat.mle, length = n.grid )

beta.1.hat.mle <- linear.model.maximum.likelihood.results$par[ 2 ]

se.hat.beta.1.hat.mle <- 
  linear.model.maximum.likelihood.estimated.standard.errors[ 2 ]

beta.1.grid <- seq( beta.1.hat.mle - 4 * se.hat.beta.1.hat.mle,
  beta.1.hat.mle + 4 * se.hat.beta.1.hat.mle, length = n.grid )

ll.grid.for.beta.0.and.beta.1 <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    ll.grid.for.beta.0.and.beta.1[ i, j ] <- 
      linear.model.log.likelihood( beta.0.grid[ i ], 
      beta.1.grid[ j ], sigma.e.hat.mle, age, kidney.function )

  }

}

ll.grid.for.beta.0.and.beta.1.transpose <- t( ll.grid.for.beta.0.and.beta.1 )

interactive.likelihood.for.beta.0.and.beta.1.perspective.plot <- 
  plot_ly( x = beta.0.grid, y = beta.1.grid, 
  z = exp( ll.grid.for.beta.0.and.beta.1.transpose ) ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of likelihood function for beta.0 and beta.1',
  scene = list( xaxis = list( title = 'beta.0' ), 
  yaxis = list( title = 'beta.1' ), zaxis = list( title = 'Likehood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Likelihood' )

interactive.likelihood.for.beta.0.and.beta.1.perspective.plot

#####################################################################################

se.hat.sigma.hat.mle <- 
  linear.model.maximum.likelihood.estimated.standard.errors[ 3 ]

sigma.e.grid <- seq( sigma.e.hat.mle - 4 * se.hat.sigma.hat.mle,
  sigma.e.hat.mle + 4 * se.hat.sigma.hat.mle, length = n.grid )

ll.grid.for.beta.0.and.sigma.e <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    ll.grid.for.beta.0.and.sigma.e[ i, j ] <- 
      linear.model.log.likelihood( beta.0.grid[ i ], 
      beta.1.hat.mle, sigma.e.grid[ j ], age, kidney.function )

  }

}

ll.grid.for.beta.0.and.sigma.e.transpose <- t( ll.grid.for.beta.0.and.sigma.e )

interactive.likelihood.for.beta.0.and.sigma.e.perspective.plot <- 
  plot_ly( x = beta.0.grid, y = sigma.e.grid, 
  z = exp( ll.grid.for.beta.0.and.sigma.e.transpose ) ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of likelihood function for beta.0 and sigma.e',
  scene = list( xaxis = list( title = 'beta.0' ), 
  yaxis = list( title = 'sigma.e' ), zaxis = list( title = 'Likehood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Likelihood' )

interactive.likelihood.for.beta.0.and.sigma.e.perspective.plot

#####################################################################################

ll.grid.for.beta.1.and.sigma.e <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    ll.grid.for.beta.1.and.sigma.e[ i, j ] <- 
      linear.model.log.likelihood( beta.0.hat.mle, 
      beta.1.grid[ i ], sigma.e.grid[ j ], age, kidney.function )

  }

}

ll.grid.for.beta.1.and.sigma.e.transpose <- t( ll.grid.for.beta.1.and.sigma.e )

interactive.likelihood.for.beta.1.and.sigma.e.perspective.plot <- 
  plot_ly( x = beta.1.grid, y = sigma.e.grid, 
  z = exp( ll.grid.for.beta.1.and.sigma.e.transpose ) ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of likelihood function for beta.1 and sigma.e',
  scene = list( xaxis = list( title = 'beta.1' ), 
  yaxis = list( title = 'sigma.e' ), zaxis = list( title = 'Likehood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Likelihood' )

interactive.likelihood.for.beta.1.and.sigma.e.perspective.plot

###### is the linear fit statistically significantly better than the constant fit? ######

# let's compare the linear model

#   y[ i ] = beta.0 + beta.1 * x[ i ] + e[ i ], e[ i ] ~IID N ( 0, sigma.e^2 )

# with the constant model

#   y[ i ] = mu + epsilon[ i ], epsilon[ i ] ~IID N ( 0, sigma^2 )

# one frequentist approach would build a 95% confidence interval
# for beta.1 in the linear model and see if it includes 0

# recall these results earlier from 'lm':

# Call:
# lm(formula = kidney.function ~ age)

# Residuals:
#     Min      1Q  Median      3Q     Max 
# -4.2018 -1.3451  0.0765  1.0719  4.5252 

# Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.860027   0.359565   7.954 3.53e-13 ***
# age         -0.078588   0.009056  -8.678 5.18e-15 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 1.801 on 155 degrees of freedom
# Multiple R-squared:  0.327,     Adjusted R-squared:  0.3227 
# F-statistic: 75.31 on 1 and 155 DF,  p-value: 5.182e-15

# the 95% CI would run from -0.078588 - 1.96 * 0.009056 =. -0.096
# to -0.078588 + 1.96 * 0.009056 =. -0.061; this interval
# emphatically doesn't include 0, so the linear model
# is statistically significantly better than the constant model

# one bayesian approach would use BIC to choose between the models

constant.model.log.likelihood <- function( mu, sigma, x, y ) {

  n <- length( x )

  ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma ) - 
    sum( ( y - mu )^2 ) / ( 2 * sigma^2 )

  return( ll )

}

constant.model.log.likelihood.for.optim <- 
  function( theta.constant.model, x, y ) {

  mu <- theta.constant.model[ 1 ]

  sigma <- theta.constant.model[ 2 ]

  n <- length( x )

  ll <- - ( n / 2 ) * log( 2 * pi ) - n * log( sigma ) - 
    sum( ( y - mu )^2 ) / ( 2 * sigma^2 )

  return( ll )

}

theta.constant.model.initial.values <- c( 0, 1 )

print( constant.model.maximum.likelihood.results <- 
  optim( theta.constant.model.initial.values, constant.model.log.likelihood.for.optim, 
  x = age, y = kidney.function, hessian = T, control = list( fnscale = -1 ) ) )

$par
[1] -0.0004708477  2.1814794894

$value
[1] -345.2254

$counts
function gradient 
      57       NA 

$convergence
[1] 0

$message
NULL

$hessian
              [,1]          [,2]
[1,] -32.991144678  -0.008461924
[2,]  -0.008461924 -65.971740099

k.constant.model <- 2

print( BIC.constant.model <- 
  - 2 * constant.model.maximum.likelihood.results$value + 
  k.constant.model * log( n ) )

# [1] 700.5633

# summary of BIC analysis; recall that good models have small BIC values

           |     model lack of fit    | model complexity |
           |                          |                  |
#   model  | - 2 * max.log.likelihood | k   k * log( n ) |  BIC
-----------+--------------------------+------------------+-------
# constant |           690.5          | 2      10.11     | 700.6
# linear   |           628.3          | 3      15.17     | 643.4

# the constant model wins by about 5 on the BIC scale in complexity
# but it loses by about 62 on the BIC scale in goodness of fit;
# the result is an emphatic preference by BIC for the linear model,
# which matches the frequentist answer here (with n = 157 and only 3
# unknown parameters, we're comfortably in bernstein-von-mises territory)

# a simpler way to compute the BIC values when comparing regression models:
# use the built-in function 'step', which is also employed in take-home test 3:

null.model <- lm( kidney.function ~ 1, data = raw.data )

full.model <- lm( kidney.function ~ age, data = raw.data )

variable.selection.with.bic <- step( null.model,
  scope = list( lower = null.model, upper = full.model ), 
  direction = 'forward', trace = 1, steps = 1000, k = log( n ) )

Start:  AIC=249.96
kidney.function ~ 1

       Df Sum of Sq    RSS    AIC
+ age   1    244.29 502.77 192.84
<none>              747.06 249.96

Step:  AIC=192.84
kidney.function ~ age

# the results look different, but the difference in BIC values
# between the two models with my home-grown method 

#   ( 700.6 - 643.4 ) = 57.2

# and the corresponding difference from the function 'step' 

#   ( 249.96 - 192.84 ) = 57.1

# is the same apart from rounding error

###### is the linear fit practically significantly better than the constant fit? ######

plot( age, kidney.function, ylab = 'kidney function', col = 'red', pch = 16 )

lines( age, linear.fit$fitted.values, lwd = 2, col = 'blue' )

abline( h = constant.model.maximum.likelihood.results$par[ 1 ], lwd = 2, lty = 2,
  col = 'black' )

# rmse = root mean squared error

print( linear.fit.rmse <- sqrt( mean( ( kidney.function - 
  linear.fit$fitted.values )^2 ) ) )

# [1] 1.789515

print( constant.fit.rmse <- sd( kidney.function ) )

# [1] 2.188342

( constant.fit.rmse - linear.fit.rmse ) / linear.fit.rmse

# [1] 0.2228687

( linear.fit.rmse - constant.fit.rmse ) / constant.fit.rmse

# [1] -0.1822507

# You could either say that the typical amount by which the
# constant-fit predictions are wrong is 22% higher than
# the corresponding amount for the linear-fit predictions,
# or that typical amount by which the linear-fit predictions 
# are wrong is 18% lower than the corresponding amount 
# for the constant-fit predictions (both statements are true)

( constant.fit.rmse - linear.fit.rmse ) / 
  mean( c( constant.fit.rmse, linear.fit.rmse ) )

# [1] 0.2005235

# or You could just say that the predictions from the linear fit
# are 20% more accurate than those from the constant fit

# any way You cut it, the linear fit is practically significantly better 
# than the constant fit

##### bayesian analysis of the multiple linear regression model

# in the model (in matrix form)

#   y = X beta + e, e[ i ] ~IID N ( 0, sigma^2 ) ,

# there *is* a conjugate prior for the vector theta = ( sigma^2, beta )
# of unknowns, of length ( k + 1 ) if X is ( n by k )

# see the excellent notes by sudipto banerjee giving what he calls
# the 'gory details'

# the entire bayesian analysis is performed conditional on the matrix X,
# even when the x values for individual i are randomly sampled jointly
# with the y value for that individual; gelman et al. p. 354 gives a good
# explanation of why this is reasonable

# a useful low-information-content improper prior -- call it script-P -- 
# is obtained by putting a Uniform( - infinity, + infinity ) prior on 
# ( log( sigma ), beta ); this prior leads to a proper posterior whenever
# n > k and X is of full rank, which will often be true

# see chapter 14 in the gelman et al. book for more details

# back on the scale of ( sigma^2, beta ) this prior is of the form

#   p ( sigma^2, beta | X, script-P, script-B ) = c / sigma^2

# the conditional posterior for beta given sigma^2 is multivariate normal
# with this prior:

#   ( beta | sigma^2, y, X, script-P, script-N, script-B ) ~ 
#     N_k ( beta.hat, sigma^2 V )

# here script-N is the Normality assumption for the 'errors' e,
# N_k is the k-dimensional multivariate normal distribution,
# beta.hat is the MLE vector, given by

#   beta.hat = ( X^T X )^( -1 ) X^T y       (^T means transpose),

# and V = ( X^T X )^( -1 )

# the marginal posterior for sigma^2 with this prior turns out to be
# a member of the scaled-inverse-chi-squared family of distributions:

#   ( sigma^2 | y, X, script-P, script-N, script-B ) ~ 
#     scaled-inverse-chi-squared( n - k, s^2 )

# banerjee refers to this distribution as the Inverse-Gamma family,
# which is another name for it

# to sample from the posterior for ( sigma^2, beta ), You simply

#   (1) make a sigma^2 draw from scaled-inverse-chi-squared( n - k, s^2 );
#   call this draw sigma.star^2; and

#   (2) make a beta draw from the multivariate Normal distribution
#   with mean beta.hat and covariance matrix sigma.star^2 V


scaled.inverse.chisq.density <- function( theta, nu, theta.0 ) {

# note that theta plays the role of sigma^2 in this function;
# when used as a prior for theta, nu represents the prior sample size
# and theta.0 = sigma.0^2 represents the prior estimate
# of theta = sigma^2

  log.density <- ( nu / 2 ) * log( nu / 2 ) - lgamma( nu / 2 ) +
    ( nu / 2 ) * log( theta.0 ) - ( 1 + nu / 2 ) * log( theta ) -
    nu * theta.0 / ( 2 * theta )

  return( exp( log.density ) )

} 

r.scaled.inverse.chi.squared <- function( M, nu.0, theta.0 ) {



  return(  )

}

regression.posterior.sampling <- function( y, X, M ) {

  n <- dim( X )[ 1 ]

  k <- dim( X )[ 2 ]

  theta.star <- matrix( NA, M, k + 1 )

  for ( m in 1:M ) {

    sigma.squared.star <- r.scaled.inverse.chi.squared

  }

  return( theta.star )  

}