# start up a new R session # unbuffer the output # set the working directory to the place you want the session stored, # either with the 'setwd' built-in function or the pull-down menu; # on my desktop at home this is # setwd( 'C:/DD/Teaching/AMS-206/Winter-2019' ) # in what follows, R commands are followed by # comments on what the commands do CLT.simulation <- function( seed, M, n, theta.DG ) { # this begins the creation of the object 'CLT.simulation' # of mode and class 'function', with inputs to the function # ( seed, M, n, theta.DG ) set.seed( seed ) # this sets the random number seed, to make the results replicable repeated.sampling.data.set <- rep( NA, M ) # this creates the output vector and fills it with NA values; # if any of its elements are still NA at the end of the calculations, # something went wrong for ( m in 1:M ) { # this begins looping through the elements of the output vector y.star <- rbinom( n, 1, theta.DG ) # this creates a simulated IID sample of 0s and 1s of size n, # with each binary draw having theta.DG as its # probability of being a 1 theta.hat.star <- mean( y.star ) # this calculates the estimator theta.hat based on y.star repeated.sampling.data.set[ m ] <- theta.hat.star # this fills element m of the output vector with the latest # simulated value of theta.hat } # this ends the 'for' loop return( repeated.sampling.data.set ) # this returns the vector of M simulated theta.hat values } # this ends the creation of 'CLT.simulation' # now let's initialize all of the inputs to the function seed <- 314159 # random number seed M <- 10000 # size of the repeated-sampling data set n <- 403 # sample size in each data set of 1s and 0s s <- 72 # number of deaths at Dominican Hospital # (DH) in CS 2 print( theta.DG <- s / n ) # mortality rate at DH in CS 2 # [1] 0.17866 system.time( # this calls the built-in function 'system.time', # which keeps track of the clock time consumed by # the commands in between '(' and ')' simulation.results.314159.10000.403.72 <- CLT.simulation( seed, M, n, theta.DG ) # this calls the user-defined function 'CLT.simulation' created above ) # this ends the 'system.time' command # user system elapsed # 0.26 0.05 0.33 # on my home desktop, which has 4 cores, each at 3.40GHz, # hyperthreaded (so 8 total threads, but using only 1 thread at a time # for this calculation), and 32GB of fast RAM (setting up R # and performing the commands listed so far, the windows OS used # about 15G of RAM, leaving 17GB free, so no memory problems # with something as simple as this), # 10,000 simulation replications only took about 0.3 seconds # of clock time # on my (slow) laptop (2 cores, 4 threads, 8Gb RAM # [about 2 GB used]), 10000 simulation replications # took about 0.6 seconds sum( is.na( simulation.results.314159.10000.403.72 ) ) # [1] 0 # the 'is.na' built-in function call creates a 'logical' vector # of length M in which each element is 'TRUE' if the corresponding # element in 'simulation.results.314159.10000.403.72' is missing (NA) # and 'FALSE' otherwise; the 'sum' command then creates # a 'numeric' vector from the 'logical' vector, assigning 1 for 'TRUE' # and 0 for 'FALSE', and adds the elements of the resulting vector # of length M; if the sum is anything other than 0, you know # something went wrong inside the function 'CLT.simulation' hist( simulation.results.314159.10000.403.72, breaks = 100, prob = T, xlab = 'theta.hat', main = 'seed = 314159, M = 10000, n = 403, s = 72' ) # this makes a histogram of the theta.hat values in the # repeated-sampling data set, with 100 histogram bars; # 'prob = T', which is a permitted abbreviation of # 'probability = TRUE', puts the vertical axis of the histogram # on the density scale and labels the y axis 'Density'; # 'xlab' creates a horizontal-axis label; and 'main' # creates a title for the plot E.theta.hat <- theta.DG # this is the theoretical mean (expected value) of # the random variable theta.hat c( E.theta.hat, mean( simulation.results.314159.10000.403.72 ) ) # [1] 0.1786600 0.1787814 # this compares the theoretical and simulation means; # they agree out to about 0.0001 SE.theta.hat <- sqrt( theta.DG * ( 1 - theta.DG ) / n ) # this is the theoretical SD (standard error) of # the random variable theta.hat c( SE.theta.hat, sd( simulation.results.314159.10000.403.72 ) ) # [1] 0.01908195 0.01899710 # this compares the theoretical and simulation values; # they agree out to about 0.001 theta.hat.grid.1 <- seq( min( simulation.results.314159.10000.403.72 ), max( simulation.results.314159.10000.403.72 ), length = 500 ) # this creates a grid of theta.hat values, from the smallest ('min') # to the largest ('max') observed values in the repeated-sampling data set lines( theta.hat.grid.1, dnorm( theta.hat.grid.1, E.theta.hat, SE.theta.hat ), lty = 1, lwd = 2, col = 'red' ) # this superimposes (the normal density curve with mean and SD # given by the theoretical values) on the histogram, for comparison # it looks like the normal curve doesn't fit the histogram well, # but that's actually an illusion created by the fact that # theta.hat = s / n is actually a discrete random variable # because the random variable S whose value is s in this case study # is Binomial with parameters n and theta.DG # the apparent lack of fit of the red normal density curve # on the histogram is caused by the discreteness of S # (and therefore of S / n ): if You look closely, You'll see # that, reading from left to right, many of the bar heights # go (nonzero height, zero height) in pairs, and the nonzero heights # and zero heights actually balance out to be close to the red curve table( 403 * simulation.results.314159.10000.403.72 ) # this command multiplies all of the theta.hat = s / n values by n # and creates a frequency distribution of the resulting s values, # illustrating the discretness directly # 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 # 2 2 3 4 4 9 11 21 17 40 43 47 68 112 111 156 200 212 256 297 340 384 428 433 # 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 # 477 523 525 515 546 504 469 442 438 355 343 327 271 194 176 166 128 90 92 64 41 34 30 19 # 93 94 95 96 97 98 99 100 107 # 8 6 5 2 3 2 3 1 1 hist( simulation.results.314159.10000.403.72, breaks = 50, prob = T, xlab = 'theta.hat', main = 'seed = 314159, M = 10000, n = 403, s = 72' ) theta.hat.grid.1 <- seq( min( simulation.results.314159.10000.403.72 ), max( simulation.results.314159.10000.403.72 ), length = 500 ) lines( theta.hat.grid.1, dnorm( theta.hat.grid.1, E.theta.hat, SE.theta.hat ), lty = 1, lwd = 2, col = 'red' ) # the 3 commands right above this line redraw the histogram # with only 50 bars and superimpose the normal curve on # the new histogram; now the fit looks excellent, because # the bars are wide enough to hide the discreteness of theta.hat qqnorm( simulation.results.314159.10000.403.72, main = 'seed = 314159, M = 10000, n = 403, s = 72' ) abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' ) # the two commands right above this line make a different type # of plot to check normality: a *normal quantile-quantile* (qq)plot # a normal qqplot graphs the data values, sorted from smallest # to largest, on the vertical scale, against what are called *normal scores* # namely: (what You would have # expected the data values to be if a standardized version (mean 0, SD 1) # of the data values had indeed been drawn from the standard normal curve) # if the data values do in fact follow the # normal curve well, the normal qqplot will look like a straight line, # namely the straight line with intercept equal to the # theoretical normal curve mean and slope equal to the theoretical # normal curve SD # the 'abline' command superimposes that line on the qqplot; # the fit is excellent # note also that the discreteness of theta.hat is evident # in the plot, in the many horizontal lines, which denote # ties in the data (two or more theta.hat.star values exactly equal) # let's run 'CLT.simulation' again with a different seed and M = 100000 seed <- 271828 M <- 100000 n <- 403 s <- 72 theta.DG <- s / n system.time( simulation.results.271828.100000.403.72 <- CLT.simulation( seed, M, n, theta.DG ) ) # user system elapsed # 2.67 0.03 2.70 # on my (slow) laptop this took about 4 seconds # multiplying M by 10 multiplied the user clock time # by roughly a factor of 10 hist( simulation.results.271828.100000.403.72, breaks = 100, prob = T, xlab = 'theta.hat', main = 'seed = 271828, M = 100000, n = 403, s = 72' ) theta.hat.grid.2 <- seq( min( simulation.results.271828.100000.403.72 ), max( simulation.results.271828.100000.403.72 ), length = 500 ) lines( theta.hat.grid.2, dnorm( theta.hat.grid.2, E.theta.hat, SE.theta.hat ), lty = 1, lwd = 2, col = 'red' ) # looks much the same as with M = 10000, but is in fact # quite a bit more Monte-Carlo accurate system.time( qqnorm( simulation.results.271828.100000.403.72, main = 'seed = 271828, M = 100000, n = 403, s = 72' ) ) # user system elapsed # 0.41 2.86 3.28 # R has to sort 100000 numbers and compute 100000 normal scores, # so it takes a bit longer to make the normal qqplot abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' ) table( 403 * simulation.results.271828.100000.403.72 ) # 36 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 # 1 1 1 1 2 5 5 4 17 29 39 70 98 164 228 308 420 603 705 # 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 # 1015 1245 1591 1894 2153 2599 3254 3468 3961 4220 4513 4845 5081 5258 5135 5148 5037 4743 4308 # 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 # 3992 3760 3344 3013 2589 2198 1840 1488 1279 1009 780 658 481 390 312 196 167 102 74 # 96 97 98 99 100 101 102 103 104 105 110 # 40 43 30 21 9 7 2 2 3 1 1 # the frequencies in the earlier table with M = 10000 # are noisier versions of (the frequencies here divided by 10) # let's explore the validity of the normal approximation # to the repeated-sampling data set a bit, # by making n quite a bit smaller (25) # and keeping the mortality rate about the same: 0.18 * 25 = 4.5, # so i'm going to go with ( s = 4, n = 25 ), which defines # a mortality rate of 0.16: seed <- 123456789 M <- 10000 n <- 25 s <- 4 theta.DG <- s / n system.time( simulation.results.123456789.100000.25.4 <- CLT.simulation( seed, M, n, theta.DG ) ) # user system elapsed # 0.1 0.0 0.1 # much faster (of course) with n = 25 instead of 403 hist( simulation.results.123456789.100000.25.4, breaks = 100, prob = T, xlab = 'theta.hat', main = 'seed = 123456789, M = 10000, n = 25, s = 4' ) theta.hat.grid.3 <- seq( min( simulation.results.123456789.100000.25.4 ), max( simulation.results.123456789.100000.25.4 ), length = 500 ) E.theta.hat <- theta.DG c( E.theta.hat, mean( simulation.results.123456789.100000.25.4 ) ) [1] 0.160000 0.159396 SE.theta.hat <- sqrt( theta.DG * ( 1 - theta.DG ) / n ) # this is the theoretical SD (standard error) of # the random variable theta.hat c( SE.theta.hat, sd( simulation.results.123456789.100000.25.4 ) ) # [1] 0.07332121 0.07309836 lines( theta.hat.grid.3, dnorm( theta.hat.grid.3, E.theta.hat, SE.theta.hat ), lty = 1, lwd = 2, col = 'red' ) # with n only 25 the discreteness is much stronger; # we need a lot fewer histogram bars to create a decent comparison hist( simulation.results.123456789.100000.25.4, breaks = 10, prob = T, xlab = 'theta.hat', main = 'seed = 123456789, M = 10000, n = 25, s = 4' ) lines( theta.hat.grid.3, dnorm( theta.hat.grid.3, E.theta.hat, SE.theta.hat ), lty = 1, lwd = 2, col = 'red' ) # You can see that the normal approximation isn't so good with # n = 25 and theta.DG = 0.16: the actual distribution # has a much shorter left tail than the normal curve expects system.time( qqnorm( simulation.results.123456789.100000.25.4, main = 'seed = 123456789, M = 10000, n = 25, s = 4', ylim = c( -0.2, 0.5 ) ) ) # user system elapsed # 0.41 2.86 3.28 # R has to sort 100000 numbers and compute 100000 normal scores, # so it takes a bit longer to make the normal qqplot abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' ) # this picture is also trying to tell us that the empirical # distribution of the theta.hat values with ( n = 25, s = 4 ) # has a much shorter left tail than the normal curve: # all of the smallest theta.hat values occur at 0, # which is far bigger than what the red line # predicts in the left tail under normality (about -0.15, # which is of course impossible) save.image( 'CLT-simulation-CS-2-R.RData' ) q( save = 'no' )