# start up a new R session

# unbuffer the output

# set the working directory to the place you want the session stored,
# either with the 'setwd' built-in function or the pull-down menu;
# on my desktop at home this is

# setwd( 'C:/DD/Teaching/AMS-206/Winter-2019' )

# in what follows, R commands are followed by 
# comments on what the commands do

CLT.simulation <- function( seed, M, n, theta.DG ) {

# this begins the creation of the object 'CLT.simulation'
# of mode and class 'function', with inputs to the function
# ( seed, M, n, theta.DG )

  set.seed( seed )

  # this sets the random number seed, to make the results replicable

  repeated.sampling.data.set <- rep( NA, M )

  # this creates the output vector and fills it with NA values;
  # if any of its elements are still NA at the end of the calculations,
  # something went wrong

  for ( m in 1:M ) {

  # this begins looping through the elements of the output vector

    y.star <- rbinom( n, 1, theta.DG )

    # this creates a simulated IID sample of 0s and 1s of size n,
    # with each binary draw having theta.DG as its 
    # probability of being a 1

    theta.hat.star <- mean( y.star )

    # this calculates the estimator theta.hat based on y.star

    repeated.sampling.data.set[ m ] <- theta.hat.star

    # this fills element m of the output vector with the latest
    # simulated value of theta.hat

  }

  # this ends the 'for' loop

  return( repeated.sampling.data.set )

  # this returns the vector of M simulated theta.hat values

}

# this ends the creation of 'CLT.simulation'

# now let's initialize all of the inputs to the function

seed <- 314159                 # random number seed

M <- 10000                     # size of the repeated-sampling data set

n <- 403                       # sample size in each data set of 1s and 0s

s <- 72                        # number of deaths at Dominican Hospital 
                               # (DH) in CS 2

print( theta.DG <- s / n )     # mortality rate at DH in CS 2

# [1] 0.17866

system.time(

# this calls the built-in function 'system.time', 
# which keeps track of the clock time consumed by
# the commands in between '(' and ')'

  simulation.results.314159.10000.403.72 <- 
    CLT.simulation( seed, M, n, theta.DG )

  # this calls the user-defined function 'CLT.simulation' created above

)

# this ends the 'system.time' command

#   user  system elapsed 
#   0.26    0.05    0.33

# on my home desktop, which has 4 cores, each at 3.40GHz,
# hyperthreaded (so 8 total threads, but using only 1 thread at a time
# for this calculation), and 32GB of fast RAM (setting up R
# and performing the commands listed so far, the windows OS used
# about 15G of RAM, leaving 17GB free, so no memory problems
# with something as simple as this), 
# 10,000 simulation replications only took about 0.3 seconds
# of clock time

# on my (slow) laptop (2 cores, 4 threads, 8Gb RAM
# [about 2 GB used]), 10000 simulation replications 
# took about 0.6 seconds 

sum( is.na( simulation.results.314159.10000.403.72 ) )

# [1] 0

# the 'is.na' built-in function call creates a 'logical' vector 
# of length M in which each element is 'TRUE' if the corresponding
# element in 'simulation.results.314159.10000.403.72' is missing (NA)
# and 'FALSE' otherwise; the 'sum' command then creates
# a 'numeric' vector from the 'logical' vector, assigning 1 for 'TRUE'
# and 0 for 'FALSE', and adds the elements of the resulting vector
# of length M; if the sum is anything other than 0, you know
# something went wrong inside the function 'CLT.simulation'

hist( simulation.results.314159.10000.403.72, breaks = 100,
  prob = T, xlab = 'theta.hat',
  main = 'seed = 314159, M = 10000, n = 403, s = 72' )

# this makes a histogram of the theta.hat values in the
# repeated-sampling data set, with 100 histogram bars;
# 'prob = T', which is a permitted abbreviation of
# 'probability = TRUE', puts the vertical axis of the histogram
# on the density scale and labels the y axis 'Density';
# 'xlab' creates a horizontal-axis label; and 'main'
# creates a title for the plot

E.theta.hat <- theta.DG

# this is the theoretical mean (expected value) of
# the random variable theta.hat

c( E.theta.hat, mean( simulation.results.314159.10000.403.72 ) )

# [1] 0.1786600 0.1787814

# this compares the theoretical and simulation means;
# they agree out to about 0.0001

SE.theta.hat <- sqrt( theta.DG * ( 1 - theta.DG ) / n )

# this is the theoretical SD (standard error) of
# the random variable theta.hat

c( SE.theta.hat, sd( simulation.results.314159.10000.403.72 ) )

# [1] 0.01908195 0.01899710

# this compares the theoretical and simulation values;
# they agree out to about 0.001

theta.hat.grid.1 <- seq( min( simulation.results.314159.10000.403.72 ),
  max( simulation.results.314159.10000.403.72 ), length = 500 )

# this creates a grid of theta.hat values, from the smallest ('min')
# to the largest ('max') observed values in the repeated-sampling data set

lines( theta.hat.grid.1, dnorm( theta.hat.grid.1, E.theta.hat,
  SE.theta.hat ), lty = 1, lwd = 2, col = 'red' )

# this superimposes (the normal density curve with mean and SD
# given by the theoretical values) on the histogram, for comparison

# it looks like the normal curve doesn't fit the histogram well,
# but that's actually an illusion created by the fact that
# theta.hat = s / n is actually a discrete random variable
# because the random variable S whose value is s in this case study
# is Binomial with parameters n and theta.DG

# the apparent lack of fit of the red normal density curve
# on the histogram is caused by the discreteness of S
# (and therefore of S / n ): if You look closely, You'll see
# that, reading from left to right, many of the bar heights
# go (nonzero height, zero height) in pairs, and the nonzero heights
# and zero heights actually balance out to be close to the red curve

table( 403 * simulation.results.314159.10000.403.72 )

# this command multiplies all of the theta.hat = s / n values by n
# and creates a frequency distribution of the resulting s values,
# illustrating the discretness directly

#  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68 
#   2   2   3   4   4   9  11  21  17  40  43  47  68 112 111 156 200 212 256 297 340 384 428 433 
#  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92 
# 477 523 525 515 546 504 469 442 438 355 343 327 271 194 176 166 128  90  92  64  41  34  30  19 
#  93  94  95  96  97  98  99 100 107 
#   8   6   5   2   3   2   3   1   1

hist( simulation.results.314159.10000.403.72, breaks = 50,
  prob = T, xlab = 'theta.hat',
  main = 'seed = 314159, M = 10000, n = 403, s = 72' )

theta.hat.grid.1 <- seq( min( simulation.results.314159.10000.403.72 ),
  max( simulation.results.314159.10000.403.72 ), length = 500 )

lines( theta.hat.grid.1, dnorm( theta.hat.grid.1, E.theta.hat,
  SE.theta.hat ), lty = 1, lwd = 2, col = 'red' )

# the 3 commands right above this line redraw the histogram
# with only 50 bars and superimpose the normal curve on
# the new histogram; now the fit looks excellent, because
# the bars are wide enough to hide the discreteness of theta.hat

qqnorm( simulation.results.314159.10000.403.72,
  main = 'seed = 314159, M = 10000, n = 403, s = 72' )

abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' )

# the two commands right above this line make a different type
# of plot to check normality: a *normal quantile-quantile* (qq)plot

# a normal qqplot graphs the data values, sorted from smallest
# to largest, on the vertical scale, against what are called *normal scores*
# namely: (what You would have
# expected the data values to be if a standardized version (mean 0, SD 1)
# of the data values had indeed been drawn from the standard normal curve)

# if the data values do in fact follow the
# normal curve well, the normal qqplot will look like a straight line,
# namely the straight line with intercept equal to the
# theoretical normal curve mean and slope equal to the theoretical
# normal curve SD

# the 'abline' command superimposes that line on the qqplot;
# the fit is excellent

# note also that the discreteness of theta.hat is evident
# in the plot, in the many horizontal lines, which denote
# ties in the data (two or more theta.hat.star values exactly equal)

# let's run 'CLT.simulation' again with a different seed and M = 100000

seed <- 271828

M <- 100000

n <- 403

s <- 72

theta.DG <- s / n

system.time(

  simulation.results.271828.100000.403.72 <- 
    CLT.simulation( seed, M, n, theta.DG )

)

#   user  system elapsed 
#   2.67    0.03    2.70

# on my (slow) laptop this took about 4 seconds

# multiplying M by 10 multiplied the user clock time
# by roughly a factor of 10

hist( simulation.results.271828.100000.403.72, breaks = 100,
  prob = T, xlab = 'theta.hat',
  main = 'seed = 271828, M = 100000, n = 403, s = 72' )

theta.hat.grid.2 <- seq( min( simulation.results.271828.100000.403.72 ),
  max( simulation.results.271828.100000.403.72 ), length = 500 )

lines( theta.hat.grid.2, dnorm( theta.hat.grid.2, E.theta.hat,
  SE.theta.hat ), lty = 1, lwd = 2, col = 'red' )
# looks much the same as with M = 10000, but is in fact
# quite a bit more Monte-Carlo accurate

system.time( 

  qqnorm( simulation.results.271828.100000.403.72,
    main = 'seed = 271828, M = 100000, n = 403, s = 72' )

)

#   user  system elapsed 
#   0.41    2.86    3.28

# R has to sort 100000 numbers and compute 100000 normal scores,
# so it takes a bit longer to make the normal qqplot

abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' )

table( 403 * simulation.results.271828.100000.403.72 )

#   36   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57 
#    1    1    1    1    2    5    5    4   17   29   39   70   98  164  228  308  420  603  705 
#   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75   76 
# 1015 1245 1591 1894 2153 2599 3254 3468 3961 4220 4513 4845 5081 5258 5135 5148 5037 4743 4308 
#   77   78   79   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95 
# 3992 3760 3344 3013 2589 2198 1840 1488 1279 1009  780  658  481  390  312  196  167  102   74 
#   96   97   98   99  100  101  102  103  104  105  110 
#   40   43   30   21    9    7    2    2    3    1    1

# the frequencies in the earlier table with M = 10000
# are noisier versions of (the frequencies here divided by 10)

# let's explore the validity of the normal approximation
# to the repeated-sampling data set a bit, 
# by making n quite a bit smaller (25)
# and keeping the mortality rate about the same: 0.18 * 25 = 4.5,
# so i'm going to go with ( s = 4, n = 25 ), which defines
# a mortality rate of 0.16:

seed <- 123456789

M <- 10000

n <- 25

s <- 4

theta.DG <- s / n

system.time(

  simulation.results.123456789.100000.25.4 <- 
    CLT.simulation( seed, M, n, theta.DG )

)

#   user  system elapsed 
#    0.1     0.0     0.1

# much faster (of course) with n = 25 instead of 403

hist( simulation.results.123456789.100000.25.4, breaks = 100,
  prob = T, xlab = 'theta.hat',
  main = 'seed = 123456789, M = 10000, n = 25, s = 4' )

theta.hat.grid.3 <- seq( min( simulation.results.123456789.100000.25.4 ),
  max( simulation.results.123456789.100000.25.4 ), length = 500 )

E.theta.hat <- theta.DG

c( E.theta.hat, mean( simulation.results.123456789.100000.25.4 ) )

[1] 0.160000 0.159396

SE.theta.hat <- sqrt( theta.DG * ( 1 - theta.DG ) / n )

# this is the theoretical SD (standard error) of
# the random variable theta.hat

c( SE.theta.hat, sd( simulation.results.123456789.100000.25.4 ) )

# [1] 0.07332121 0.07309836

lines( theta.hat.grid.3, dnorm( theta.hat.grid.3, E.theta.hat,
  SE.theta.hat ), lty = 1, lwd = 2, col = 'red' )

# with n only 25 the discreteness is much stronger;
# we need a lot fewer histogram bars to create a decent comparison

hist( simulation.results.123456789.100000.25.4, breaks = 10,
  prob = T, xlab = 'theta.hat',
  main = 'seed = 123456789, M = 10000, n = 25, s = 4' )

lines( theta.hat.grid.3, dnorm( theta.hat.grid.3, E.theta.hat,
  SE.theta.hat ), lty = 1, lwd = 2, col = 'red' )

# You can see that the normal approximation isn't so good with
# n = 25 and theta.DG = 0.16: the actual distribution
# has a much shorter left tail than the normal curve expects

system.time( 

  qqnorm( simulation.results.123456789.100000.25.4,
    main = 'seed = 123456789, M = 10000, n = 25, s = 4',
    ylim = c( -0.2, 0.5 ) )

)

#   user  system elapsed 
#   0.41    2.86    3.28

# R has to sort 100000 numbers and compute 100000 normal scores,
# so it takes a bit longer to make the normal qqplot

abline( E.theta.hat, SE.theta.hat, lwd = 2, col = 'red' )

# this picture is also trying to tell us that the empirical
# distribution of the theta.hat values with ( n = 25, s = 4 )
# has a much shorter left tail than the normal curve:
# all of the smallest theta.hat values occur at 0,
# which is far bigger than what the red line
# predicts in the left tail under normality (about -0.15,
# which is of course impossible)

save.image( 'CLT-simulation-CS-2-R.RData' )

q( save = 'no' )