#################################################
##Some simulations to help explore the idea of parameters and statistics ####
#################################################
library(ggplot2)
#######
#The first thing we are going to do is use R to simulate a population. We can then use this population to do things like take samples.
#We will use the rnorm() function to generate a normally distirbuted population with a mean of 0 and a standard deviation of 1.
#######
#define a population
p = rnorm(10000, mean=0, sd=1)
#calculate the population mean
p.m=sum(p)/length(p)
#calculate the population variance
#Notice that I am doing this by hand. R has a built in function for variance called var(), but it implements the Bessel correction by default, which we don't want for populations
p.v=sum((p-p.m)^2)/length(p)
#calculate the population standard deviation
#Again, R has a built in function for this called sd(), but it uses the Bessel correction, and we don't want that for a population
p.sd=sqrt(sum((p-p.m)^2)/length(p))
########
#Now let's sample from the population
########
#We can use the sample() functon to sample from the population.
#Here we take two samples, each with 20 values, from the population
s1 = sample(p, 20)
s2 = sample(p, 20)
#We can calculate summary statistics on the sample using the uncorrected formula
#Ultimately, this will help us demonstrate the bias in the variance formula
s1.m=sum(s1)/length(s1)
s1.v=sum((s1-s1.m)^2)/length(s1)
s1.sd=sqrt(sum((s1-s1.m)^2)/length(s1))
s2.m=sum(s2)/length(s2)
s2.v=sum((s2-s2.m)^2)/length(s2)
s2.sd=sqrt(sum((s2-s2.m)^2)/length(s2))
#Here is a for-loop that takes 1000 samples from the population, calculates summary statistics for each of them, and saves the summary statistics
#the first step when using a loop is to define a data structure to contain the results of the loop. This needs to be defined outside of the loop so that it isn't re-generated in each step of the loop.
results=data.frame(m=rep(0,1000), v=rep(0,1000), stdev=rep(0,1000), v.corrected=rep(0,1000), stdev.corrected=rep(0,1000))
#then we can run the loop
#everything inside the curly braces is run, once for every integer between the two numbers in the parantheses
#We are going to calculate the uncorrected variance and the corrected variance
for(i in 1: 1000){
s=sample(p,20)
s.m=sum(s)/length(s)
s.v=sum((s-s.m)^2)/length(s)
s.sd=sqrt(sum((s-s.m)^2)/length(s))
s.v2=sum((s-s.m)^2)/(length(s)-1)
s.sd2=sqrt(sum((s-s.m)^2)/(length(s)-1))
results[i,1]=s.m
results[i,2]=s.v
results[i,3]=s.sd
results[i,4]=s.v2
results[i,5]=s.sd2
}
#Now we have 1000 samples and their summary statistics saved in a data frame called results
###########
#Seeing the effect of bias in thev variance, and the effect of Bessel's corection
###########
#now let's look at the estimated variances and compare them to the population variance
#We can plot the distribution of the estimated variances, plot the mean of those in red, and plot the population variance in black
#First, let's look at the uncorrected variance estimates
ggplot(results, aes(x=v))+
geom_histogram(binwidth=.1, color="black", fill="white")+
geom_vline(xintercept=c(p.v, mean(results$v, na.rm=TRUE)), color=c("black","#e41a1c"), size=1.5, linetype=c(1,2))
#Notice that the mean of the variance is below the populaton variance. Go ahead and re-run the simulation above (the for-loop) more times and plot the results. The mean variance estimate will always be less than (or equal to) the population variance.
#Now let's look at the corrected variance estimates
ggplot(results, aes(x=v.corrected))+geom_histogram(binwidth=.1, color="black", fill="white")+geom_vline(xintercept=c(p.v, mean(results$v.corrected, na.rm=TRUE)), color=c("black","#e41a1c"), size=1.5, linetype=c(1,2))
#Go ahead and re-run the simulation (the for-loop) several times. Sometimes the estimate will be below the population variance, sometimes it will be above it. It is no longer biased.
#Note that if we plotted standard deviations, we would see that the the corrected standard deviation is still biased. Bessel's correction is not strong enough to correct for the bias in the standard deviation (because square roots are a concave function). There is a correction for standard deviation, but it is beyond the scope of this class.
#If you just want to see the means of all of thes statistics, you can type these commands
mean(results$m)
mean(results$v)
mean(results$stdev)
mean(results$v.corrected)
mean(results$stdev.corrected)
##############
#The sampling distribution of means
#And standard error
##############
#We won't calculate the actual sampling distribution of means, because they are just too big (they grow factorially).
#But, we've already begun to estimate one in the loop above. We took 1000 samples of size 20 from our population, and calculated the means.
#plot a histogram of the population itself. We need a dataframe to use ggplot, so we'll make one:
p.dataframe=data.frame(1:1000, p)
#Now use ggplot to plut the population histogram with the mean marked in red (it is very very close to 0, which is what we defined when we created this population)
ggplot(p.dataframe, aes(x=p))+
geom_histogram(binwidth=.25, color="black", fill="white")+
geom_vline(xintercept=mean(p.datafame$p, na.rm=TRUE), color="#e41a1c", size=1.5)
#plot a histogram of the means of our samples:
ggplot(results, aes(x=m))+
geom_histogram(binwidth=.1, color="black", fill="white")+
geom_vline(xintercept=mean(results$m, na.rm=TRUE), color="#e41a1c", size=1.5)
#Notice that the mean of the estimated sampling distribution of means is very close to the mean of the population. If we had a true sampling distribution of the mean, it would be identical, but we don't, so it is just really close.
#Finally, lets compare the formula for estimating standard error, and the empirical simulations of the standard error
#empirical estimation of standard error
empirical.standard.error=sqrt(sum((results$m-mean(results$m))^2)/length(results$m))
#plot a histogram of the means of our samples with a blue line for the standard error
ggplot(results, aes(x=m))+
geom_histogram(binwidth=.1, color="black", fill="white")+
geom_vline(xintercept=mean(results$m, na.rm=TRUE), color="#e41a1c", size=1.5)+
geom_segment(x=mean(results$m, na.rm=TRUE), xend=empirical.standard.error, y=25, yend=25, color="#377EB8", size=1.5)+
geom_segment(x=mean(results$m, na.rm=TRUE)+empirical.standard.error, xend=mean(results$m, na.rm=TRUE)+empirical.standard.error, y=15, yend=35, color="#377EB8", size=1.5)
#now, we can estimate the standard error from every sample we have. Notice we use the corrected standard deviation.
estimated.standard.errors = results$stdev.corrected/sqrt(20)
#Now, we can calculate the difference between each estimate and the empirically derived standard error
differences = estimated.standard.errors - empirical.standard.error
#And we can plot the differences.
#First, we need to cerate a data frame
differences = data.frame(1:1000, differences)
#Then we can plot the distribution and its mean
ggplot(differences, aes(x=differences))+
geom_histogram(binwidth=.01, color="black", fill="white")+
geom_vline(xintercept=mean(differences$differences, na.rm=TRUE), color="#e41a1c", size=1.5)
#As you can see, the estimates are very close to empirically derived value, as evidenced by the very small differences between them. The mean of all of the estimates is very close to 0!