# a little more on help files in R ?matrix ?rnorm ######################### # Experiment 1 with CLT # ######################### # create a storage vector for the sample means # rep(a,n) repeats the value "a" n times sample.means<-rep(0,1000) # create a loop to repeat the experiment # "i" will index the iterations, starting at 1 and ending at 1000 for(i in 1:1000){ # open loop in i x<-rnorm(5,mean=0,sd=1) # simulate 5 normal random variables # with mean 0 and st. dev. = 1 sample.means[i]<-mean(x)# store the sample mean of the 5 rv's } # close the loop in i # calculate the mean of our stored sample means mean(sample.means) # should be near 0 (by E(X.bar) = mu) # calculate the sample variance of the stored sample means var(sample.means) # should be near 1/5 (by CLT) # create a histogram of our stored sample means # with 50 bins/breaks/intervals. # freq=F specifies that we will have percentages, not counts hist(sample.means,breaks=50,freq=F) # draw a blue line (col="blue") that is double width (lwd=2) # of the kernel density estimated for our sample means lines(density(sample.means),col="blue",lwd=2) # create a vector of x-values for which we wil calculate # the N(0,1/5) density. This vector begins at -3 and ends # at 3. It is 100 long. x.support<-seq(from=-3,to=3,length=100) # draw a red line of double width of a N(0,1/5) density curve. lines(x.support,dnorm(x.support,mean=0,sd=(1/5)^.5), col="red",lwd=2) # Exercise: Draw a green line of double width for a # Normal curve having mean equal to the sample mean and # standard deviation equal to the sample standard deviation. # let's re-write this experiment num.sims<-1000 n<-5 mu<-0 sigma<-1 sample.means<-rep(0,num.sims) for(sim in 1:num.sims){ sample.means[sim]<-mean(rnorm(n,mean=mu,sd=sigma)) } mean(sample.means) # should be near ___? var(sample.means) # should be near ___? hist(sample.means,breaks=50,freq=F) lines(density(sample.means),col="blue",lwd=2) x<-seq(-3,3,l=100) y<-dnorm(x,mean=mu,sd=(sigma/sqrt(n))) lines(x,y,col="red",lwd=2) # re-run with n=30,100, num.sims=10^4 # pause here to comment out the above code # re-run with Poisson, uniform distributions # at each instance, look at r,p,q,d functions for those distributions # Copy the above code and change to Poisson and Uniform. ######################### # Experiment 2 with CLT # ######################### # How does convergence improve with sample size? # Need to use a non-Gaussian example... sizes<-c(2,5,10,30) num.sims<-1000 r<-1 # exponential parameter sample.means<-array(dim=c(1000,(length(sizes)))) for(size in 1:(length(sizes))){ n<-sizes[size] for(sim in 1:num.sims){ sample.means[sim,size]<-mean(rexp(n,rate=r)) } } apply(sample.means,2,mean) # are sample means sensible? apply(sample.means,2,var) # are sample variances sensible? par(mfrow=c(2,2),oma=c(3,3,5,3)) for(size in 1:(length(sizes))){ n<-sizes[size] hist(sample.means[,size],breaks=50,freq=F, main=paste("n =",n),xlab="x.bar") lines(density(sample.means[,size]),col="blue",lwd=2) x.support<-seq(from=-3,to=3,length=100) sigma<-1/n^.5 lines(x.support,dnorm(x.support,mean=r,sd=sigma), col="red",lwd=2) } mtext("CLT convergence improves in n (X~Exp(1))", side=3, cex=1.5,outer=T) # Pause here to comment out the above code. ####################################################################### # Let's go into the deep-end and practice with multiple distributions # ####################################################################### sizes<-c(2,5,10,30) num.sims<-1000 mu<-0; sigma<-1 # normal parameters lambda<-1 # Poisson parameters a<-0; b<-1 # uniform parameters sample.means<-array(dim=c(1000,(length(sizes)),3)) for(size in 1:(length(sizes))){ n<-sizes[size] for(dist in 1:3){ for(sim in 1:num.sims){ if(dist==1) sample.means[sim,size,dist]<-mean(rnorm(n,mu,sigma)) if(dist==2) sample.means[sim,size,dist]<-mean(rpois(n,lambda)) if(dist==3) sample.means[sim,size,dist]<-mean(runif(n,a,b)) } } } apply(sample.means,c(3,2),mean) # are sample means sensible? apply(sample.means,c(3,2),var) # are sample variances sensible? # a plot of our results par(mfrow=c(3,(length(sizes))),oma=c(3,3,5,3)) for(dist in 1:3){ for(size in 1:(length(sizes))){ n<-sizes[size] hist(sample.means[,size,dist],breaks=50, main=paste("n =",n)) } } # an improved plot of our results par(mfrow=c(3,(length(sizes))),oma=c(2,2,4,3)) for(dist in 1:3){ for(size in 1:(length(sizes))){ n<-sizes[size] if(dist==1) hist(sample.means[,size,dist],breaks=50, main=paste("n =",n),xlab="",ylab="",freq=F,xlim=c(-3,3)) if(dist==1 & size==4) mtext(" Gaussian",cex=1.5,side=4,line=1) if(dist==2) hist(sample.means[,size,dist],breaks=50, main="",xlab="",ylab="",freq=F,xlim=c(0,5)) if(dist==2 & size==4) mtext(" Poisson",cex=1.5,side=4,line=1) if(dist==3 & size==1) hist(sample.means[,size,dist],breaks=50, main="",xlab="x.bar",ylab="frequency",freq=F,xlim=c(0,1)) if(dist==3 & size!=1) hist(sample.means[,size,dist],breaks=50, main="",xlab="",ylab="",freq=F,xlim=c(0,1)) if(dist==3 & size==4) mtext(" Uniform",cex=1.5,side=4,line=1) } }