The objective is to show the efficiency of the mean when compared with the median using a large simulation where both estimators are applied on a sample of U(0,1) uniformly distributed random numbers.
Step 1. Define a function for sampling
one.simulation <- function(N=100) {
# N defaults to 100 if not supplied
x = runif(N)
return(c(mean(x), median(x)))
}
Step 2. Generate a \(2 \times
10000\) matrix
results = replicate(10000, one.simulation(200)) #
Two kernel densities
k1 = density(results[1,]) # results[1,] is the 1st row
k2 = density(results[2,])
Step 3. A pretty picture
xrange = range(k1$x, k2$x)
plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab="")
grid()
lines(k2$x, k2$y, col="red")
abline(v=.5)
legend(x="topleft", bty="n",lty=c(1,1),col=c("black", "red"),legend=c("Mean", "Median"))
quantmod::getSymbols("BTC-USD")
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## [1] "BTC-USD"
BTC=`BTC-USD`
k=6
set.seed(123)
Y0=as.numeric(na.omit(BTC[,k]))
Y=sample(Y0,1500)
s=600
reps=2000
x0=NULL;for (i in seq(reps)){
x0=c(x0,mean(sample(Y,s,replace = TRUE)))
}
x=sort(x0)[(reps*0.1):reps]
###dev.new()
hist(x)
abline(v=mean(Y0),lwd=3,col="red",lty=2) # population mean
abline(v=mean(Y),col="blue") #sample mean
abline(v=mean(x),col="green") #bootstrap mean
Step 1. Define a function for computing the median
samplemedian <- function(x, d) {
# d is a vector of integer indexes
return(median(x[d])) # The genius is in the x[d] notation
}
Step 2. Generate a normally distributed sample with 50 obs
data <- rnorm(50) # Generate a dataset
samplemedian(data)
## [1] 0.2502838
Step 3. Generate 2,000 bootstrap replications
b <- boot::boot(data, samplemedian, R=2000)
head(b$t)
## [,1]
## [1,] 0.1959390
## [2,] 0.6252099
## [3,] 0.7172471
## [4,] 0.1959390
## [5,] 0.4578366
## [6,] 0.2896740
The standard deviation of bootstrapping data:
sd(b$t)
## [1] 0.2315454
plot(b)
Check bootstrap intervals:
boot::boot.ci(b, conf=0.95, type="basic")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
##
## CALL :
## boot::boot.ci(boot.out = b, conf = 0.95, type = "basic")
##
## Intervals :
## Level Basic
## 95% (-0.3087, 0.7207 )
## Calculations and Intervals on Original Scale
This is in the context of testing for time-series dependence in
stock market returns data.
The code here does the idea of Kim,
Nelson, Startz (1991).We want to use the distribution of real world
returns data, without needing assumptions about normality.
The null
is lack of dependence (i.e. an efficient market).So repeatedly, the data
is permuted, and the sample ACF is computed. This gives us the
distribution of the ACF under H0: independence, but while using the
empirical distribution of the returns data.
Case 1: Testing if ret is an independent time series
Step 1. Prepare the data: 10-year daily returns on Microsoft, 2007/1/3 to 2024/6/6
quantmod::getSymbols("MSFT")
## [1] "MSFT"
Step 2. Prepare the time series of returns
ret= MSFT[,"MSFT.Adjusted"] |>
log() |>
diff() |>
na.omit()*100
ret= as.matrix(ret)
t=nrow(ret)
t
## [1] 4417
By t, there are 4417 observations
Step 3. Compute The 1st autocorrelation from the sample
above.
phi=acf(ret, 1, plot=FALSE)$acf[2]
phi
## [1] -0.1091192
Step 4. Obtain 1000 draws to generate the distribution of the 1st
autocorrelation
set.seed = 101
simulated1 = replicate(1000, acf(ret[sample(seq(t), replace=FALSE)], 1, plot=FALSE)$acf[2])
Step 5. The 95% confidence interval
quantile(simulated1, probs=c(.025,.975))
## 2.5% 97.5%
## -0.02979108 0.03041340
Step 6. The 99% confidence interval
quantile(simulated1, probs=c(.005,.995))
## 0.5% 99.5%
## -0.03728414 0.03540463
Under the null of independence, we can reject the null
significantly.
Finally, we visualize the result.
#dev.new()
plot(density(simulated1), xlim=sort(range(c(phi, density(simulated1)$x))),col="blue")
abline(v=0)
abline(v=quantile(simulated1, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=phi, lty=2, lwd=4, col="yellow")
Case 2: Testing if the mean of ret is significantly different from 0.
Ho: mu=0
mu=mean(ret)
mu
## [1] 0.06874052
Step 1. Obtain 2000 sub-samples to generate the distribution of the
mean
set.seed = 101
simulated2 = replicate(5000, mean(sample(ret,3000, replace=F)))
cat("Sample mean is", mu, "\n")
## Sample mean is 0.06874052
Step 2. The 95% confidence interval
quantile(simulated2, probs=c(.025,.975))
## 2.5% 97.5%
## 0.03375978 0.10446726
Step 3. The 99% confidence interval
quantile(simulated2, probs=c(.005,.995))
## 0.5% 99.5%
## 0.0236874 0.1168051
So we cannot reject the null at both significance levels.
Finally, visualize the result.
#dev.new()
plot(density(simulated2), xlim=sort(range(c(mu, density(simulated2)$x))),col="blue")
abline(v=0)
abline(v=quantile(simulated2, probs=c(.025,.975)), lwd=2, col="purple")
abline(v=mu, lty=2, lwd=4, col="yellow")
Step 1. Define parameters
set.seed(123456) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
par.est <- matrix(NA, nrow = reps, ncol = 4) # Empty matrix to store the
# estimates
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
Step 2. Replications
par.est1 =par.est2 =par.est3 =par.est4 <- NULL
for(i in 1:reps){ # Start the loop
errors=rnorm(n, 0, 1)
Y <- b0 + b1*X + errors # The true DGP, with N(0, 1) error
model <- lm(Y ~ X) # Estimate OLS model
vcv <- vcov(model) # Variance-covariance matrix
par.est1 <- c(par.est1, model$coef[1]) # Put the estimate for the intercept
# in the first column
par.est2 <- c(par.est2, model$coef[2]) # Put the estimate for the coefficient on
# X in the second column
par.est3 <- c(par.est3, sqrt(diag(vcv)[1])) # SE of the intercept
par.est4 <- c(par.est4, sqrt(diag(vcv)[2])) # SE of the coefficient on X
} # End the loop
par.est=cbind(par.est1,par.est2,par.est3,par.est4)
DGP \(Y=b{_0}+b{_1}{\times}X+errors \approx rnorm(n, b{_0}+b{_1}{\times}X,1)\)
Step 3. Compute error measures of coefficient estimates
\[ AB=E[|\beta-b|] \\ MSE=E[(\beta-b)^2] \]
# Absolute Bias of beta
ab.beta0 <- mean(abs(par.est[ , 1] - b0))
ab.beta1 <- mean(abs(par.est[ , 2] - b1))
# MSE
mse.beta0 <- mean((par.est[ , 1] - b0)^2)
mse.beta1 <- mean((par.est[ , 2] - b1)^2)
Step 4. Compute standard deviation of coefficient estimates
sd.beta0 <- sd(par.est[ , 1]) # SD of the intercept estimates
mean.se.beta0 <- mean(par.est[ , 3]) # Mean SE of the intercept
sd.beta1 <- sd(par.est[ , 2]) # SD of the coefficient on X estimates
mean.se.beta1 <- mean(par.est[ , 4]) # Mean SE of the coefficient on X
Step 5. Visualization: histogram of 1,000 simulated SD for \(\beta_0\)
par(mar = c(5, 5.25, .5, .5))
H0=hist(par.est[ , 3], breaks = 25, col = "gray50", xlab = "", ylab = "",
main = "", axes = FALSE)
axis(1, cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression("Standard Error Estimates for"~beta[0]),
cex.lab = 1.5)
title(ylab = expression("Frequency"), line = 3.75, cex.lab = 1.5)
abline(v = sd.beta0, lwd = 4)
text(sd.beta0*0.96, max(H0$counts)*0.9,
substitute(paste("SD of ", hat(beta[0]), "=", v ),list(v=round(sd.beta0,4))),
cex = 1.5,col="red")
box()
Step 6. Visualization: histogram of 1,000 simulated SD for \(\beta_1\)
par(mar = c(5, 5.25, .5, .5))
H1=hist(par.est[ , 4], breaks = 25, col = "gray50", xlab = "", ylab = "",
main = "", axes = FALSE)
axis(1, cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression("Standard Error Estimates for"~beta[1]),
cex.lab = 1.5)
title(ylab = expression("Frequency"), line = 3.75, cex.lab = 1.5)
abline(v = sd.beta1, lwd = 4)
text(sd.beta1*0.96, max(H1$counts)*0.9,
substitute(paste("SD of ", hat(beta[1]), "=", v ),list(v=round(sd.beta1,4))),
cex = 1.5,col="red")
box()
Step 7. Compute coverage probability
The function
coverage <- function(b.est, se.est, b.true, level = .95, df = Inf){
# b.est= parameter estimate,
# se.est= standard error,
# b.true= true parameter value,
# level=confidence level,
# df=degree of freedom
qtile <- level + (1 - level)/2 # Compute the proper quantile
lower.bound <- b.est - qt(qtile, df = df)*se.est # Lower bound
upper.bound <- b.est + qt(qtile, df = df)*se.est # Upper bound
# Is the true parameter in the confidence interval? (yes = 1)
true.in.ci <- ifelse(b.true >= lower.bound & b.true <= upper.bound, 1, 0)
cp <- mean(true.in.ci) # The coverage probability
mc.lower.bound <- cp - 1.96*sqrt((cp*(1 - cp))/length(b.est)) # Monte Carlo error
mc.upper.bound <- cp + 1.96*sqrt((cp*(1 - cp))/length(b.est))
return(list(coverage.probability = cp, # Return results
true.in.ci = true.in.ci,
ci = cbind(lower.bound, upper.bound),
mc.eb = c(mc.lower.bound, mc.upper.bound)))
}
#Read CP function
cp.beta0 <- coverage(par.est[ , 1], par.est[ , 3], b0, df = n - model$rank)
cp.beta1 <- coverage(par.est[ , 2], par.est[ , 4], b1, df = n - model$rank)
Step 8. Coefficient estimate \(\beta_0\) and its 95% confidence intervals
for randomly selected 100 simulated samples.
ID=sample(seq(reps),200)
par(mar = c(5, 6, .5, .5))
plot(ID[1:100], seq(.05, .4, length = 100), type = "n",
axes = FALSE, xlab = "", ylab = "")
title(xlab = expression("Randomly Selected 100 Simulated" ~ beta[0]), cex.lab = 1.5)
title(ylab = expression(hat(beta[0])), line = 3.75, cex.lab = 1.5)
box()
axis(1, at = ID[1:100], cex.axis = 1.25)
axis(2, at = seq(.05, 4, .05), cex.axis = 1.25, las = 2)
abline(h = b0, lwd = 2)
for (i in ID[1:100]){
points(i, par.est[i, 1], lwd = 2, col = ifelse(cp.beta0$true.in.ci[i] == 1,
"gray70", "gray20"), pch = 19)
segments(i, cp.beta0$ci[i, 1], i, cp.beta0$ci[i, 2], lwd = 2,
col = ifelse(cp.beta0$true.in.ci[i] == 1, "gray70", "gray20"))
}
legend("topleft", bty = "n", c(expression("CI includes true"~beta[0]),
expression("CI does not include true"~beta[0])),
fill = c("gray70", "gray20"), cex = 1.5)
Step 9. Coefficient estimate \(\beta_1\) and its 95% confidence intervals
for randomly selected 100 simulated samples.
par(mar = c(5, 6, .5, .5))
plot(ID[101:200], seq(.25, .8, length = 100), type = "n",
axes = FALSE, xlab = "", ylab = "")
title(xlab = expression("Randomly Selected 100 Simulated" ~ beta[1]), cex.lab = 1.5)
title(ylab = expression(hat(beta[1])), line = 3.75, cex.lab = 1.5)
box()
axis(1, at = ID[101:200], labels = ID[101:200], cex.axis = 1.25)
axis(2, at = seq(.25, .75, .05), cex.axis = 1.25, las = 2)
abline(h = b1, lwd = 2)
for (i in ID[101:200]){
points(i, par.est[i, 2], lwd = 2, col = ifelse(cp.beta1$true.in.ci[i] == 1,
"gray70", "gray20"), pch = 19)
segments(i, cp.beta1$ci[i, 1], i, cp.beta1$ci[i, 2], lwd = 2,
col = ifelse(cp.beta1$true.in.ci[i] == 1, "gray70", "gray20"))
}
legend("topleft", bty = "n", c(expression("CI includes true"~beta[1]),
expression("CI does not include true"~beta[1])),
fill = c("gray70", "gray20"), cex = 1.5)
Hands-on practices:
1. 請修改Estimator
Performance 設定中的errors 為常態分佈 rnorm(n) 和 student t 分佈
rt(n,df),看看結果和殘差分佈有沒有關係。
2. 請修改Estimator
Performance 設定中的sample size,比較小樣本和大樣本的差異。
Step 1. DGP
#set.seed(100484) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
par.est.ncv <- matrix(NA, nrow = reps, ncol = 4) # Empty matrix to store the
# estimates
sigma.est <- numeric(reps) # Empty vector to store sigma
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
gamma <- 1.5 # Heteroskedasticity parameter
for(i in 1:reps){ # Start the loop
Y <- b0 + b1*X + rnorm(n, 0, exp(X*gamma)) # Now the error variance is a
# function of X plus random noise
model <- lm(Y ~ X) # Estimate OLS model
sigma.est[i] <- summary(model)$sigma # Store sigma
vcv <- vcov(model) # Variance-covariance matrix
par.est.ncv[i, 1] <- model$coef[1] # Put the estimate for the intercept
# in the first column
par.est.ncv[i, 2] <- model$coef[2] # Put the estimate for the coefficient on
# X in the second column
par.est.ncv[i, 3] <- sqrt(diag(vcv)[1]) # SE of the intercept
par.est.ncv[i, 4] <- sqrt(diag(vcv)[2]) # SE of the coefficient on X
} # End the loop
Step 2. Visualization
par(mar = c(5, 5.25, .5, .5))
plot(X, Y, ylim = c(-10, 10), axes = FALSE, xlab = "",
ylab = "", pch = 19)
title(xlab = "X", cex.lab = 1.5)
title(ylab = "Y", line = 3.75, cex.lab = 1.5)
box()
abline(lsfit(X, Y), lwd = 3)
axis(1, cex.axis = 1.25)
axis(2, at = seq(-10, 10, 2), cex.axis = 1.25, las = 2)
sigma <- mean(sigma.est)
#set.seed(100484) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
gamma <- 1.5 # Heteroskedasticity parameter
par.est.ncv <- matrix(NA, nrow = reps, ncol = 6) # Empty matrix to store the
# estimates
for(i in 1:reps){ # Start the loop
Y1 <- b0 + b1*X + rnorm(n, 0, exp(X*gamma)) # Y1: Heteroskedasticity
Y2 <- b0 + b1*X + rnorm(n, 0, sigma) # Y2: Homoskedasticity, same average
# sigma as Y1
model1 <- lm(Y1 ~ X) # LS model 1
model2 <- lm(Y2 ~ X) # LS model 2
par.est.ncv[i, 1] <- coef(summary(model1))[1,1] # Put the estimate for the intercept
# in the first column (model 1)
par.est.ncv[i, 2] <- coef(summary(model1))[2,1] # Put the estimate for the coefficient on
# X in the second column (model 1)
par.est.ncv[i, 3] <- coef(summary(model2))[1,1] # Put the estimate for the intercept
# in the first column (model 2)
par.est.ncv[i, 4] <- coef(summary(model2))[2,1] # Put the estimate for the coefficient on
# X in the second column (model 2)
par.est.ncv[i, 5] <- coef(summary(model1))[1,1] # SE of the intercept (model 1)
par.est.ncv[i, 6] <- coef(summary(model1))[1,2] # SE of the coefficient on X (model 1)
}
head(par.est.ncv)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.23325959 0.4922520 0.1904042 0.4018591 0.23325959 0.05774023
## [2,] 0.17908139 0.4914120 0.1417546 0.2245914 0.17908139 0.05561297
## [3,] 0.19593951 0.5339693 0.2148368 0.4837724 0.19593951 0.05190694
## [4,] 0.16225476 0.4766376 0.2524764 0.4471867 0.16225476 0.05597353
## [5,] 0.09017021 0.3048556 0.1969788 0.3443968 0.09017021 0.06151371
## [6,] 0.17860669 0.4241443 0.2430053 0.5191199 0.17860669 0.05756034
Distribution of \(\beta_0\)
par(mar = c(5, 5.25, .5, .5))
plot(density(par.est.ncv[ , 1]), lty = 2, ylim = c(0, 12), lwd = 3,
xlab = "", ylab = "", main = "", axes = FALSE)
lines(density(par.est.ncv[ , 3]), lwd = 3,col="red")
axis(1, cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression(hat(beta[0])), cex.lab = 1.5)
title(ylab = "Density", line = 3.75, cex.lab = 1.5)
abline(v = b0, lwd = 2)
text(.1, 8, substitute(paste("True ", beta[0], "=", v ),list(v=b0)), cex = 1.5)
box()
legend("topright", bty = "n", c("Homoskedastic","Heteroskedastic"), lty = c(1, 2), lwd = 3, cex = 1.5,col=c("red","black"))
Distribution of \(\beta_1\)
par(mar = c(5, 5.25, .5, .5))
plot(density(par.est.ncv[ , 2]), lty = 2, ylim = c(0, 6), lwd = 3,
xlab = "", ylab = "", main = "", axes = FALSE)
lines(density(par.est.ncv[ , 4]), lwd = 3,col="red")
axis(1, at = seq(0, 1, .1), cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression(hat(beta[1])), cex.lab = 1.5)
title(ylab = expression("Density"), line = 3.75, cex.lab = 1.5)
abline(v = b1, lwd = 2)
text(.25, 4, substitute(paste("True ", beta[1], "=", v ),list(v=b1)), cex = 1.5)
box()
legend("topright", bty = "n", c("Homoskedastic","Heteroskedastic"), lty = c(1, 2), lwd = 3, cex = 1.5,col=c("red","black"))
Coverage Probabilities
cp.beta0.ncv <- coverage(par.est.ncv[ , 1], par.est.ncv[ , 5], b0,
df = n - model1$rank)
cp.beta0.ncv$coverage.probability
## [1] 0.988
cp.beta0.ncv$mc.eb
## [1] 0.9812512 0.9947488
cp.beta1.ncv <- coverage(par.est.ncv[ , 2], par.est.ncv[ , 6], b1,
df = n - model1$rank)
cp.beta1.ncv$coverage.probability
## [1] 0.587
cp.beta1.ncv$mc.eb
## [1] 0.5564824 0.6175176
According to the above-mentioned coverage probabilities, the standard
error of \(\beta_0\) is unaffected by
heteroskedasticity, with a coverage probability of 0.988 and a 95% error
bound of [0.9812512,0.9947488]. However, the coverage probability of
0.587 for \(\beta_1\) with a 95% error
bound of [0.5564824,0.6175176] indicates that the OLS estimates of its
standard error are too small, on average. This is not surprising because
we created a DGP with non-constant error variance as a function of X.
The conventional standard error assumes constant variance, which does
not hold in this case. Thus, the conventional method for calculating
standard errors is not appropriate when heteroskedasticity is
present.
So far our simulation has shown that adding heteroskedasticity to the
DGP in the form of a positive increase in the variance of the residual
as \(X\) increases does not bias our
estimates of either \(\beta_0\) or
\(\beta_1\). However, it does result in
less efficient estimates for \(\beta_1\) that is not captured properly by
the conventional method of computing the standard error of \(\beta_1\). You should explore other values
for and other forms of heteroskedasticity to see if this always
happens.
This leads to the question of whether there is a solution to the
problem of getting the wrong standard error estimate for \(\beta_1\), and, if so, how we can use our
simulation methods to evaluate it. A common fix for heteroskedasticity
is to estimate what are called “robust” standard errors. The term robust
implies that the method is capable of generating standard error
estimates even in the face of a possible assumption violation—in this
case, heteroskedasticity. The robust variance–covariance matrix, which
uses the model residuals to better approximate coefficient variability,
can be estimated with the vcovHC()
function in the sandwich
package. We can rerun the simulation, substituting this line of code for
vcov()
to determine whether robust standard errors actually
“work” as they are intended. For simplicity, we will do this by
modifying the first heteroskedasticity simulation because we do not need
the homoskedastic DGP here.
library(sandwich)
#set.seed(100484) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
par.est.ncv <- matrix(NA, nrow = reps, ncol = 4) # Empty matrix to store the
# estimates
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
gamma <- 1.5 # Heteroskedasticity parameter
for(i in 1:reps){ # Start the loop
Y <- b0 + b1*X + rnorm(n, 0, exp(X*gamma)) # Now the error variance is a
# function of X plus random noise
model <- lm(Y ~ X) # Estimate OLS model
vcv <- vcovHC(model) # Robust variance-covariance matrix
par.est.ncv[i, 1] <- model$coef[1] # Put the estimate for the intercept
# in the first column
par.est.ncv[i, 2] <- model$coef[2] # Put the estimate for the coefficient on
# X in the second column
par.est.ncv[i, 3] <- sqrt(diag(vcv)[1]) # SE of the intercept
par.est.ncv[i, 4] <- sqrt(diag(vcv)[2]) # SE of the coefficient on X
} # End the loop
cp.beta1.ncv.robust <- coverage(par.est.ncv[ , 2], par.est.ncv[ , 4], b1,
df = n - model$rank)
cp.beta1.ncv.robust$coverage.probability
## [1] 0.947
cp.beta1.ncv.robust$mc.eb # Simulation error
## [1] 0.9331143 0.9608857
The coverage probability improves to 0.947, with simulation error bounds of [0.9331143, 0.9608857]. Because these bounds include 0.95, we conclude that robust standard errors do, in fact, account for coefficient variability under heteroskedasticity (as we have simulated it here).
We show the impact of multi-collinearity on the bias of parameter estimates DGP
library(mvtnorm)
#set.seed(121402) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slopes
b2 <- .75
n <- 1000 # Sample size
# Levels of multicollinearity
mc.level <- c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, .99)
# Matrix to store SD of the coefficient estimates
sd.betas <- NULL
for(j in 1:length(mc.level)){ # Start the j loop
X.corr <- matrix(c(1, mc.level[j], mc.level[j], 1), nrow = 2, ncol = 2)
X <- rmvnorm(n, mean = c(0, 0), sigma = X.corr) # Create two correlated
X1 <- X[ , 1] # independent variables
X2 <- X[ , 2]
par.est.mc=NULL
for(i in 1:reps){ # Start the i loop
Y <- b0 + b1*X1 + b2*X2 + rnorm(n, 0, 1) # The true DGP, with N(0, 1) error
model <- lm(Y ~ X1 + X2) # Estimate OLS model
par.est.mc=rbind(par.est.mc,cbind(coef(summary(model))[2,1]
,coef(summary(model))[3,1]
,coef(summary(model))[2,2]
,coef(summary(model))[3,2]))
} # End the i loop
sd.betas <- rbind(sd.betas, cbind(mean(abs(par.est.mc[ , 1] - b1)), mean(abs(par.est.mc[ , 1]))))
} # End the j loop
par(mar = c(5, 5.5, 2, .5))
plot(mc.level, sd.betas[ , 1], lwd = 3, ylim = c(0, .25), type = "b",
xlab = "", ylab = "", main = paste("n=",n), axes = FALSE)
axis(1, at = mc.level, cex.axis = 1)
axis(2, at = seq(0, .25, .05), cex.axis = 1.25, las = 2)
title(xlab = expression("Correlation between" ~ X[1]~ "and" ~ X[2]),
cex.lab = 1.5)
title(ylab = expression("Absolute Bias of "~hat(beta[1])), line = 3.75, cex.lab = 1.5)
box()
Hands-on practices:
1. 請比較n=5000和n=1000
的差異。
2.
請將多元常態改成多元t分佈(MVT::rmt),檢視看看相關性對參數估計偏誤的影響,是否與分佈有關?
Hint: MVT::rmt(n,Sigma=X.corr)
set.seed(125842) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slopes
n <- 50 # Sample size
ac <- .75 # AR(1) parameter
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
par.est.sc=NULL
for(i in 1:reps){
library(zoo)
# AR(1) process in the error term
Y <- b0 + b1*X + arima.sim(list(order = c(1, 0, 0), ar = ac), n = n)
dat <- ts(cbind(Y=Y, X=X),end=c(2024,6),freq=12)
model0 <- dynlm::dynlm(Y ~ X, data = dat) # Fit Y Without lag Y
modelL1 <- dynlm::dynlm(Y ~ X + L(Y,1), data = dat) # Fit Y With lag Y
par.est.sc <- rbind(par.est.sc, cbind(model0$coef[2], modelL1$coef[2]))
}
##
## 載入套件:'zoo'
## 下列物件被遮斷自 'package:base':
##
## as.Date, as.Date.numeric
par(mar = c(5, 5.25, .5, .5))
plot(density(par.est.sc[ , 1]), lty = 2, ylim = c(0, 3), lwd = 3, xlab = "",
ylab = "", main = "", axes = FALSE)
lines(density(par.est.sc[ , 2]), lwd = 3, lty = 1,col="blue")
axis(1, at = seq(-.5, 1.5, .5), cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression(hat(beta[1])), cex.lab = 1.5)
title(ylab = "Density", line = 3.75, cex.lab = 1.5)
abline(v = b1, lwd = 2)
text(0, 1.5, expression("True" ~ beta[1] ~ "= 0.50"), cex = 1.5)
box()
legend("topright", bty = "n", c(expression("With"~ Y[t-1]), expression("Without"~ Y[t-1])), lty = c(1, 2), lwd = 3, cex = 1.5,col=c("blue","black"))
In the previous example of serial correlation, we saw non
i.i.d among the errors based on time—serial correlation. In
this example, we examine the consequences of another form of
non-independence—clustering. This occurs when the errors among groups of
observations are subject to group-wise correlation.
For example, patients are grouped in hospitals and students are
grouped in schools. If there is something about those patients
(students) that makes them similar to each other within hospitals
(schools) in some way even after considering a set of independent
variables, then clustering remains in the residuals. Clustered data also
arises when there are repeated measures from the same actor. For
example, analyses of the votes made by members of Congress or the
Supreme Court are clustered by member because each vote is an
observation and each member votes on many bills or cases. Similarly,
conventional panel data or time-series cross-section (TSCS) can be
thought of as clustered data, under which, each cross-section unit
(e.g., countries, cities, individual survey respondents) contains
several time series observations.
A large literature shows that this data structure can have important
consequences for both coefficient estimates and standard errors of
statistical models. In particular, it can produce inefficiency, and
sometimes bias in coefficient estimates, and lead to estimates of
standard errors that are too small. We can illustrate these properties
through simulation. In this example, we simulate clustered data and then
compare the following strategies for handling it:
Naïve OLS (i.e., ignoring the problem)
OLS with “fixed
effects” (indicator variables for each group)
Adjusting the OLS
standard errors
Multilevel modeling (MLM)
In the first part of the simulation, we compare standard OLS, OLS
with fixed effects, and MLM coefficient estimates. In the second part,
we compare OLS standard errors, a version of robust standard errors that
handles clustering, and MLM standard errors. We begin by generating
clustered data in the DGP.
We create an object called c.label that groups observations
into clusters. We set the sample size to 1,000 and the number of
clusters, nc, to 50. In other words, there are 1,000
individuals (e.g., patients, voters, or students) and 50 clusters
(hospitals, states, or schools). We set the object \(p\) to 0.50. This parameter, which can vary
between 0 and 1, sets the degree to which the data are clustered by
controlling the relative amounts of between-cluster variance and
within-cluster variance (in this case, the two are equal, see Harden,
2011).
We use the rmvnorm()
function to generate two
individual-level variables (i.e., variables that are unique to each
observation) labeled effect1 and effect2, and two cluster-level
variables (variables that are the same within clusters but vary across
clusters) labeled effect3 and effect4. The covariances between these
pairs of variables are set to 0, the variances of effect1 and effect3
are set to 1, and the variances of effect2 and effect4 are set to \(p\) and 1 − \(p\). We then create an independent
variable, \(X\), to use in our models
as the sum of effect1 and effect3.
Next, we create an error
term, error, to use in our models as the sum of effect2 and effect4.
This produces an error term in which half of the variance comes from the
cluster level and half from the individual level. By construction, the
error term is uncorrelated with the independent variable.
We use our independent variable and our error term to generate the
dependent variable as before. Then, we estimate the model where \(Y\) is a linear function of \(X\) three different ways, using
(1) OLS,
(2) OLS with fixed effects, and
(3) MLM
We store the coefficient estimates from each of those three estimators and their standard errors. We also estimate robust cluster standard errors (RCSE) for the OLS coefficients. We will explain more about what RCSEs are below. There is a lot going on in this code, but most of it you have seen before. Take your time working through it to make sure you can see what is happening.
Function to compute robust cluster standard errors (Arai 2011)1
rcse <- function(model, cluster){
require(sandwich)
M <- length(unique(cluster))
N <- length(cluster)
K <- model$rank
dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
uj <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum))
rcse.cov <- dfc * sandwich(model, meat = crossprod(uj)/N)
return(rcse.cov)
}
DGP
#set.seed(28704) # Set the seed for reproducible results
reps <- 1000 # Set the number of repetitions at the top of the script
par.est.cluster <- matrix(NA, nrow = reps, ncol = 3) # Empty matrix to store
# the estimates
se.est.cluster <- matrix(NA, nrow = reps, ncol = 4) # Empty matrix to store
# the standard errors
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
p <- .5 # Rho
nc <- 50 # Number of clusters
c.label <- rep(1:nc, each = n/nc) # Cluster label
i.sigma <- matrix(c(1, 0, 0, 1 - p), ncol = 2) # Level 1 effects
c.sigma <- matrix(c(1, 0, 0, p), ncol = 2) # Level 2 effects
for(i in 1:reps){ # Start the loop
i.values <- mvtnorm::rmvnorm(n = n, sigma = i.sigma)
effect1 <- i.values[ , 1]
effect2 <- i.values[ , 2]
c.values <- mvtnorm::rmvnorm(n = nc, sigma = c.sigma) # cross-sections
effect3 <- rep(c.values[ , 1], each = n/nc)
effect4 <- rep(c.values[ , 2], each = n/nc)
X <- effect1 + effect3 # X values unique to level 1 observations
error <- effect2 + effect4
Y <- b0 + b1*X + error # True model
model.ols <- lm(Y ~ X) # Model estimation
model.fe <- lm(Y ~ X + factor(c.label))
model.mlm <- lme4::lmer(Y ~ X + (1|c.label))
par.est.cluster[i, 1] <- model.ols$coef[2] # Coefficients
par.est.cluster[i, 2] <- model.fe$coef[2]
par.est.cluster[i, 3] <- plm::fixef(model.mlm)[2]
vcv.ols <- vcov(model.ols) # Variance-covariance matrices
vcv.rcse <- rcse(model.ols, c.label)
se.est.cluster[i, 1] <- sqrt(diag(vcv.ols)[2]) # Standard errors
se.est.cluster[i, 2] <- sqrt(diag(vcv.rcse)[2])
se.est.cluster[i, 3] <- coef(summary(model.fe))[2,2]
se.est.cluster[i, 4] <- coef(summary(model.mlm))[2,2]
} # End the loop
Visual display of estimators’ bias
par(mar = c(5, 5.25, .5, .5))
plot(density(par.est.cluster[ , 1]), lty = 1, xlim = c(.2, .8),
ylim = c(0, 20), lwd = 3, xlab = "", ylab = "", main = "", axes = FALSE)
lines(density(par.est.cluster[ , 2]), lwd = 3, lty = 2,col="grey")
lines(density(par.est.cluster[ , 3]), lwd = 3, lty = 3,col="red")
axis(1, at = seq(0, 1, .1), cex.axis = 1.25)
axis(2, cex.axis = 1.25, las = 2)
title(xlab = expression(hat(beta[1])), cex.lab = 1.5)
title(ylab = "Density", line = 3.75, cex.lab = 1.5)
abline(v = b1, lwd = 2)
text(.7, 7, substitute(paste("True "~beta[1]~"=", v ),list(v=b1)), cex = 1.5)
box()
legend("topright", bty = "n", c("OLS", "OLS with FE","MLM"), lty = c(1, 2, 3),
lwd = 3, cex = 1.5,col=c("black","grey","red"))
Compute coverage probability
# Coverage probabilities
ols.cp <- coverage(par.est.cluster[ , 1], se.est.cluster[ , 1], b1,
df = n - model.ols$rank)
rcse.cp <- coverage(par.est.cluster[ , 1], se.est.cluster[ , 2], b1,
df = n - model.ols$rank)
fe.cp <- coverage(par.est.cluster[ , 2], se.est.cluster[ , 3], b1,
df = n - model.fe$rank)
mlm.cp <- coverage(par.est.cluster[ , 3], se.est.cluster[ , 4], b1,
df = n - length(plm::fixef(model.mlm)))
Visual display of coverage probabilities of 4 estimators
par(mar = c(5, 5.25, .5, .5))
plot(1, ols.cp$coverage.probability, pch = 19, xlim = c(0, 8),
ylim = c(.5, 1), lwd = 3, xlab = "", ylab = "", main = "", axes = FALSE)
segments(1, ols.cp$mc.eb[1], 1, ols.cp$mc.eb[2], lwd = 2)
points(3, rcse.cp$coverage.probability, pch = 1, lwd = 3)
segments(3, rcse.cp$mc.eb[1], 3, rcse.cp$mc.eb[2], lwd = 2)
points(5, fe.cp$coverage.probability, pch = 19, lwd = 3)
segments(5, fe.cp$mc.eb[1], 5, fe.cp$mc.eb[2], lwd = 2)
points(7, mlm.cp$coverage.probability, pch = 19, lwd = 3)
segments(7, mlm.cp$mc.eb[1], 7, mlm.cp$mc.eb[2], lwd = 2)
axis(1, at = c(1, 3, 5, 7), labels = c("OLS", "RCSE","OLS with FE", "MLM"), cex.axis = 1.25)
axis(2, at = seq(.5, 1, .05), cex.axis = 1.25, las = 2)
title(xlab = "Estimator", cex.lab = 1.5)
title(ylab = "Coverage Probability", line = 3.75, cex.lab = 1.5)
abline(h = .95, lwd = 2, lty = 2)
box()
Method 1.
output1=forecast::auto.arima(ret,ic=c("aicc", "aic", "bic")[3])
y1=simulate(output1,n=nrow(ret))
forecast::auto.arima(y1,ic=c("aicc", "aic", "bic")[3])
## Series: y1
## ARIMA(0,0,1) with zero mean
##
## Coefficients:
## ma1
## -0.1239
## s.e. 0.0152
##
## sigma^2 = 3.102: log likelihood = -8766.83
## AIC=17537.66 AICc=17537.66 BIC=17550.44
output1
## Series: ret
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## -0.1175 0.0687
## s.e. 0.0154 0.0233
##
## sigma^2 = 3.09: log likelihood = -8757.71
## AIC=17521.41 AICc=17521.42 BIC=17540.59
Method 2.
y2=arima.sim(n=4000,list(order=c(2,0,1),ar=c(0.147,-0.258),ma=c(-0.345)))
forecast::auto.arima(y2,ic=c("aicc", "aic", "bic")[3])
## Series: y2
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 0.0925 -0.2608 -0.3061
## s.e. 0.0423 0.0179 0.0431
##
## sigma^2 = 1.013: log likelihood = -5700.39
## AIC=11408.77 AICc=11408.78 BIC=11433.95
Hands-on practices:
1. 請利用Method 2
產生非定態I(1)時間序列,並繪圖看效果。
2.
承上,請產生季節非定態I(1)時間序列,並繪圖看效果。
3.
請參考說明(help),利用Method 2 產生具備季節結構(seasonal
ARMA)之時間序列,並繪圖看效果。
Simulating Logit Function
# Logit probability function
inv.logit <- function(p){
return(exp(p)/(1 + exp(p)))
}
# DGP of binomial process
reps <- 1000 # Set the number of repetitions at the top of the script
b0 <- .2 # True value for the intercept
b1 <- .5 # True value for the slope
n <- 1000 # Sample size
X <- runif(n, -1, 1) # Create a sample of n observations on the
# independent variable X
par.est.logit=NULL
for(i in 1:reps){ # Start the loop
Y <- rbinom(n, 1, inv.logit(b0 + b1*X)) # The true DGP, Bernoulli trials
model <- glm(Y ~ X, family = binomial (link = logit)) # Estimate logit model
par.est.logit=rbind(par.est.logit, cbind(coef(summary(model))[1,1],
coef(summary(model))[2,1],
coef(summary(model))[1,2],
coef(summary(model))[2,2]))
} # End the loop
mean(par.est.logit[ , 1]) # Mean of intercept estimates
## [1] 0.1997882
mean(par.est.logit[ , 2]) # Mean of coefficient on X estimates
## [1] 0.4986062
Mean of intercept estimates is 0.1997882
Mean of coefficient on X estimates is 0.4986062
COmpute Coverage Probability
cp.beta0.logit <- coverage(par.est.logit[ , 1], par.est.logit[ , 3], b0,
df = n - model$rank)
cp.beta1.logit <- coverage(par.est.logit[ , 2], par.est.logit[ , 4], b1,
df = n - model$rank)
# Coverage probability for the intercept
cp.beta0.logit$coverage.probability
## [1] 0.953
# Coverage probability for the coefficient on X
cp.beta1.logit$coverage.probability
## [1] 0.954
Coverage probability for the intercept is 0.953
Coverage probability for the coefficient on X is 0.954
library(pglm)
## 載入需要的套件:maxLik
## 載入需要的套件:miscTools
##
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
##
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
## 載入需要的套件:plm
Regression in panel data with serial correlation
simPGLM.sc <- function(t=50,N=200,ar1=0.5,type=c("logit","probit")[1])
{
if (ar1==0) {rsd.ar= rep(rnorm(t),times=N)
} else {
rsd.ar= rep(arima.sim(n=t,list(ar=ar1)),times=N)
#rsd.ar= rep(filter(rnorm(t),filter=ar1,method="recursive"),times=N)
}
# Generate level-1 variables
x1 = rep(runif(t,-1,1),times=N)
x2 = rep(runif(t,-1,1),times=N)
b1 = 1 # coefficient for x1
b2 = -2 # coefficient for x2
# Generate random intercepts
RE <- runif(N)
z= RE[rep(1:N, each = t)]
# Generate dependent variable
y0 <- z + b1 * x1 + b2 * x2 + rsd.ar
if (type=="logit") {Y=rbinom(N*t,1, exp(y0)/(1+exp(y0)))
} else if (type=="probit") {Y=rbinom(N*t,1, pnorm(y0))}
DGP.DF <- data.frame(ID = rep(1:N, each = t),# Generate N ID
year = rep(1:t, N), # Generate t ID
z = z,
x1 = x1,
x2 = x2,
y=Y)
return(DGP.DF)
}
myData=simPGLM.sc(ar1=0, type=c("logit","probit")[2])#;head(mydata)
FEByLSDV=glm(y ~ x1 + x2+factor(ID),family=binomial("logit"),data=myData)
fitByLogit=pglm(y ~ x1 + x2,family=binomial("logit"),data=myData, index="ID")
fitByProbit=pglm(y ~ x1 + x2,family=binomial("probit"),data=myData, index="ID")
coef(summary(FEByLSDV))[1:3,]
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3286794 0.37860633 -0.8681298 3.853233e-01
## x1 1.6884910 0.05229930 32.2851538 1.130284e-228
## x2 -3.0029094 0.05928341 -50.6534480 0.000000e+00
coef(summary(fitByLogit))
## Estimate Std. error t value Pr(> t)
## (Intercept) 0.4554853 0.03638902 12.517108 6.018868e-36
## x1 1.6468782 0.05152330 31.963757 3.479219e-224
## x2 -2.9242534 0.05818062 -50.261639 0.000000e+00
## sigma -0.3356460 0.03862548 -8.689756 3.632178e-18
coef(summary(fitByProbit))
## Estimate Std. error t value Pr(> t)
## (Intercept) 0.2769019 0.02124464 13.03396 7.842610e-39
## x1 0.8994237 0.02717752 33.09440 3.577298e-240
## x2 -1.6813825 0.03015636 -55.75548 0.000000e+00
## sigma -0.1993567 0.02200026 -9.06156 1.285983e-19
Hands-on practices:
1. Please modify the
program to for a linear panel DGP.
2. Please estimate the model by
glmmTMB::glmmTMB and lme4::glmer, and compare the results.
Arai, M. (2011). Cluster-robust standard errors using R. Retrieved from http://people.su.se/~ma/clustering.pdf↩︎