rm(list = ls())
library(LalRUtils)
libreq(lfe, rio, MASS, data.table, tidyverse, magrittr, patchwork, parallel,
furrr)
theme_set(lal_plot_theme())
set.seed(42)
root = '~/ddr/tut/ARE212'
setwd(root)
setwd(file.path(root, 'Section01/'))
list.files()
auto = fread('auto.csv')
auto %>% glimpse
tail(auto)
auto[, .(price, mpg, weight, length)] %>% head
auto[order(price, -weight)] %>%head
auto[, .(mean(price), sd(price))]
auto[, .(means = lapply(.SD, mean), sds = lapply(.SD, sd)), .SDcols = c('price', 'mpg', 'weight')]
# The histogram function
with(auto, hist(
# The variable for the histogram
x = mpg,
# The main title
main = "Distribution of fuel economy",
# The x-axis label
xlab = "MPG (miles per gallon)")
# The blue vertical line at the median MPG (lwd is line width)
)
abline(v = median(auto$mpg), col = "blue", lwd = 3)
ggplot(auto, aes(mpg)) + geom_histogram() + geom_vline(xintercept = median(auto$mpg), col = "blue") +
labs(title= "Distribution of fuel economy", x = "MPG (miles per gallon)")
v1 = 1:5
v2 = c(1,2,3,4,5)
all.equal(v1, v2)
A <- matrix(data = 1:6, ncol = 2)
Z = matrix(c(
c(1, 3, 4),
c(6, 7, 2),
1:3),
3, byrow = T
)
t(Z) # transpose
Z %*% A
solve(Z)
crossprod(v1, v2)
eigen(Z)
s.t. $Q$ is an orthogonal matrix($:= Q'Q = QQ' = I; Q' = Q^{-1}$) and $R$ is upper-triangular.
qrd = qr(Z)
qr.Q(qrd)
qr.R(qrd)
cars = import(file.path(root, 'Section03/auto.dta'))
b_ols = function(df, y, X){
d = setDT(df)
y_d = d[[y]]
X_d = cbind(1, as.matrix(d[, ..X]))
β_hat = solve(crossprod(X_d, X_d)) %*% crossprod(X_d, y_d)
β_hat
}
b_ols(cars, 'price', c('mpg', 'weight'))
lm(price ~ mpg + weight, cars)
target_vars <- c("price", "headroom", "trunk", "length", "turn")
results_list <- lapply(
X = target_vars,
FUN = function(i) b_ols(df = cars, y = i, X = c("mpg", "weight"))
)
results_list
# A function to calculate bias
data_baker <- function(sample_n, true_beta) {
# First generate x from N(0,1)
x <- rnorm(sample_n)
# Now the error from N(0,1)
e <- rnorm(sample_n)
# Now combine true_beta, x, and e to get y
y <- true_beta[1] + true_beta[2] * x + e
# Define the data matrix of independent vars.
X <- cbind(1, x)
# Force y to be a matrix
y <- matrix(y, ncol = 1)
# Calculate the OLS estimates
b_ols <- solve(t(X) %*% X) %*% t(X) %*% y
# Convert b_ols to vector
b_ols <- b_ols %>% as.vector()
# Calculate bias, force to 2x1 data.frame()
the_bias <- (true_beta - b_ols) %>%
matrix(ncol = 2) %>% data.frame()
# Set names
names(the_bias) <- c("bias_intercept", "bias_x")
# Return the bias
return(the_bias)
}
# Set seed
set.seed(12345)
# Run once
data_baker(sample_n = 100, true_beta = c(1, 3))
# A function to run the simulation
bias_simulator <- function(n_sims, sample_n, true_beta) {
# Run data_baker() n_sims times with given parameters
sims_dt <- lapply(
X = 1:n_sims,
FUN = function(i) data_baker(sample_n, true_beta)) %>%
# Bind the rows together to output a nice data.frame
bind_rows()
# Return sim_dt
return(sims_dt)
}
# Set seed
set.seed(12345)
# Run it
sim_dt <- bias_simulator(n_sims = 1e4, sample_n = 100, true_beta = c(1,3))
# Check the results with a histogram
hist(sim_dt[,2],
breaks = 30,
main = "Is OLS unbiased?",
xlab = "Bias")
# Emphasize the zero line
abline(v = 0, col = "blue", lwd = 3)
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
# Require the 'dplyr' package
require(dplyr)
# Create the y matrix
y <- data %>%
# Select y variable data from 'data'
select({{y_var}}) %>%
# Convert y_data to matrices
as.matrix()
# Create the X matrix
X <- data %>%
# Select X variable data from 'data'
select({{X_vars}})
# If 'intercept' is TRUE, then add a column of ones
# and move the column of ones to the front of the matrix
if (intercept == T) {
# Bind on a column of ones
X <- cbind(1, X)
# Name the column of ones
names(X) <- c("ones", X_vars)
}
# Convert X_data to a matrix
X <- as.matrix(X)
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# If 'intercept' is TRUE:
# change the name of 'ones' to 'intercept'
if (intercept == T) rownames(beta_hat) <- c("intercept", X_vars)
# Return beta_hat
return(beta_hat)
}
n = 100
the_data <- tibble(
x = rnorm(n),
e = rnorm(n))
# Calculate y = 3 + 1.5 x + e
the_data <- mutate(the_data, y = 3 + 1.5 * x + e)
# Plot to make sure things are going well.
plot(
# The variables for the plot
x = the_data$x, y = the_data$y,
# Labels and title
xlab = "x", ylab = "y", main = "Our generated data")
# Run b_ols with and intercept
b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
felm(y ~ x, data = the_data)
# The estimates
b_w <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
b_wo <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F)
# Plot the points
plot(
# The variables for the plot
x = the_data$x, y = the_data$y,
# Labels and title
xlab = "x", ylab = "y", main = "Our generated data")
# Plot the line from the 'with intercept' regression in yellow
abline(a = b_w[1], b = b_w[2], col = "lightblue", lwd = 3)
# Plot the line from the 'without intercept' regression in purple
abline(a = 0, b = b_w[1], col = "purple4", lwd = 2)
# Add a legend
legend(x = min(the_data$x), y = max(the_data$y),
legend = c("with intercept", "w/o intercept"),
# Line widths
lwd = c(3, 2),
# Colors
col = c("lightblue", "purple4"),
# No box around the legend
bty = "n")
For an arbitrary regression
$$ \mathbf{y}=\mathbf{X}_{1} \beta_{1}+\mathbf{X}_{2} \beta_{2}+\varepsilon $$FWL states that we can estimate $\beta_1$ with $b_1$ using
$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$Now, define $M_2$ as the annihilator matrix, and define $X_1^* := M_2 X_1$ and $y^* := M_2 y$. These are the residualised data matrices on $X_2$. Then, we can also write
$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{* \prime} \mathbf{X}_{1}^{*}\right)^{-1} \mathbf{X}_{1}^{* \prime} \mathbf{y}^{*} $$to_matrix <- function(the_df, vars) the_df %>% dplyr::select({{vars}}) %>% as.matrix()
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
# Create the y matrix
y <- to_matrix(the_df = data, vars = y_var)
X <- to_matrix(the_df = data, vars = X_vars)
# If 'intercept' is TRUE, then add a column of ones
if (intercept == T) {
# Bind a column of ones to X
X <- cbind(1, X)
# Name the new column "intercept"
colnames(X) <- c("intercept", X_vars)
}
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# Return beta_hat
return(beta_hat)
}
b_ols(the_data, y_var = "y", X_vars = "x", intercept = T)
resid_ols <- function(data, y_var, X_vars, intercept = TRUE) {
y <- to_matrix(the_df = data, vars = y_var)
X <- to_matrix(the_df = data, vars = X_vars)
if (intercept == T) {
X <- cbind(1, X)
colnames(X) <- c("intercept", X_vars)
}
n <- nrow(X)
resids <- (diag(n) - X %*% solve(t(X) %*% X) %*% t(X)) %*% y
return(resids)
}
to estimate the following regression
$$ y = X_1 \beta_1 + X_2 \beta_2 + \epsilon $$# Steps 1 and 2: Residualize 'price' on 'weight' and an intercept
e_yx <- resid_ols(data = cars, y_var = "price",
X_vars = "weight", intercept = T)
# Steps 3 and 4: Residualize 'mpg' on 'weight' and an intercept
e_xx <- resid_ols(data = cars, y_var = "mpg",
X_vars = "weight", intercept = T)
# Combine the two sets of residuals into a data.frame
e_df <- data.frame(e_yx = e_yx[,1], e_xx = e_xx[,1])
# Step 5: Regress e_yx on e_xx without an intercept
b_ols(data = e_df, y_var = "e_yx",
X_vars = "e_xx", intercept = F)
b_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
We can recover the OLS estimate by estimating the univariate regression and differencing away a normalizing term. This normalizing term is 0 if $X_1$ $X_2$ are uncorrelated / orthogonal.
library(MASS)
v_cov <- matrix(data = c(1, 0.95, 0.95, 1), nrow = 2)
# Create the means vector
means <- c(5, 10)
# Define our sample size
n <- 1e5
# Set the seed
set.seed(12345)
# Generate x1 and x2
X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T)
# Create a tibble for our data, add generate error from N(0,1)
the_data <- as.data.frame(X) %>% mutate(e = rnorm(n))
# Set the names
names(the_data) <- c("x1", "x2", "e")
# The data-generating process
the_data <- the_data %>% mutate(y = 1 + 2 * x1 + 3 * x2 + e)
# Regression 1: y on int, x1, and x2
b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T)
# Regression 2: y on int and x1
b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T)
# Regression 3: y on int and x2
b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T)
Use demeaning matrix $$ \mathbf{A}=\mathbf{I}_{n}-\left(\frac{1}{n}\right) \mathbf{i} \mathbf{i}^{\prime} $$
# Function that demeans the columns of Z
demeaner <- function(N) {
# Create an N-by-1 column of 1s
i <- matrix(data = 1, nrow = N)
# Create the demeaning matrix
A <- diag(N) - (1/N) * i %*% t(i)
# Return A
return(A)
}
# Start the function
r2_ols <- function(data, y_var, X_vars) {
# Create y and X matrices
y <- to_matrix(data, vars = y_var)
X <- to_matrix(data, vars = X_vars)
# Add intercept column to X
X <- cbind(1, X)
# Find N and K (dimensions of X)
N <- nrow(X)
K <- ncol(X)
# Calculate the OLS residuals
e <- resid_ols(data, y_var, X_vars)
# Calculate the y_star (demeaned y)
y_star <- demeaner(N) %*% y
# Calculate r-squared values
r2_uc <- 1 - t(e) %*% e / (t(y) %*% y)
r2 <- 1 - t(e) %*% e / (t(y_star) %*% y_star)
r2_adj <- 1 - (N-1) / (N-K) * (1 - r2)
# Return a vector of the r-squared measures
return(c("r2_uc" = r2_uc, "r2" = r2, "r2_adj" = r2_adj))
}
# Our r-squared function
r2_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
felm(price ~ mpg + weight, cars) %>% summary()
# Find the number of observations in cars
n <- nrow(cars)
# Fill an n-by-50 matrix with random numbers from N(0,1)
set.seed(12345)
rand_mat <- matrix(data = rnorm(n * 50, sd = 10), ncol = 50)
# Convert rand_mat to tbl_df
rand_df <- as.data.frame(rand_mat)
# Change names to x1 to x50
names(rand_df) <- paste0("x", 1:50)
# Bind cars and rand_df; convert to tbl_df; call it 'cars_rand'
cars_rand <- cbind(cars, rand_df) %>% tbl_df()
# Drop the variable 'rep78' (it has NAs)
cars_rand <- dplyr::select(cars_rand, -rep78, -make)
result_df <- lapply(
X = 2:ncol(cars_rand),
FUN = function(k) {
# Apply the 'r2_ols' function to columns 2:k
r2_df <- r2_ols(
data = cars_rand,
y_var = "price",
X_vars = names(cars_rand)[2:k]) %>%
matrix(ncol = 3) %>% as.data.frame()
# Set names
names(r2_df) <- c("r2_uc", "r2", "r2_adj")
# Add a column for k
r2_df$k <- k
# Return r2_df
return(r2_df)
}) %>% bind_rows()
# Plot unadjusted r-squared
plot(x = result_df$k, y = result_df$r2,
type = "l", lwd = 3, col = "lightblue",
xlab = expression(paste("k: number of columns in ", bold("X"))),
ylab = expression('R' ^ 2))
# Plot adjusted r-squared
lines(x = result_df$k, y = result_df$r2_adj,
type = "l", lwd = 2, col = "purple4")
# The legend
legend(x = 0, y = max(result_df$r2),
legend = c(expression('R' ^ 2),
expression('Adjusted R' ^ 2)),
# Line widths
lwd = c(3, 2),
# Colors
col = c("lightblue", "purple4"),
# No box around the legend
bty = "n")
x <- data.frame(a = c(1:4, NA), b = c("..", 1, 2, 3, "..")) %>% as_tibble()
x
x[x == ".."] <- NA
x
dir_data = file.path(root, 'section05')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()
t_stat <- function(data, y_var, X_vars, gamma, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept if requested
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(data, y_var, X_vars, intercept)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Force s2 to numeric
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - gamma) / se
# Return the _t_ statistics
return(t_stats)
}
t_stat(cars,
y_var = "price", X_vars = c("mpg", "weight"),
gamma = 0, intercept = T)
felm(price ~ mpg + weight, cars) %>%
summary() %$% (coefficients)[,3]
%$%
pipe
cars %$% cor(price, weight)
# The default: lower.tail = TRUE
pt(q = 2, df = 15)
# Setting lower.tail = TRUE
pt(q = 2, df = 15, lower.tail = T)
# Setting lower.tail = FALSE
pt(q = 2, df = 15, lower.tail = F)
ols <- function(data, y_var, X_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept if requested
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(data, y_var, X_vars, intercept)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Update s2 to numeric
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - 0) / se
# Calculate the p-values
p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
# Nice table (data.frame) of results
results <- data.frame(
# The rows have the coef. names
effect = rownames(b),
# Estimated coefficients
coef = as.vector(b) %>% round(3),
# Standard errors
std_error = as.vector(se) %>% round(3),
# t statistics
t_stat = as.vector(t_stats) %>% round(3),
# p-values
p_value = as.vector(p_values) %>% round(4)
)
# Return the results
return(results)
}
ols(data = cars,
y_var = "price",
X_vars = c("mpg", "weight"),
intercept = T)
felm(price ~ mpg + weight, cars) %>%
summary() %$% coefficients
joint_test <- function(data, y_var, X_vars) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
X <- cbind(1, X)
# Name the new column "intercept"
colnames(X) <- c("intercept", X_vars)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# J is k-1
J <- k - 1
# Create the R matrix: bind a column of zeros
# onto a J-by-J identity matrix
R <- cbind(0, diag(J))
# Estimate coefficients
b <- b_ols(data, y_var, X_vars)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Force s2 to numeric
s2 %<>% as.numeric()
# Create the inner matrix R(X'X)^(-1)R'
RXXR <- R %*% solve(t(X) %*% X) %*% t(R)
# Calculate the F stat
f_stat <- t(R %*% b) %*% solve(RXXR) %*% (R %*% b) / J / s2
# Calculate the p-value
p_value <- pf(q = f_stat, df1 = J, df2 = n-k, lower.tail = F)
# Create a data.frame of the f stat. and p-value
results <- data.frame(
f_stat = f_stat %>% as.vector(),
p_value = p_value %>% as.vector())
return(results)
}
joint_test(data = cars,
y_var = "price", X_vars = c("mpg", "weight"))
felm(price ~ mpg + weight, cars) %>%
summary() %$% F.fstat
# Function to generate data
gen_data <- function(sample_size) {
# Create data.frame with random x and error
data_df <- data.frame(
x = rnorm(sample_size),
e = rnorm(sample_size))
# Calculate y = 7 + 0.5 x + e; drop 'e'
data_df %<>% mutate(y = 7 + 0.5 * x + e) %>%
select(-e)
# Return data_df
return(data_df)
}
one_sim <- function(sample_size) {
# Estimate via OLS
ols_est <- ols(data = gen_data(sample_size),
y_var = "y", X_vars = "x")
# Grab the estimated coefficient on x
# (the second element of 'coef')
b1 <- ols_est %$% coef[2]
# Grab the second p-value
# (the first p-value is for the intercept)
p_value <- ols_est %$% p_value[2]
# Return a data.frame with b1 and p_value
return(data.frame(b1, p_value))
}
ols_sim <- function(n_sims, sample_size, seed = 12345) {
# Set the seed
set.seed(seed)
# Run one_sim n_sims times; convert results to data.frame
sim_df <- replicate(
n = n_sims,
expr = one_sim(sample_size),
simplify = F
) %>% bind_rows()
# Return sim_df
return(sim_df)
}
# Run ols_sim for sample size of 10
sim10 <- ols_sim(n_sims = 1e3, sample_size = 10)
# Run ols_sim for sample size of 100
sim100 <- ols_sim(n_sims = 1e3, sample_size = 100)
p1 = ggplot(data = sim10, aes(x = p_value)) +
stat_density(fill = "grey20", alpha = 0.9) +
geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
theme_bw() +
xlab(expression(paste(italic("p"), "-Value"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", italic(p),
"-Values from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = p_value)) +
stat_density(fill = "grey20", alpha = 0.9) +
geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
xlim(0, 0.1) +
theme_bw() +
xlab(expression(paste(italic("p"), "-Value"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", italic(p),
"-Values from 1,000 simulations with sample size 100")))
(p1 / p2)
p1 = ggplot(data = sim10, aes(x = b1)) +
stat_density(fill = "grey70") +
geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) +
xlab(expression(paste(beta[1], " estimate"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", beta[1],
" estimates from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = b1)) +
stat_density(fill = "grey70") +
geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) +
xlab(expression(paste(beta[1], " estimate"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", beta[1],
" estimates from 1,000 simulations with sample size 100")))
(p1 / p2)
ols_sim_mc <- function(n_sims, sample_size, seed = 12345) {
# Require the parallel package
require(parallel)
# Set the seed
set.seed(seed)
# Run one_sim n_sims times; convert results to data.frame
sim_df <- mclapply(
X = rep(x = sample_size, times = n_sims),
FUN = one_sim,
# Specify that we want 4 cores
mc.cores = detectCores() - 2
) %>% bind_rows()
# Return sim_df
return(sim_df)
}
Non-Parallelised
Parallelised
# stop1 - start1
# stop2 - start2
# stop3 - start3
# stop4 - start4
libreq(dplyr, magrittr, ggplot2, ggthemes)
# Reformat Anscombe's dataset
a_df <- bind_rows(
dplyr::select(anscombe, x = x1, y = y1),
dplyr::select(anscombe, x = x2, y = y2),
dplyr::select(anscombe, x = x3, y = y3),
dplyr::select(anscombe, x = x4, y = y4))
# Add group identifier
a_df %<>% mutate(group = rep(paste0("Dataset ", 1:4), each = 11))
# The plot
ggplot(data = a_df, aes(x, y)) +
# Plot the points
geom_point() +
# Add the regression line (without S.E. shading)
geom_smooth(method = "lm", formula = y ~ x, se = F) +
# Change the theme
theme_pander() +
theme(panel.border = element_rect(color = "grey90", fill=NA, size=1)) +
# Plot by group
facet_wrap(~group, nrow = 2, ncol = 2) +
ggtitle("Illustrating Anscombe's Quartet",
subtitle = "Four very different datasets with the same regression line")
# Regress price on weight (with an intercept)
b <- ols(data = cars, y_var = "price", X_vars = "weight") %$% coef
price_hat <- function(weight) b[1] + b[2] * weight
p1 = ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
stat_function(fun = price_hat, colour = 'blue')
p2 = ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
geom_smooth(method = "lm")
(p1 / p2)
ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2))
cars %<>% mutate(foreign = foreign == 1)
ggplot(data = cars, aes(x = weight, y = price, color = foreign)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2))
ggplot(data = cars, aes(x = weight, y = price)) +
geom_point(aes(color = foreign, size = mpg), alpha = 0.5) +
xlab("Weight (lbs)") +
ylab("Price (USD)") +
ggtitle("Trends in cars sold on the US market",
subtitle = "From the world-famous autos dataset") +
theme_pander()
ggplot(data = cars, aes(x = weight, y = price)) +
geom_point(aes(color = foreign, size = mpg), alpha = 0.5) +
xlab("Weight (lbs)") +
ylab("Price (USD)") +
ggtitle("Trends in cars sold on the US market",
subtitle = "From the world-famous autos dataset") +
scale_color_manual("Origin",
values = c("grey70", "midnightblue"),
labels = c("Domestic", "Foreign"))
# Define the inverse demand and supply functions
inv_demand <- function(q) 10 - q
inv_supply <- function(q) 1.5 * q
ggplot(data = data.frame(x = c(0, 10)), aes(x)) +
# The inverse demand
stat_function(fun = inv_demand, geom = "line") +
# The inverse supply
stat_function(fun = inv_supply, geom = "line") +
# The shifted inverse supply curve
stat_function(fun = function(x) inv_supply(x) + 3, geom = "line",
linetype = 2) +
# Labels and themes
xlab("Quantity") +
ylab("Price") +
ggtitle("Classic economics figure")
ggplot(data = data.frame(x = c(-4,4)), aes(x)) +
# Plot the pdf
stat_function(fun = function(x) dt(x, df = 29),
color = "grey75") +
ggtitle(expression(paste("The beautiful ", italic(t),
" distribution"))) +
xlab(expression(italic(t))) +
ylab("Density")
my_theme <- theme(
legend.position = "bottom",
panel.background = element_rect(fill = NA),
panel.border = element_rect(fill = NA, color = "grey75"),
axis.ticks = element_line(color = "grey85"),
panel.grid.major = element_line(color = "grey95", size = 0.2),
panel.grid.minor = element_line(color = "grey95", size = 0.2),
legend.key = element_blank())
libreq(dplyr, lfe, readr, magrittr, parallel, lfe, ggplot2, ggthemes, viridis)
# Functions ----
# Function to convert tibble, data.frame, or tbl_df to matrix
to_matrix <- function(the_df, vars) {
# Create a matrix from variables in var
new_mat <- the_df %>%
# Select the columns given in 'vars'
select_(.dots = vars) %>%
# Convert to matrix
as.matrix()
# Return 'new_mat'
return(new_mat)
}
# Function for OLS coefficient estimates
b_ols <- function(y, X) {
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# Return beta_hat
return(beta_hat)
}
Spherical errors assumption in OLS
$$ \boldsymbol{E}\left[\varepsilon \varepsilon^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{I}_{n} $$replace with
$$ \boldsymbol{E}\left[\boldsymbol{\varepsilon} \boldsymbol{\varepsilon}^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{\Omega}(\mathbf{X}) $$where $\Omega$ is known and positive definite and $$\hat{\beta}_{GLS} = \left.\left(\mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{y}\right)$$
Impose structure on $\Omega(\textbf{X})$ s.t.
$$ \boldsymbol{E}\left[\varepsilon \varepsilon^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{\Omega}(\mathbf{X})=\sigma^{2} \operatorname{diag}\left\{\omega_{i}(\mathbf{X})\right\} $$or $$ \boldsymbol{E}\left[\varepsilon_{i}^{2}\right]=\sigma^{2} \omega_{i}(\mathbf{X}) $$
$$ \tilde{y}_{i}=\frac{y_{i}}{\sqrt{\omega_{i}(\mathbf{X})}} \quad \text { and } \quad \tilde{x}_{i}=\frac{x_{i}}{\sqrt{\omega_{i}(\mathbf{X})}} $$$$ \begin{aligned} \mathbf{b}_{\mathrm{gls}} &=\left(\mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{y} \\ &=\left(\mathbf{X}^{\prime} \mathbf{W} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{W} \mathbf{y} \\ &=\left(\mathbf{X}^{\prime} \mathbf{C}^{\prime} \mathbf{C} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{C}^{\prime} \mathbf{C} \mathbf{y} \\ &=\left((\mathbf{C} \mathbf{X})^{\prime} \mathbf{C X}\right)^{-1}(\mathbf{C} \mathbf{X})^{\prime} \mathbf{C} \mathbf{y} \\ &=\left(\widetilde{\mathbf{X}}^{\prime} \widetilde{\mathbf{X}}\right)^{-1} \widetilde{\mathbf{X}}^{\prime} \tilde{\mathbf{y}} \end{aligned} $$# Set the seed
set.seed(12345)
# Set population size, N
N <- 1e5
# Set alpha and beta
alpha <- 0.5
beta <- 1.5
# Create the population data: intercept (i) and X
pop_df <- data.frame(
i = 1,
x = runif(n = N, min = 0, max = 2000)
) %>% tbl_df()
# Generate error term, e
pop_df %<>% mutate(
e = rnorm(N, mean = 0, sd = sqrt(4 * x^2)))
# Calculate y
pop_df %<>% mutate(y = alpha + 1.5 * x + e)
# Add weights
pop_df %<>% mutate(w = 10/x)
pop_df %<>% mutate(
y_w = y * w,
i_w = i * w,
x_w = x * w)
# Function for a single iteration of the simulation
one_run <- function(iter, population) {
# Sample 1000 rows from the population
sample_df <- sample_n(tbl = population, size = 1000)
# Calculate the OLS coef. (using unweighted variables)
coef_ols <- b_ols(
y = to_matrix(sample_df, "y"),
X = to_matrix(sample_df, c("i", "x")))
# Calculate the WLS coef. (using weighted variables)
coef_wls <- b_ols(
y = to_matrix(sample_df, "y_w"),
X = to_matrix(sample_df, c("i_w", "x_w")))
# Create a data.frame to return
coef_df <- data.frame(
est = as.vector(c(coef_ols, coef_wls)),
param = rep(c("int", "coef"), 2),
method = rep(c("ols", "wls"), each = 2),
iter = iter
)
# Return the data.frame
return(coef_df)
}
# Set seed in parallel
sim_df <- future_map(
1:1e3,
.f = one_run,
population = pop_df) %>% bind_rows() %>% tbl_df()
ggplot(data = filter(sim_df, param == "coef"), aes(x = est)) +
geom_vline(xintercept = 1.5, color = "grey70", size = 0.75) +
geom_density(aes(fill = method, color = method), alpha = 0.7) +
xlab(expression(paste("Estimate for ", beta))) +
ylab("Density") +
ggtitle("Simulation comparing coefficients from OLS and WLS") +
scale_fill_viridis("Method", labels = c("OLS", "WLS"),
discrete = T, end = 0.95) +
scale_color_viridis("Method", labels = c("OLS", "WLS"),
discrete = T, end = 0.95)
sim_df %>%
group_by(param, method) %>%
summarize(mean(est), sd(est)) %>%
knitr::kable(digits = 4,
col.names = c("Parameter", "Method", "Mean", "Std. Dev."))
# Add the bad weights
pop_df %<>% mutate(v = 10/x^2)
# Weight the observations with the bad weights
pop_df %<>% mutate(
y_v = y * v,
i_v = i * v,
x_v = x * v)
# Function for a single iteration of the simulation
one_run <- function(iter, population) {
# Sample 1000 rows from the population
sample_df <- sample_n(tbl = population, size = 1000)
# Calculate the OLS coef. (using unweighted variables)
coef_ols <- b_ols(
y = to_matrix(sample_df, "y"),
X = to_matrix(sample_df, c("i", "x")))
# Calculate the WLS coef. (using correctly weighted variables)
coef_wls <- b_ols(
y = to_matrix(sample_df, "y_w"),
X = to_matrix(sample_df, c("i_w", "x_w")))
# Calculate the WLS coef. (using incorrectly weighted variables)
coef_wls_bad <- b_ols(
y = to_matrix(sample_df, "y_v"),
X = to_matrix(sample_df, c("i_v", "x_v")))
# Create a data.frame to return
coef_df <- data.frame(
est = as.vector(c(coef_ols, coef_wls, coef_wls_bad)),
param = rep(c("int", "coef"), 3),
method = rep(c("ols", "wls", "wls bad"), each = 2),
iter = iter
)
# Return the data.frame
return(coef_df)
}
# Set seed in parallel
miss_df <- future_map(
1:1e3,
.f = one_run,
population = pop_df) %>% bind_rows() %>% tbl_df()
ggplot(data = filter(miss_df, param == "coef"), aes(x = est)) +
geom_vline(xintercept = 1.5, color = "grey70", size = 0.75) +
geom_density(aes(fill = method, color = method), alpha = 0.7) +
xlab(expression(paste("Estimate for ", beta))) +
ylab("Density") +
ggtitle("Simulation comparing coefficients from OLS and WLS",
subtitle = "Allowing for misspecification in WLS") +
scale_fill_viridis("Method",
labels = c("OLS", "WLS", "WLS misspecified"),
discrete = T, end = 0.95, direction = -1) +
scale_color_viridis("Method",
labels = c("OLS", "WLS", "WLS misspecified"),
discrete = T, end = 0.95, direction = -1)
felm(y ~ x, data = pop_df) %>% summary
felm(y ~ x, data = pop_df, weights = (10/pop_df$x)^2) %>% summary
libreq(dplyr, lfe, readr, magrittr, parallel, lfe, ggplot2, ggthemes, viridis, VGAM,
gridExtra)
When OLS assumptions
then $\hat{\beta}$ is
rbigamma <- function(n, prob = c(0.5, 0.5)) {
# Draw n samples of T or F (our 'coin flip') with replacement
flips <- base::sample(x = c(T, F), size = n,
replace = T, prob = prob)
# Draw n samples from Gamma with shape=5 and scale=1 (mean=7)
# substract the mean (7) from the draws
gamma_7 <- rgamma(n = n, shape = 7, scale = 1)
# Draw n samples from Gamma with shape=1 and scale=1 (mean=1)
# substract the mean (1) from the draws
gamma_1 <- rgamma(n = n, shape = 1, scale = 1)
# Combine the flips and the two gammas' draws
bi_gamma <- flips * gamma_7 + (!flips) * gamma_1
# Demean the bimodal variables (weighting by 'prob')
bi_gamma <- bi_gamma - 7 * prob[1] - 1 * prob[2]
}
qplot(rbigamma(1e5), geom = "histogram")
# Set the seed
set.seed(12345)
# Define the population size
N <- 1e5
# Define the true beta
beta <- 5
# Start the population
pop_df <- data.frame(
ones = 1,
x = runif(N, min = 0, max = 100),
e_norm = rnorm(n = N),
e_unif = runif(n = N, min = -5, max = 5),
e_pois = rpois(n = N, lambda = 1) - 1,
e_bg = rbigamma(n = N)
) %>% tbl_df()
# Calculate the outcome variable: y = 1 + beat * x + error
pop_df %<>% mutate(
y_norm = 1 + beta * x + e_norm,
y_unif = 1 + beta * x + e_unif,
y_pois = 1 + beta * x + e_pois,
y_bg = 1 + beta * x + e_bg
)
pop_df %>% head
# Generate 4 colors from 'viridis'
c4 <- viridis(n = 4, begin = 0.1, end = 0.8)
# Generate 4 slightly lighter colors from 'viridis'
c4_l <- viridis(n = 4, begin = 0.4, end = 1)
# Make a plot for the normal disturbances
gg_norm <- ggplot(data = pop_df, aes(x = e_norm)) +
geom_histogram(fill = c4[1], color = c4_l[1],
size = 0.1, bins = 30) +
xlab("Normal") +
ylab("Count")
# Make a plot for the uniform disturbances
gg_unif <- ggplot(data = pop_df, aes(x = e_unif)) +
geom_histogram(fill = c4[2], color = c4_l[2],
size = 0.1, bins = 30) +
xlab("Uniform") +
ylab("Count")
# Make a plot for the poisson disturbances
gg_pois <- ggplot(data = pop_df, aes(x = e_pois)) +
geom_histogram(fill = c4[3], color = c4_l[3],
size = 0.1, bins = 30) +
xlab("Poisson") +
ylab("Count")
# Make a plot for the bimodal gamma disturbances
gg_bg <- ggplot(data = pop_df, aes(x = e_bg)) +
geom_histogram(fill = c4[4], color = c4_l[4],
size = 0.1, bins = 30) +
xlab("Bernoulli-Gamma") +
ylab("Count")
# Combine the plots
(gg_norm | gg_unif) / (gg_pois | gg_bg)
pop_df %>% select(starts_with("e_")) %>% summarize_each(funs = "mean")
one_iter <- function(n, data) {
# Draw 'n' observations from 'data'
tmp_df <- sample_n(tbl = data, size = n)
# Define the X matrix (same across regressions)
x_mat <- to_matrix(tmp_df, c("ones", "x"))
# Estimate OLS for each 'y'
b_norm = b_ols(
y = to_matrix(tmp_df, "y_norm"),
X = x_mat)[2]
b_unif = b_ols(
y = to_matrix(tmp_df, "y_unif"),
X = x_mat)[2]
b_pois = b_ols(
y = to_matrix(tmp_df, "y_pois"),
X = x_mat)[2]
b_bg = b_ols(
y = to_matrix(tmp_df, "y_bg"),
X = x_mat)[2]
# Create a data.frame to return
coef_df <- data.frame(
# The estimates
est = c(b_norm, b_unif, b_pois, b_bg),
# The distributions
dist = c("normal", "uniform", "poisson", "bi-gamma"),
# The sample size
n = n
)
# Return coef_df
return(coef_df)
}
run_sim <- function(n, n_iter, data, n_cores = 4) {
# Required packages
require(dplyr)
require(parallel)
require(magrittr)
# Run 'one_iter' 'n_iter' times with sample size 'n'
run_df <- mclapply(
X = rep(n, each = n_iter),
FUN = one_iter,
data = data,
mc.cores = n_cores) %>% bind_rows() %>% tbl_df()
# Return run_df
return(run_df)
}
# Set the seed (again)
set.seed(12345)
# Run the simulation with n = 25
sim_25 <- run_sim(
n = 25,
n_iter = 1e4,
data = select(pop_df, -starts_with("e")))
# Run the simulation with n = 100
sim_100 <- run_sim(
n = 100,
n_iter = 1e4,
data = select(pop_df, -starts_with("e")))
# Run the simulation with n = 1,000
sim_1k <- run_sim(
n = 1e3,
n_iter = 1e4,
data = select(pop_df, -starts_with("e")))
# Run the simulation with n = 10,000
sim_10k <- run_sim(
n = 1e4,
n_iter = 1e4,
data = select(pop_df, -starts_with("e")))
# Combine simulations
sim_df <- bind_rows(sim_25, sim_100, sim_1k, sim_10k)
# Clean up
rm(sim_25, sim_100, sim_1k, sim_10k); gc()
# Set the seed (again)
set.seed(12345)
# Define our desired sample sizes
sizes <- c(25, 100, 1e3, 1e4)
# Run the simulation function 'run_sim()'
sim_df <- run_sim(n = sizes, n_iter = 1e4,
data = select(pop_df, -starts_with("e")))
ggplot(data = filter(sim_df, dist == "normal"),
aes(x = est, fill = as.character(n))) +
geom_density(stat = "density", alpha = 0.6, size = 0.05) +
xlab(expression(b[OLS])) +
ylab("Density") +
ggtitle(paste("Distribution of OLS coefficients",
"with normal disturbances")) +
scale_fill_viridis("Sample size:", discrete = T,
direction = -1)
sim_df %<>% mutate(dist_fac = factor(
x = dist,
levels = c("bi-gamma", "normal", "poisson", "uniform"),
labels = c("Bernoulli-Gamma", "Normal", "Poisson", "Uniform")
))
# Add a factor version of 'n'
sim_df %<>% mutate(
n_fac = factor(
x = n,
levels = sizes,
labels = prettyNum(sizes, big.mark=",", scientific = F),
ordered = T)
)
# Check our new variable
sim_df$n_fac %>% head()
ggplot(data = filter(sim_df, dist == "normal"),
aes(x = est, fill = n_fac)) +
geom_density(stat = "density", alpha = 0.6, size = 0.05) +
xlab(expression(b[OLS])) +
ylab("Density") +
ggtitle(paste("Distribution of OLS coefficients",
"with normal disturbances")) +
scale_fill_viridis("Sample size:", discrete = T,
direction = -1)
ggplot(data = sim_df,
aes(x = est, fill = n_fac)) +
geom_density(stat = "density", alpha = 0.6, size = 0.05) +
xlab(expression(b[OLS])) +
ylab("Density") +
ggtitle(paste("Distribution of OLS coefficients",
"with normal disturbances")) +
scale_fill_viridis("Sample size:", discrete = T,
direction = -1) +
facet_grid(dist_fac ~ ., scales = "free_y")
sim_df %>% group_by(dist_fac, n_fac) %>%
summarize(mean = mean(est), std_dev = sd(est)) %>%
knitr::kable(digits = 4,
col.names = c("Distribution", "N", "Mean", "Std. Dev."))
# Summaries of our point estimates
est_df <- sim_df %>%
filter(n == 1e4) %>%
group_by(dist) %>%
summarize(mean_est = mean(est), sd_est = sd(est))
# Normal disturbances
sim_norm <- ggplot(aes(x = est),
data = filter(sim_df, dist == "normal", n == 1e4)) +
geom_histogram(aes(y = ..density..),
fill = "grey90", color = "grey65",
size = 0.1, bins = 50) +
geom_line(stat = "density",
color = "violetred2", size = 0.6) +
stat_function(
geom = "line", fun = dnorm,
color = "slateblue4", linetype = "dashed", size = 0.8,
args = list(
mean = filter(est_df, dist == "normal")$mean_est,
sd = filter(est_df, dist == "normal")$sd_est)
) +
ggtitle("Normal") +
xlab("") + ylab("")
# Uniform disturbances
sim_unif <- ggplot(aes(x = est),
data = filter(sim_df, dist == "uniform", n == 1e4)) +
geom_histogram(aes(y = ..density..),
fill = "grey90", color = "grey65",
size = 0.1, bins = 50) +
geom_line(stat = "density",
color = "violetred2", size = 0.6) +
stat_function(
geom = "line", fun = dnorm,
color = "slateblue4", linetype = "dashed", size = 0.8,
args = list(
mean = filter(est_df, dist == "uniform")$mean_est,
sd = filter(est_df, dist == "uniform")$sd_est)
) +
ggtitle("Uniform") +
xlab("") + ylab("")
# Uniform disturbances
sim_pois <- ggplot(aes(x = est),
data = filter(sim_df, dist == "poisson", n == 1e4)) +
geom_histogram(aes(y = ..density..),
fill = "grey90", color = "grey65",
size = 0.1, bins = 50) +
geom_line(stat = "density",
color = "violetred2", size = 0.6) +
stat_function(
geom = "line", fun = dnorm,
color = "slateblue4", linetype = "dashed", size = 0.8,
args = list(
mean = filter(est_df, dist == "poisson")$mean_est,
sd = filter(est_df, dist == "poisson")$sd_est)
) +
ggtitle("Poisson") +
xlab("") + ylab("")
# Uniform disturbances
sim_bg <- ggplot(aes(x = est),
data = filter(sim_df, dist == "bi-gamma", n == 1e4)) +
geom_histogram(aes(y = ..density..),
fill = "grey90", color = "grey65",
size = 0.1, bins = 50) +
geom_line(stat = "density",
color = "violetred2", size = 0.6) +
stat_function(
geom = "line", fun = dnorm,
color = "slateblue4", linetype = "dashed", size = 0.8,
args = list(
mean = filter(est_df, dist == "bi-gamma")$mean_est,
sd = filter(est_df, dist == "bi-gamma")$sd_est)
) +
ggtitle("Bernoulli-Gamma") +
xlab("") + ylab("")
# Combine the plots
(sim_norm | sim_unif) / (sim_pois | sim_bg)
# Set the seed
set.seed(12345)
# The Shapiro-Wilk test
sim_df %>%
# Subset for Poisson and n = 10,000
filter(dist == "poisson", n == 1e4) %>%
# Sample 5,000 of the rows
sample_n(size = 5e3) %$%
# Grab the variable 'est'
est %>%
# The actual Shaprio-Wilk test function
shapiro.test()
ks.test(
x = filter(sim_df, dist == "poisson", n == 1e4)$est,
y = "pnorm",
mean = filter(est_df, dist == "poisson")$mean_est,
sd = filter(est_df, dist == "poisson")$sd_est)
TLDR: For pareto, $E(x^2) = \infty$, so CLT breaks
N_par <- 1e6
# Create a population
par_df <- data.frame(
e = rpareto(N_par, scale = 2, shape = 1))
# Demean our disturbances
par_df %<>% mutate(e = e - mean(e))
# Add x and ones
par_df %<>% mutate(
ones = 1,
x = runif(N_par, 0, 100),
y = 1 + 5 * x + e
) %>% tbl_df()
quantile(par_df$e, probs = seq(0, 1, 0.1))
one_par <- function(n, data) {
# Draw 'n' observations from 'data'
tmp_df <- sample_n(tbl = data, size = n)
# Estimate OLS
b_par = b_ols(
y = to_matrix(tmp_df, "y"),
X = to_matrix(tmp_df, c("ones", "x")))[2]
# Create a data.frame to return
coef_df <- data.frame(
# The estimates
est = b_par,
# The sample size
n = n
)
# Return coef_df
return(coef_df)
}
run_par <- function(n, n_iter, data, n_cores = 4) {
# Required packages
require(dplyr)
require(parallel)
require(magrittr)
# Run 'one_par' 'n_iter' times with sample size 'n'
run_df <- mclapply(
X = rep(n, each = n_iter),
FUN = one_par,
data = data,
mc.cores = n_cores) %>% bind_rows() %>% tbl_df()
# Return run_df
return(run_df)
}
par_10k <- run_par(
n = 1e4,
n_iter = 1e4,
data = par_df)
# Pareto disturbances
ggplot(data = par_10k, aes(x = est)) +
geom_histogram(aes(y = ..density..),
fill = "grey90", color = "grey65",
size = 0.1, bins = 50) +
geom_line(stat = "density",
color = "violetred2", size = 0.6) +
stat_function(
geom = "line", fun = dnorm,
color = "slateblue4", linetype = "dashed", size = 0.8,
args = list(
mean = mean(par_10k$est),
sd = sd(par_10k$est))
) +
ggtitle("Distribution of coefficients from Pareto disturbances",
subtitle = "Sample size of 10,000") +
xlab(expression(b[OLS])) +
ylab("Density")
libreq(gmodels, msm, dplyr, readr, magrittr, lfe, ggplot2, ggthemes)
dir_data = file.path(root, 'section09')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()
# Functions ----
# Function to convert tibble, data.frame, or tbl_df to matrix
to_matrix <- function(the_df, vars) {
# Create a matrix from variables in var
new_mat <- the_df %>%
# Select the columns given in 'vars'
select_(.dots = vars) %>%
# Convert to matrix
as.matrix()
# Return 'new_mat'
return(new_mat)
}
# Function for OLS coefficient estimates
b_ols <- function(y, X) {
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# Return beta_hat
return(beta_hat)
}
# Function for OLS coef., SE, t-stat, and p-value
ols <- function(data, y_var, X_vars) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Update names
rownames(b)[1] <- "Intercept"
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Convert s2 to numeric
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - 0) / se
# Calculate the p-values
p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
# Nice table (data.frame) of results
results <- data.frame(
# The rows have the coef. names
effect = rownames(b),
# Estimated coefficients
coef = as.vector(b),
# Standard errors
std_error = as.vector(se),
# t statistics
t_stat = as.vector(t_stats),
# p-values
p_value = as.vector(p_values)
)
# Return the results
return(results)
}
# Regress price on MPG and weight
ols(cars, "price", c("mpg", "weight")) %>%
knitr::kable()
# Regress price on mpg and weight
reg1 <- ols(cars, "price", c("mpg", "weight"))
# lc = 20 * b1 + 3000 * b2
(lc <- 20 * reg1[2,2] + 3000 * reg1[3,2])
Since $$ \begin{array}{c}{\operatorname{Var}(a X)=a^{2} \operatorname{Var}(X)} \\ {\operatorname{Var}(X+Y)=\operatorname{Var}(X)+\operatorname{Var}(Y)+2 \operatorname{Cov}(X, Y)} \\ {\operatorname{Cov}(a X, b Y)=a b \operatorname{Cov}(X, Y)}\end{array} $$
$$ \operatorname{Var}(a X+b Y)=a^{2} \operatorname{Var}(X)+b^{2} \operatorname{Var}(Y)+2 a b \operatorname{Cov}(X, Y) $$Implies $$ \operatorname{Var}(\widehat{L C})=20^{2} \operatorname{Var}\left(b_{1}\right)+3000^{2} \operatorname{Var}\left(b_{2}\right)+2 \times 20 \times 3000 \operatorname{Cov}\left(b_{1}, b_{2}\right) $$
# Variance-covariance function for OLS beta hat
vcov_ols <- function(data, y_var, X_vars) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
X <- cbind(1, X)
# Label intercept
colnames(X)[1] <- "intercept"
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Calculate residuals
e <- y - X %*% b
# Calculate s2 and convert to scalar
s2 <- (t(e) %*% e / (n - k)) %>% as.vector()
# Convert s2 to numeric
s2 %<>% as.numeric()
# Calculate the variance-covariance matrix
vcov_mat <- s2 * solve(t(X) %*% X)
# Return the variance-covariance matrix
return(vcov_mat)
}
# Run the vcov_ols() function
vcov_ols(cars, "price", c("mpg", "weight"))
# The variance-covariance matrix
vcov1 <- vcov_ols(cars, "price", c("mpg", "weight"))
# The standard error for 'lc'
(lc_se <- sqrt(20^2 * vcov1[2,2] + 3000^2 * vcov1[3,3] +
2 * 20 * 3000 * vcov1[2,3]))
# Estimate the model with 'lm'
lm_est <- lm(price ~ mpg + weight, data = cars)
# Estimate the linear combination
estimable(obj = lm_est, cm = c(0, 20, 3000))
For an arbitrary differentiable $\mathbf{a}(\cdot): \mathbb{R}^{K} \rightarrow \mathbb{R}^{r}$,
$$ \mathbf{A}(\beta)=\frac{\partial \mathbf{a}(\beta)}{\partial \beta^{\prime}} $$.
Take a seqeunce of k-dimensional random vectors $\{x_N : N = 1, 2, \dots \}$ where $$ \mathbf{x}_{N} \stackrel{p}{\rightarrow} \boldsymbol{\beta} $$ and $$ \sqrt{N}\left(\mathbf{x}_{N}-\boldsymbol{\beta}\right) \stackrel{d}{\longrightarrow} N(\mathbf{0}, \mathbf{\Sigma}) $$
For OLS or maximum likelihood, these conditions hold.
Then, $$ \sqrt{N}\left(\mathbf{a}\left(\mathbf{x}_{N}\right)-\mathbf{a}(\boldsymbol{\beta})\right) \stackrel{d}{\rightarrow} N\left(\mathbf{0}, \mathbf{A}(\boldsymbol{\beta}) \boldsymbol{\Sigma} \mathbf{A}(\boldsymbol{\beta})^{\prime}\right) $$
In this toy example,
$$ \mathbf{a}(\boldsymbol{\beta})=L C=20 \beta_{1}+3000 \beta_{2} $$$$ \mathbf{A}(\boldsymbol{\beta})=\frac{\partial \mathbf{a}(\boldsymbol{\beta})}{\partial \boldsymbol{\beta}^{\prime}}=\frac{\partial\left(20 \beta_{1}+3000 \beta_{2}\right)}{\partial\left[\beta_{0}, \beta_{1}, \beta_{2}\right]}=\left[\begin{array}{lll}{0} & {20} & {3000}\end{array}\right] $$# Remind ourselves of LC and its var-cov matrix
lc <- 20 * reg1[2,2] + 3000 * reg1[3,2]
vcov1 <- vcov_ols(cars, "price", c("mpg", "weight"))
# Define our derivative matrix
deriv_mat <- matrix(c(0, 20, 3000), nrow = 1)
# Calculate the standard error of 'lc' via delta method
lc_dm <- sqrt(deriv_mat %*% vcov1 %*% t(deriv_mat))
lc_se
lc_dm
Maximum y is reached when $$ a(\beta) = x^{\mathrm{M}}=-\frac{\beta_{1}}{2 \beta_{2}} $$
$$ \mathbf{A}(\beta)=\frac{\partial x^{\mathrm{M}}}{\partial \beta^{\prime}}=\left[\begin{array}{lll}{\frac{\partial x^{\mathrm{M}}}{\partial \beta_{0}}} & {\frac{\partial x^{\mathrm{M}}}{\partial \beta_{1}}} & {\frac{\partial x^{\mathrm{M}}}{\partial \beta_{2}}}\end{array}\right]=\left[\begin{array}{ccc}{0} & {-\frac{1}{2 \beta_{2}}} & {\frac{\beta_{1}}{2 \beta_{2}^{2}}}\end{array}\right] $$# Generate data
fake_df <- data.frame(
x = runif(n = n, min = -2, max = 3),
e = rnorm(n = n, mean = 0, sd = sqrt(10))
) %>% tbl_df()
# Calculate y = 4 + 4x - 2x^2 + e
fake_df %<>% mutate(
x2 = x^2,
y = 4 + 4 * x - 2 * x^2 + e)
# Estimate coefficients
(b_fake <- ols(fake_df, "y", c("x", "x2")) %$% coef)
# Estimate var-cov matrix
v_fake <- vcov_ols(fake_df, "y", c("x", "x2"))
A_fake <- matrix(data = c(
# The first entry of A()
0,
# The second entry of A()
-1/(2 * b_fake[3]),
# The third entry of A()
b_fake[2]/(2 * b_fake[3]^2)),
nrow = 1)
# Our estimate for the x that maximizes y
(x_m <- - b_fake[2] / (2 * b_fake[3]))
# Our estimate for the standard error
(se_m <- sqrt(A_fake %*% v_fake %*% t(A_fake)))
# Estimate the equation
felm_fake <- felm(y ~ x + x2, data = fake_df)
# Use the 'deltamethod' function
deltamethod(g = ~ - x2 / (2 * x3),
mean = coef(felm_fake),
cov = vcov(felm_fake))
# Force se_m to numeric
se_m <- as.numeric(se_m)
# Our plot
ggplot() +
# 95% confidence interval for maximal x
geom_rect(aes(ymin = -Inf, ymax = Inf,
xmin = x_m - 1.96 * se_m, xmax = x_m + 1.96 * se_m),
fill = "grey90", alpha = 0.5) +
# Plot the points
geom_point(data = fake_df, aes(x = x, y = y)) +
# Plot the true function
stat_function(data = data.frame(x = c(-Inf, Inf)), aes(x = x),
fun = function(x) {
4 + 4 * x - 2 * x^2
}, color = "blue", alpha = 0.25) +
# Plot the predicted function
stat_function(data = data.frame(x = c(-Inf, Inf)), aes(x = x),
fun = function(x) {
b_fake[1] + b_fake[2] * x + b_fake[3] * x^2
}, color = "grey65", linetype = 2, alpha = 0.25) +
# Vertical line at the predicted max.
geom_vline(xintercept = x_m, color = "grey65", linetype = 2) +
# Vertical line at the true max.
geom_vline(xintercept = 1, color = "blue") +
# Title
ggtitle("Estimating the maximum of a quadratic")
libreq(sandwich, HistData, robustbase, tidyr, dplyr, lfe, magrittr, ggplot2, viridis)
For linear regressions of the form
$$ \mathbf{y}=\mathbf{X} \boldsymbol{\beta}+\boldsymbol{\varepsilon} $$$$ \mathbf{b}=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{y} $$$$ \operatorname{Var}(\mathbf{b} | \mathbf{X})=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Sigma} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} $$$$ \boldsymbol{\Sigma}=\left[\begin{array}{cccc}{\varepsilon_{1}^{2}} & {\varepsilon_{1} \varepsilon_{2}} & {\cdots} & {\varepsilon_{1} \varepsilon_{N}} \\ {\varepsilon_{2} \varepsilon_{1}} & {\varepsilon_{2}^{2}} & {\cdots} & {\varepsilon_{2} \varepsilon_{N}} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {\varepsilon_{N} \varepsilon_{1}} & {\varepsilon_{N} \varepsilon_{2}} & {\cdots} & {\varepsilon_{N}^{2}}\end{array}\right] $$Assume
$$ \boldsymbol{\Sigma}=\left[\begin{array}{cccc}{\sigma^{2}} & {0} & {\cdots} & {0} \\ {0} & {\sigma^{2}} & {\cdots} & {0} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {0} & {0} & {\cdots} & {\sigma^{2}}\end{array}\right] $$$$ \begin{aligned} \operatorname{Var}_{\mathrm{Sph}}(\mathbf{b} | \mathbf{X}) &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Sigma} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \sigma^{2} \mathbf{I}_{N} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\sigma^{2}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\sigma^{2}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \end{aligned} $$# Function that demeans the columns of Z
demeaner <- function(N) {
# Create an N-by-1 column of 1s
i <- matrix(data = 1, nrow = N)
# Create the demeaning matrix
A <- diag(N) - (1/N) * i %*% t(i)
# Return A
return(A)
}
# Function to return OLS residuals
resid_ols <- function(data, y_var, X_vars, intercept = T) {
# Require the 'dplyr' package
require(dplyr)
# Create the y matrix
y <- to_matrix(the_df = data, vars = y_var)
# Create the X matrix
X <- to_matrix(the_df = data, vars = X_vars)
# Bind a column of ones to X
if (intercept == T) X <- cbind(1, X)
# Calculate the sample size, n
n <- nrow(X)
# Calculate the residuals
resids <- y - X %*% b_ols(y, X)
# Return 'resids'
return(resids)
}
# Function for OLS coef., SE, t-stat, and p-value
vcov_ols <- function(data, y_var, X_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Return the results
return(as.numeric(s2) * XX_inv)
}
# The estimated coefficients
ols(data = diamonds,
y_var = "price",
X_vars = c("carat", "depth"))[,1:2]
# The estimated variance-covariance matrix
vcov_spherical <- vcov_ols(data = diamonds,
y_var = "price",
X_vars = c("carat", "depth"))
# Get the standard errors
vcov_spherical %>% diag() %>% sqrt()
# Add OLS residuals to the diamonds dataset
diamonds %<>% mutate(e_ols =
resid_ols(data = diamonds,
y_var = "price",
X_vars = c("carat", "depth")) %>% as.numeric()
)
p1 = # Residuals against carat
ggplot(data = diamonds, aes(x = carat, y = e_ols)) +
geom_point(color = viridis(1), shape = 21, alpha = 0.3) +
xlab("Carat (covariate #1)") +
ylab("OLS residual") +
ggtitle("Residuals and carat")
p2 = ggplot(data = diamonds, aes(x = depth, y = e_ols)) +
geom_point(color = viridis(1), shape = 21, alpha = 0.3) +
xlab("Depth (covariate #2)") +
ylab("OLS residual") +
ggtitle("Residuals and depth")
(p1 | p2)
vcov_white <- function(data, y_var, X_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Calculate OLS residuals
e <- y - X %*% b
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# For each row, calculate x_i' x_i e_i^2; then sum
sigma_hat <- lapply(X = 1:n, FUN = function(i) {
# Define x_i
x_i <- matrix(as.vector(X[i,]), nrow = 1)
# Return x_i' x_i e_i^2
return(t(x_i) %*% x_i * e[i]^2)
}) %>% Reduce(f = "+", x = .)
# Return the results
return(XX_inv %*% sigma_hat %*% XX_inv)
}
# Spherical:
vcov_ols(data = diamonds,
y_var = "price",
X_vars = c("carat", "depth")) %>%
diag() %>% sqrt()
# Het. robust:
vcov_white(data = diamonds,
y_var = "price",
X_vars = c("carat", "depth")) %>%
diag() %>% sqrt()
felm(price ~ carat + depth, data = diamonds) %>%
summary(robust = T) %>% coef() %>% extract(., 1:3, 2)
# Load Playfair's wheat data
wheat_df <- Wheat %>% tbl_df()
# Drop the rows missing a value
wheat_df %<>% na.omit()
# Long to wide table
wheat_gg <- wheat_df %>% tidyr::gather(Series, Price, Wheat:Wages)
# Playfair's graph
ggplot(data = wheat_gg, aes(x = Year, y = Price, color = Series, linetype = Series)) +
geom_point() +
geom_line() +
geom_point() +
geom_line() +
xlab("Year") +
ylab("Wage/Price (Shillings)") +
ggtitle("Playfair's wheat and wages time series") +
scale_linetype_manual("Series:",
values = c(1, 3),
labels = c("Wages", "Price of wheat")
) +
scale_color_viridis("Series:",
option = "B",
discrete = T, end = .8, direction = -1,
labels = c("Wages", "Price of wheat")
)
vcov_hac <- function(data, y_var, X_vars, L, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Calculate OLS residuals
e <- y - X %*% b
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# The first term
S_o <- lapply(X = 1:n, FUN = function(i) {
# Define x_i
x_i <- matrix(as.vector(X[i,]), nrow = 1)
# Return x_i' x_i e_i^2
return(t(x_i) %*% x_i * e[i]^2)
}) %>% Reduce(f = "+", x = .)
S_o <- S_o / n
# The second term
S_more <- lapply(X = 1:L, FUN = function(j) {
lapply(X = (j+1):n, FUN = function(t) {
# Grab the rows of X that we need
x_t <- matrix(X[t,], nrow = 1)
x_tj <- matrix(X[t-j,], nrow = 1)
# The calculation
(1 - j / (L + 1)) * e[t] * e[t-j] * (
t(x_t) %*% x_tj + t(x_tj) %*% x_t)
}) %>% Reduce(f = "+", x = .)
}) %>% Reduce(f = "+", x = .)
S_more <- S_more / n
# The full sandwich
S_star <- S_o + S_more
# Return the results
return(n * XX_inv %*% S_star %*% XX_inv)
}
# Choose a lag
the_lag <- ceiling(nrow(wheat_df) / 4)
# The spherical standard errors
vcov_ols(wheat_df, "Wheat", "Wages") %>%
diag() %>% sqrt()
# The standard errors from our HAC robust function
vcov_hac(wheat_df, "Wheat", "Wages", the_lag) %>%
diag() %>% sqrt()
# Estimate the model using 'lm'
wheat_reg <- lm(Wheat ~ Wages, data = wheat_df)
# Use the NeweyWest function
NeweyWest(wheat_reg, lag = the_lag, prewhite = F) %>%
diag() %>% sqrt()
Assume DGP is
$$ y_{ig} = x_{ig} \beta + \epsilon_{ig} $$where $i = 1, \dots, N$ indexes individuals and $g = 1, \dots, G$ indexes groups. We continue to assume that $E[\epsilon_{ig} | x_{ig} ] = 0$ and add the assumption $E [\epsilon_{ig} \epsilon_{ih} | x_{ig}, x_{ih}] = 0$ when $g \neq h$.
Estimator for the meat
with clusters
vcov_cluster <- function(data, y_var, X_vars,
cluster_var, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(y, X)
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Calculate OLS residuals
e <- y - X %*% b
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Find the levels of the variable on which we are clustering
cl_levels <- data[, cluster_var] %>% unique() %>% unlist()
# Calculate the meat, iterating over the clusters
meat_hat <- lapply(X = cl_levels, FUN = function(g) {
# Find the row indices for the current cluster
indices <- which(unlist(data[, cluster_var]) == g)
# Grab the current cluster's rows from X and e
X_g <- X[indices,]
e_g <- e[indices] %>% matrix(ncol = 1)
# Calculate this cluster's part of the meat estimate
return(t(X_g) %*% e_g %*% t(e_g) %*% X_g)
}) %>% Reduce(f = "+", x = .) / n
# Find the number of clusters
G <- length(cl_levels)
# Degrees-of-freedom correction
df_c <- G/(G-1) * (n-1)/(n-k)
# Return the results
return(df_c * n * XX_inv %*% meat_hat %*% XX_inv)
}
# Load the 'robustbase' package
library(robustbase)
# Load the 'NOxEmissions' dataset
nox_df <- NOxEmissions %>% tbl_df()
# Change names
names(nox_df) <- c("date", "log_nox", "log_nox_cars", "wind")
vcov_cluster(
data = nox_df,
y_var = "log_nox",
X_vars = "wind",
cluster_var = "date") %>%
diag() %>% sqrt()
# The results under spherical errors
our_table <- ols(nox_df, "log_nox", "wind")[, 1:3]
our_table$effect <- c("Intercept", "Wind")
# Heteroskedasticity-robust
se_white <- vcov_white(nox_df, "log_nox", "wind") %>%
diag() %>% sqrt()
# Cluster-robust
se_cluster <- vcov_cluster(nox_df, "log_nox", "wind", "date") %>%
diag() %>% sqrt()
# Add new columns to the table
our_table %<>% mutate(se_white, se_cluster)
# Print results table
knitr::kable(our_table,
col.names = c("Effect", "Coef.", "S.E. (Sph. Errors)",
"S.E. (Het. Robust)", "S.E. (Cluster Robust)"),
digits = c(3, 3, 4, 4, 4),
align = c("l", rep("r", 4)),
caption = "Comparing standard errors")
libreq(dplyr, lfe, magrittr)
Violations of
caused by
true dgp
$y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \epsilon$
estimated regression
$y = \theta_0 + \theta_1 x_1 + \nu$
\begin{align*} Cov(x_1, \nu) &= Cov(x_1, \beta_2 x_2 + \epsilon) \\ & = \beta_2 Cov(x_1, x_2) \\ &\neq 0 \end{align*}IV is correlated with the 'good' / exogenous variation in $x_1$ and uncorrelated with the 'bad' / correlated-with-$x_2$ variation in $x_1$.
# Define our sample size
n <- 1e4
# Define beta
beta <- c(5, 2, -3)
# Define the means of x1, x2, and z
mean_vec <- c(5, 10, -5)
# Define the var-cov matrix
vcov_mat <- matrix(data =
c(1, 0.75, 0.25, 0.75, 1, 0, 0.25, 0, 1),
nrow = 3)
# Generate the data for x1, x2, and z
gen_df <- mvrnorm(n = n, mu = mean_vec, Sigma = vcov_mat,
empirical = T) %>% as.data.frame()
# Change names
names(gen_df) <- c("x1", "x2", "z")
# Generate the error term and calculate y
gen_df %<>% mutate(
e = rnorm(n),
y = beta[1] + beta[2] * x1 + beta[3] * x2 + e)
select(gen_df, x1, x2, z, e) %>% cor()
ols(data = gen_df, y_var = "y", X_vars = c("x1", "x2"))
ols(data = gen_df, y_var = "y", X_vars = "x1")
ols(data = gen_df, y_var = "y", X_vars = "x2")
# Function for IV coefficient estimates
b_iv <- function(data, y_var, X_vars, Z_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
Z <- to_matrix(data, Z_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
if (intercept == T) Z <- cbind(1, Z)
# Calculate beta hat
beta_hat <- solve(t(Z) %*% X) %*% t(Z) %*% y
# Update names
if (intercept == T) rownames(beta_hat)[1] <- "Intercept"
# Return beta_hat
return(beta_hat)
}
b_iv(data = gen_df, y_var = "y", X_vars = "x1", Z_vars = "z")
# Checking our work with 'felm'
felm(y ~ 1 | 0 | (x1 ~ z) | 0, data = gen_df) %>%
summary() %>% coef()
Exactly identical to Wald estimator for 1 instrument case, but permits use of multiple instruments.
$$ \hat{\beta}_{2SLS} = (X' P_Z X)^{-1} X' P_{Z} y $$where $P_Z = Z(Z'Z)^{-1} Z'$ is the projection matrix from the first stage.
# Function for IV coefficient estimates
b_2sls <- function(data, y_var, X_vars, Z_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
Z <- to_matrix(data, Z_vars)
# Add intercept
if (intercept == T) X <- cbind(1, X)
if (intercept == T) Z <- cbind(1, Z)
# Calculate P_Z
P_Z <- Z %*% solve(t(Z) %*% Z) %*% t(Z)
# Calculate b_2sls
b <- solve(t(X) %*% P_Z %*% X) %*% t(X) %*% P_Z %*% y
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Return b
return(b)
}
b_2sls(data = gen_df, y_var = "y", X_vars = "x1", Z_vars = "z")
weakly larger SEs than OLS, since IV uses subset of the variation in X (the 'good' variation), while OLS uses all of it, which results in more precise estimates.
$$ Var(b_{2SLS}) = \sigma^2 (X' P_Z X)^{-1} \geq \sigma^2(X' X)^{-1} = Var(b_{OLS}) $$# Function for IV coefficient estimates
b_2sls <- function(data, y_var, X_vars, Z_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
Z <- to_matrix(data, Z_vars)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Add intercept
if (intercept == T) X <- cbind(1, X)
if (intercept == T) Z <- cbind(1, Z)
# Calculate P_Z
P_Z <- Z %*% solve(t(Z) %*% Z) %*% t(Z)
# Calculate b_2sls
b <- solve(t(X) %*% P_Z %*% X) %*% t(X) %*% P_Z %*% y
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n - k)
s2 %<>% as.numeric()
# Inverse of X' Pz X
XX_inv <- solve(t(X) %*% P_Z %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - 0) / se
# Calculate the p-values
p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
# Update names
if (intercept == T) rownames(b)[1] <- "Intercept"
# Nice table (data.frame) of results
results <- data.frame(
# The rows have the coef. names
effect = rownames(b),
# Estimated coefficients
coef = as.vector(b),
# Standard errors
std_error = as.vector(se),
# t statistics
t_stat = as.vector(t_stats),
# p-values
p_value = as.vector(p_values)
)
# Return the results
return(results)
}
# Our function
b_2sls(gen_df, "y", "x1", "z")