rm(list = ls())
library(LalRUtils)
libreq(lfe, rio, MASS, data.table, tidyverse, magrittr, patchwork, parallel,
furrr)
theme_set(lal_plot_theme())
set.seed(42)
root = '~/ddr/tut/ARE212'
setwd(root)
setwd(file.path(root, 'Section01/'))
list.files()
auto = fread('auto.csv')
auto %>% glimpse
tail(auto)
auto[, .(price, mpg, weight, length)] %>% head
auto[order(price, -weight)] %>%head
auto[, .(mean(price), sd(price))]
auto[, .(means = lapply(.SD, mean), sds = lapply(.SD, sd)), .SDcols = c('price', 'mpg', 'weight')]
# The histogram function
with(auto, hist(
# The variable for the histogram
x = mpg,
# The main title
main = "Distribution of fuel economy",
# The x-axis label
xlab = "MPG (miles per gallon)")
# The blue vertical line at the median MPG (lwd is line width)
)
abline(v = median(auto$mpg), col = "blue", lwd = 3)
ggplot(auto, aes(mpg)) + geom_histogram() + geom_vline(xintercept = median(auto$mpg), col = "blue") +
labs(title= "Distribution of fuel economy", x = "MPG (miles per gallon)")
v1 = 1:5
v2 = c(1,2,3,4,5)
all.equal(v1, v2)
A <- matrix(data = 1:6, ncol = 2)
Z = matrix(c(
c(1, 3, 4),
c(6, 7, 2),
1:3),
3, byrow = T
)
t(Z) # transpose
Z %*% A
solve(Z)
crossprod(v1, v2)
eigen(Z)
s.t. $Q$ is an orthogonal matrix($:= Q'Q = QQ' = I; Q' = Q^{-1}$) and $R$ is upper-triangular.
qrd = qr(Z)
qr.Q(qrd)
qr.R(qrd)
cars = import(file.path(root, 'Section03/auto.dta'))
b_ols = function(df, y, X){
d = setDT(df)
y_d = d[[y]]
X_d = cbind(1, as.matrix(d[, ..X]))
β_hat = solve(crossprod(X_d, X_d)) %*% crossprod(X_d, y_d)
β_hat
}
b_ols(cars, 'price', c('mpg', 'weight'))
lm(price ~ mpg + weight, cars)
target_vars <- c("price", "headroom", "trunk", "length", "turn")
results_list <- lapply(
X = target_vars,
FUN = function(i) b_ols(df = cars, y = i, X = c("mpg", "weight"))
)
results_list
# A function to calculate bias
data_baker <- function(sample_n, true_beta) {
# First generate x from N(0,1)
x <- rnorm(sample_n)
# Now the error from N(0,1)
e <- rnorm(sample_n)
# Now combine true_beta, x, and e to get y
y <- true_beta[1] + true_beta[2] * x + e
# Define the data matrix of independent vars.
X <- cbind(1, x)
# Force y to be a matrix
y <- matrix(y, ncol = 1)
# Calculate the OLS estimates
b_ols <- solve(t(X) %*% X) %*% t(X) %*% y
# Convert b_ols to vector
b_ols <- b_ols %>% as.vector()
# Calculate bias, force to 2x1 data.frame()
the_bias <- (true_beta - b_ols) %>%
matrix(ncol = 2) %>% data.frame()
# Set names
names(the_bias) <- c("bias_intercept", "bias_x")
# Return the bias
return(the_bias)
}
# Set seed
set.seed(12345)
# Run once
data_baker(sample_n = 100, true_beta = c(1, 3))
# A function to run the simulation
bias_simulator <- function(n_sims, sample_n, true_beta) {
# Run data_baker() n_sims times with given parameters
sims_dt <- lapply(
X = 1:n_sims,
FUN = function(i) data_baker(sample_n, true_beta)) %>%
# Bind the rows together to output a nice data.frame
bind_rows()
# Return sim_dt
return(sims_dt)
}
# Set seed
set.seed(12345)
# Run it
sim_dt <- bias_simulator(n_sims = 1e4, sample_n = 100, true_beta = c(1,3))
# Check the results with a histogram
hist(sim_dt[,2],
breaks = 30,
main = "Is OLS unbiased?",
xlab = "Bias")
# Emphasize the zero line
abline(v = 0, col = "blue", lwd = 3)
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
# Require the 'dplyr' package
require(dplyr)
# Create the y matrix
y <- data %>%
# Select y variable data from 'data'
select({{y_var}}) %>%
# Convert y_data to matrices
as.matrix()
# Create the X matrix
X <- data %>%
# Select X variable data from 'data'
select({{X_vars}})
# If 'intercept' is TRUE, then add a column of ones
# and move the column of ones to the front of the matrix
if (intercept == T) {
# Bind on a column of ones
X <- cbind(1, X)
# Name the column of ones
names(X) <- c("ones", X_vars)
}
# Convert X_data to a matrix
X <- as.matrix(X)
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# If 'intercept' is TRUE:
# change the name of 'ones' to 'intercept'
if (intercept == T) rownames(beta_hat) <- c("intercept", X_vars)
# Return beta_hat
return(beta_hat)
}
n = 100
the_data <- tibble(
x = rnorm(n),
e = rnorm(n))
# Calculate y = 3 + 1.5 x + e
the_data <- mutate(the_data, y = 3 + 1.5 * x + e)
# Plot to make sure things are going well.
plot(
# The variables for the plot
x = the_data$x, y = the_data$y,
# Labels and title
xlab = "x", ylab = "y", main = "Our generated data")
# Run b_ols with and intercept
b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
felm(y ~ x, data = the_data)
# The estimates
b_w <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
b_wo <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F)
# Plot the points
plot(
# The variables for the plot
x = the_data$x, y = the_data$y,
# Labels and title
xlab = "x", ylab = "y", main = "Our generated data")
# Plot the line from the 'with intercept' regression in yellow
abline(a = b_w[1], b = b_w[2], col = "lightblue", lwd = 3)
# Plot the line from the 'without intercept' regression in purple
abline(a = 0, b = b_w[1], col = "purple4", lwd = 2)
# Add a legend
legend(x = min(the_data$x), y = max(the_data$y),
legend = c("with intercept", "w/o intercept"),
# Line widths
lwd = c(3, 2),
# Colors
col = c("lightblue", "purple4"),
# No box around the legend
bty = "n")
For an arbitrary regression
$$ \mathbf{y}=\mathbf{X}_{1} \beta_{1}+\mathbf{X}_{2} \beta_{2}+\varepsilon $$FWL states that we can estimate $\beta_1$ with $b_1$ using
$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$Now, define $M_2$ as the annihilator matrix, and define $X_1^* := M_2 X_1$ and $y^* := M_2 y$. These are the residualised data matrices on $X_2$. Then, we can also write
$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{* \prime} \mathbf{X}_{1}^{*}\right)^{-1} \mathbf{X}_{1}^{* \prime} \mathbf{y}^{*} $$to_matrix <- function(the_df, vars) the_df %>% dplyr::select({{vars}}) %>% as.matrix()
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
# Create the y matrix
y <- to_matrix(the_df = data, vars = y_var)
X <- to_matrix(the_df = data, vars = X_vars)
# If 'intercept' is TRUE, then add a column of ones
if (intercept == T) {
# Bind a column of ones to X
X <- cbind(1, X)
# Name the new column "intercept"
colnames(X) <- c("intercept", X_vars)
}
# Calculate beta hat
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# Return beta_hat
return(beta_hat)
}
b_ols(the_data, y_var = "y", X_vars = "x", intercept = T)
resid_ols <- function(data, y_var, X_vars, intercept = TRUE) {
y <- to_matrix(the_df = data, vars = y_var)
X <- to_matrix(the_df = data, vars = X_vars)
if (intercept == T) {
X <- cbind(1, X)
colnames(X) <- c("intercept", X_vars)
}
n <- nrow(X)
resids <- (diag(n) - X %*% solve(t(X) %*% X) %*% t(X)) %*% y
return(resids)
}
to estimate the following regression
$$ y = X_1 \beta_1 + X_2 \beta_2 + \epsilon $$# Steps 1 and 2: Residualize 'price' on 'weight' and an intercept
e_yx <- resid_ols(data = cars, y_var = "price",
X_vars = "weight", intercept = T)
# Steps 3 and 4: Residualize 'mpg' on 'weight' and an intercept
e_xx <- resid_ols(data = cars, y_var = "mpg",
X_vars = "weight", intercept = T)
# Combine the two sets of residuals into a data.frame
e_df <- data.frame(e_yx = e_yx[,1], e_xx = e_xx[,1])
# Step 5: Regress e_yx on e_xx without an intercept
b_ols(data = e_df, y_var = "e_yx",
X_vars = "e_xx", intercept = F)
b_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
We can recover the OLS estimate by estimating the univariate regression and differencing away a normalizing term. This normalizing term is 0 if $X_1$ $X_2$ are uncorrelated / orthogonal.
library(MASS)
v_cov <- matrix(data = c(1, 0.95, 0.95, 1), nrow = 2)
# Create the means vector
means <- c(5, 10)
# Define our sample size
n <- 1e5
# Set the seed
set.seed(12345)
# Generate x1 and x2
X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T)
# Create a tibble for our data, add generate error from N(0,1)
the_data <- as.data.frame(X) %>% mutate(e = rnorm(n))
# Set the names
names(the_data) <- c("x1", "x2", "e")
# The data-generating process
the_data <- the_data %>% mutate(y = 1 + 2 * x1 + 3 * x2 + e)
# Regression 1: y on int, x1, and x2
b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T)
# Regression 2: y on int and x1
b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T)
# Regression 3: y on int and x2
b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T)
Use demeaning matrix $$ \mathbf{A}=\mathbf{I}_{n}-\left(\frac{1}{n}\right) \mathbf{i} \mathbf{i}^{\prime} $$
# Function that demeans the columns of Z
demeaner <- function(N) {
# Create an N-by-1 column of 1s
i <- matrix(data = 1, nrow = N)
# Create the demeaning matrix
A <- diag(N) - (1/N) * i %*% t(i)
# Return A
return(A)
}
# Start the function
r2_ols <- function(data, y_var, X_vars) {
# Create y and X matrices
y <- to_matrix(data, vars = y_var)
X <- to_matrix(data, vars = X_vars)
# Add intercept column to X
X <- cbind(1, X)
# Find N and K (dimensions of X)
N <- nrow(X)
K <- ncol(X)
# Calculate the OLS residuals
e <- resid_ols(data, y_var, X_vars)
# Calculate the y_star (demeaned y)
y_star <- demeaner(N) %*% y
# Calculate r-squared values
r2_uc <- 1 - t(e) %*% e / (t(y) %*% y)
r2 <- 1 - t(e) %*% e / (t(y_star) %*% y_star)
r2_adj <- 1 - (N-1) / (N-K) * (1 - r2)
# Return a vector of the r-squared measures
return(c("r2_uc" = r2_uc, "r2" = r2, "r2_adj" = r2_adj))
}
# Our r-squared function
r2_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
felm(price ~ mpg + weight, cars) %>% summary()
# Find the number of observations in cars
n <- nrow(cars)
# Fill an n-by-50 matrix with random numbers from N(0,1)
set.seed(12345)
rand_mat <- matrix(data = rnorm(n * 50, sd = 10), ncol = 50)
# Convert rand_mat to tbl_df
rand_df <- as.data.frame(rand_mat)
# Change names to x1 to x50
names(rand_df) <- paste0("x", 1:50)
# Bind cars and rand_df; convert to tbl_df; call it 'cars_rand'
cars_rand <- cbind(cars, rand_df) %>% tbl_df()
# Drop the variable 'rep78' (it has NAs)
cars_rand <- dplyr::select(cars_rand, -rep78, -make)
result_df <- lapply(
X = 2:ncol(cars_rand),
FUN = function(k) {
# Apply the 'r2_ols' function to columns 2:k
r2_df <- r2_ols(
data = cars_rand,
y_var = "price",
X_vars = names(cars_rand)[2:k]) %>%
matrix(ncol = 3) %>% as.data.frame()
# Set names
names(r2_df) <- c("r2_uc", "r2", "r2_adj")
# Add a column for k
r2_df$k <- k
# Return r2_df
return(r2_df)
}) %>% bind_rows()
# Plot unadjusted r-squared
plot(x = result_df$k, y = result_df$r2,
type = "l", lwd = 3, col = "lightblue",
xlab = expression(paste("k: number of columns in ", bold("X"))),
ylab = expression('R' ^ 2))
# Plot adjusted r-squared
lines(x = result_df$k, y = result_df$r2_adj,
type = "l", lwd = 2, col = "purple4")
# The legend
legend(x = 0, y = max(result_df$r2),
legend = c(expression('R' ^ 2),
expression('Adjusted R' ^ 2)),
# Line widths
lwd = c(3, 2),
# Colors
col = c("lightblue", "purple4"),
# No box around the legend
bty = "n")
x <- data.frame(a = c(1:4, NA), b = c("..", 1, 2, 3, "..")) %>% as_tibble()
x
x[x == ".."] <- NA
x
dir_data = file.path(root, 'section05')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()
t_stat <- function(data, y_var, X_vars, gamma, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept if requested
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(data, y_var, X_vars, intercept)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Force s2 to numeric
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - gamma) / se
# Return the _t_ statistics
return(t_stats)
}
t_stat(cars,
y_var = "price", X_vars = c("mpg", "weight"),
gamma = 0, intercept = T)
felm(price ~ mpg + weight, cars) %>%
summary() %$% (coefficients)[,3]
%$%
pipe
cars %$% cor(price, weight)
# The default: lower.tail = TRUE
pt(q = 2, df = 15)
# Setting lower.tail = TRUE
pt(q = 2, df = 15, lower.tail = T)
# Setting lower.tail = FALSE
pt(q = 2, df = 15, lower.tail = F)
ols <- function(data, y_var, X_vars, intercept = T) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept if requested
if (intercept == T) X <- cbind(1, X)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# Estimate coefficients
b <- b_ols(data, y_var, X_vars, intercept)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Update s2 to numeric
s2 %<>% as.numeric()
# Inverse of X'X
XX_inv <- solve(t(X) %*% X)
# Standard error
se <- sqrt(s2 * diag(XX_inv))
# Vector of _t_ statistics
t_stats <- (b - 0) / se
# Calculate the p-values
p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
# Nice table (data.frame) of results
results <- data.frame(
# The rows have the coef. names
effect = rownames(b),
# Estimated coefficients
coef = as.vector(b) %>% round(3),
# Standard errors
std_error = as.vector(se) %>% round(3),
# t statistics
t_stat = as.vector(t_stats) %>% round(3),
# p-values
p_value = as.vector(p_values) %>% round(4)
)
# Return the results
return(results)
}
ols(data = cars,
y_var = "price",
X_vars = c("mpg", "weight"),
intercept = T)
felm(price ~ mpg + weight, cars) %>%
summary() %$% coefficients
joint_test <- function(data, y_var, X_vars) {
# Turn data into matrices
y <- to_matrix(data, y_var)
X <- to_matrix(data, X_vars)
# Add intercept
X <- cbind(1, X)
# Name the new column "intercept"
colnames(X) <- c("intercept", X_vars)
# Calculate n and k for degrees of freedom
n <- nrow(X)
k <- ncol(X)
# J is k-1
J <- k - 1
# Create the R matrix: bind a column of zeros
# onto a J-by-J identity matrix
R <- cbind(0, diag(J))
# Estimate coefficients
b <- b_ols(data, y_var, X_vars)
# Calculate OLS residuals
e <- y - X %*% b
# Calculate s^2
s2 <- (t(e) %*% e) / (n-k)
# Force s2 to numeric
s2 %<>% as.numeric()
# Create the inner matrix R(X'X)^(-1)R'
RXXR <- R %*% solve(t(X) %*% X) %*% t(R)
# Calculate the F stat
f_stat <- t(R %*% b) %*% solve(RXXR) %*% (R %*% b) / J / s2
# Calculate the p-value
p_value <- pf(q = f_stat, df1 = J, df2 = n-k, lower.tail = F)
# Create a data.frame of the f stat. and p-value
results <- data.frame(
f_stat = f_stat %>% as.vector(),
p_value = p_value %>% as.vector())
return(results)
}
joint_test(data = cars,
y_var = "price", X_vars = c("mpg", "weight"))
felm(price ~ mpg + weight, cars) %>%
summary() %$% F.fstat
# Function to generate data
gen_data <- function(sample_size) {
# Create data.frame with random x and error
data_df <- data.frame(
x = rnorm(sample_size),
e = rnorm(sample_size))
# Calculate y = 7 + 0.5 x + e; drop 'e'
data_df %<>% mutate(y = 7 + 0.5 * x + e) %>%
select(-e)
# Return data_df
return(data_df)
}
one_sim <- function(sample_size) {
# Estimate via OLS
ols_est <- ols(data = gen_data(sample_size),
y_var = "y", X_vars = "x")
# Grab the estimated coefficient on x
# (the second element of 'coef')
b1 <- ols_est %$% coef[2]
# Grab the second p-value
# (the first p-value is for the intercept)
p_value <- ols_est %$% p_value[2]
# Return a data.frame with b1 and p_value
return(data.frame(b1, p_value))
}
ols_sim <- function(n_sims, sample_size, seed = 12345) {
# Set the seed
set.seed(seed)
# Run one_sim n_sims times; convert results to data.frame
sim_df <- replicate(
n = n_sims,
expr = one_sim(sample_size),
simplify = F
) %>% bind_rows()
# Return sim_df
return(sim_df)
}
# Run ols_sim for sample size of 10
sim10 <- ols_sim(n_sims = 1e3, sample_size = 10)
# Run ols_sim for sample size of 100
sim100 <- ols_sim(n_sims = 1e3, sample_size = 100)
p1 = ggplot(data = sim10, aes(x = p_value)) +
stat_density(fill = "grey20", alpha = 0.9) +
geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
theme_bw() +
xlab(expression(paste(italic("p"), "-Value"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", italic(p),
"-Values from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = p_value)) +
stat_density(fill = "grey20", alpha = 0.9) +
geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
xlim(0, 0.1) +
theme_bw() +
xlab(expression(paste(italic("p"), "-Value"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", italic(p),
"-Values from 1,000 simulations with sample size 100")))
(p1 / p2)
p1 = ggplot(data = sim10, aes(x = b1)) +
stat_density(fill = "grey70") +
geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) +
xlab(expression(paste(beta[1], " estimate"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", beta[1],
" estimates from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = b1)) +
stat_density(fill = "grey70") +
geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) +
xlab(expression(paste(beta[1], " estimate"))) +
ylab("Density") +
ggtitle(expression(paste("Distribution of ", beta[1],
" estimates from 1,000 simulations with sample size 100")))
(p1 / p2)
ols_sim_mc <- function(n_sims, sample_size, seed = 12345) {
# Require the parallel package
require(parallel)
# Set the seed
set.seed(seed)
# Run one_sim n_sims times; convert results to data.frame
sim_df <- mclapply(
X = rep(x = sample_size, times = n_sims),
FUN = one_sim,
# Specify that we want 4 cores
mc.cores = detectCores() - 2
) %>% bind_rows()
# Return sim_df
return(sim_df)
}
Non-Parallelised
Parallelised
# stop1 - start1
# stop2 - start2
# stop3 - start3
# stop4 - start4
libreq(dplyr, magrittr, ggplot2, ggthemes)
# Reformat Anscombe's dataset
a_df <- bind_rows(
dplyr::select(anscombe, x = x1, y = y1),
dplyr::select(anscombe, x = x2, y = y2),
dplyr::select(anscombe, x = x3, y = y3),
dplyr::select(anscombe, x = x4, y = y4))
# Add group identifier
a_df %<>% mutate(group = rep(paste0("Dataset ", 1:4), each = 11))
# The plot
ggplot(data = a_df, aes(x, y)) +
# Plot the points
geom_point() +
# Add the regression line (without S.E. shading)
geom_smooth(method = "lm", formula = y ~ x, se = F) +
# Change the theme
theme_pander() +
theme(panel.border = element_rect(color = "grey90", fill=NA, size=1)) +
# Plot by group
facet_wrap(~group, nrow = 2, ncol = 2) +
ggtitle("Illustrating Anscombe's Quartet",
subtitle = "Four very different datasets with the same regression line")
# Regress price on weight (with an intercept)
b <- ols(data = cars, y_var = "price", X_vars = "weight") %$% coef
price_hat <- function(weight) b[1] + b[2] * weight
p1 = ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
stat_function(fun = price_hat, colour = 'blue')
p2 = ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
geom_smooth(method = "lm")
(p1 / p2)
ggplot(data = cars, aes(x = weight, y = price)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2))