Graduate Regression notes in R¶

Worked examples from Ed Rubin's excellent ARE212 repo

rm(list = ls())
library(LalRUtils)

libreq(lfe, rio, MASS, data.table, tidyverse, magrittr, patchwork, parallel,
        furrr)
theme_set(lal_plot_theme())
set.seed(42)

      wants        loaded
 [1,] "lfe"        TRUE  
 [2,] "rio"        TRUE  
 [3,] "MASS"       TRUE  
 [4,] "data.table" TRUE  
 [5,] "tidyverse"  TRUE  
 [6,] "magrittr"   TRUE  
 [7,] "patchwork"  TRUE  
 [8,] "parallel"   TRUE  
 [9,] "furrr"      TRUE

root = '~/ddr/tut/ARE212'
setwd(root)

Section 1: R Basics¶

setwd(file.path(root, 'Section01/'))
list.files()

auto = fread('auto.csv')
auto %>% glimpse

Observations: 74
Variables: 12
$ make         <chr> "AMC Concord", "AMC Pacer", "AMC Spirit", "Buick Century…
$ price        <dbl> 4099, 4749, 3799, 4816, 7827, 5788, 4453, 5189, 10372, 4…
$ mpg          <dbl> 22, 17, 22, 20, 15, 18, 26, 20, 16, 19, 14, 14, 21, 29, …
$ rep78        <dbl> 3, 3, NaN, 3, 4, 3, NaN, 3, 3, 3, 3, 2, 3, 3, 4, 3, 2, 2…
$ headroom     <dbl> 2.5, 3.0, 3.0, 4.5, 4.0, 4.0, 3.0, 2.0, 3.5, 3.5, 4.0, 3…
$ trunk        <dbl> 11, 11, 12, 16, 20, 21, 10, 16, 17, 13, 20, 16, 13, 9, 2…
$ weight       <dbl> 2930, 3350, 2640, 3250, 4080, 3670, 2230, 3280, 3880, 34…
$ length       <dbl> 186, 173, 168, 196, 222, 218, 170, 200, 207, 200, 221, 2…
$ turn         <dbl> 40, 40, 35, 40, 43, 43, 34, 42, 43, 42, 44, 43, 45, 34, …
$ displacement <dbl> 121, 258, 121, 196, 350, 231, 304, 196, 231, 231, 425, 3…
$ gear_ratio   <dbl> 3.58, 2.53, 3.08, 2.93, 2.41, 2.73, 2.87, 2.93, 2.93, 3.…
$ foreign      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

tail(auto)

select columns¶

auto[, .(price, mpg, weight, length)] %>% head

Sorting¶

auto[order(price, -weight)] %>%head

Summary stats¶

Single Variable¶

auto[, .(mean(price), sd(price))]

Multiple Variables¶

auto[, .(means = lapply(.SD, mean), sds = lapply(.SD, sd)), .SDcols = c('price', 'mpg', 'weight')]

Figures¶

# The histogram function
with(auto, hist(
  # The variable for the histogram
  x = mpg,
  # The main title
  main = "Distribution of fuel economy",
  # The x-axis label
  xlab = "MPG (miles per gallon)")
# The blue vertical line at the median MPG (lwd is line width)
)
abline(v = median(auto$mpg), col = "blue", lwd = 3)

ggplot(auto, aes(mpg)) + geom_histogram() + geom_vline(xintercept = median(auto$mpg), col = "blue") + 
   labs(title= "Distribution of fuel economy", x = "MPG (miles per gallon)")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Section 2 : Linear Algebra¶

v1 = 1:5
v2 = c(1,2,3,4,5)
all.equal(v1, v2)

A <- matrix(data = 1:6, ncol = 2)
Z = matrix(c(
    c(1, 3, 4),
    c(6, 7, 2),
    1:3), 
    3, byrow = T
    )

t(Z) # transpose

Matrix Multiplication¶

Z %*% A

Inverse¶

solve(Z)

Crossproduct¶

crossprod(v1, v2)

Decompositions¶

Eigenvalues¶

eigen(Z)

eigen() decomposition
$values
[1] 10.4187613  1.3584430 -0.7772043

$vectors
           [,1]       [,2]       [,3]
[1,] -0.3996373  0.4619135 -0.7690618
[2,] -0.8701109 -0.6891769  0.6262312
[3,] -0.2884389  0.5582751 -0.1279784

QR Decomposition¶

$$ A = QR $$

s.t. $Q$ is an orthogonal matrix($:= Q'Q = QQ' = I; Q' = Q^{-1}$) and $R$ is upper-triangular.

qrd = qr(Z)
qr.Q(qrd)
qr.R(qrd)

Sec 3: Functions¶

cars = import(file.path(root, 'Section03/auto.dta'))

b_ols = function(df, y, X){
    d = setDT(df)
    y_d = d[[y]] 
    X_d = cbind(1, as.matrix(d[, ..X]))
    β_hat = solve(crossprod(X_d, X_d)) %*% crossprod(X_d, y_d)
    β_hat
}

b_ols(cars, 'price', c('mpg', 'weight'))

lm(price ~ mpg + weight, cars)

Call:
lm(formula = price ~ mpg + weight, data = cars)

Coefficients:
(Intercept)          mpg       weight  
   1946.069      -49.512        1.747

Lapply¶

target_vars <- c("price", "headroom", "trunk", "length", "turn")

results_list <- lapply(
  X = target_vars,
  FUN = function(i) b_ols(df = cars, y = i, X = c("mpg", "weight"))
  )
results_list

# A function to calculate bias
data_baker <- function(sample_n, true_beta) {
  # First generate x from N(0,1)
  x <- rnorm(sample_n)
  # Now the error from N(0,1)
  e <- rnorm(sample_n)
  # Now combine true_beta, x, and e to get y
  y <- true_beta[1] + true_beta[2] * x + e
  # Define the data matrix of independent vars.
  X <- cbind(1, x)
  # Force y to be a matrix
  y <- matrix(y, ncol = 1)
  # Calculate the OLS estimates
  b_ols <- solve(t(X) %*% X) %*% t(X) %*% y
  # Convert b_ols to vector
  b_ols <- b_ols %>% as.vector()
  # Calculate bias, force to 2x1 data.frame()
  the_bias <- (true_beta - b_ols) %>%
    matrix(ncol = 2) %>% data.frame()
  # Set names
  names(the_bias) <- c("bias_intercept", "bias_x")
  # Return the bias
  return(the_bias)
}

# Set seed
set.seed(12345)
# Run once
data_baker(sample_n = 100, true_beta = c(1, 3))

# A function to run the simulation
bias_simulator <- function(n_sims, sample_n, true_beta) {
  # Run data_baker() n_sims times with given parameters
  sims_dt <- lapply(
    X = 1:n_sims,
    FUN = function(i) data_baker(sample_n, true_beta)) %>%
    # Bind the rows together to output a nice data.frame
    bind_rows()

  # Return sim_dt
  return(sims_dt)
}

# Set seed
set.seed(12345)
# Run it
sim_dt <- bias_simulator(n_sims = 1e4, sample_n = 100, true_beta = c(1,3))
# Check the results with a histogram
hist(sim_dt[,2],
  breaks = 30,
  main = "Is OLS unbiased?",
  xlab = "Bias")
# Emphasize the zero line
abline(v = 0, col = "blue", lwd = 3)

Section 4: FWL and model fit¶

b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  # Require the 'dplyr' package
  require(dplyr)

  # Create the y matrix
  y <- data %>%
    # Select y variable data from 'data'
    select({{y_var}}) %>%
    # Convert y_data to matrices
    as.matrix()

  # Create the X matrix
  X <- data %>%
    # Select X variable data from 'data'
    select({{X_vars}})

  # If 'intercept' is TRUE, then add a column of ones
  # and move the column of ones to the front of the matrix
  if (intercept == T) {
    # Bind on a column of ones
    X <- cbind(1, X)
    # Name the column of ones
    names(X) <- c("ones", X_vars)
    }

  # Convert X_data to a matrix
  X <- as.matrix(X)

  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y

  # If 'intercept' is TRUE:
  # change the name of 'ones' to 'intercept'
  if (intercept == T) rownames(beta_hat) <- c("intercept", X_vars)

  # Return beta_hat
  return(beta_hat)
}

n = 100
the_data <- tibble(
  x = rnorm(n),
  e = rnorm(n))
# Calculate y = 3 + 1.5 x + e
the_data <- mutate(the_data, y = 3 + 1.5 * x + e)
# Plot to make sure things are going well.
plot(
  # The variables for the plot
  x = the_data$x, y = the_data$y,
  # Labels and title
  xlab = "x", ylab = "y", main = "Our generated data")

# Run b_ols with and intercept
b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
felm(y ~ x, data = the_data)

(Intercept)           x 
      3.121       1.367

# The estimates
b_w <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
b_wo <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F)
# Plot the points
plot(
  # The variables for the plot
  x = the_data$x, y = the_data$y,
  # Labels and title
  xlab = "x", ylab = "y", main = "Our generated data")
# Plot the line from the 'with intercept' regression in yellow
abline(a = b_w[1], b = b_w[2], col = "lightblue", lwd = 3)
# Plot the line from the 'without intercept' regression in purple
abline(a = 0, b = b_w[1], col = "purple4", lwd = 2)
# Add a legend
legend(x = min(the_data$x), y = max(the_data$y),
  legend = c("with intercept", "w/o intercept"),
  # Line widths
  lwd = c(3, 2),
  # Colors
  col = c("lightblue", "purple4"),
  # No box around the legend
  bty = "n")

FWL Thm¶

For an arbitrary regression

$$ \mathbf{y}=\mathbf{X}_{1} \beta_{1}+\mathbf{X}_{2} \beta_{2}+\varepsilon $$

FWL states that we can estimate $\beta_1$ with $b_1$ using

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$

Now, define $M_2$ as the annihilator matrix, and define $X_1^* := M_2 X_1$ and $y^* := M_2 y$. These are the residualised data matrices on $X_2$. Then, we can also write

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{* \prime} \mathbf{X}_{1}^{*}\right)^{-1} \mathbf{X}_{1}^{* \prime} \mathbf{y}^{*} $$

to_matrix <- function(the_df, vars) the_df %>% dplyr::select({{vars}}) %>% as.matrix()

b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  # Create the y matrix
  y <- to_matrix(the_df = data, vars = y_var)
  X <- to_matrix(the_df = data, vars = X_vars)
  # If 'intercept' is TRUE, then add a column of ones
  if (intercept == T) {
    # Bind a column of ones to X
    X <- cbind(1, X)
    # Name the new column "intercept"
    colnames(X) <- c("intercept", X_vars)
  }
  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
  # Return beta_hat
  return(beta_hat)
}

b_ols(the_data, y_var = "y", X_vars = "x", intercept = T)

resid_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  y <- to_matrix(the_df = data, vars = y_var)
  X <- to_matrix(the_df = data, vars = X_vars)
  if (intercept == T) {
    X <- cbind(1, X)
    colnames(X) <- c("intercept", X_vars)
  }
  n <- nrow(X)
  resids <- (diag(n) - X %*% solve(t(X) %*% X) %*% t(X)) %*% y
  return(resids)
}

to estimate the following regression

$$ y = X_1 \beta_1 + X_2 \beta_2 + \epsilon $$

Regress $y$ on $X_2$
save the residuals, call them $e_{yX_2}$
Regress $x$ on $X_2$
save the residuals, call them $e_{X_1 X_2}$
Regress $e_{yX_2}$ on $e_{X_1 X_2}$ (without an intercept)

# Steps 1 and 2: Residualize 'price' on 'weight' and an intercept
e_yx <- resid_ols(data = cars, y_var = "price",
  X_vars = "weight", intercept = T)
# Steps 3 and 4: Residualize 'mpg' on 'weight' and an intercept
e_xx <- resid_ols(data = cars, y_var = "mpg",
  X_vars = "weight", intercept = T)
# Combine the two sets of residuals into a data.frame
e_df <- data.frame(e_yx = e_yx[,1], e_xx = e_xx[,1])
# Step 5: Regress e_yx on e_xx without an intercept
b_ols(data = e_df, y_var = "e_yx",
  X_vars = "e_xx", intercept = F)

b_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))

Omitted Variables Bias¶

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$

We can recover the OLS estimate by estimating the univariate regression and differencing away a normalizing term. This normalizing term is 0 if $X_1$ $X_2$ are uncorrelated / orthogonal.

library(MASS)

v_cov <- matrix(data = c(1, 0.95, 0.95, 1), nrow = 2)
# Create the means vector
means <- c(5, 10)
# Define our sample size
n <- 1e5
# Set the seed
set.seed(12345)
# Generate x1 and x2
X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T)

# Create a tibble for our data, add generate error from N(0,1)
the_data <- as.data.frame(X) %>% mutate(e = rnorm(n))
# Set the names
names(the_data) <- c("x1", "x2", "e")
# The data-generating process
the_data <- the_data %>% mutate(y = 1 + 2 * x1 + 3 * x2 + e)

# Regression 1: y on int, x1, and x2
b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T)

# Regression 2: y on int and x1
b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T)
# Regression 3: y on int and x2
b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T)

$\text{R}^2$ and variations¶

Uncentered $R^2$¶

$$ R_{\mathrm{uc}}^{2}=\frac{\hat{\mathbf{y}}^{\prime} \hat{\mathbf{y}}}{\mathbf{y}^{\prime} \mathbf{y}}=1-\frac{\mathbf{e}^{\prime} \mathbf{e}}{\mathbf{y}^{\prime} \mathbf{y}}=1-\frac{\sum_{i}\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum_{i}\left(y_{i}-0\right)^{2}}=1-\frac{\mathrm{SSR}}{\mathrm{SST}_{0}} $$

Centered $R^2$¶

$$ R^{2}=1-\frac{\mathbf{e}^{\prime} \mathbf{e}}{\mathbf{y}^{* \prime} \mathbf{y}^{*}}=1-\frac{\sum_{i}\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum_{i}\left(y_{i}-\bar{y}\right)^{2}}=1-\frac{\mathrm{SSR}}{\mathrm{SST}}=\frac{\mathrm{SSM}}{\mathrm{SST}} $$

Adjusted $R^2$¶

$$ \bar{R}^{2}=1-\frac{n-1}{n-k}\left(1-R^{2}\right) $$

Use demeaning matrix $$ \mathbf{A}=\mathbf{I}_{n}-\left(\frac{1}{n}\right) \mathbf{i} \mathbf{i}^{\prime} $$

# Function that demeans the columns of Z
demeaner <- function(N) {
  # Create an N-by-1 column of 1s
  i <- matrix(data = 1, nrow = N)
  # Create the demeaning matrix
  A <- diag(N) - (1/N) * i %*% t(i)
  # Return A
  return(A)
}

# Start the function
r2_ols <- function(data, y_var, X_vars) {
  # Create y and X matrices
  y <- to_matrix(data, vars = y_var)
  X <- to_matrix(data, vars = X_vars)
  # Add intercept column to X
  X <- cbind(1, X)
  # Find N and K (dimensions of X)
  N <- nrow(X)
  K <- ncol(X)
  # Calculate the OLS residuals
  e <- resid_ols(data, y_var, X_vars)
  # Calculate the y_star (demeaned y)
  y_star <- demeaner(N) %*% y
  # Calculate r-squared values
  r2_uc  <- 1 - t(e) %*% e / (t(y) %*% y)
  r2     <- 1 - t(e) %*% e / (t(y_star) %*% y_star)
  r2_adj <- 1 - (N-1) / (N-K) * (1 - r2)
  # Return a vector of the r-squared measures
  return(c("r2_uc" = r2_uc, "r2" = r2, "r2_adj" = r2_adj))
}

# Our r-squared function
r2_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))

felm(price ~ mpg + weight, cars) %>% summary()

Call:
   felm(formula = price ~ mpg + weight, data = cars) 

Residuals:
   Min     1Q Median     3Q    Max 
 -3332  -1858   -504   1256   7507 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) 1946.0687  3597.0496   0.541  0.59019   
mpg          -49.5122    86.1560  -0.575  0.56732   
weight         1.7466     0.6414   2.723  0.00813 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2514 on 71 degrees of freedom
Multiple R-squared(full model): 0.2934   Adjusted R-squared: 0.2735 
Multiple R-squared(proj model): 0.2934   Adjusted R-squared: 0.2735 
F-statistic(full model):14.74 on 2 and 71 DF, p-value: 4.425e-06 
F-statistic(proj model): 14.74 on 2 and 71 DF, p-value: 4.425e-06

Overfitting¶

# Find the number of observations in cars
n <- nrow(cars)
# Fill an n-by-50 matrix with random numbers from N(0,1)
set.seed(12345)
rand_mat <- matrix(data = rnorm(n * 50, sd = 10), ncol = 50)
# Convert rand_mat to tbl_df
rand_df <- as.data.frame(rand_mat)
# Change names to x1 to x50
names(rand_df) <- paste0("x", 1:50)
# Bind cars and rand_df; convert to tbl_df; call it 'cars_rand'
cars_rand <- cbind(cars, rand_df) %>% tbl_df()
# Drop the variable 'rep78' (it has NAs)
cars_rand <- dplyr::select(cars_rand, -rep78, -make)

result_df <- lapply(
  X = 2:ncol(cars_rand),
  FUN = function(k) {
    # Apply the 'r2_ols' function to columns 2:k
    r2_df <- r2_ols(
      data = cars_rand,
      y_var = "price",
      X_vars = names(cars_rand)[2:k]) %>%
      matrix(ncol = 3) %>% as.data.frame()
    # Set names
    names(r2_df) <- c("r2_uc", "r2", "r2_adj")
    # Add a column for k
    r2_df$k <- k
    # Return r2_df
    return(r2_df)
  }) %>% bind_rows()

# Plot unadjusted r-squared
plot(x = result_df$k, y = result_df$r2,
  type = "l", lwd = 3, col = "lightblue",
  xlab = expression(paste("k: number of columns in ", bold("X"))),
  ylab = expression('R' ^ 2))
# Plot adjusted r-squared
lines(x = result_df$k, y = result_df$r2_adj,
  type = "l", lwd = 2, col = "purple4")
# The legend
legend(x = 0, y = max(result_df$r2),
  legend = c(expression('R' ^ 2),
    expression('Adjusted R' ^ 2)),
  # Line widths
  lwd = c(3, 2),
  # Colors
  col = c("lightblue", "purple4"),
  # No box around the legend
  bty = "n")

Section 5: Inference and Parallelisation¶

x <- data.frame(a = c(1:4, NA), b = c("..", 1, 2, 3, ".."))  %>% as_tibble()
x

x[x == ".."] <- NA
x

dir_data = file.path(root, 'section05')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()

Parsed with column specification:
cols(
  make = col_character(),
  price = col_double(),
  mpg = col_double(),
  rep78 = col_double(),
  headroom = col_double(),
  trunk = col_double(),
  weight = col_double(),
  length = col_double(),
  turn = col_double(),
  displacement = col_double(),
  gear_ratio = col_double(),
  foreign = col_double()
)

t-tests¶

$$ t_{j}=\frac{b_{j}-\bar{\gamma}}{\sqrt{s^{2} \cdot\left\{\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right\}_{i j}}}=\frac{b_{j}-\bar{\gamma}}{\operatorname{se}\left(b_{j}\right)} $$

t_stat <- function(data, y_var, X_vars, gamma, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept if requested
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars, intercept)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Force s2 to numeric
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - gamma) / se
  # Return the _t_ statistics
  return(t_stats)
}

t_stat(cars,
  y_var = "price", X_vars = c("mpg", "weight"),
  gamma = 0, intercept = T)

felm(price ~ mpg + weight, cars) %>%
  summary() %$% (coefficients)[,3]

%$% pipe

cars %$% cor(price, weight)

P-value¶

$$ p=\operatorname{Pr}\left(t_{\mathrm{df}}>\left|t_{j}\right|\right) \times 2 $$

# The default: lower.tail = TRUE
pt(q = 2, df = 15)
# Setting lower.tail = TRUE
pt(q = 2, df = 15, lower.tail = T)
# Setting lower.tail = FALSE
pt(q = 2, df = 15, lower.tail = F)

ols <- function(data, y_var, X_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept if requested
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars, intercept)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Update s2 to numeric
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - 0) / se
  # Calculate the p-values
  p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
  # Nice table (data.frame) of results
  results <- data.frame(
    # The rows have the coef. names
    effect = rownames(b),
    # Estimated coefficients
    coef = as.vector(b) %>% round(3),
    # Standard errors
    std_error = as.vector(se) %>% round(3),
    # t statistics
    t_stat = as.vector(t_stats) %>% round(3),
    # p-values
    p_value = as.vector(p_values) %>% round(4)
    )
  # Return the results
  return(results)
}

ols(data = cars,
  y_var = "price",
  X_vars = c("mpg", "weight"),
  intercept = T)

felm(price ~ mpg + weight, cars) %>%
  summary() %$% coefficients

F tests¶

$$ F=\frac{(\mathbf{R} \mathbf{b}-\mathbf{r})^{\prime}\left[\mathbf{R}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right](\mathbf{R} \mathbf{b}-\mathbf{r}) / J}{s^{2}}=\frac{(\mathbf{R} \mathbf{b})^{\prime}\left[\mathbf{R}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right](\mathbf{R} \mathbf{b}) / J}{s^{2}} $$

joint_test <- function(data, y_var, X_vars) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  X <- cbind(1, X)
  # Name the new column "intercept"
  colnames(X) <- c("intercept", X_vars)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # J is k-1
  J <- k - 1
  # Create the R matrix: bind a column of zeros
  # onto a J-by-J identity matrix
  R <- cbind(0, diag(J))

  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Force s2 to numeric
  s2 %<>% as.numeric()

  # Create the inner matrix R(X'X)^(-1)R'
  RXXR <- R %*% solve(t(X) %*% X) %*% t(R)
  # Calculate the F stat
  f_stat <- t(R %*% b) %*% solve(RXXR) %*% (R %*% b) / J / s2
  # Calculate the p-value
  p_value <- pf(q = f_stat, df1 = J, df2 = n-k, lower.tail = F)
  # Create a data.frame of the f stat. and p-value
  results <- data.frame(
    f_stat = f_stat %>% as.vector(),
    p_value = p_value %>% as.vector())
  return(results)
}

joint_test(data = cars,
  y_var = "price", X_vars = c("mpg", "weight"))

felm(price ~ mpg + weight, cars) %>%
  summary() %$% F.fstat

Simulations¶

# Function to generate data
gen_data <- function(sample_size) {
  # Create data.frame with random x and error
  data_df <- data.frame(
    x = rnorm(sample_size),
    e = rnorm(sample_size))
  # Calculate y = 7 + 0.5 x + e; drop 'e'
  data_df %<>% mutate(y = 7 + 0.5 * x + e) %>%
    select(-e)
  # Return data_df
  return(data_df)
}

one_sim <- function(sample_size) {
  # Estimate via OLS
  ols_est <- ols(data = gen_data(sample_size),
    y_var = "y", X_vars = "x")
  # Grab the estimated coefficient on x
  # (the second element of 'coef')
  b1 <- ols_est %$% coef[2]
  # Grab the second p-value
  # (the first p-value is for the intercept)
  p_value <- ols_est %$% p_value[2]
  # Return a data.frame with b1 and p_value
  return(data.frame(b1, p_value))
}

ols_sim <- function(n_sims, sample_size, seed = 12345) {
  # Set the seed
  set.seed(seed)
  # Run one_sim n_sims times; convert results to data.frame
  sim_df <- replicate(
    n = n_sims,
    expr = one_sim(sample_size),
    simplify = F
    ) %>% bind_rows()
  # Return sim_df
  return(sim_df)
}

# Run ols_sim for sample size of 10
sim10 <- ols_sim(n_sims = 1e3, sample_size = 10)
# Run ols_sim for sample size of 100
sim100 <- ols_sim(n_sims = 1e3, sample_size = 100)

Power visualisations¶

p1 = ggplot(data = sim10, aes(x = p_value)) +
  stat_density(fill = "grey20", alpha = 0.9) +
  geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
  theme_bw() +
  xlab(expression(paste(italic("p"), "-Value"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", italic(p),
    "-Values from 1,000 simulations with sample size 10")))

p2 = ggplot(data = sim100, aes(x = p_value)) +
  stat_density(fill = "grey20", alpha = 0.9) +
  geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
  xlim(0, 0.1) +
  theme_bw() +
  xlab(expression(paste(italic("p"), "-Value"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", italic(p),
    "-Values from 1,000 simulations with sample size 100")))

(p1 / p2)

p1 = ggplot(data = sim10, aes(x = b1)) +
  stat_density(fill = "grey70") +
  geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
  theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) + 
  xlab(expression(paste(beta[1], " estimate"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", beta[1],
    " estimates from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = b1)) +
  stat_density(fill = "grey70") +
  geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
  theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) + 
  xlab(expression(paste(beta[1], " estimate"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", beta[1],
    " estimates from 1,000 simulations with sample size 100")))
(p1 / p2)

Parallelisation¶

ols_sim_mc <- function(n_sims, sample_size, seed = 12345) {
  # Require the parallel package
  require(parallel)
  # Set the seed
  set.seed(seed)
  # Run one_sim n_sims times; convert results to data.frame
  sim_df <- mclapply(
    X = rep(x = sample_size, times = n_sims),
    FUN = one_sim,
    # Specify that we want 4 cores
    mc.cores = detectCores() - 2
    ) %>% bind_rows()
  # Return sim_df
  return(sim_df)
}

Non-Parallelised

Parallelised

# stop1 - start1
# stop2 - start2
# stop3 - start3
# stop4 - start4

Section 6: Figures¶

libreq(dplyr, magrittr, ggplot2, ggthemes)

     wants      loaded
[1,] "dplyr"    TRUE  
[2,] "magrittr" TRUE  
[3,] "ggplot2"  TRUE  
[4,] "ggthemes" TRUE

# Reformat Anscombe's dataset
a_df <- bind_rows(
  dplyr::select(anscombe, x = x1, y = y1),
  dplyr::select(anscombe, x = x2, y = y2),
  dplyr::select(anscombe, x = x3, y = y3),
  dplyr::select(anscombe, x = x4, y = y4))
# Add group identifier
a_df %<>% mutate(group = rep(paste0("Dataset ", 1:4), each = 11))
# The plot
ggplot(data = a_df, aes(x, y)) +
  # Plot the points
  geom_point() +
  # Add the regression line (without S.E. shading)
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  # Change the theme
  theme_pander() +
  theme(panel.border = element_rect(color = "grey90", fill=NA, size=1)) +
  # Plot by group
  facet_wrap(~group, nrow = 2, ncol = 2) +
  ggtitle("Illustrating Anscombe's Quartet",
    subtitle = "Four very different datasets with the same regression line")

# Regress price on weight (with an intercept)
b <- ols(data = cars, y_var = "price", X_vars = "weight") %$% coef
price_hat <- function(weight) b[1] + b[2] * weight

p1 = ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  stat_function(fun = price_hat, colour = 'blue')
p2 = ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  geom_smooth(method = "lm")

(p1 / p2)

ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2))

cars %<>% mutate(foreign = foreign == 1)
ggplot(data = cars, aes(x = weight, y = price, color = foreign)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2))

ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point(aes(color = foreign, size = mpg), alpha = 0.5) +
  xlab("Weight (lbs)") +
  ylab("Price (USD)") +
  ggtitle("Trends in cars sold on the US market",
    subtitle = "From the world-famous autos dataset") +
  theme_pander()

ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point(aes(color = foreign, size = mpg), alpha = 0.5) +
  xlab("Weight (lbs)") +
  ylab("Price (USD)") +
  ggtitle("Trends in cars sold on the US market",
    subtitle = "From the world-famous autos dataset") +
  scale_color_manual("Origin",
    values = c("grey70", "midnightblue"),
    labels = c("Domestic", "Foreign"))

Plotting functions¶

# Define the inverse demand and supply functions
inv_demand <- function(q) 10 - q
inv_supply <- function(q) 1.5 * q

ggplot(data = data.frame(x = c(0, 10)), aes(x)) +
  # The inverse demand
  stat_function(fun = inv_demand, geom = "line") +
  # The inverse supply
  stat_function(fun = inv_supply, geom = "line") +
  # The shifted inverse supply curve
  stat_function(fun = function(x) inv_supply(x) + 3, geom = "line",
    linetype = 2) +
  # Labels and themes
  xlab("Quantity") +
  ylab("Price") +
  ggtitle("Classic economics figure")

ggplot(data = data.frame(x = c(-4,4)), aes(x)) +
  # Plot the pdf
  stat_function(fun = function(x) dt(x, df = 29),
    color = "grey75") +
  ggtitle(expression(paste("The beautiful ", italic(t),
    " distribution"))) +
  xlab(expression(italic(t))) +
  ylab("Density")

my_theme <- theme(
  legend.position = "bottom",
  panel.background = element_rect(fill = NA),
  panel.border = element_rect(fill = NA, color = "grey75"),
  axis.ticks = element_line(color = "grey85"),
  panel.grid.major = element_line(color = "grey95", size = 0.2),
  panel.grid.minor = element_line(color = "grey95", size = 0.2),
  legend.key = element_blank())

Section 7: Generalised Least Squares¶

libreq(dplyr, lfe, readr, magrittr, parallel, lfe, ggplot2, ggthemes, viridis)

      wants      loaded
 [1,] "dplyr"    TRUE  
 [2,] "lfe"      TRUE  
 [3,] "readr"    TRUE  
 [4,] "magrittr" TRUE  
 [5,] "parallel" TRUE  
 [6,] "lfe"      TRUE  
 [7,] "ggplot2"  TRUE  
 [8,] "ggthemes" TRUE  
 [9,] "viridis"  TRUE

# Functions ----
# Function to convert tibble, data.frame, or tbl_df to matrix
to_matrix <- function(the_df, vars) {
  # Create a matrix from variables in var
  new_mat <- the_df %>%
    # Select the columns given in 'vars'
    select_(.dots = vars) %>%
    # Convert to matrix
    as.matrix()
  # Return 'new_mat'
  return(new_mat)
}
# Function for OLS coefficient estimates
b_ols <- function(y, X) {
  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
  # Return beta_hat
  return(beta_hat)
}

BLUE¶

Spherical errors assumption in OLS

$$ \boldsymbol{E}\left[\varepsilon \varepsilon^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{I}_{n} $$

replace with

$$ \boldsymbol{E}\left[\boldsymbol{\varepsilon} \boldsymbol{\varepsilon}^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{\Omega}(\mathbf{X}) $$

where $\Omega$ is known and positive definite and $$\hat{\beta}_{GLS} = \left.\left(\mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{y}\right)$$

WLS¶

Impose structure on $\Omega(\textbf{X})$ s.t.

$$ \boldsymbol{E}\left[\varepsilon \varepsilon^{\prime} | \mathbf{X}\right]=\sigma^{2} \mathbf{\Omega}(\mathbf{X})=\sigma^{2} \operatorname{diag}\left\{\omega_{i}(\mathbf{X})\right\} $$

or $$ \boldsymbol{E}\left[\varepsilon_{i}^{2}\right]=\sigma^{2} \omega_{i}(\mathbf{X}) $$

$$ \tilde{y}_{i}=\frac{y_{i}}{\sqrt{\omega_{i}(\mathbf{X})}} \quad \text { and } \quad \tilde{x}_{i}=\frac{x_{i}}{\sqrt{\omega_{i}(\mathbf{X})}} $$$$ \begin{aligned} \mathbf{b}_{\mathrm{gls}} &=\left(\mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Omega}^{-1} \mathbf{y} \\ &=\left(\mathbf{X}^{\prime} \mathbf{W} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{W} \mathbf{y} \\ &=\left(\mathbf{X}^{\prime} \mathbf{C}^{\prime} \mathbf{C} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{C}^{\prime} \mathbf{C} \mathbf{y} \\ &=\left((\mathbf{C} \mathbf{X})^{\prime} \mathbf{C X}\right)^{-1}(\mathbf{C} \mathbf{X})^{\prime} \mathbf{C} \mathbf{y} \\ &=\left(\widetilde{\mathbf{X}}^{\prime} \widetilde{\mathbf{X}}\right)^{-1} \widetilde{\mathbf{X}}^{\prime} \tilde{\mathbf{y}} \end{aligned} $$

# Set the seed
set.seed(12345)
# Set population size, N
N <- 1e5
# Set alpha and beta
alpha <- 0.5
beta <- 1.5
# Create the population data: intercept (i) and X
pop_df <- data.frame(
  i = 1,
  x = runif(n = N, min = 0, max = 2000)
  ) %>% tbl_df()
# Generate error term, e
pop_df %<>% mutate(
  e = rnorm(N, mean = 0, sd = sqrt(4 * x^2)))
# Calculate y
pop_df %<>% mutate(y = alpha + 1.5 * x + e)

# Add weights
pop_df %<>% mutate(w = 10/x)
pop_df %<>% mutate(
  y_w = y * w,
  i_w = i * w,
  x_w = x * w)

# Function for a single iteration of the simulation
one_run <- function(iter, population) {
  # Sample 1000 rows from the population
  sample_df <- sample_n(tbl = population, size = 1000)
  # Calculate the OLS coef. (using unweighted variables)
  coef_ols <- b_ols(
    y = to_matrix(sample_df, "y"),
    X = to_matrix(sample_df, c("i", "x")))
  # Calculate the WLS coef. (using weighted variables)
  coef_wls <- b_ols(
    y = to_matrix(sample_df, "y_w"),
    X = to_matrix(sample_df, c("i_w", "x_w")))
  # Create a data.frame to return
  coef_df <- data.frame(
    est    = as.vector(c(coef_ols, coef_wls)),
    param  = rep(c("int", "coef"), 2),
    method = rep(c("ols", "wls"), each = 2),
    iter   = iter
    )
  # Return the data.frame
  return(coef_df)
}

# Set seed in parallel
sim_df <- future_map(
  1:1e3,
  .f = one_run,
  population = pop_df) %>% bind_rows() %>% tbl_df()

Warning message:
“select_() is deprecated. 
Please use select() instead

The 'programming' vignette or the tidyeval book can help you
to program with select() : https://tidyeval.tidyverse.org
This warning is displayed once per session.”

ggplot(data = filter(sim_df, param == "coef"), aes(x = est)) +
  geom_vline(xintercept = 1.5, color = "grey70", size = 0.75) +
  geom_density(aes(fill = method, color = method), alpha = 0.7) +
  xlab(expression(paste("Estimate for ", beta))) +
  ylab("Density") +
  ggtitle("Simulation comparing coefficients from OLS and WLS") +
  scale_fill_viridis("Method", labels = c("OLS", "WLS"),
    discrete = T, end = 0.95) +
  scale_color_viridis("Method", labels = c("OLS", "WLS"),
    discrete = T, end = 0.95)

sim_df %>%
  group_by(param, method) %>%
  summarize(mean(est), sd(est)) %>%
  knitr::kable(digits = 4,
    col.names = c("Parameter", "Method", "Mean", "Std. Dev."))


|Parameter |Method |    Mean| Std. Dev.|
|:---------|:------|-------:|---------:|
|coef      |ols    |  1.5064|    0.1416|
|coef      |wls    |  1.5005|    0.0665|
|int       |ols    | -2.1870|   92.6135|
|int       |wls    |  0.0477|    3.1439|

WLS with misspecification¶

# Add the bad weights
pop_df %<>% mutate(v = 10/x^2)
# Weight the observations with the bad weights
pop_df %<>% mutate(
  y_v = y * v,
  i_v = i * v,
  x_v = x * v)

# Function for a single iteration of the simulation
one_run <- function(iter, population) {
  # Sample 1000 rows from the population
  sample_df <- sample_n(tbl = population, size = 1000)
  # Calculate the OLS coef. (using unweighted variables)
  coef_ols <- b_ols(
    y = to_matrix(sample_df, "y"),
    X = to_matrix(sample_df, c("i", "x")))
  # Calculate the WLS coef. (using correctly weighted variables)
  coef_wls <- b_ols(
    y = to_matrix(sample_df, "y_w"),
    X = to_matrix(sample_df, c("i_w", "x_w")))
  # Calculate the WLS coef. (using incorrectly weighted variables)
  coef_wls_bad <- b_ols(
    y = to_matrix(sample_df, "y_v"),
    X = to_matrix(sample_df, c("i_v", "x_v")))
  # Create a data.frame to return
  coef_df <- data.frame(
    est    = as.vector(c(coef_ols, coef_wls, coef_wls_bad)),
    param  = rep(c("int", "coef"), 3),
    method = rep(c("ols", "wls", "wls bad"), each = 2),
    iter   = iter
    )
  # Return the data.frame
  return(coef_df)
}

# Set seed in parallel
miss_df <- future_map(
  1:1e3,
  .f = one_run,
  population = pop_df) %>% bind_rows() %>% tbl_df()

ggplot(data = filter(miss_df, param == "coef"), aes(x = est)) +
  geom_vline(xintercept = 1.5, color = "grey70", size = 0.75) +
  geom_density(aes(fill = method, color = method), alpha = 0.7) +
  xlab(expression(paste("Estimate for ", beta))) +
  ylab("Density") +
  ggtitle("Simulation comparing coefficients from OLS and WLS",
    subtitle = "Allowing for misspecification in WLS") +
  scale_fill_viridis("Method",
    labels = c("OLS", "WLS", "WLS misspecified"),
    discrete = T, end = 0.95, direction = -1) +
  scale_color_viridis("Method",
    labels = c("OLS", "WLS", "WLS misspecified"),
    discrete = T, end = 0.95, direction = -1)

Weights in FELM¶

felm(y ~ x, data = pop_df) %>% summary
felm(y ~ x, data = pop_df, weights = (10/pop_df$x)^2) %>% summary

Call:
   felm(formula = y ~ x, data = pop_df) 

Residuals:
     Min       1Q   Median       3Q      Max 
-15056.3  -1032.5      0.3   1046.6  15751.2 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.66329   14.70387  -0.045    0.964    
x            1.50502    0.01273 118.248   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2321 on 99998 degrees of freedom
Multiple R-squared(full model): 0.1227   Adjusted R-squared: 0.1227 
Multiple R-squared(proj model): 0.1227   Adjusted R-squared: 0.1227 
F-statistic(full model):1.398e+04 on 1 and 99998 DF, p-value: < 2.2e-16 
F-statistic(proj model): 1.398e+04 on 1 and 99998 DF, p-value: < 2.2e-16

Call:
   felm(formula = y ~ x, data = pop_df, weights = (10/pop_df$x)^2) 

Weighted Residuals:
    Min      1Q  Median      3Q     Max 
-91.262 -13.536   0.006  13.482 111.662 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.477692   0.016675   28.65   <2e-16 ***
x           1.499899   0.006333  236.85   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 20.02 on 99998 degrees of freedom
Multiple R-squared(full model): 0.3594   Adjusted R-squared: 0.3594 
Multiple R-squared(proj model): 0.3594   Adjusted R-squared: 0.3594 
F-statistic(full model):5.61e+04 on 1 and 99998 DF, p-value: < 2.2e-16 
F-statistic(proj model): 5.61e+04 on 1 and 99998 DF, p-value: < 2.2e-16

Section 8: OLS Asymptotics¶

libreq(dplyr, lfe, readr, magrittr, parallel, lfe, ggplot2, ggthemes, viridis, VGAM,
        gridExtra)

      wants       loaded
 [1,] "dplyr"     TRUE  
 [2,] "lfe"       TRUE  
 [3,] "readr"     TRUE  
 [4,] "magrittr"  TRUE  
 [5,] "parallel"  TRUE  
 [6,] "lfe"       TRUE  
 [7,] "ggplot2"   TRUE  
 [8,] "ggthemes"  TRUE  
 [9,] "viridis"   TRUE  
[10,] "VGAM"      TRUE  
[11,] "gridExtra" TRUE

When OLS assumptions

linearity
population orthogonality
asymptotic full rank

then $\hat{\beta}$ is

Consistent for $\beta$
Asymptotically normally distributed with mean $\beta$ and variance $$ \sigma^{2}\left(\sum_{i}^{N} \mathbf{x}_{i}^{\prime} \mathbf{x}_{i}\right)^{-1} $$

rbigamma <- function(n, prob = c(0.5, 0.5)) {
  # Draw n samples of T or F (our 'coin flip') with replacement
  flips <- base::sample(x = c(T, F), size = n,
    replace = T, prob = prob)
  # Draw n samples from Gamma with shape=5 and scale=1 (mean=7)
  # substract the mean (7) from the draws
  gamma_7 <- rgamma(n = n, shape = 7, scale = 1)
  # Draw n samples from Gamma with shape=1 and scale=1 (mean=1)
  # substract the mean (1) from the draws
  gamma_1 <- rgamma(n = n, shape = 1, scale = 1)
  # Combine the flips and the two gammas' draws
  bi_gamma <- flips * gamma_7 + (!flips) * gamma_1
  # Demean the bimodal variables (weighting by 'prob')
  bi_gamma <- bi_gamma - 7 * prob[1] - 1 * prob[2]
}

qplot(rbigamma(1e5), geom = "histogram")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Set the seed
set.seed(12345)
# Define the population size
N <- 1e5
# Define the true beta
beta <- 5
# Start the population
pop_df <- data.frame(
  ones   = 1,
  x      = runif(N, min = 0, max = 100),
  e_norm = rnorm(n = N),
  e_unif = runif(n = N, min = -5, max = 5),
  e_pois = rpois(n = N, lambda = 1) - 1,
  e_bg   = rbigamma(n = N)
  ) %>% tbl_df()
# Calculate the outcome variable: y = 1 + beat * x + error
pop_df %<>% mutate(
  y_norm = 1 + beta * x + e_norm,
  y_unif = 1 + beta * x + e_unif,
  y_pois = 1 + beta * x + e_pois,
  y_bg   = 1 + beta * x + e_bg
  )

pop_df %>% head

# Generate 4 colors from 'viridis'
c4 <- viridis(n = 4, begin = 0.1, end = 0.8)
# Generate 4 slightly lighter colors from 'viridis'
c4_l <- viridis(n = 4, begin = 0.4, end = 1)
# Make a plot for the normal disturbances
gg_norm <- ggplot(data = pop_df, aes(x = e_norm)) +
  geom_histogram(fill = c4[1], color = c4_l[1],
    size = 0.1, bins = 30) +
  xlab("Normal") +
  ylab("Count") 
# Make a plot for the uniform disturbances
gg_unif <- ggplot(data = pop_df, aes(x = e_unif)) +
  geom_histogram(fill = c4[2], color = c4_l[2],
    size = 0.1, bins = 30) +
  xlab("Uniform") +
  ylab("Count") 
# Make a plot for the poisson disturbances
gg_pois <- ggplot(data = pop_df, aes(x = e_pois)) +
  geom_histogram(fill = c4[3], color = c4_l[3],
    size = 0.1, bins = 30) +
  xlab("Poisson") +
  ylab("Count") 
# Make a plot for the bimodal gamma disturbances
gg_bg <- ggplot(data = pop_df, aes(x = e_bg)) +
  geom_histogram(fill = c4[4], color = c4_l[4],
    size = 0.1, bins = 30) +
  xlab("Bernoulli-Gamma") +
  ylab("Count") 
# Combine the plots
(gg_norm | gg_unif) / (gg_pois | gg_bg)

pop_df %>% select(starts_with("e_")) %>% summarize_each(funs = "mean")

Simulation: Consistency¶

one_iter <- function(n, data) {
  # Draw 'n' observations from 'data'
  tmp_df <- sample_n(tbl = data, size = n)
  # Define the X matrix (same across regressions)
  x_mat <- to_matrix(tmp_df, c("ones", "x"))
  # Estimate OLS for each 'y'
  b_norm = b_ols(
    y = to_matrix(tmp_df, "y_norm"),
    X = x_mat)[2]
  b_unif = b_ols(
    y = to_matrix(tmp_df, "y_unif"),
    X = x_mat)[2]
  b_pois = b_ols(
    y = to_matrix(tmp_df, "y_pois"),
    X = x_mat)[2]
  b_bg = b_ols(
    y = to_matrix(tmp_df, "y_bg"),
    X = x_mat)[2]
  # Create a data.frame to return
  coef_df <- data.frame(
    # The estimates
    est = c(b_norm, b_unif, b_pois, b_bg),
    # The distributions
    dist = c("normal", "uniform", "poisson", "bi-gamma"),
    # The sample size
    n = n
    )
  # Return coef_df
  return(coef_df)
}

run_sim <- function(n, n_iter, data, n_cores = 4) {
  # Required packages
  require(dplyr)
  require(parallel)
  require(magrittr)
  # Run 'one_iter' 'n_iter' times with sample size 'n'
  run_df <- mclapply(
    X = rep(n, each = n_iter),
    FUN = one_iter,
    data = data,
    mc.cores = n_cores) %>% bind_rows() %>% tbl_df()
  # Return run_df
  return(run_df)
}

# Set the seed (again)
set.seed(12345)
# Run the simulation with n = 25
sim_25 <- run_sim(
  n      = 25,
  n_iter = 1e4,
  data   = select(pop_df, -starts_with("e")))
# Run the simulation with n = 100
sim_100 <- run_sim(
  n      = 100,
  n_iter = 1e4,
  data   = select(pop_df, -starts_with("e")))
# Run the simulation with n = 1,000
sim_1k <- run_sim(
  n      = 1e3,
  n_iter = 1e4,
  data   = select(pop_df, -starts_with("e")))
# Run the simulation with n = 10,000
sim_10k <- run_sim(
  n      = 1e4,
  n_iter = 1e4,
  data   = select(pop_df, -starts_with("e")))
# Combine simulations
sim_df <- bind_rows(sim_25, sim_100, sim_1k, sim_10k)
# Clean up
rm(sim_25, sim_100, sim_1k, sim_10k); gc()

# Set the seed (again)
set.seed(12345)
# Define our desired sample sizes
sizes <- c(25, 100, 1e3, 1e4)
# Run the simulation function 'run_sim()'
sim_df <- run_sim(n = sizes, n_iter = 1e4,
  data = select(pop_df, -starts_with("e")))

ggplot(data = filter(sim_df, dist == "normal"),
  aes(x = est, fill = as.character(n))) +
  geom_density(stat = "density", alpha = 0.6, size = 0.05) +
  xlab(expression(b[OLS])) +
  ylab("Density") +
  ggtitle(paste("Distribution of OLS coefficients",
    "with normal disturbances")) +
  scale_fill_viridis("Sample size:", discrete = T,
    direction = -1)

sim_df %<>% mutate(dist_fac = factor(
  x = dist,
  levels = c("bi-gamma", "normal", "poisson", "uniform"),
  labels = c("Bernoulli-Gamma", "Normal", "Poisson", "Uniform")
  ))
# Add a factor version of 'n'
sim_df %<>% mutate(
  n_fac = factor(
    x       = n,
    levels  = sizes,
    labels  = prettyNum(sizes, big.mark=",", scientific = F),
    ordered = T)
  )
# Check our new variable
sim_df$n_fac %>% head()

ggplot(data = filter(sim_df, dist == "normal"),
  aes(x = est, fill = n_fac)) +
  geom_density(stat = "density", alpha = 0.6, size = 0.05) +
  xlab(expression(b[OLS])) +
  ylab("Density") +
  ggtitle(paste("Distribution of OLS coefficients",
    "with normal disturbances")) +
  scale_fill_viridis("Sample size:", discrete = T,
    direction = -1)

ggplot(data = sim_df,
  aes(x = est, fill = n_fac)) +
  geom_density(stat = "density", alpha = 0.6, size = 0.05) +
  xlab(expression(b[OLS])) +
  ylab("Density") +
  ggtitle(paste("Distribution of OLS coefficients",
    "with normal disturbances")) +
  scale_fill_viridis("Sample size:", discrete = T,
    direction = -1) +
  facet_grid(dist_fac ~ ., scales = "free_y")

sim_df %>% group_by(dist_fac, n_fac) %>%
  summarize(mean = mean(est), std_dev = sd(est)) %>%
  knitr::kable(digits = 4,
    col.names = c("Distribution", "N", "Mean", "Std. Dev."))


|Distribution    |N      |   Mean| Std. Dev.|
|:---------------|:------|------:|---------:|
|Bernoulli-Gamma |25     | 5.0000|    0.0262|
|Bernoulli-Gamma |100    | 5.0002|    0.0128|
|Bernoulli-Gamma |1,000  | 5.0002|    0.0039|
|Bernoulli-Gamma |10,000 | 5.0002|    0.0012|
|Normal          |25     | 5.0001|    0.0072|
|Normal          |100    | 5.0001|    0.0035|
|Normal          |1,000  | 5.0001|    0.0011|
|Normal          |10,000 | 5.0001|    0.0003|
|Poisson         |25     | 4.9999|    0.0073|
|Poisson         |100    | 4.9999|    0.0035|
|Poisson         |1,000  | 4.9999|    0.0011|
|Poisson         |10,000 | 4.9999|    0.0003|
|Uniform         |25     | 4.9999|    0.0208|
|Uniform         |100    | 4.9998|    0.0102|
|Uniform         |1,000  | 4.9998|    0.0032|
|Uniform         |10,000 | 4.9999|    0.0009|

Simulation: Asymptotic Normality¶

# Summaries of our point estimates
est_df <- sim_df %>%
  filter(n == 1e4) %>%
  group_by(dist) %>%
  summarize(mean_est = mean(est), sd_est = sd(est))

# Normal disturbances
sim_norm <- ggplot(aes(x = est),
  data = filter(sim_df, dist == "normal", n == 1e4)) +
  geom_histogram(aes(y = ..density..),
    fill = "grey90", color = "grey65",
    size = 0.1, bins = 50) +
  geom_line(stat = "density",
    color = "violetred2", size = 0.6) +
  stat_function(
    geom = "line", fun = dnorm,
    color = "slateblue4", linetype = "dashed", size = 0.8,
    args = list(
      mean = filter(est_df, dist == "normal")$mean_est,
      sd = filter(est_df, dist == "normal")$sd_est)
    ) +
  ggtitle("Normal") +
  xlab("") + ylab("") 

# Uniform disturbances
sim_unif <- ggplot(aes(x = est),
  data = filter(sim_df, dist == "uniform", n == 1e4)) +
  geom_histogram(aes(y = ..density..),
    fill = "grey90", color = "grey65",
    size = 0.1, bins = 50) +
  geom_line(stat = "density",
    color = "violetred2", size = 0.6) +
  stat_function(
    geom = "line", fun = dnorm,
    color = "slateblue4", linetype = "dashed", size = 0.8,
    args = list(
      mean = filter(est_df, dist == "uniform")$mean_est,
      sd = filter(est_df, dist == "uniform")$sd_est)
    ) +
  ggtitle("Uniform") +
  xlab("") + ylab("") 

# Uniform disturbances
sim_pois <- ggplot(aes(x = est),
  data = filter(sim_df, dist == "poisson", n == 1e4)) +
  geom_histogram(aes(y = ..density..),
    fill = "grey90", color = "grey65",
    size = 0.1, bins = 50) +
  geom_line(stat = "density",
    color = "violetred2", size = 0.6) +
  stat_function(
    geom = "line", fun = dnorm,
    color = "slateblue4", linetype = "dashed", size = 0.8,
    args = list(
      mean = filter(est_df, dist == "poisson")$mean_est,
      sd = filter(est_df, dist == "poisson")$sd_est)
    ) +
  ggtitle("Poisson") +
  xlab("") + ylab("") 

# Uniform disturbances
sim_bg <- ggplot(aes(x = est),
  data = filter(sim_df, dist == "bi-gamma", n == 1e4)) +
  geom_histogram(aes(y = ..density..),
    fill = "grey90", color = "grey65",
    size = 0.1, bins = 50) +
  geom_line(stat = "density",
    color = "violetred2", size = 0.6) +
  stat_function(
    geom = "line", fun = dnorm,
    color = "slateblue4", linetype = "dashed", size = 0.8,
    args = list(
      mean = filter(est_df, dist == "bi-gamma")$mean_est,
      sd = filter(est_df, dist == "bi-gamma")$sd_est)
    ) +
  ggtitle("Bernoulli-Gamma") +
  xlab("") + ylab("")

# Combine the plots
(sim_norm | sim_unif) / (sim_pois | sim_bg)

# Set the seed
set.seed(12345)
# The Shapiro-Wilk test
sim_df %>%
  # Subset for Poisson and n = 10,000
  filter(dist == "poisson", n == 1e4) %>%
  # Sample 5,000 of the rows
  sample_n(size = 5e3) %$%
  # Grab the variable 'est'
  est %>%
  # The actual Shaprio-Wilk test function
  shapiro.test()

	Shapiro-Wilk normality test

data:  .
W = 0.99969, p-value = 0.6621

ks.test(
  x = filter(sim_df, dist == "poisson", n == 1e4)$est,
  y = "pnorm",
  mean = filter(est_df, dist == "poisson")$mean_est,
  sd = filter(est_df, dist == "poisson")$sd_est)

	One-sample Kolmogorov-Smirnov test

data:  filter(sim_df, dist == "poisson", n == 10000)$est
D = 0.0087056, p-value = 0.4346
alternative hypothesis: two-sided

Breaking Asymptopia - Pareto¶

TLDR: For pareto, $E(x^2) = \infty$, so CLT breaks

N_par <- 1e6
# Create a population
par_df <- data.frame(
  e = rpareto(N_par, scale = 2, shape = 1))
# Demean our disturbances
par_df %<>% mutate(e = e - mean(e))
# Add x and ones
par_df %<>% mutate(
  ones = 1,
  x    = runif(N_par, 0, 100),
  y    = 1 + 5 * x + e
  ) %>% tbl_df()

quantile(par_df$e, probs = seq(0, 1, 0.1))

one_par <- function(n, data) {
  # Draw 'n' observations from 'data'
  tmp_df <- sample_n(tbl = data, size = n)
  # Estimate OLS
  b_par = b_ols(
    y   = to_matrix(tmp_df, "y"),
    X   = to_matrix(tmp_df, c("ones", "x")))[2]
  # Create a data.frame to return
  coef_df <- data.frame(
    # The estimates
    est = b_par,
    # The sample size
    n = n
    )
  # Return coef_df
  return(coef_df)
}

run_par <- function(n, n_iter, data, n_cores = 4) {
  # Required packages
  require(dplyr)
  require(parallel)
  require(magrittr)
  # Run 'one_par' 'n_iter' times with sample size 'n'
  run_df <- mclapply(
    X = rep(n, each = n_iter),
    FUN = one_par,
    data = data,
    mc.cores = n_cores) %>% bind_rows() %>% tbl_df()
  # Return run_df
  return(run_df)
}

par_10k <- run_par(
  n      = 1e4,
  n_iter = 1e4,
  data   = par_df)

# Pareto disturbances
ggplot(data = par_10k, aes(x = est)) +
  geom_histogram(aes(y = ..density..),
    fill = "grey90", color = "grey65",
    size = 0.1, bins = 50) +
  geom_line(stat = "density",
    color = "violetred2", size = 0.6) +
  stat_function(
    geom = "line", fun = dnorm,
    color = "slateblue4", linetype = "dashed", size = 0.8,
    args = list(
      mean = mean(par_10k$est),
      sd = sd(par_10k$est))
    ) +
  ggtitle("Distribution of coefficients from Pareto disturbances",
    subtitle = "Sample size of 10,000") +
  xlab(expression(b[OLS])) +
  ylab("Density")

Section 9: Standard Errors¶

libreq(gmodels, msm, dplyr, readr, magrittr, lfe, ggplot2, ggthemes)

Installing package into ‘/home/alal/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
also installing the dependencies ‘muhaz’, ‘mstate’, ‘flexsurv’

     wants      loaded
[1,] "gmodels"  TRUE  
[2,] "msm"      TRUE  
[3,] "dplyr"    TRUE  
[4,] "readr"    TRUE  
[5,] "magrittr" TRUE  
[6,] "lfe"      TRUE  
[7,] "ggplot2"  TRUE  
[8,] "ggthemes" TRUE

dir_data = file.path(root, 'section09')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()

Parsed with column specification:
cols(
  make = col_character(),
  price = col_double(),
  mpg = col_double(),
  rep78 = col_double(),
  headroom = col_double(),
  trunk = col_double(),
  weight = col_double(),
  length = col_double(),
  turn = col_double(),
  displacement = col_double(),
  gear_ratio = col_double(),
  foreign = col_double()
)

# Functions ----
# Function to convert tibble, data.frame, or tbl_df to matrix
to_matrix <- function(the_df, vars) {
  # Create a matrix from variables in var
  new_mat <- the_df %>%
    # Select the columns given in 'vars'
    select_(.dots = vars) %>%
    # Convert to matrix
    as.matrix()
  # Return 'new_mat'
  return(new_mat)
}
# Function for OLS coefficient estimates
b_ols <- function(y, X) {
  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
  # Return beta_hat
  return(beta_hat)
}
# Function for OLS coef., SE, t-stat, and p-value
ols <- function(data, y_var, X_vars) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(y, X)
  # Update names
  rownames(b)[1] <- "Intercept"
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Convert s2 to numeric
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - 0) / se
  # Calculate the p-values
  p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
  # Nice table (data.frame) of results
  results <- data.frame(
    # The rows have the coef. names
    effect = rownames(b),
    # Estimated coefficients
    coef = as.vector(b),
    # Standard errors
    std_error = as.vector(se),
    # t statistics
    t_stat = as.vector(t_stats),
    # p-values
    p_value = as.vector(p_values)
    )
  # Return the results
  return(results)
}

# Regress price on MPG and weight
ols(cars, "price", c("mpg", "weight")) %>%
  knitr::kable()


|effect    |        coef|    std_error|     t_stat|   p_value|
|:---------|-----------:|------------:|----------:|---------:|
|Intercept | 1946.068668| 3597.0495988|  0.5410180| 0.5901886|
|mpg       |  -49.512221|   86.1560389| -0.5746808| 0.5673237|
|weight    |    1.746559|    0.6413538|  2.7232382| 0.0081298|

Linear Combinations¶

# Regress price on mpg and weight
reg1 <- ols(cars, "price", c("mpg", "weight"))
# lc = 20 * b1 + 3000 * b2
(lc <- 20 * reg1[2,2] + 3000 * reg1[3,2])

$$ \operatorname{Var}(\widehat{L C})=\operatorname{Var}\left(20 b_{1}+3000 b_{2}\right) $$

Since $$ \begin{array}{c}{\operatorname{Var}(a X)=a^{2} \operatorname{Var}(X)} \\ {\operatorname{Var}(X+Y)=\operatorname{Var}(X)+\operatorname{Var}(Y)+2 \operatorname{Cov}(X, Y)} \\ {\operatorname{Cov}(a X, b Y)=a b \operatorname{Cov}(X, Y)}\end{array} $$

$$ \operatorname{Var}(a X+b Y)=a^{2} \operatorname{Var}(X)+b^{2} \operatorname{Var}(Y)+2 a b \operatorname{Cov}(X, Y) $$

Implies $$ \operatorname{Var}(\widehat{L C})=20^{2} \operatorname{Var}\left(b_{1}\right)+3000^{2} \operatorname{Var}\left(b_{2}\right)+2 \times 20 \times 3000 \operatorname{Cov}\left(b_{1}, b_{2}\right) $$

# Variance-covariance function for OLS beta hat
vcov_ols <- function(data, y_var, X_vars) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  X <- cbind(1, X)
  # Label intercept
  colnames(X)[1] <- "intercept"
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(y, X)
  # Calculate residuals
  e <- y - X %*% b
  # Calculate s2 and convert to scalar
  s2 <- (t(e) %*% e / (n - k)) %>% as.vector()
  # Convert s2 to numeric
  s2 %<>% as.numeric()
  # Calculate the variance-covariance matrix
  vcov_mat <- s2 * solve(t(X) %*% X)
  # Return the variance-covariance matrix
  return(vcov_mat)
}

# Run the vcov_ols() function
vcov_ols(cars, "price", c("mpg", "weight"))

# The variance-covariance matrix
vcov1 <- vcov_ols(cars, "price", c("mpg", "weight"))
# The standard error for 'lc'
(lc_se <- sqrt(20^2 * vcov1[2,2] + 3000^2 * vcov1[3,3] +
  2 * 20 * 3000 * vcov1[2,3]))

# Estimate the model with 'lm'
lm_est <- lm(price ~ mpg + weight, data = cars)
# Estimate the linear combination
estimable(obj = lm_est, cm = c(0, 20, 3000))

Delta Method¶

For an arbitrary differentiable $\mathbf{a}(\cdot): \mathbb{R}^{K} \rightarrow \mathbb{R}^{r}$,

$$ \mathbf{A}(\beta)=\frac{\partial \mathbf{a}(\beta)}{\partial \beta^{\prime}} $$

.

Take a seqeunce of k-dimensional random vectors $\{x_N : N = 1, 2, \dots \}$ where $$ \mathbf{x}_{N} \stackrel{p}{\rightarrow} \boldsymbol{\beta} $$ and $$ \sqrt{N}\left(\mathbf{x}_{N}-\boldsymbol{\beta}\right) \stackrel{d}{\longrightarrow} N(\mathbf{0}, \mathbf{\Sigma}) $$

For OLS or maximum likelihood, these conditions hold.

$b_{OLS}$ is consistent for $\beta$
$ \sqrt{N}\left(\mathbf{b}_{\mathrm{OLS}}-\boldsymbol{\beta}\right) \stackrel{d}{\longrightarrow} N\left(\mathbf{0}, \sigma^{2}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right) $
Derivatives are continuous.

Then, $$ \sqrt{N}\left(\mathbf{a}\left(\mathbf{x}_{N}\right)-\mathbf{a}(\boldsymbol{\beta})\right) \stackrel{d}{\rightarrow} N\left(\mathbf{0}, \mathbf{A}(\boldsymbol{\beta}) \boldsymbol{\Sigma} \mathbf{A}(\boldsymbol{\beta})^{\prime}\right) $$

In this toy example,

$$ \mathbf{a}(\boldsymbol{\beta})=L C=20 \beta_{1}+3000 \beta_{2} $$$$ \mathbf{A}(\boldsymbol{\beta})=\frac{\partial \mathbf{a}(\boldsymbol{\beta})}{\partial \boldsymbol{\beta}^{\prime}}=\frac{\partial\left(20 \beta_{1}+3000 \beta_{2}\right)}{\partial\left[\beta_{0}, \beta_{1}, \beta_{2}\right]}=\left[\begin{array}{lll}{0} & {20} & {3000}\end{array}\right] $$

# Remind ourselves of LC and its var-cov matrix
lc <- 20 * reg1[2,2] + 3000 * reg1[3,2]
vcov1 <- vcov_ols(cars, "price", c("mpg", "weight"))
# Define our derivative matrix
deriv_mat <- matrix(c(0, 20, 3000), nrow = 1)
# Calculate the standard error of 'lc' via delta method
lc_dm <- sqrt(deriv_mat %*% vcov1 %*% t(deriv_mat))

lc_se
lc_dm

Nonlinear Combinations¶

$$ y_{i}=\beta_{0}+\beta_{1} x_{i}+\beta_{2} x_{i}^{2}+\varepsilon_{i} $$

Maximum y is reached when $$ a(\beta) = x^{\mathrm{M}}=-\frac{\beta_{1}}{2 \beta_{2}} $$

$$ \mathbf{A}(\beta)=\frac{\partial x^{\mathrm{M}}}{\partial \beta^{\prime}}=\left[\begin{array}{lll}{\frac{\partial x^{\mathrm{M}}}{\partial \beta_{0}}} & {\frac{\partial x^{\mathrm{M}}}{\partial \beta_{1}}} & {\frac{\partial x^{\mathrm{M}}}{\partial \beta_{2}}}\end{array}\right]=\left[\begin{array}{ccc}{0} & {-\frac{1}{2 \beta_{2}}} & {\frac{\beta_{1}}{2 \beta_{2}^{2}}}\end{array}\right] $$

# Generate data
fake_df <- data.frame(
  x = runif(n = n, min = -2, max = 3),
  e = rnorm(n = n, mean = 0, sd = sqrt(10))
  ) %>% tbl_df()
# Calculate y = 4 + 4x - 2x^2 + e
fake_df %<>% mutate(
  x2 = x^2,
  y = 4 + 4 * x - 2 * x^2 + e)

# Estimate coefficients
(b_fake <- ols(fake_df, "y", c("x", "x2")) %$% coef)

# Estimate var-cov matrix
v_fake <- vcov_ols(fake_df, "y", c("x", "x2"))

A_fake <- matrix(data = c(
  # The first entry of A()
  0,
  # The second entry of A()
  -1/(2 * b_fake[3]),
  # The third entry of A()
  b_fake[2]/(2 * b_fake[3]^2)),
  nrow = 1)

# Our estimate for the x that maximizes y
(x_m <- - b_fake[2] / (2 * b_fake[3]))

# Our estimate for the standard error
(se_m <- sqrt(A_fake %*% v_fake %*% t(A_fake)))

# Estimate the equation
felm_fake <- felm(y ~ x + x2, data = fake_df)
# Use the 'deltamethod' function
deltamethod(g = ~ - x2 / (2 * x3),
  mean = coef(felm_fake),
  cov = vcov(felm_fake))

# Force se_m to numeric
se_m <- as.numeric(se_m)
# Our plot
ggplot() +
  # 95% confidence interval for maximal x
  geom_rect(aes(ymin = -Inf, ymax = Inf,
    xmin = x_m - 1.96 * se_m, xmax = x_m + 1.96 * se_m),
    fill = "grey90", alpha = 0.5) +
  # Plot the points
  geom_point(data = fake_df, aes(x = x, y = y)) +
  # Plot the true function
  stat_function(data = data.frame(x = c(-Inf, Inf)), aes(x = x),
    fun = function(x) {
      4 + 4 * x - 2 * x^2
    }, color = "blue", alpha = 0.25) +
  # Plot the predicted function
  stat_function(data = data.frame(x = c(-Inf, Inf)), aes(x = x),
    fun = function(x) {
      b_fake[1] + b_fake[2] * x + b_fake[3] * x^2
    }, color = "grey65", linetype = 2, alpha = 0.25) +
  # Vertical line at the predicted max.
  geom_vline(xintercept = x_m, color = "grey65", linetype = 2) +
  # Vertical line at the true max.
  geom_vline(xintercept = 1, color = "blue") +
  # Title
  ggtitle("Estimating the maximum of a quadratic")

Section 10: Standard Errors II¶

libreq(sandwich, HistData, robustbase, tidyr, dplyr, lfe, magrittr, ggplot2, viridis)

Installing package into ‘/home/alal/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
also installing the dependencies ‘proto’, ‘jpeg’, ‘heplots’

      wants        loaded
 [1,] "sandwich"   TRUE  
 [2,] "HistData"   TRUE  
 [3,] "robustbase" TRUE  
 [4,] "tidyr"      TRUE  
 [5,] "dplyr"      TRUE  
 [6,] "lfe"        TRUE  
 [7,] "magrittr"   TRUE  
 [8,] "ggplot2"    TRUE  
 [9,] "viridis"    TRUE

OLS Review¶

For linear regressions of the form

$$ \mathbf{y}=\mathbf{X} \boldsymbol{\beta}+\boldsymbol{\varepsilon} $$$$ \mathbf{b}=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{y} $$$$ \operatorname{Var}(\mathbf{b} | \mathbf{X})=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Sigma} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} $$$$ \boldsymbol{\Sigma}=\left[\begin{array}{cccc}{\varepsilon_{1}^{2}} & {\varepsilon_{1} \varepsilon_{2}} & {\cdots} & {\varepsilon_{1} \varepsilon_{N}} \\ {\varepsilon_{2} \varepsilon_{1}} & {\varepsilon_{2}^{2}} & {\cdots} & {\varepsilon_{2} \varepsilon_{N}} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {\varepsilon_{N} \varepsilon_{1}} & {\varepsilon_{N} \varepsilon_{2}} & {\cdots} & {\varepsilon_{N}^{2}}\end{array}\right] $$

Spherical Errors¶

Assume

$$ \boldsymbol{\Sigma}=\left[\begin{array}{cccc}{\sigma^{2}} & {0} & {\cdots} & {0} \\ {0} & {\sigma^{2}} & {\cdots} & {0} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {0} & {0} & {\cdots} & {\sigma^{2}}\end{array}\right] $$$$ \begin{aligned} \operatorname{Var}_{\mathrm{Sph}}(\mathbf{b} | \mathbf{X}) &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{\Sigma} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \sigma^{2} \mathbf{I}_{N} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\sigma^{2}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\sigma^{2}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \end{aligned} $$

# Function that demeans the columns of Z
demeaner <- function(N) {
  # Create an N-by-1 column of 1s
  i <- matrix(data = 1, nrow = N)
  # Create the demeaning matrix
  A <- diag(N) - (1/N) * i %*% t(i)
  # Return A
  return(A)
}
# Function to return OLS residuals
resid_ols <- function(data, y_var, X_vars, intercept = T) {
  # Require the 'dplyr' package
  require(dplyr)
  # Create the y matrix
  y <- to_matrix(the_df = data, vars = y_var)
  # Create the X matrix
  X <- to_matrix(the_df = data, vars = X_vars)
  # Bind a column of ones to X
  if (intercept == T) X <- cbind(1, X)
  # Calculate the sample size, n
  n <- nrow(X)
  # Calculate the residuals
  resids <- y - X %*% b_ols(y, X)
  # Return 'resids'
  return(resids)
}
# Function for OLS coef., SE, t-stat, and p-value
vcov_ols <- function(data, y_var, X_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(y, X)
  # Update names
  if (intercept == T) rownames(b)[1] <- "Intercept"
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Return the results
  return(as.numeric(s2) * XX_inv)
}

# The estimated coefficients
ols(data = diamonds,
  y_var = "price",
  X_vars = c("carat", "depth"))[,1:2]

# The estimated variance-covariance matrix
vcov_spherical <- vcov_ols(data = diamonds,
  y_var = "price",
  X_vars = c("carat", "depth"))
# Get the standard errors
vcov_spherical %>% diag() %>% sqrt()

# Add OLS residuals to the diamonds dataset
diamonds %<>% mutate(e_ols =
  resid_ols(data = diamonds,
    y_var = "price",
    X_vars = c("carat", "depth")) %>% as.numeric()
  )

p1 = # Residuals against carat
    ggplot(data = diamonds, aes(x = carat, y = e_ols)) +
  geom_point(color = viridis(1), shape = 21, alpha = 0.3) +
  xlab("Carat (covariate #1)") +
  ylab("OLS residual") +
  ggtitle("Residuals and carat")
p2 = ggplot(data = diamonds, aes(x = depth, y = e_ols)) +
  geom_point(color = viridis(1), shape = 21, alpha = 0.3) +
  xlab("Depth (covariate #2)") +
  ylab("OLS residual") +
  ggtitle("Residuals and depth")

(p1 | p2)

Heteroskedasticity / Sandwich estimator¶

$$ \begin{aligned} \widehat{\operatorname{Var}_{\mathrm{Het}}}(\mathbf{b} | \mathbf{X}) &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \mathbf{X}^{\prime} \hat{\mathbf{\Sigma}} \mathbf{X}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \\ &=\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\left(\sum_{i} \mathbf{x}_{i}^{\prime} \mathbf{x}_{i} e_{i}^{2}\right)\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1} \end{aligned} $$

vcov_white <- function(data, y_var, X_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(y, X)
  # Update names
  if (intercept == T) rownames(b)[1] <- "Intercept"
  # Calculate OLS residuals
  e <- y - X %*% b
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # For each row, calculate x_i' x_i e_i^2; then sum
  sigma_hat <- lapply(X = 1:n, FUN = function(i) {
    # Define x_i
    x_i <- matrix(as.vector(X[i,]), nrow = 1)
    # Return x_i' x_i e_i^2
    return(t(x_i) %*% x_i * e[i]^2)
  }) %>% Reduce(f = "+", x = .)
  # Return the results
  return(XX_inv %*% sigma_hat %*% XX_inv)
}

# Spherical:
vcov_ols(data = diamonds,
  y_var = "price",
  X_vars = c("carat", "depth")) %>%
  diag() %>% sqrt()
# Het. robust:
vcov_white(data = diamonds,
  y_var = "price",
  X_vars = c("carat", "depth")) %>%
  diag() %>% sqrt()

felm(price ~ carat + depth, data = diamonds) %>%
  summary(robust = T) %>% coef() %>% extract(., 1:3, 2)

Autocorrelated Errors¶

# Load Playfair's wheat data
wheat_df <- Wheat %>% tbl_df()
# Drop the rows missing a value
wheat_df %<>% na.omit()
# Long to wide table
wheat_gg <- wheat_df %>% tidyr::gather(Series, Price, Wheat:Wages)
# Playfair's graph
ggplot(data = wheat_gg, aes(x = Year, y = Price, color = Series, linetype = Series)) +
  geom_point() +
  geom_line() +
  geom_point() +
  geom_line() +
  xlab("Year") +
  ylab("Wage/Price (Shillings)") +
  ggtitle("Playfair's wheat and wages time series") +
  scale_linetype_manual("Series:",
    values = c(1, 3),
    labels = c("Wages", "Price of wheat")
  ) +
  scale_color_viridis("Series:",
    option = "B",
    discrete = T, end = .8, direction = -1,
    labels = c("Wages", "Price of wheat")
  )

Newey-West HAC¶

$$ \mathbf{X}'\hat{\Sigma}\mathbf{X} = \frac{1}{T} \sum_t e^2 x_t' x_t + \frac{1}{T} \sum{j=1}^L \sum{t=j+1}^T \left(1 - \frac{j}{L+1} \right) e_t e_{t-j} (x_t' x_{t-j} + x_{t-j}' x_t ) $$

vcov_hac <- function(data, y_var, X_vars, L, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(y, X)
  # Update names
  if (intercept == T) rownames(b)[1] <- "Intercept"
  # Calculate OLS residuals
  e <- y - X %*% b
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # The first term
  S_o <- lapply(X = 1:n, FUN = function(i) {
    # Define x_i
    x_i <- matrix(as.vector(X[i,]), nrow = 1)
    # Return x_i' x_i e_i^2
    return(t(x_i) %*% x_i * e[i]^2)
  }) %>% Reduce(f = "+", x = .)
  S_o <- S_o / n
  # The second term
  S_more <- lapply(X = 1:L, FUN = function(j) {
    lapply(X = (j+1):n, FUN = function(t) {
      # Grab the rows of X that we need
      x_t <- matrix(X[t,], nrow = 1)
      x_tj <- matrix(X[t-j,], nrow = 1)
      # The calculation
      (1 - j / (L + 1)) * e[t] * e[t-j] * (
        t(x_t) %*% x_tj + t(x_tj) %*% x_t)
      }) %>% Reduce(f = "+", x = .)
    }) %>% Reduce(f = "+", x = .)
  S_more <- S_more / n
  # The full sandwich
  S_star <- S_o + S_more
  # Return the results
  return(n * XX_inv %*% S_star %*% XX_inv)
}

# Choose a lag
the_lag <- ceiling(nrow(wheat_df) / 4)
# The spherical standard errors
vcov_ols(wheat_df, "Wheat", "Wages") %>%
  diag() %>% sqrt()

# The standard errors from our HAC robust function
vcov_hac(wheat_df, "Wheat", "Wages", the_lag) %>%
  diag() %>% sqrt()

# Estimate the model using 'lm'
wheat_reg <- lm(Wheat ~ Wages, data = wheat_df)
# Use the NeweyWest function
NeweyWest(wheat_reg, lag = the_lag, prewhite = F) %>%
  diag() %>% sqrt()

Clustered / correlated errors¶

Assume DGP is

$$ y_{ig} = x_{ig} \beta + \epsilon_{ig} $$

where $i = 1, \dots, N$ indexes individuals and $g = 1, \dots, G$ indexes groups. We continue to assume that $E[\epsilon_{ig} | x_{ig} ] = 0$ and add the assumption $E [\epsilon_{ig} \epsilon_{ih} | x_{ig}, x_{ih}] = 0$ when $g \neq h$.

Estimator for the meat with clusters

$$ X' \hat{\Sigma} X = \sum_{g=1}^G X_g ' e_g e_g' X_g $$

vcov_cluster <- function(data, y_var, X_vars,
  cluster_var, intercept = T) {
    # Turn data into matrices
    y <- to_matrix(data, y_var)
    X <- to_matrix(data, X_vars)
    # Add intercept
    if (intercept == T) X <- cbind(1, X)
    # Calculate n and k for degrees of freedom
    n <- nrow(X)
    k <- ncol(X)
    # Estimate coefficients
    b <- b_ols(y, X)
    # Update names
    if (intercept == T) rownames(b)[1] <- "Intercept"
    # Calculate OLS residuals
    e <- y - X %*% b
    # Inverse of X'X
    XX_inv <- solve(t(X) %*% X)
    # Find the levels of the variable on which we are clustering
    cl_levels <- data[, cluster_var] %>% unique() %>% unlist()
    # Calculate the meat, iterating over the clusters
    meat_hat <- lapply(X = cl_levels, FUN = function(g) {
      # Find the row indices for the current cluster
      indices <- which(unlist(data[, cluster_var]) == g)
      # Grab the current cluster's rows from X and e
      X_g <- X[indices,]
      e_g <- e[indices] %>% matrix(ncol = 1)
      # Calculate this cluster's part of the meat estimate
      return(t(X_g) %*% e_g %*% t(e_g) %*% X_g)
      }) %>% Reduce(f = "+", x = .) / n
    # Find the number of clusters
    G <- length(cl_levels)
    # Degrees-of-freedom correction
    df_c <- G/(G-1) * (n-1)/(n-k)
    # Return the results
    return(df_c * n * XX_inv %*% meat_hat %*% XX_inv)
  }

# Load the 'robustbase' package
library(robustbase)
# Load the 'NOxEmissions' dataset
nox_df <- NOxEmissions %>% tbl_df()
# Change names
names(nox_df) <- c("date", "log_nox", "log_nox_cars", "wind")

vcov_cluster(
  data = nox_df,
  y_var = "log_nox",
  X_vars = "wind",
  cluster_var = "date") %>%
  diag() %>% sqrt()

# The results under spherical errors
our_table <- ols(nox_df, "log_nox", "wind")[, 1:3]
our_table$effect <- c("Intercept", "Wind")
# Heteroskedasticity-robust
se_white <- vcov_white(nox_df, "log_nox", "wind") %>%
  diag() %>% sqrt()
# Cluster-robust
se_cluster <- vcov_cluster(nox_df, "log_nox", "wind", "date") %>%
  diag() %>% sqrt()
# Add new columns to the table
our_table %<>% mutate(se_white, se_cluster)
# Print results table
knitr::kable(our_table,
  col.names = c("Effect", "Coef.", "S.E. (Sph. Errors)",
    "S.E. (Het. Robust)", "S.E. (Cluster Robust)"),
  digits = c(3, 3, 4, 4, 4),
  align = c("l", rep("r", 4)),
  caption = "Comparing standard errors")


|Effect    |  Coef.| S.E. (Sph. Errors)| S.E. (Het. Robust)| S.E. (Cluster Robust)|
|:---------|------:|------------------:|------------------:|---------------------:|
|Intercept |  5.559|             0.0291|             0.0308|                0.0648|
|Wind      | -0.864|             0.0202|             0.0227|                0.0478|

Section 11: Instrumental Variables¶

libreq(dplyr, lfe, magrittr)

     wants      loaded
[1,] "dplyr"    TRUE  
[2,] "lfe"      TRUE  
[3,] "magrittr" TRUE

Violations of

strict exogeneity : $E[\epsilon_i | X] = 0$
population orthogonality: $E[x_i \epsilon_i] = 0$.

caused by

Omitted variables
Simultanaeity
Measurement error

Omitted variables bias¶

true dgp

$y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \epsilon$

estimated regression

$y = \theta_0 + \theta_1 x_1 + \nu$

\begin{align*} Cov(x_1, \nu) &= Cov(x_1, \beta_2 x_2 + \epsilon) \\ & = \beta_2 Cov(x_1, x_2) \\ &\neq 0 \end{align*}

IV solution¶

IV is correlated with the 'good' / exogenous variation in $x_1$ and uncorrelated with the 'bad' / correlated-with-$x_2$ variation in $x_1$.

Relevance: $Cov(z, x_1) \neq 0$
Exclusion: $Cov(z, \nu) = 0$

$$ \hat{\beta}_{IV} = (Z' X)^{-1} Z'y $$

# Define our sample size
n <- 1e4
# Define beta
beta <- c(5, 2, -3)
# Define the means of x1, x2, and z
mean_vec <- c(5, 10, -5)
# Define the var-cov matrix
vcov_mat <- matrix(data =
  c(1, 0.75, 0.25, 0.75, 1, 0, 0.25, 0, 1),
  nrow = 3)
# Generate the data for x1, x2, and z
gen_df <- mvrnorm(n = n, mu = mean_vec, Sigma = vcov_mat,
  empirical = T) %>% as.data.frame()
# Change names
names(gen_df) <- c("x1", "x2", "z")
# Generate the error term and calculate y
gen_df %<>% mutate(
  e = rnorm(n),
  y = beta[1] + beta[2] * x1 + beta[3] * x2 + e)

select(gen_df, x1, x2, z, e) %>% cor()

ols(data = gen_df, y_var = "y", X_vars = c("x1", "x2"))

ols(data = gen_df, y_var = "y", X_vars = "x1")
ols(data = gen_df, y_var = "y", X_vars = "x2")

# Function for IV coefficient estimates
b_iv <- function(data, y_var, X_vars, Z_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  Z <- to_matrix(data, Z_vars)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  if (intercept == T) Z <- cbind(1, Z)
  # Calculate beta hat
  beta_hat <- solve(t(Z) %*% X) %*% t(Z) %*% y
  # Update names
  if (intercept == T) rownames(beta_hat)[1] <- "Intercept"
  # Return beta_hat
  return(beta_hat)
}

b_iv(data = gen_df, y_var = "y", X_vars = "x1", Z_vars = "z")

# Checking our work with 'felm'
felm(y ~ 1 | 0 | (x1 ~ z) | 0, data = gen_df) %>%
  summary() %>% coef()

2SLS¶

First stage $x =\gamma_0 + \gamma_1 z + u$
Second stage $y = \beta_0 + \beta_1 \hat{x}_1 + v$

Exactly identical to Wald estimator for 1 instrument case, but permits use of multiple instruments.

$$ \hat{\beta}_{2SLS} = (X' P_Z X)^{-1} X' P_{Z} y $$

where $P_Z = Z(Z'Z)^{-1} Z'$ is the projection matrix from the first stage.

# Function for IV coefficient estimates
b_2sls <- function(data, y_var, X_vars, Z_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  Z <- to_matrix(data, Z_vars)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  if (intercept == T) Z <- cbind(1, Z)
  # Calculate P_Z
  P_Z <- Z %*% solve(t(Z) %*% Z) %*% t(Z)
  # Calculate b_2sls
  b <- solve(t(X) %*% P_Z %*% X) %*% t(X) %*% P_Z %*% y
  # Update names
  if (intercept == T) rownames(b)[1] <- "Intercept"
  # Return b
  return(b)
}

b_2sls(data = gen_df, y_var = "y", X_vars = "x1", Z_vars = "z")

IV Standard errors¶

weakly larger SEs than OLS, since IV uses subset of the variation in X (the 'good' variation), while OLS uses all of it, which results in more precise estimates.

$$ Var(b_{2SLS}) = \sigma^2 (X' P_Z X)^{-1} \geq \sigma^2(X' X)^{-1} = Var(b_{OLS}) $$

# Function for IV coefficient estimates
b_2sls <- function(data, y_var, X_vars, Z_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  Z <- to_matrix(data, Z_vars)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Add intercept
  if (intercept == T) X <- cbind(1, X)
  if (intercept == T) Z <- cbind(1, Z)
  # Calculate P_Z
  P_Z <- Z %*% solve(t(Z) %*% Z) %*% t(Z)
  # Calculate b_2sls
  b <- solve(t(X) %*% P_Z %*% X) %*% t(X) %*% P_Z %*% y
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n - k)
  s2 %<>% as.numeric()
  # Inverse of X' Pz X
  XX_inv <- solve(t(X) %*% P_Z %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - 0) / se
  # Calculate the p-values
  p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
  # Update names
  if (intercept == T) rownames(b)[1] <- "Intercept"
  # Nice table (data.frame) of results
  results <- data.frame(
    # The rows have the coef. names
    effect = rownames(b),
    # Estimated coefficients
    coef = as.vector(b),
    # Standard errors
    std_error = as.vector(se),
    # t statistics
    t_stat = as.vector(t_stats),
    # p-values
    p_value = as.vector(p_values)
    )
  # Return the results
  return(results)
}

# Our function
b_2sls(gen_df, "y", "x1", "z")

V1	V2
<dbl>	<dbl>
6165.257	2949.496

means	sds
<list>	<list>
6165.257	2949.496
21.2973	5.785503
3019.459	777.1936

-1.5454545	0.09090909	2
1.4545455	0.09090909	-2
-0.4545455	-0.09090909	1

-0.1622214	0.8964464	-0.41239305
-0.9733285	-0.2140768	-0.08247861
-0.1622214	0.3880141	0.90726471

-6.164414	-7.624407	-3.0822070
0.000000	1.966830	4.3216745
0.000000	0.000000	0.9072647

make	price	mpg	rep78	headroom	trunk	weight	length	turn	displacement	gear_ratio	foreign
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>
Toyota Corona	5719	18	5	2.0	11	2670	175	36	134	3.05	1
VW Dasher	7140	23	4	2.5	12	2160	172	36	97	3.74	1
VW Diesel	5397	41	5	3.0	15	2040	155	35	90	3.78	1
VW Rabbit	4697	25	4	3.0	15	1930	155	35	89	3.78	1
VW Scirocco	6850	25	4	2.0	16	1990	156	36	97	3.78	1
Volvo 260	11995	17	5	2.5	14	3170	193	37	163	2.98	1

effect	coef	std_error	t_stat	p_value
<fct>	<dbl>	<dbl>	<dbl>	<dbl>
intercept	1946.069	3597.050	0.541	0.5902
mpg	-49.512	86.156	-0.575	0.5673
weight	1.747	0.641	2.723	0.0081

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	1946.068668	3597.0495988	0.5410180	0.590188628
mpg	-49.512221	86.1560389	-0.5746808	0.567323727
weight	1.746559	0.6413538	2.7232382	0.008129813

	used	(Mb)	gc trigger	(Mb)	max used	(Mb)
Ncells	2556281	136.6	4065737	217.2	4065737	217.2
Vcells	6501323	49.7	12305512	93.9	12305512	93.9

	intercept	mpg	weight
intercept	12938765.816	-292759.82264	-2191.9031965
mpg	-292759.823	7422.86303	44.6016592
weight	-2191.903	44.60166	0.4113347

	x1	x2	z	e
x1	1.0000000000	7.500000e-01	2.500000e-01	-0.0009642764
x2	0.7500000000	1.000000e+00	-1.805659e-15	-0.0085279932
z	0.2500000000	-1.805659e-15	1.000000e+00	0.0145589308
e	-0.0009642764	-8.527993e-03	1.455893e-02	1.0000000000

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-25.290889	0.6424132	-39.36857	2.844185e-315
`x1(fit)`	2.057551	0.1283224	16.03423	3.789778e-57

A data.table: 6 × 4
price	mpg	weight	length
<dbl>	<dbl>	<dbl>	<dbl>
4099	22	2930	186
4749	17	3350	173
3799	22	2640	168
4816	20	3250	196
7827	15	4080	222
5788	18	3670	218

A data.table: 6 × 12
make	price	mpg	rep78	headroom	trunk	weight	length	turn	displacement	gear_ratio	foreign
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>
Merc. Zephyr	3291	20	3	3.5	17	2830	195	43	140	3.08	0
Chev. Chevette	3299	29	3	2.5	9	2110	163	34	231	2.93	0
Chev. Monza	3667	24	2	2.0	7	2750	179	40	151	2.73	0
Toyota Corolla	3748	31	5	3.0	9	2200	165	35	97	3.21	1
Subaru	3798	35	5	2.5	11	2050	164	36	97	3.81	1
AMC Spirit	3799	22	NaN	3.0	12	2640	168	35	121	3.08	0

A matrix: 3 × 1 of type dbl
	1.7943225731
mpg	-0.0098904309
weight	0.0004668253

A matrix: 3 × 1 of type dbl
	5.849262628
mpg	-0.082739270
weight	0.003202433