Graduate Regression notes in R

Worked examples from Ed Rubin's excellent ARE212 repo

In [1]:
rm(list = ls())
library(LalRUtils)
In [2]:
libreq(lfe, rio, MASS, data.table, tidyverse, magrittr, patchwork, parallel,
        furrr)
theme_set(lal_plot_theme())
set.seed(42)
      wants        loaded
 [1,] "lfe"        TRUE  
 [2,] "rio"        TRUE  
 [3,] "MASS"       TRUE  
 [4,] "data.table" TRUE  
 [5,] "tidyverse"  TRUE  
 [6,] "magrittr"   TRUE  
 [7,] "patchwork"  TRUE  
 [8,] "parallel"   TRUE  
 [9,] "furrr"      TRUE  
In [3]:
root = '~/ddr/tut/ARE212'
setwd(root)

Section 1: R Basics

In [4]:
setwd(file.path(root, 'Section01/'))
list.files()
  1. 'auto.csv'
  2. 'auto.dta'
  3. 'README.md'
  4. 'section01.pdf'
  5. 'section01.R'
In [5]:
auto = fread('auto.csv')
auto %>% glimpse
Observations: 74
Variables: 12
$ make         <chr> "AMC Concord", "AMC Pacer", "AMC Spirit", "Buick Century…
$ price        <dbl> 4099, 4749, 3799, 4816, 7827, 5788, 4453, 5189, 10372, 4…
$ mpg          <dbl> 22, 17, 22, 20, 15, 18, 26, 20, 16, 19, 14, 14, 21, 29, …
$ rep78        <dbl> 3, 3, NaN, 3, 4, 3, NaN, 3, 3, 3, 3, 2, 3, 3, 4, 3, 2, 2…
$ headroom     <dbl> 2.5, 3.0, 3.0, 4.5, 4.0, 4.0, 3.0, 2.0, 3.5, 3.5, 4.0, 3…
$ trunk        <dbl> 11, 11, 12, 16, 20, 21, 10, 16, 17, 13, 20, 16, 13, 9, 2…
$ weight       <dbl> 2930, 3350, 2640, 3250, 4080, 3670, 2230, 3280, 3880, 34…
$ length       <dbl> 186, 173, 168, 196, 222, 218, 170, 200, 207, 200, 221, 2…
$ turn         <dbl> 40, 40, 35, 40, 43, 43, 34, 42, 43, 42, 44, 43, 45, 34, …
$ displacement <dbl> 121, 258, 121, 196, 350, 231, 304, 196, 231, 231, 425, 3…
$ gear_ratio   <dbl> 3.58, 2.53, 3.08, 2.93, 2.41, 2.73, 2.87, 2.93, 2.93, 3.…
$ foreign      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
In [6]:
tail(auto)
A data.table: 6 × 12
makepricempgrep78headroomtrunkweightlengthturndisplacementgear_ratioforeign
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int>
Toyota Corona 57191852.0112670175361343.051
VW Dasher 71402342.512216017236 973.741
VW Diesel 53974153.015204015535 903.781
VW Rabbit 46972543.015193015535 893.781
VW Scirocco 68502542.016199015636 973.781
Volvo 260 119951752.5143170193371632.981

select columns

In [7]:
auto[, .(price, mpg, weight, length)] %>% head
A data.table: 6 × 4
pricempgweightlength
<dbl><dbl><dbl><dbl>
4099222930186
4749173350173
3799222640168
4816203250196
7827154080222
5788183670218

Sorting

In [8]:
auto[order(price, -weight)] %>%head
A data.table: 6 × 12
makepricempgrep78headroomtrunkweightlengthturndisplacementgear_ratioforeign
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int>
Merc. Zephyr 329120 33.5172830195431403.080
Chev. Chevette329929 32.5 92110163342312.930
Chev. Monza 366724 22.0 72750179401512.730
Toyota Corolla374831 53.0 9220016535 973.211
Subaru 379835 52.511205016436 973.811
AMC Spirit 379922NaN3.0122640168351213.080

Summary stats

Single Variable

In [9]:
auto[, .(mean(price), sd(price))]
A data.table: 1 × 2
V1V2
<dbl><dbl>
6165.2572949.496

Multiple Variables

In [10]:
auto[, .(means = lapply(.SD, mean), sds = lapply(.SD, sd)), .SDcols = c('price', 'mpg', 'weight')]
A data.table: 3 × 2
meanssds
<list><list>
6165.2572949.496
21.29735.785503
3019.459777.1936

Figures

In [11]:
# The histogram function
with(auto, hist(
  # The variable for the histogram
  x = mpg,
  # The main title
  main = "Distribution of fuel economy",
  # The x-axis label
  xlab = "MPG (miles per gallon)")
# The blue vertical line at the median MPG (lwd is line width)
)
abline(v = median(auto$mpg), col = "blue", lwd = 3)
In [12]:
ggplot(auto, aes(mpg)) + geom_histogram() + geom_vline(xintercept = median(auto$mpg), col = "blue") + 
   labs(title= "Distribution of fuel economy", x = "MPG (miles per gallon)")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Section 2 : Linear Algebra

In [13]:
v1 = 1:5
v2 = c(1,2,3,4,5)
all.equal(v1, v2)
TRUE
In [14]:
A <- matrix(data = 1:6, ncol = 2)
Z = matrix(c(
    c(1, 3, 4),
    c(6, 7, 2),
    1:3), 
    3, byrow = T
    )
In [15]:
t(Z) # transpose
A matrix: 3 × 3 of type dbl
161
372
423

Matrix Multiplication

In [16]:
Z %*% A
A matrix: 3 × 2 of type dbl
1943
2671
1432

Inverse

In [17]:
solve(Z)
A matrix: 3 × 3 of type dbl
-1.5454545 0.09090909 2
1.4545455 0.09090909-2
-0.4545455-0.09090909 1

Crossproduct

In [18]:
crossprod(v1, v2)
A matrix: 1 × 1 of type dbl
55

Decompositions

Eigenvalues

In [19]:
eigen(Z)
eigen() decomposition
$values
[1] 10.4187613  1.3584430 -0.7772043

$vectors
           [,1]       [,2]       [,3]
[1,] -0.3996373  0.4619135 -0.7690618
[2,] -0.8701109 -0.6891769  0.6262312
[3,] -0.2884389  0.5582751 -0.1279784

QR Decomposition

$$ A = QR $$

s.t. $Q$ is an orthogonal matrix($:= Q'Q = QQ' = I; Q' = Q^{-1}$) and $R$ is upper-triangular.

In [20]:
qrd = qr(Z)
qr.Q(qrd)
qr.R(qrd)
A matrix: 3 × 3 of type dbl
-0.1622214 0.8964464-0.41239305
-0.9733285-0.2140768-0.08247861
-0.1622214 0.3880141 0.90726471
A matrix: 3 × 3 of type dbl
-6.164414-7.624407-3.0822070
0.000000 1.966830 4.3216745
0.000000 0.000000 0.9072647

Sec 3: Functions

In [21]:
cars = import(file.path(root, 'Section03/auto.dta'))
In [22]:
b_ols = function(df, y, X){
    d = setDT(df)
    y_d = d[[y]] 
    X_d = cbind(1, as.matrix(d[, ..X]))
    β_hat = solve(crossprod(X_d, X_d)) %*% crossprod(X_d, y_d)
    β_hat
}
In [23]:
b_ols(cars, 'price', c('mpg', 'weight'))
A matrix: 3 × 1 of type dbl
1946.068668
mpg -49.512221
weight 1.746559
In [24]:
lm(price ~ mpg + weight, cars)
Call:
lm(formula = price ~ mpg + weight, data = cars)

Coefficients:
(Intercept)          mpg       weight  
   1946.069      -49.512        1.747  

Lapply

In [25]:
target_vars <- c("price", "headroom", "trunk", "length", "turn")
In [26]:
results_list <- lapply(
  X = target_vars,
  FUN = function(i) b_ols(df = cars, y = i, X = c("mpg", "weight"))
  )
results_list
  1. A matrix: 3 × 1 of type dbl
    1946.068668
    mpg -49.512221
    weight 1.746559
  2. A matrix: 3 × 1 of type dbl
    1.7943225731
    mpg-0.0098904309
    weight 0.0004668253
  3. A matrix: 3 × 1 of type dbl
    5.849262628
    mpg-0.082739270
    weight 0.003202433
  4. A matrix: 3 × 1 of type dbl
    120.11619444
    mpg -0.35546594
    weight 0.02496695
  5. A matrix: 3 × 1 of type dbl
    27.323996368
    mpg-0.059092537
    weight 0.004498541
In [27]:
# A function to calculate bias
data_baker <- function(sample_n, true_beta) {
  # First generate x from N(0,1)
  x <- rnorm(sample_n)
  # Now the error from N(0,1)
  e <- rnorm(sample_n)
  # Now combine true_beta, x, and e to get y
  y <- true_beta[1] + true_beta[2] * x + e
  # Define the data matrix of independent vars.
  X <- cbind(1, x)
  # Force y to be a matrix
  y <- matrix(y, ncol = 1)
  # Calculate the OLS estimates
  b_ols <- solve(t(X) %*% X) %*% t(X) %*% y
  # Convert b_ols to vector
  b_ols <- b_ols %>% as.vector()
  # Calculate bias, force to 2x1 data.frame()
  the_bias <- (true_beta - b_ols) %>%
    matrix(ncol = 2) %>% data.frame()
  # Set names
  names(the_bias) <- c("bias_intercept", "bias_x")
  # Return the bias
  return(the_bias)
}
In [28]:
# Set seed
set.seed(12345)
# Run once
data_baker(sample_n = 100, true_beta = c(1, 3))
A data.frame: 1 × 2
bias_interceptbias_x
<dbl><dbl>
-0.02205339-0.09453503
In [29]:
# A function to run the simulation
bias_simulator <- function(n_sims, sample_n, true_beta) {
  # Run data_baker() n_sims times with given parameters
  sims_dt <- lapply(
    X = 1:n_sims,
    FUN = function(i) data_baker(sample_n, true_beta)) %>%
    # Bind the rows together to output a nice data.frame
    bind_rows()

  # Return sim_dt
  return(sims_dt)
}
In [30]:
# Set seed
set.seed(12345)
# Run it
sim_dt <- bias_simulator(n_sims = 1e4, sample_n = 100, true_beta = c(1,3))
# Check the results with a histogram
hist(sim_dt[,2],
  breaks = 30,
  main = "Is OLS unbiased?",
  xlab = "Bias")
# Emphasize the zero line
abline(v = 0, col = "blue", lwd = 3)

Section 4: FWL and model fit

In [31]:
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  # Require the 'dplyr' package
  require(dplyr)

  # Create the y matrix
  y <- data %>%
    # Select y variable data from 'data'
    select({{y_var}}) %>%
    # Convert y_data to matrices
    as.matrix()

  # Create the X matrix
  X <- data %>%
    # Select X variable data from 'data'
    select({{X_vars}})

  # If 'intercept' is TRUE, then add a column of ones
  # and move the column of ones to the front of the matrix
  if (intercept == T) {
    # Bind on a column of ones
    X <- cbind(1, X)
    # Name the column of ones
    names(X) <- c("ones", X_vars)
    }

  # Convert X_data to a matrix
  X <- as.matrix(X)

  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y

  # If 'intercept' is TRUE:
  # change the name of 'ones' to 'intercept'
  if (intercept == T) rownames(beta_hat) <- c("intercept", X_vars)

  # Return beta_hat
  return(beta_hat)
}
In [32]:
n = 100
the_data <- tibble(
  x = rnorm(n),
  e = rnorm(n))
# Calculate y = 3 + 1.5 x + e
the_data <- mutate(the_data, y = 3 + 1.5 * x + e)
# Plot to make sure things are going well.
plot(
  # The variables for the plot
  x = the_data$x, y = the_data$y,
  # Labels and title
  xlab = "x", ylab = "y", main = "Our generated data")
In [33]:
# Run b_ols with and intercept
b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
felm(y ~ x, data = the_data)
A matrix: 2 × 1 of type dbl
y
intercept3.120907
x1.366898
(Intercept)           x 
      3.121       1.367 
In [34]:
# The estimates
b_w <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T)
b_wo <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F)
# Plot the points
plot(
  # The variables for the plot
  x = the_data$x, y = the_data$y,
  # Labels and title
  xlab = "x", ylab = "y", main = "Our generated data")
# Plot the line from the 'with intercept' regression in yellow
abline(a = b_w[1], b = b_w[2], col = "lightblue", lwd = 3)
# Plot the line from the 'without intercept' regression in purple
abline(a = 0, b = b_w[1], col = "purple4", lwd = 2)
# Add a legend
legend(x = min(the_data$x), y = max(the_data$y),
  legend = c("with intercept", "w/o intercept"),
  # Line widths
  lwd = c(3, 2),
  # Colors
  col = c("lightblue", "purple4"),
  # No box around the legend
  bty = "n")

FWL Thm

For an arbitrary regression

$$ \mathbf{y}=\mathbf{X}_{1} \beta_{1}+\mathbf{X}_{2} \beta_{2}+\varepsilon $$

FWL states that we can estimate $\beta_1$ with $b_1$ using

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$

Now, define $M_2$ as the annihilator matrix, and define $X_1^* := M_2 X_1$ and $y^* := M_2 y$. These are the residualised data matrices on $X_2$. Then, we can also write

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{* \prime} \mathbf{X}_{1}^{*}\right)^{-1} \mathbf{X}_{1}^{* \prime} \mathbf{y}^{*} $$
In [35]:
to_matrix <- function(the_df, vars) the_df %>% dplyr::select({{vars}}) %>% as.matrix()
In [36]:
b_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  # Create the y matrix
  y <- to_matrix(the_df = data, vars = y_var)
  X <- to_matrix(the_df = data, vars = X_vars)
  # If 'intercept' is TRUE, then add a column of ones
  if (intercept == T) {
    # Bind a column of ones to X
    X <- cbind(1, X)
    # Name the new column "intercept"
    colnames(X) <- c("intercept", X_vars)
  }
  # Calculate beta hat
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
  # Return beta_hat
  return(beta_hat)
}
In [37]:
b_ols(the_data, y_var = "y", X_vars = "x", intercept = T)
A matrix: 2 × 1 of type dbl
y
intercept3.120907
x1.366898
In [38]:
resid_ols <- function(data, y_var, X_vars, intercept = TRUE) {
  y <- to_matrix(the_df = data, vars = y_var)
  X <- to_matrix(the_df = data, vars = X_vars)
  if (intercept == T) {
    X <- cbind(1, X)
    colnames(X) <- c("intercept", X_vars)
  }
  n <- nrow(X)
  resids <- (diag(n) - X %*% solve(t(X) %*% X) %*% t(X)) %*% y
  return(resids)
}

to estimate the following regression

$$ y = X_1 \beta_1 + X_2 \beta_2 + \epsilon $$
  • Regress $y$ on $X_2$
  • save the residuals, call them $e_{yX_2}$
  • Regress $x$ on $X_2$
  • save the residuals, call them $e_{X_1 X_2}$
  • Regress $e_{yX_2}$ on $e_{X_1 X_2}$ (without an intercept)
In [39]:
# Steps 1 and 2: Residualize 'price' on 'weight' and an intercept
e_yx <- resid_ols(data = cars, y_var = "price",
  X_vars = "weight", intercept = T)
# Steps 3 and 4: Residualize 'mpg' on 'weight' and an intercept
e_xx <- resid_ols(data = cars, y_var = "mpg",
  X_vars = "weight", intercept = T)
# Combine the two sets of residuals into a data.frame
e_df <- data.frame(e_yx = e_yx[,1], e_xx = e_xx[,1])
# Step 5: Regress e_yx on e_xx without an intercept
b_ols(data = e_df, y_var = "e_yx",
  X_vars = "e_xx", intercept = F)
A matrix: 1 × 1 of type dbl
e_yx
e_xx-49.51222
In [40]:
b_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
A matrix: 3 × 1 of type dbl
price
intercept1946.068668
mpg -49.512221
weight 1.746559

Omitted Variables Bias

$$ \mathbf{b}_{1}=\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{y}-\left(\mathbf{X}_{1}^{\prime} \mathbf{X}_{1}\right)^{-1} \mathbf{X}_{1}^{\prime} \mathbf{X}_{2} \mathbf{b}_{2} $$

We can recover the OLS estimate by estimating the univariate regression and differencing away a normalizing term. This normalizing term is 0 if $X_1$ $X_2$ are uncorrelated / orthogonal.

In [41]:
library(MASS)
In [42]:
v_cov <- matrix(data = c(1, 0.95, 0.95, 1), nrow = 2)
# Create the means vector
means <- c(5, 10)
# Define our sample size
n <- 1e5
# Set the seed
set.seed(12345)
# Generate x1 and x2
X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T)
In [43]:
# Create a tibble for our data, add generate error from N(0,1)
the_data <- as.data.frame(X) %>% mutate(e = rnorm(n))
# Set the names
names(the_data) <- c("x1", "x2", "e")
# The data-generating process
the_data <- the_data %>% mutate(y = 1 + 2 * x1 + 3 * x2 + e)
In [44]:
# Regression 1: y on int, x1, and x2
b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T)
A matrix: 3 × 1 of type dbl
y
intercept1.012006
x11.995968
x23.000975
In [45]:
# Regression 2: y on int and x1
b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T)
# Regression 3: y on int and x2
b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T)
A matrix: 2 × 1 of type dbl
y
intercept16.767124
x1 4.846894
A matrix: 2 × 1 of type dbl
y
intercept-7.969850
x2 4.897144

$\text{R}^2$ and variations

Uncentered $R^2$

$$ R_{\mathrm{uc}}^{2}=\frac{\hat{\mathbf{y}}^{\prime} \hat{\mathbf{y}}}{\mathbf{y}^{\prime} \mathbf{y}}=1-\frac{\mathbf{e}^{\prime} \mathbf{e}}{\mathbf{y}^{\prime} \mathbf{y}}=1-\frac{\sum_{i}\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum_{i}\left(y_{i}-0\right)^{2}}=1-\frac{\mathrm{SSR}}{\mathrm{SST}_{0}} $$

Centered $R^2$

$$ R^{2}=1-\frac{\mathbf{e}^{\prime} \mathbf{e}}{\mathbf{y}^{* \prime} \mathbf{y}^{*}}=1-\frac{\sum_{i}\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum_{i}\left(y_{i}-\bar{y}\right)^{2}}=1-\frac{\mathrm{SSR}}{\mathrm{SST}}=\frac{\mathrm{SSM}}{\mathrm{SST}} $$

Adjusted $R^2$

$$ \bar{R}^{2}=1-\frac{n-1}{n-k}\left(1-R^{2}\right) $$

Use demeaning matrix $$ \mathbf{A}=\mathbf{I}_{n}-\left(\frac{1}{n}\right) \mathbf{i} \mathbf{i}^{\prime} $$

In [46]:
# Function that demeans the columns of Z
demeaner <- function(N) {
  # Create an N-by-1 column of 1s
  i <- matrix(data = 1, nrow = N)
  # Create the demeaning matrix
  A <- diag(N) - (1/N) * i %*% t(i)
  # Return A
  return(A)
}
In [47]:
# Start the function
r2_ols <- function(data, y_var, X_vars) {
  # Create y and X matrices
  y <- to_matrix(data, vars = y_var)
  X <- to_matrix(data, vars = X_vars)
  # Add intercept column to X
  X <- cbind(1, X)
  # Find N and K (dimensions of X)
  N <- nrow(X)
  K <- ncol(X)
  # Calculate the OLS residuals
  e <- resid_ols(data, y_var, X_vars)
  # Calculate the y_star (demeaned y)
  y_star <- demeaner(N) %*% y
  # Calculate r-squared values
  r2_uc  <- 1 - t(e) %*% e / (t(y) %*% y)
  r2     <- 1 - t(e) %*% e / (t(y_star) %*% y_star)
  r2_adj <- 1 - (N-1) / (N-K) * (1 - r2)
  # Return a vector of the r-squared measures
  return(c("r2_uc" = r2_uc, "r2" = r2, "r2_adj" = r2_adj))
}
In [48]:
# Our r-squared function
r2_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight"))
r2_uc
0.86984754062892
r2
0.293389123194756
r2_adj
0.273484591453764
In [49]:
felm(price ~ mpg + weight, cars) %>% summary()
Call:
   felm(formula = price ~ mpg + weight, data = cars) 

Residuals:
   Min     1Q Median     3Q    Max 
 -3332  -1858   -504   1256   7507 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) 1946.0687  3597.0496   0.541  0.59019   
mpg          -49.5122    86.1560  -0.575  0.56732   
weight         1.7466     0.6414   2.723  0.00813 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2514 on 71 degrees of freedom
Multiple R-squared(full model): 0.2934   Adjusted R-squared: 0.2735 
Multiple R-squared(proj model): 0.2934   Adjusted R-squared: 0.2735 
F-statistic(full model):14.74 on 2 and 71 DF, p-value: 4.425e-06 
F-statistic(proj model): 14.74 on 2 and 71 DF, p-value: 4.425e-06 

Overfitting

In [50]:
# Find the number of observations in cars
n <- nrow(cars)
# Fill an n-by-50 matrix with random numbers from N(0,1)
set.seed(12345)
rand_mat <- matrix(data = rnorm(n * 50, sd = 10), ncol = 50)
# Convert rand_mat to tbl_df
rand_df <- as.data.frame(rand_mat)
# Change names to x1 to x50
names(rand_df) <- paste0("x", 1:50)
# Bind cars and rand_df; convert to tbl_df; call it 'cars_rand'
cars_rand <- cbind(cars, rand_df) %>% tbl_df()
# Drop the variable 'rep78' (it has NAs)
cars_rand <- dplyr::select(cars_rand, -rep78, -make)
In [51]:
result_df <- lapply(
  X = 2:ncol(cars_rand),
  FUN = function(k) {
    # Apply the 'r2_ols' function to columns 2:k
    r2_df <- r2_ols(
      data = cars_rand,
      y_var = "price",
      X_vars = names(cars_rand)[2:k]) %>%
      matrix(ncol = 3) %>% as.data.frame()
    # Set names
    names(r2_df) <- c("r2_uc", "r2", "r2_adj")
    # Add a column for k
    r2_df$k <- k
    # Return r2_df
    return(r2_df)
  }) %>% bind_rows()
In [52]:
# Plot unadjusted r-squared
plot(x = result_df$k, y = result_df$r2,
  type = "l", lwd = 3, col = "lightblue",
  xlab = expression(paste("k: number of columns in ", bold("X"))),
  ylab = expression('R' ^ 2))
# Plot adjusted r-squared
lines(x = result_df$k, y = result_df$r2_adj,
  type = "l", lwd = 2, col = "purple4")
# The legend
legend(x = 0, y = max(result_df$r2),
  legend = c(expression('R' ^ 2),
    expression('Adjusted R' ^ 2)),
  # Line widths
  lwd = c(3, 2),
  # Colors
  col = c("lightblue", "purple4"),
  # No box around the legend
  bty = "n")

Section 5: Inference and Parallelisation

In [53]:
x <- data.frame(a = c(1:4, NA), b = c("..", 1, 2, 3, ".."))  %>% as_tibble()
x
A tibble: 5 × 2
ab
<int><fct>
1..
21
32
43
NA..
In [54]:
x[x == ".."] <- NA
x
A tibble: 5 × 2
ab
<int><fct>
1NA
21
32
43
NANA
In [55]:
dir_data = file.path(root, 'section05')
cars <- file.path(dir_data, "auto.csv") %>% read_csv()
Parsed with column specification:
cols(
  make = col_character(),
  price = col_double(),
  mpg = col_double(),
  rep78 = col_double(),
  headroom = col_double(),
  trunk = col_double(),
  weight = col_double(),
  length = col_double(),
  turn = col_double(),
  displacement = col_double(),
  gear_ratio = col_double(),
  foreign = col_double()
)

t-tests

$$ t_{j}=\frac{b_{j}-\bar{\gamma}}{\sqrt{s^{2} \cdot\left\{\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right\}_{i j}}}=\frac{b_{j}-\bar{\gamma}}{\operatorname{se}\left(b_{j}\right)} $$
In [56]:
t_stat <- function(data, y_var, X_vars, gamma, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept if requested
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars, intercept)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Force s2 to numeric
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - gamma) / se
  # Return the _t_ statistics
  return(t_stats)
}
In [57]:
t_stat(cars,
  y_var = "price", X_vars = c("mpg", "weight"),
  gamma = 0, intercept = T)
A matrix: 3 × 1 of type dbl
price
intercept 0.5410180
mpg-0.5746808
weight 2.7232382
In [58]:
felm(price ~ mpg + weight, cars) %>%
  summary() %$% (coefficients)[,3]
(Intercept)
0.541018024504537
mpg
-0.574680792169236
weight
2.7232382300228

%$% pipe

In [59]:
cars %$% cor(price, weight)
0.538611462600479

P-value

$$ p=\operatorname{Pr}\left(t_{\mathrm{df}}>\left|t_{j}\right|\right) \times 2 $$
In [60]:
# The default: lower.tail = TRUE
pt(q = 2, df = 15)
# Setting lower.tail = TRUE
pt(q = 2, df = 15, lower.tail = T)
# Setting lower.tail = FALSE
pt(q = 2, df = 15, lower.tail = F)
0.96802749635764
0.96802749635764
0.0319725036423601
In [61]:
ols <- function(data, y_var, X_vars, intercept = T) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept if requested
  if (intercept == T) X <- cbind(1, X)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars, intercept)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Update s2 to numeric
  s2 %<>% as.numeric()
  # Inverse of X'X
  XX_inv <- solve(t(X) %*% X)
  # Standard error
  se <- sqrt(s2 * diag(XX_inv))
  # Vector of _t_ statistics
  t_stats <- (b - 0) / se
  # Calculate the p-values
  p_values = pt(q = abs(t_stats), df = n-k, lower.tail = F) * 2
  # Nice table (data.frame) of results
  results <- data.frame(
    # The rows have the coef. names
    effect = rownames(b),
    # Estimated coefficients
    coef = as.vector(b) %>% round(3),
    # Standard errors
    std_error = as.vector(se) %>% round(3),
    # t statistics
    t_stat = as.vector(t_stats) %>% round(3),
    # p-values
    p_value = as.vector(p_values) %>% round(4)
    )
  # Return the results
  return(results)
}
In [62]:
ols(data = cars,
  y_var = "price",
  X_vars = c("mpg", "weight"),
  intercept = T)
A data.frame: 3 × 5
effectcoefstd_errort_statp_value
<fct><dbl><dbl><dbl><dbl>
intercept1946.0693597.050 0.5410.5902
mpg -49.512 86.156-0.5750.5673
weight 1.747 0.641 2.7230.0081
In [63]:
felm(price ~ mpg + weight, cars) %>%
  summary() %$% coefficients
A matrix: 3 × 4 of type dbl
EstimateStd. Errort valuePr(>|t|)
(Intercept)1946.0686683597.0495988 0.54101800.590188628
mpg -49.512221 86.1560389-0.57468080.567323727
weight 1.746559 0.6413538 2.72323820.008129813

F tests

$$ F=\frac{(\mathbf{R} \mathbf{b}-\mathbf{r})^{\prime}\left[\mathbf{R}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right](\mathbf{R} \mathbf{b}-\mathbf{r}) / J}{s^{2}}=\frac{(\mathbf{R} \mathbf{b})^{\prime}\left[\mathbf{R}\left(\mathbf{X}^{\prime} \mathbf{X}\right)^{-1}\right](\mathbf{R} \mathbf{b}) / J}{s^{2}} $$
In [64]:
joint_test <- function(data, y_var, X_vars) {
  # Turn data into matrices
  y <- to_matrix(data, y_var)
  X <- to_matrix(data, X_vars)
  # Add intercept
  X <- cbind(1, X)
  # Name the new column "intercept"
  colnames(X) <- c("intercept", X_vars)
  # Calculate n and k for degrees of freedom
  n <- nrow(X)
  k <- ncol(X)
  # J is k-1
  J <- k - 1
  # Create the R matrix: bind a column of zeros
  # onto a J-by-J identity matrix
  R <- cbind(0, diag(J))

  # Estimate coefficients
  b <- b_ols(data, y_var, X_vars)
  # Calculate OLS residuals
  e <- y - X %*% b
  # Calculate s^2
  s2 <- (t(e) %*% e) / (n-k)
  # Force s2 to numeric
  s2 %<>% as.numeric()

  # Create the inner matrix R(X'X)^(-1)R'
  RXXR <- R %*% solve(t(X) %*% X) %*% t(R)
  # Calculate the F stat
  f_stat <- t(R %*% b) %*% solve(RXXR) %*% (R %*% b) / J / s2
  # Calculate the p-value
  p_value <- pf(q = f_stat, df1 = J, df2 = n-k, lower.tail = F)
  # Create a data.frame of the f stat. and p-value
  results <- data.frame(
    f_stat = f_stat %>% as.vector(),
    p_value = p_value %>% as.vector())
  return(results)
}
In [65]:
joint_test(data = cars,
  y_var = "price", X_vars = c("mpg", "weight"))
A data.frame: 1 × 2
f_statp_value
<dbl><dbl>
14.739824.424878e-06
In [66]:
felm(price ~ mpg + weight, cars) %>%
  summary() %$% F.fstat
F
14.7398153853841
df1
2
df2
71
p
4.42487795297466e-06

Simulations

In [67]:
# Function to generate data
gen_data <- function(sample_size) {
  # Create data.frame with random x and error
  data_df <- data.frame(
    x = rnorm(sample_size),
    e = rnorm(sample_size))
  # Calculate y = 7 + 0.5 x + e; drop 'e'
  data_df %<>% mutate(y = 7 + 0.5 * x + e) %>%
    select(-e)
  # Return data_df
  return(data_df)
}

one_sim <- function(sample_size) {
  # Estimate via OLS
  ols_est <- ols(data = gen_data(sample_size),
    y_var = "y", X_vars = "x")
  # Grab the estimated coefficient on x
  # (the second element of 'coef')
  b1 <- ols_est %$% coef[2]
  # Grab the second p-value
  # (the first p-value is for the intercept)
  p_value <- ols_est %$% p_value[2]
  # Return a data.frame with b1 and p_value
  return(data.frame(b1, p_value))
}

ols_sim <- function(n_sims, sample_size, seed = 12345) {
  # Set the seed
  set.seed(seed)
  # Run one_sim n_sims times; convert results to data.frame
  sim_df <- replicate(
    n = n_sims,
    expr = one_sim(sample_size),
    simplify = F
    ) %>% bind_rows()
  # Return sim_df
  return(sim_df)
}
In [68]:
# Run ols_sim for sample size of 10
sim10 <- ols_sim(n_sims = 1e3, sample_size = 10)
# Run ols_sim for sample size of 100
sim100 <- ols_sim(n_sims = 1e3, sample_size = 100)

Power visualisations

In [69]:
p1 = ggplot(data = sim10, aes(x = p_value)) +
  stat_density(fill = "grey20", alpha = 0.9) +
  geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
  theme_bw() +
  xlab(expression(paste(italic("p"), "-Value"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", italic(p),
    "-Values from 1,000 simulations with sample size 10")))

p2 = ggplot(data = sim100, aes(x = p_value)) +
  stat_density(fill = "grey20", alpha = 0.9) +
  geom_vline(xintercept = 0.05, color = "deeppink2", size = 1) +
  xlim(0, 0.1) +
  theme_bw() +
  xlab(expression(paste(italic("p"), "-Value"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", italic(p),
    "-Values from 1,000 simulations with sample size 100")))

(p1 / p2)
In [70]:
p1 = ggplot(data = sim10, aes(x = b1)) +
  stat_density(fill = "grey70") +
  geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
  theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) + 
  xlab(expression(paste(beta[1], " estimate"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", beta[1],
    " estimates from 1,000 simulations with sample size 10")))
p2 = ggplot(data = sim100, aes(x = b1)) +
  stat_density(fill = "grey70") +
  geom_vline(xintercept = 0.5, color = "darkviolet", size = 1) +
  theme_bw() + xlim(c(min(sim10$b1), max(sim10$b1))) + 
  xlab(expression(paste(beta[1], " estimate"))) +
  ylab("Density") +
  ggtitle(expression(paste("Distribution of ", beta[1],
    " estimates from 1,000 simulations with sample size 100")))
(p1 / p2)

Parallelisation

In [71]:
ols_sim_mc <- function(n_sims, sample_size, seed = 12345) {
  # Require the parallel package
  require(parallel)
  # Set the seed
  set.seed(seed)
  # Run one_sim n_sims times; convert results to data.frame
  sim_df <- mclapply(
    X = rep(x = sample_size, times = n_sims),
    FUN = one_sim,
    # Specify that we want 4 cores
    mc.cores = detectCores() - 2
    ) %>% bind_rows()
  # Return sim_df
  return(sim_df)
}

Non-Parallelised

# Run ols_sim for sample size of 10 start1 <- proc.time() sim10 <- ols_sim(n_sims = 1e4, sample_size = 10) stop1 <- proc.time() # Run ols_sim for sample size of 100 start2 <- proc.time() sim100 <- ols_sim(n_sims = 1e4, sample_size = 100) stop2 <- proc.time()

Parallelised

# Run ols_sim_mc for sample size of 10 start3 <- proc.time() sim10_mc <- ols_sim_mc(n_sims = 1e4, sample_size = 10) stop3 <- proc.time() # Run ols_sim_mc for sample size of 100 start4 <- proc.time() sim100_mc <- ols_sim_mc(n_sims = 1e4, sample_size = 100) stop4 <- proc.time()
In [72]:
# stop1 - start1
# stop2 - start2
# stop3 - start3
# stop4 - start4

Section 6: Figures

In [73]:
libreq(dplyr, magrittr, ggplot2, ggthemes)
     wants      loaded
[1,] "dplyr"    TRUE  
[2,] "magrittr" TRUE  
[3,] "ggplot2"  TRUE  
[4,] "ggthemes" TRUE  
In [74]:
# Reformat Anscombe's dataset
a_df <- bind_rows(
  dplyr::select(anscombe, x = x1, y = y1),
  dplyr::select(anscombe, x = x2, y = y2),
  dplyr::select(anscombe, x = x3, y = y3),
  dplyr::select(anscombe, x = x4, y = y4))
# Add group identifier
a_df %<>% mutate(group = rep(paste0("Dataset ", 1:4), each = 11))
# The plot
ggplot(data = a_df, aes(x, y)) +
  # Plot the points
  geom_point() +
  # Add the regression line (without S.E. shading)
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  # Change the theme
  theme_pander() +
  theme(panel.border = element_rect(color = "grey90", fill=NA, size=1)) +
  # Plot by group
  facet_wrap(~group, nrow = 2, ncol = 2) +
  ggtitle("Illustrating Anscombe's Quartet",
    subtitle = "Four very different datasets with the same regression line")
In [75]:
# Regress price on weight (with an intercept)
b <- ols(data = cars, y_var = "price", X_vars = "weight") %$% coef
price_hat <- function(weight) b[1] + b[2] * weight

p1 = ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  stat_function(fun = price_hat, colour = 'blue')
p2 = ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  geom_smooth(method = "lm")

(p1 / p2)
In [76]:
ggplot(data = cars, aes(x = weight, y = price)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2))