In [1]:
options(digits=2)
# R Imports Boilerplate
rm(list=ls())
In [23]:
library(LalRUtils)
load_or_install(c('tidyverse','data.table', 'stargazer','lfe', 'estimatr', 'systemfit', 
  'magrittr','Hmisc', 'rio', 'texreg', 'knitr', 'foreach', 'doParallel', 'gridExtra',
  'rdrobust', 'rdd', 'foreign'))
theme_set(lal_plot_theme())

exp_type = 'text'
  1. TRUE
  2. TRUE
  3. TRUE
  4. TRUE
  5. TRUE
  6. TRUE
  7. TRUE
  8. TRUE
  9. TRUE
  10. TRUE
  11. TRUE
  12. TRUE
  13. TRUE
  14. TRUE
  15. TRUE
  16. TRUE
  17. TRUE
In [3]:
root = '/home/alal/Dropbox/0_GradSchool/0_classes/450b/applied-econometrics-notes/'
data = paste0(root, 'data/')
In [4]:
#%%
# Read the data into a dataframe
pums = read.table(paste0(data, 'asciiqob.txt'),
                  header           = FALSE,
                  stringsAsFactors = FALSE)
names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')
pums$age <- 80 - pums$yob

Instrumental Variables

Structural Equations view

Endogeneity : $Cov(X_i, \epsilon_i) \neq 0$

Instrumental Variable requirements:

  • Relevance : $Cov(X_i, Z_i) \neq 0$
  • Exclusion Restriction: $Cov(Z_i, \epsilon_i) = 0$

IV estimation

$$ \beta_{IV} = \frac{Cov(Y_i,Z_i)}{Cov(X_i, Z_i)} = \frac{\beta_{RF}}{\beta_{FS}} $$

General form:

$$ \hat{\beta}_{IV} = (X'P_z X)^{-1}X'P_z Y $$

where $P_z$ is the projection matrix of Z:= $Z(Z'Z)^{-1}Z'$

More general estimation strategies:

  • 2 Stage Least Squares:

    • Use OLS to regress X on Z and get $\hat{X} = P_z X$
    • Use OLS to regress y on $\hat{X}$ to get $\hat{\beta}_{iv}$
  • Control Function approach:

    • Use OLS (or some other estimation strategy) to regress X on Z and get estimated residuals $\hat{v} = I - P_z X = M_z X$
    • Use OLS to regress y on X and $\hat{v}$ to get $\hat{\beta}_{iv}$

IV estimation with covariates

conditional independence assumption

$$ {Y_{1i}, Y_{0i}, D_{1i}, D_{0i}} \coprod Z_i | X_i $$

Angrist and Krueger - Compulsory Schooling / Quarter of Birth Instrument

Visual test of Instrument Relevance

Z = Quarter of Birth , Y = wages, X = education

In [15]:
pums.qob.means <- pums %>% group_by(yob, qob) %>% summarise_all(funs(mean))
#%%
# Add dates
pums.qob.means$yqob <- lubridate::ymd(paste0("19",
                                  pums.qob.means$yob,
                                  pums.qob.means$qob * 3),
                           truncated = 2)
In [16]:
# Function for plotting data
plot.qob <- function(ggplot.obj, ggtitle, ylab) {
  gg.colours <- c("firebrick", rep("black", 3), "white")
  ggplot.obj + geom_line()                                              +
               geom_point(aes(colour = factor(qob)),
                              size = 5)                                 +
               geom_text(aes(label = qob, colour = "white"),
                         size  = 3,
                         hjust = 0.5, vjust = 0.5,
                         show_guide = FALSE)                            +
               scale_colour_manual(values = gg.colours, guide = FALSE)  +
               ggtitle(ggtitle)                                         +
               xlab("Year of birth")                                    +
               ylab(ylab)
}
# Plot
p.educ     <- plot.qob(ggplot(pums.qob.means, aes(x = yqob, y = educ)),
                       "A. Average education by quarter of birth (first stage)",
                       "Years of education")
p.lwklywge <- plot.qob(ggplot(pums.qob.means, aes(x = yqob, y = lwklywge)),
                       "B. Average weekly wage by quarter of birth (reduced form)",
                       "Log weekly earnings")

p.ivgraph  <- grid.arrange(p.educ, p.lwklywge)
Warning message:
“`show_guide` has been deprecated. Please use `show.legend` instead.”Warning message:
“`show_guide` has been deprecated. Please use `show.legend` instead.”
In [19]:
### Replication (Table 4.1.1 in MHE)
pums        <- fread(paste0(data, 'asciiqob.txt'),
                     header           = FALSE,
                     stringsAsFactors = FALSE)
names(pums) <- c('lwklywge', 'educ', 'yob', 'qob', 'pob')

qobs      <- unique(pums$qob)
qobs.vars <- sapply(qobs, function(x) paste0('qob', x))
pums[, (qobs.vars) := lapply(qobs, function(x) 1*(qob == x))]
#%%
# Column 1: OLS
col1 <- felm(lwklywge ~ 1|0|(educ~educ)|0, pums)
# Column 2: OLS with YOB, POB dummies
col2 <- felm(lwklywge ~ 1| factor(yob) + factor(pob)|(educ~educ)|0, pums)
#%%
col3 <- felm(lwklywge ~ 1 | 0 | (educ~qob1) |0, data=pums)
# Column 3: 2SLS with instrument QOB = 1
#%%
# Column 4: 2SLS with YOB, POB dummies and instrument QOB = 1
col4 <- felm(lwklywge ~ 1 | 0 |(educ~qob1 + qob2 + qob3)|0 , data=pums)
#%%
# Column 5: 2SLS with YOB, POB dummies and instrument (QOB = 1)
col5 <- felm(lwklywge ~ 1 | factor(yob) + factor(pob) |
                        (educ ~ qob1)|0,data=pums)
# Column 6: 2SLS with YOB, POB dummies and full QOB dummies
col6 <- felm(lwklywge ~ 1 | factor(yob) + factor(pob) |
                        (educ ~ factor(qob))|0,data=pums)
# Column 7: 2SLS with YOB, POB dummies and full QOB dummies interacted with YOB
col7 <- felm(lwklywge ~ 1 | factor(yob) + factor(pob) |
                       (educ ~ factor(qob) * factor(yob)),
              data = pums)
Warning message in chol.default(mat, pivot = TRUE, tol = tol):
“the matrix is either rank-deficient or indefinite”Warning message in chol.default(mat, pivot = TRUE, tol = tol):
“the matrix is either rank-deficient or indefinite”Warning message in chol.default(mat, pivot = TRUE, tol = tol):
“the matrix is either rank-deficient or indefinite”
In [20]:
stargazer(col1,col2,col3,col4,col5,col6,col7,
    keep.stat="n",
    covariate.labels=c('Years of education'),
    add.lines = list(
        c("YOB fixed effects", '', 'X','','','X','X','X' ),
        c("POB fixed effects", '', 'X','','','X','X','X' ),
        c("Instruments",       '', '','','','','' ),
        c("QOB=1",             '', '','X','X','X','X' ),
        c("QOB=2",             '', '','','X','','X' ),
        c("QOB=3",             '', '','','X','','X' ),
        c("QOB X YOB dummies", '', '','','','','','X' )),
    omit= "Constant",
    type=exp_type)
=================================================================================
                                        Dependent variable:                      
                   --------------------------------------------------------------
                                              lwklywge                           
                     (1)      (2)      (3)      (4)      (5)      (6)      (7)   
---------------------------------------------------------------------------------
Years of education 0.071*** 0.067*** 0.100*** 0.100*** 0.100*** 0.110*** 0.087***
                   (0.0003) (0.0003) (0.024)  (0.020)  (0.026)  (0.020)  (0.016) 
                                                                                 
---------------------------------------------------------------------------------
YOB fixed effects              X                          X        X        X    
POB fixed effects              X                          X        X        X    
Instruments                                                                      
QOB=1                                   X        X        X        X             
QOB=2                                            X                 X             
QOB=3                                            X                 X             
QOB X YOB dummies                                                           X    
Observations       329,509  329,509  329,509  329,509  329,509  329,509  329,509 
=================================================================================
Note:                                                 *p<0.1; **p<0.05; ***p<0.01

The Wald Estimator

Structural equation:

$$ Y_i = \alpha + \rho S_i + \eta_i $$

$Cov(\eta_i, s_i) \neq 0$. Let $Z_i \in {0,1}$ s.t. $P(Z_i=1) = p$, $$ Cov(Y_i, Z_i) = \{ E[Y_i|Z_i = 1] - E[Y_i|Z_i = 0]\}p(1-p) $$

Then, $$ \rho = \frac{E[Y_i|Z_i = 1] - E[Y_i|Z_i = 0]}{E[s_i|Z_i = 1] - E[s_i|Z_i=0]} $$

In [23]:
pums$z <- (pums$qob == 3 | pums$qob == 4) * 1

# Compare means (and differences)
ttest.lwklywge <- t.test(lwklywge ~ z, pums)
ttest.educ     <- t.test(educ ~ z, pums)

# Compute Wald estimate
sur  <- systemfit(list(first  = educ ~ z,
                       second = lwklywge ~ z),
                  data   = pums,
                  method = "SUR")
wald <- deltaMethod(sur, "second_z / first_z")

wald.estimate <- (mean(pums$lwklywge[pums$z == 1]) - mean(pums$lwklywge[pums$z == 0])) /
                 (mean(pums$educ[pums$z == 1]) - mean(pums$educ[pums$z == 0]))
# wald.se       <- wald.estimate^2 * ()

# OLS estimate
ols <- lm(lwklywge ~ educ, pums)

# Construct table
lwklywge.row <- c(ttest.lwklywge$estimate[1],
                  ttest.lwklywge$estimate[2],
                  ttest.lwklywge$estimate[2] - ttest.lwklywge$estimate[1])
educ.row     <- c(ttest.educ$estimate[1],
                  ttest.educ$estimate[2],
                  ttest.educ$estimate[2] - ttest.educ$estimate[1])
wald.row.est <- c(NA, NA, wald$Estimate)
wald.row.se  <- c(NA, NA, wald$SE)

ols.row.est <- c(NA, NA, summary(ols)$coef['educ' , 'Estimate'])
ols.row.se  <- c(NA, NA, summary(ols)$coef['educ' , 'Std. Error'])

table           <- rbind(lwklywge.row, educ.row,
                         wald.row.est, wald.row.se,
                         ols.row.est, ols.row.se)
colnames(table) <- c("Born in the 1st or 2nd quarter of year",
                     "Born in the 3rd or 4th quarter of year",
                     "Difference")
rownames(table) <- c("ln(weekly wage)",
                     "Years of education",
                     "Wald estimate",
                     "Wald std error",
                     "OLS estimate",
                     "OLS std error")
In [24]:
table
Born in the 1st or 2nd quarter of yearBorn in the 3rd or 4th quarter of yearDifference
ln(weekly wage) 5.9 5.9 0.01198
Years of education12.7 12.8 0.10569
Wald estimate NA NA 0.11339
Wald std error NA NA 0.02153
OLS estimate NA NA 0.07085
OLS std error NA NA 0.00034

The Local Average Treatment Effect (LATE)

Types of validity in research design:

  • Internal Validity - whether a given design successfully uncovers causal effects for the population being studied
  • External Validity - predictive value of the study's finding in a different context

Setup for LATE THM

Populations in IV

  • Compliers : Subpopulation with $D_{1i} > D_{0i} ; D_{1i} = 1 \land D_{0i} = 0$
  • Defiers: Subpopulation with $D_{1i} = 0 \land D_{0i} = 1$ ; Violates monotonicity - should be $\emptyset$
  • Always takers: Subpopulation with $D_{1i} = D_{0i} = 1$
  • Never takers: Subpopulation with $D_{1i} = D_{0i} = 0$

Define 4 potential outcomes

$$ Y(D = 1, Z = 1); Y(D = 1, Z = 0) ; Y(D = 0, Z = 1); Y(D = 0, Z = 0) $$

LATE Theorem (Angrist, Imbens, Rubin 1996)

Assumptions

  • SUTVA

  • Ignorability of Instrument - Instrument is as good as randomly assigned - it is independent of the vector of potential outcomes and potential treatment assignments $$ [\{ Y_i(d,z); \forall d, z \}, D_{1i}, D_{0i}] \bot z_i $$

  • Exclusion Restriction: $Y(D, 0 ) = Y(D, 1) \; \forall D \in \{0, 1\}$ - no direct effect of $Z$ on $Y$ except through $D$.

$Y_i(d,z) =f(d)$,i.e. only a function of $d$ ; $Y_i(d,0) = Y_i(d,1) \equiv Y_{di}$ for $d \in {0,1}$

Combining these, we can write the ignorability condition as

$$ \left( Y _ { 0 } , Y _ { 1 } , D _ { 0 } , D _ { 1 } \right) \perp Z $$
  • Nonzero Average Effect (Relevance) - First Stage - $Cov(d,z) \neq 0$ ; $E[D_{1i}- D_{0i}] \neq 0$
  • Monotonicity Either $\pi_{1i} \geq 0 \; \forall \; i \lor \pi_{1i} \leq 0 \forall i$ (everyone affected by the instrument are affected in the same way - red flag for jackknife instruments) $\iff D_1 \geq D_0$

Sufficient for a causal interpretation of the reduced form

Given the above conditions, The wald estimand can be interpreted as the causal effect of the endogenous variable of interest on the outcome on those whose treatment status can be changed by the instrument

$$ \begin{aligned} \rho & = \frac{E[Y_i|Z_i = 1] - E[Y_i|Z_i = 0]}{E[s_i|Z_i = 1] - E[s_i|Z_i=0]} \\ & = E[Y_{1i} - Y_{0i} | D_{1i} > D_{0i}] \\ & = E[\rho_i | \pi_{1i} > 0] \end{aligned} $$

LATE is the effect of the treatment on the population of compliers. Not informative about never-takers and always-takers

Treatment Effects

  • Average Treatment Effect (ATE) = $E[Y_{1i}-Y_{0i}]$
  • Intent to Treat (ITT) = $E[Y_{i}|Z_i = 1] - E[Y_{i}|Z_i = 0]$ - difference in outcomes between those who were offered treatment and those who weren't
  • Treatment on Treated (TOT) = Weighted average of effect on always- takers and effect on compliers = $E[Y_{1i} - Y_{0i}|D_i=1]$ - mean effect for those who actually participated in the program
  • Local Average Treatment Effect (LATE) : $E[Y_{1i} - Y_{0i} | D_{1i} > D_{0i}]$ for compliers

Under randomisation and perfect compliance, ITT == ATE

In a randomised trial with one-sided non-compliance: Bloom Result

$$ ToT = E[Y_{1i} - Y_{0i}|D_i = 1] = \frac{E[Y_i|z_i = 1] - E[Y_i|Z_i=0]}{E[D_i|z_i =1]} = \frac{RF}{FS} = \frac{ITT}{Compliance} $$
  • When $D$ is randomised, $Z = D$ and every individual is a complier
  • One-sided noncompliance $D_0 = 0$ - those who are assigned to control do not get treatment

Standard errors

Manually running IV using 2 stages does not give you correct standard errors.

OLS residual variance is the variance of $\eta + \rho(s_i - \hat{s_i})$ while for proper 2SLS standard errors, we want the variance of $\eta_i$ only.

Multiple Instruments

2 mutually exclusive dummy instruments ($z_{1i}, z_{2i}$). Resulting 2SLS_estimate of $\rho$ is a weighted average of causal effects for instrument-specific compliant subpopulations

$$ \rho_j = \frac{Cov(Y_i, z_{ji})}{Cov(D_i, z_{ji})} , j = 1,2 $$

First stage fitted values are $\hat{D_i} = \pi_{11}z_{1i} + \pi_{12}z_{2i}$

$$\rho_{2sls} = \psi \rho_1 + (1-\psi) \rho_2 $$

where

$$ \psi = \frac{\pi_{11} Cov(D_1, Z_{1i})}{\pi_{11}Cov(D_1, Z_{1i})+ \pi_{21}Cov(D_1, Z_{2i})} $$

$\psi$ is a number between 0 and 1 that depends on the relative strength of each instrument in the first stage

2SLS vs LIML

Only relevant When the model is overidentified. 2SLS estimation is done by fitting the first stage and reduced form separately; at each step, $\beta$ and $\pi$ are estimated to minimise errors in that equation. On the other hand, limited information maximum-likelihood jointly minimises the sum of squared errors in both stages.

Overidentification

Basic idea: when model is overidentified, we can test exclusion restriction ($Cov(z, e)=0$) by estimating the model with the two instruments separately and testing whether $Cov(z_1, e_2)=0$ (where $e_2$ is the error term from the structural equation when $x$ is instrumented by $z_2$) and vice versa.

Steps:

  • estimate the model with all available instruments, keep residual $\hat{e}$
  • regress $\hat{e}$ on $z_1$ and $z_2$, store $R^2$.
  • Under the null that overid restrictions are satisfied, $nR^2 \sim \chi^2(q)$, where $q$ is the number of overidentifying restrictions (1 in this case)

Hausman Test

IV comes at a price of less precision in estimates, so better to use OLS_unless it can be shown that IV significantly changes results. Define:

$$ H = \frac{(\hat{\beta}_{2sls} - \hat{\beta}_{OLS})^2} {\hat{V}(\hat{\beta}_{2sls}) - \hat{V}(\hat{\beta}_{OLS})} $$

Under the null that OLS is consistent, $H \sim \chi^2_1$. This is equivalent to testing $\gamma = 0$ in the equation $y = \beta x + \gamma \hat{u} + \epsilon$, where $\hat{u}$ are the residuals from the first stage of IV (regression of x on z).

Bartik

In [36]:
set.seed(6152011)
# Number of cities
L <- 1000
# Number of industries
K <- 3
# National growth rates by  industry
g.raw <- runif(K)
g.k <- g.raw/sum(g.raw)
#%%
CreateCity <- function(g.k, eta.s){
    #  Give each city some industry shares
    K <- length(g.k)
    z.raw <- runif(K)
    z <- z.raw / sum(z.raw)
    # Give each city some idiosyncratic growth by industry
    g.kl.tilde <- g.raw/sum(g.raw)
    # Change in employement is inner product of the national
    # + idiosyncratic component with industry share
    x <- (g.kl.tilde + g.k) %*% z
    # Bartik instrument
    # (national growth & city industry share inner product)
    B <- g.k %*% z
    # Realized change in wage
    y <- (1/eta.s) * x + rnorm(1,0,1/10)
    list("y" = y, "B" = B, "x" = x)
}
# Create a city data frame
city.data <- rep(CreateCity(g.k, 1), L)
df <- data.frame(matrix(unlist(city.data), ncol = 3))
colnames(df) <- c("y", "B", "x")
# Estimates
m.ols <- felm(y ~ x | 0 | 0 | 0, data = df)
m.iv <- felm(y ~ 1 | 0 | (x ~ B) | 0, data = df)

stargazer(m.ols, m.iv, type = exp_type)
===========================================================
                                   Dependent variable:     
                               ----------------------------
                                            y              
                                    (1)            (2)     
-----------------------------------------------------------
x                                -0.500***                 
                                  (0.027)                  
                                                           
`x(fit)`                                        1.000***   
                                                 (0.110)   
                                                           
Constant                          1.000***       -0.0004   
                                  (0.020)        (0.077)   
                                                           
-----------------------------------------------------------
Observations                       1,000          1,000    
R2                                 0.250         -2.000    
Adjusted R2                        0.250         -2.000    
Residual Std. Error (df = 998)     0.180          0.350    
===========================================================
Note:                           *p<0.1; **p<0.05; ***p<0.01

Weak Instruments

Given first stage $s_i = \pi z_i + \epsilon_i$ and structural equation $y_i = \rho s_i + \eta_i$, define $\sigma_{\epsilon \eta} = Cov(\epsilon, \eta)$ and $\sigma_{\epsilon}^2 = Var(\epsilon)$. Let $F$ be the F statistic of the first-stage. Then, bias in IV term is:

$$ \rho_{2sls} - \rho = \frac{\sigma_{\epsilon,\eta}}{\sigma_{\epsilon}^2} \frac{1}{F+1} $$

So, as $F \rightarrow \infty$, bias $\rightarrow 0$.

Bound, Jaeger, and Baker (1995) IV may be biased / inconsistent if the instruments are only weakly correlate with the endogenous variables

$$ \rho = \frac{Cov(Y_i, Z_i)}{Cov(D_i,Z_i)} + \frac{Cov(\eta_i, Z_i)}{Cov(D_i, Z_i)} = \alpha_D + \frac{Cov(\eta_i, Z_i)}{Cov(D_i, Z_i)} $$

The second term gets inflated if the instrument is weak the moment there is slight failures of exogeneity.

$$ plim \hat{\beta}_{iv} = \beta + \frac{plim \Delta \bar{\epsilon}}{plim \Delta \bar{x}} $$$$ \frac{plim \hat{\beta}_{iv} - \beta}{plim \hat{\beta}_{ols} - \beta} = \frac{cov(\hat{x}, \epsilon) / cov(x,\epsilon)}{R^2_{x,z}} $$

Staiger, Stock (1997) : rule of thumb - first-stage F statistic $\geq$ 10 (for 1 endogenous variable)

Montiel-Olea, Pflueger (2013) - construct Effective F Statistic

$$ \hat{F}_{eff} = \frac{1}{S} \frac{x'ZZ'x}{tr(\hat{W}_2)} $$

Inference

Power is linear in compliance ratio.

$$ S E _ { \widetilde { L A T E } } \approx \frac { S E _ { \widehat {I T T } } } { \text { Compliance Ratio } } $$

With 10% compliance, LATE SE is 10x the ITT SE

Abadie's Kappa

Likelihood that Complier has a given value of characteristic X relative to the rest of the population

$$ \frac{E[D | Z=1, X=x]-E[D | Z=0, X=x]}{E[D | Z=1]-E[D | Z=0]} $$

Abadie's $\kappa$

Let $g$ be any measurable function whose conditional expectation is defined

$$ \kappa=1-\frac{D \cdot(1-Z)}{P(Z=0 | X)}-\frac{(1-D) \cdot Z}{P(Z=1 | X)} $$

Then, $$ E[g(Y, D, X) | D_{1}>D_{0}]=\frac{E[\kappa \cdot g(Y, D, X)]}{E[\kappa]} $$

Used to calculate the average of $X$ in complier popn:

$$ E[X | D_{1}>D_{0}]=\frac{E[\kappa \cdot X]}{E[\kappa]} $$
  • Estimate $\kappa$
  • Average $X$ with $\kappa$ weights

Replication of Angrist 1991

In [36]:
draft = import(paste0(data, 'draft.txt'))
colnames(draft) = c('lwage', 'age', 'elig', 'vet')
draft %>% group_by(vet) %>% summarise(lw = mean(lwage)) %>% .$lw -> w
wdiff = w[2] - w[1]
m1 = lm(lwage ~ vet, draft)
In [37]:
stargazer(m1, m1,
  se = list(NULL, robust_se(m1)), 
  add.lines = list(c('SE', 'HC0', 'HC2')),
  type = exp_type, header = F, df = F
)
================================================
                        Dependent variable:     
                    ----------------------------
                               lwage            
                         (1)            (2)     
------------------------------------------------
vet                     -0.021        -0.021    
                       (0.017)        (0.017)   
                                                
Constant               5.400***      5.400***   
                       (0.008)        (0.008)   
                                                
------------------------------------------------
SE                       HC0            HC2     
Observations            10,100        10,100    
R2                      0.0001        0.0001    
Adjusted R2             0.0001        0.0001    
Residual Std. Error     0.700          0.700    
F Statistic             1.500          1.500    
================================================
Note:                *p<0.1; **p<0.05; ***p<0.01
In [38]:
freq_table(draft, 'elig') %>% kable(.)
freq_table(draft, 'vet') %>% kable(.)

| elig|    n| prop|
|----:|----:|----:|
|    0| 7320| 0.72|
|    1| 2780| 0.28|

| vet|    n| prop|
|---:|----:|----:|
|   0| 7863| 0.78|
|   1| 2237| 0.22|
In [39]:
draft %>% group_by(elig) %>% summarise(mu = mean(vet)) %>% .$mu -> mu
always_takers = nrow(filter(draft, elig != 1 & vet == 1)) / nrow(filter(draft, elig != 1))
never_takers = nrow(filter(draft,  elig == 1 & vet == 0)) / nrow(filter(draft, elig == 1))
compliers = mu[2] - mu[1]

shares = data.frame(subpop = c('always takers', 'never takers', 'compliers'),
           share  = c(always_takers, never_takers, compliers))
shares %>% kable()

|subpop        | share|
|:-------------|-----:|
|always takers |  0.19|
|never takers  |  0.69|
|compliers     |  0.12|
In [41]:
ivmod = felm(lwage ~ 1 | 0 | (vet ~ elig) | 0, draft)
stargazer(ivmod, header = F, model.names = F, type = exp_type, df = F,
  column.labels = c('2SLS Estimates'),
  covariate.labels = c('Veteran Status'), keep.stat = c('n'))
==========================================
                   Dependent variable:    
               ---------------------------
                          lwage           
                     2SLS Estimates       
------------------------------------------
Veteran Status           -0.230*          
                         (0.130)          
                                          
Constant                5.500***          
                         (0.029)          
                                          
------------------------------------------
Observations             10,100           
==========================================
Note:          *p<0.1; **p<0.05; ***p<0.01
In [42]:
draft %>% filter(age >= 52) -> a52
fml = as.formula('lwage ~ 1 | 0 | (vet ~ elig) | 0')
fs_all = felm(fml, draft)$coefficients[2]
fs_a52 = felm(fml, a52)$coefficients[2]

(lik = fs_a52/fs_all)
1.62564729249267
In [43]:
# logit for selection
selmod = glm(vet ~ elig, family=binomial(link=logit), draft)
draft$dhat = selmod$fitted.values

draft %<>% mutate(
  κ = 1 - ((vet * (1-elig))/(1-dhat)) - ((1-vet)*elig/dhat))
avg_complier_age = weighted.mean(draft$age, draft$κ)
avg_complier_age
51.3718645391492

Regression Discontinuity

Estimand (Hahn et al 2001)

$$ \alpha_{SRDD} = E[Y_{1}-Y_{0} | X=c] =\lim_{x \downarrow c} E[Y_{1} | X=x]- \lim_{x \uparrow c} E[Y_{0} | X=x] $$

Sharp RD

Treatment changes discontinuously at some particular value $x_0$ of x

$$ D_i = \begin{cases} 1 \; \text{ if} \; x_i \geq x_0, \\ 0 \; \text{ if} \; x_i < x_0, \end{cases} $$

This value $x$ is likely to be arbitrary in the neighbourhood of $x_0$

RD_takes advantage of a discrete change in treatment in response to a small change in the running variable x. Causal inference relies on the (local) arbitrariness of whether unit $i$ has $x_i < x_0 | x_i > x_0$

Formally,

\begin{align*} E[Y_{0i}|x_i] & = \alpha + \beta x_i \\ Y_{1i} = Y_{0i} + \rho \end{align*}

which leads to the regression:

$$ Y_i = \alpha + \beta x_i + \tau D_i + \eta_i $$

To permit nonlinearity, we estimate:

$$ Y_i = f(x_i) + \tau D_i + \eta_i $$

General Setup with normalised running/forcing variable

Define $c := x_0$. The treatment effect can be uncovered by running a pooled regression

$$ Y = \alpha_l +_\tau D + f(X - c) + \epsilon $$

It is recommended to let the regression function differ on the two sides of the cutoff by including interaction terms between D and X.

Then, the pooled regression is

$$ Y = \alpha_l + \tau D + \beta_l (X - c) + (\beta_r - \beta_l) D (X-c) + \epsilon $$

Higher order polynomials can be included to better approximate the regression functions (per Stone Weirstrass thm).

In [27]:
# Generate series
nobs      = 100
x         <- runif(nobs)
y.linear  <- x + (x > 0.5) * 0.25 + rnorm(n = nobs, mean = 0, sd = 0.1)
y.nonlin  <- 0.5 * sin(6 * (x - 0.5)) + 0.5 + (x > 0.5) * 0.25 + rnorm(n = nobs, mean = 0, sd = 0.1)
y.mistake <- 1 / (1 + exp(-25 * (x - 0.5))) + rnorm(n = nobs, mean = 0, sd = 0.1)
rd.series <- data.frame(x, y.linear, y.nonlin, y.mistake)

# Make graph with ggplot2
g.data   <- ggplot(rd.series, aes(x = x, group = x > 0.5))

p.linear <- g.data + geom_point(aes(y = y.linear))  +
                     stat_smooth(aes(y = y.linear),
                                 method = "lm",
                                 se     = FALSE)    +
                     geom_vline(xintercept = 0.5)   +
                     ylab("Outcome")                +
                     ggtitle(bquote('A. Linear E[' * Y["0i"] * '|' * X[i] * ']'))

p.nonlin <- g.data + geom_point(aes(y = y.nonlin))  +
                     stat_smooth(aes(y = y.nonlin),
                                 method  = "lm",
                                 formula = y ~ poly(x, 2),
                                 se      = FALSE)   +
                     geom_vline(xintercept = 0.5)   +
                     ylab("Outcome")                +
                     ggtitle(bquote('B. Nonlinear E[' * Y["0i"] * '|' * X[i] * ']'))

f.mistake <- function(x) {1 / (1 + exp(-25 * (x - 0.5)))}
p.mistake <- g.data + geom_point(aes(y = y.mistake))     +
                      stat_smooth(aes(y = y.mistake),
                                  method = "lm",
                                  se     = FALSE)        +
                      stat_function(fun      = f.mistake,
                                    linetype = "dashed") +
                      geom_vline(xintercept = 0.5)       +
                      ylab("Outcome")                    +
                      ggtitle('C. Nonlinearity mistaken for discontinuity')

grid.arrange(p.linear, p.nonlin, p.mistake, ncol = 1)

Lee (2008)

Regression Formulation

$$ Y_{i t+1} = \alpha + \beta_0 \tilde{X}_{it} + \tau D_{it} + \beta_1 \tilde{X}_{it} D_{it} + \epsilon_i $$
In [5]:
lee = import(paste0(data, 'Lee.dta'))

lee %<>% filter(party == 100) # subset to dems
lee %<>%
  mutate(
    win = 1 * (origvote == highestvote),
    vote_share = origvote / totvote,
    margin = ifelse(win == 1, # if win
                    (origvote - sechighestvote)/totvote, # margin if win
                    (origvote - highestvote)/totvote) # margin if loss
    ) %>%
  arrange(distid, yearel) %>%
  group_by(distid) %>%
  mutate(last_margin = dplyr::lag(margin, 1), # margin in last election
         incumbent   = 1 * (dplyr::lag(win, 1) == 1)
  ) %>%
  ungroup()

Testing Balance across discontinuity

In [31]:
ggplot(lee, aes(x = last_margin)) + geom_histogram(bins = 50) + geom_vline(xintercept = 0)

McCrary (2008)

In [32]:
DCdensity(lee$last_margin, verbose = T)
Assuming cutpoint of zero.
Using calculated bin size:  0.008 
Using calculated bandwidth:  0.200 
Log difference in heights is  0.001  with SE  0.076 
  this gives a z-stat of  0.010 
  and a p value of  0.992 
0.991765996299681

Treatment Assignment

In [6]:
ggplot(lee, aes(x = last_margin, y = incumbent, colour = as.factor(incumbent))) +
  geom_point(alpha = 0.5, size = 0.3) +
  labs(title = 'Incumbency as function of past margin',
      y = bquote(incumbent[t+1]), x = bquote(margin[t]))
Warning message:
“Removed 1664 rows containing missing values (geom_point).”
In [8]:
lee %>% filter(abs(last_margin) <= 0.25) -> cut_25
m1 = lm(vote_share ~ incumbent*last_margin, cut_25)
yhat = predict(m1, cut_25)
stargazer(m1, header = F, type = exp_type, se = list(robust_se(m1)),
  model.names = F,
  keep = c('incumbent$'),  dep.var.caption = 'Vote Share',
  dep.var.labels.include = FALSE,  multicolumn = F,
  omit.stat = c('f', 'adj.rsq', 'ser'))
========================================
                     Vote Share         
             ---------------------------
----------------------------------------
incumbent             0.086***          
                       (0.006)          
                                        
----------------------------------------
Observations            4,133           
R2                      0.410           
========================================
Note:        *p<0.1; **p<0.05; ***p<0.01
In [9]:
df = data.frame(yhat, vote_share = cut_25$vote_share,
  margin_last = cut_25$last_margin, inc = cut_25$incumbent) %>% na.omit()

ggplot(df, aes(x = margin_last)) +
  geom_vline(xintercept = 0, linetype = 'dashed') +
  geom_point(aes(y = vote_share, colour = as.factor(inc)),
    alpha = 0.4, size = 0.4) +
  geom_line(aes(y = yhat, colour = as.factor(inc))) +
  theme(legend.position = "none") +
  labs(title = 'RD estimates of the Incumbency Advantage',
    y = bquote(VoteShare[t+1]), x = bquote(margin[t]))

Different Bandwidths

In [10]:
bws = seq(.05, .95, .05)
coefs = c()
ses   = c()
for(bw in bws) {
  m = lm(vote_share ~ incumbent*last_margin,
      lee %>% filter(abs(last_margin) <= bw))
  coefs = c(coefs, m$coefficients[2])
  ses = c(ses, robust_se(m)[2])
}
ests = data.frame(bw = bws, coef = coefs, se = ses)

ggplot(ests, aes(bw, coef, group = 1)) + geom_point() +
  geom_line(linetype = "dotted") +
  geom_errorbar(aes(ymax = coef + 1.96 * se, ymin = coef - 1.96 * se),
    colour = 'blue', alpha = 0.6) +
  geom_hline(yintercept = 0) +
  scale_x_reverse() +
  labs(title = 'Incumbency advantage as function of cutoffs',
   y = "Effect on Vote Share",
   x = "Bandwidth")

Placebo

In [11]:
lee %<>% group_by(distid) %>% mutate(
    plac_inc_1   = 1 * (last_margin >= -0.15),
    plac_inc_2   = 1 * (last_margin >= -0.075),
    plac_inc_3   = 1 * (last_margin >=  0.075),
    plac_inc_4   = 1 * (last_margin >=  0.15),
  ) %>% ungroup()
# estimate negative placebo effects on losers
m2 = lm(vote_share ~ plac_inc_1 * last_margin,
  lee %>% filter(incumbent == 0)
)

m3 = lm(vote_share ~ plac_inc_2 * last_margin,
  lee %>% filter(incumbent == 0)
)
# estimate positive placebo effects on winners
m4 = lm(vote_share ~ plac_inc_3 * last_margin,
  lee %>% filter(incumbent == 1)
)
m5 = lm(vote_share ~ plac_inc_4 * last_margin,
  lee %>% filter(incumbent == 1)
)

mod_slicer <- function(m) {
  coef = m$coefficients[2]
  se = robust_se(m)[2]
  t  = coef/se
  c(coef, se, t)
}
# stack estimates

plac_res = data.frame(rbind(
  mod_slicer(m2),
  mod_slicer(m3),
  mod_slicer(m4),
  mod_slicer(m5)))
plac_res = cbind(
  c(-.15, -0.075, 0.075, 0.15),
  plac_res
)
colnames(plac_res) = c('cutoff', 'coef', 'se', 't-stat')
plac_res
cutoffcoefset-stat
-0.150 0.000620.0077 0.081
-0.075 -0.006370.0078 -0.818
0.075 0.016860.0093 1.811
0.150 0.000800.0080 0.100
In [12]:
ggplot(plac_res, aes(x = as.factor(cutoff), y = coef)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  geom_errorbar(aes(ymax = coef + 1.96 * se, ymin = coef - 1.96 * se),
    colour = 'blue', alpha = 0.6) +
  labs(
    x = 'cutoff', y = 'estimate', title = 'Placebo Estimates'
  )
In [14]:
rdrobust_senate=read.csv(paste0(data, "rdrobust_senate.csv"))
attach(rdrobust_senate)
The following object is masked from package:tidyr:

    population

RDRobust

In [15]:
### rdplot with 95% confidence intervals
rdplot(y=vote, x=margin, scale= 5, binselect="es", # scale = 2, ci=95,
         title="RD Plot: U.S. Senate Election Data",
         y.label="Vote Share in Election at time t+1",
         x.label="Vote Share in Election at time t")
In [16]:
rdplot(y=vote,x=margin,h=17.7080, subset=margin<=17.7080&margin>=-17.7080,
         binselect="esmv", kernel="triangular", p=1,
         title="RD Plot: U.S. Senate Election Data",
         y.label="Vote Share in Election at time t+1",
         x.label="Vote Share in Election at time t")
In [17]:
summary(rdbwselect(y=vote,x=margin, all = T))
Call: rdbwselect

Number of Obs.                 1297
BW type                         All
Kernel                   Triangular
VCE method                       NN

Number of Obs.                 595         702
Order est. (p)                   1           1
Order bias  (q)                  2           2

=======================================================
                  BW est. (h)    BW bias (b)
            Left of c Right of c  Left of c Right of c
=======================================================
     mserd    17.708     17.708     27.984     27.984
    msetwo    16.154     18.009     27.096     29.205
    msesum    18.326     18.326     31.280     31.280
  msecomb1    17.708     17.708     27.984     27.984
  msecomb2    17.708     18.009     27.984     29.205
     cerrd    12.374     12.374     27.984     27.984
    certwo    11.288     12.585     27.096     29.205
    cersum    12.806     12.806     31.280     31.280
  cercomb1    12.374     12.374     27.984     27.984
  cercomb2    12.374     12.585     27.984     29.205
=======================================================
In [18]:
### rdrobust default
summary(rdrobust(y=vote,x=margin, all = T))
Call: rdrobust

Number of Obs.                 1297
BW type                       mserd
Kernel                   Triangular
VCE method                       NN

Number of Obs.                 595         702
Eff. Number of Obs.            359         322
Order est. (p)                   1           1
Order bias  (p)                  2           2
BW est. (h)                 17.708      17.708
BW bias (b)                 27.984      27.984
rho (h/b)                    0.633       0.633

=============================================================================
        Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
=============================================================================
  Conventional     7.416     1.460     5.078     0.000     [4.554 , 10.278]    
Bias-Corrected     7.510     1.460     5.143     0.000     [4.648 , 10.372]    
        Robust     7.510     1.743     4.310     0.000     [4.094 , 10.925]    
=============================================================================
In [19]:
### rdrobust with covariates
summary(rdrobust(y=vote,x=margin,covs=cbind(class,termshouse,termssenate)))
Call: rdrobust

Number of Obs.                 1108
BW type                       mserd
Kernel                   Triangular
VCE method                       NN

Number of Obs.                 491         617
Eff. Number of Obs.            313         283
Order est. (p)                   1           1
Order bias  (p)                  2           2
BW est. (h)                 17.987      17.987
BW bias (b)                 28.943      28.943
rho (h/b)                    0.621       0.621

=============================================================================
        Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
=============================================================================
  Conventional     6.851     1.408     4.866     0.000     [4.091 , 9.611]     
        Robust         -         -     4.200     0.000     [3.729 , 10.254]    
=============================================================================
In [20]:
### rdrobust with  cluster-robust variance estimation, and underlying data-driven bandwidth selectors.
summary(rdrobust(y=vote,x=margin,cluster=state))
Call: rdrobust

Number of Obs.                 1297
BW type                       mserd
Kernel                   Triangular
VCE method                       NN

Number of Obs.                 595         702
Eff. Number of Obs.            359         320
Order est. (p)                   1           1
Order bias  (p)                  2           2
BW est. (h)                 17.509      17.509
BW bias (b)                 27.032      27.032
rho (h/b)                    0.648       0.648

=============================================================================
        Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
=============================================================================
  Conventional     7.422     1.522     4.875     0.000     [4.438 , 10.406]    
        Robust         -         -     4.266     0.000     [4.091 , 11.046]    
=============================================================================

Fowler (2015)

In [33]:
fowler = import(paste0(data, 'Fowler.dta'))
# subset
fowler %>% filter(rv1 >= -.20 & rv1 <= .20) -> band
band %>% mutate(
  bin = cut(rv1, breaks = seq(-.2, .2, 0.0025)) # .25 basis point increments
  ) %>% group_by(bin) %>%
  summarise(voteshare = mean(voteshare, na.rm = T),
  rv1 = mean(rv1, na.rm = T),
  treat = ifelse(rv1 > 0, 1, 0)) %>% select(bin, voteshare, rv1, treat) %>%
  ungroup() -> groupmeans

fml_4 = as.formula('voteshare ~ treat * poly(rv1, 4, raw = T)')
# fit model with polynomials
m = lm(fml_4, groupmeans)
groupmeans$yhat = predict(m, groupmeans)

# manual plot
ggplot(groupmeans, aes(x = bin, y = voteshare, group = 1, colour = as.factor(treat))) +
  geom_point(size = 0.3) +
  geom_line(data = groupmeans %>% filter(treat == 0), aes(y = yhat)) +
  geom_line(data = groupmeans %>% filter(treat == 1), aes(y = yhat)) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position = "none") +
  ggtitle('Incumbency Advantage RD - Manual')
In [34]:
# identical to canned routine
rdplot(band$voteshare, band$rv1, p = 4,
  nbins = c(80,80), title = 'Incumbency Advantage RD',
  y.label = bquote(VoteShare[t+1]),
  x.label = bquote(margin[t])
)

Fuzzy RD

Discontinuity doesn't necessarily change treatment, but only the probability of treatment. Then, we can use the discontinuity as an instrumental variable for treatment status.

$$ P[D_i = 1 | x_i] = \begin{cases} g_0 (x_i) if x_i < x_0 \\ g_1 (x_i) if x_i \geq x_0 \end{cases} $$

$g_0(x_i) \neq g_1(x_i)$. Assuming $g_1(x_0) > g_0(x_0)$, the probability of treatment relates to $x_i$ via:

$$ E[D_i|x_i] = P[D_i=1|x_i] = g_0(x_i) + [g_1(x_i) - g_0(x_i)] T_i $$

where $T_i = \mathbb{1} (x_i \geq x_0)$ := point of discontinuity

$$ \rho = lim_{\delta \rightarrow 0} \frac{E[Y_i|x_0 < x_i < x_0 + \delta] - E[Y_i|x_0 - \delta < x_i < x_0]} {E[D_i|x_0 < x_i < x_0 + \delta] - E[D_i|x_0 - \delta < x_i < x_0]} $$

Angrist and Lavy (1999)

uses Maimonides' rule:

$$ m_{sc} = \frac{e_s}{int[\frac{e_s - 1}{40}] + 1} $$
In [24]:
# Load the data
grade4 <- read.dta(paste0(data, "final4.dta"))
grade5 <- read.dta(paste0(data, "final5.dta"))
#%%
In [25]:
# Restrict sample
grade4 <- grade4[which(grade4$classize & grade4$classize < 45 & grade4$c_size > 5), ]
grade5 <- grade5[which(grade5$classize & grade5$classize < 45 & grade5$c_size > 5), ]

# Find means class size by grade size
grade4cmeans <- aggregate(grade4$classize,
                          by  = list(grade4$c_size),
                          FUN = mean,
                          na.rm = TRUE)

grade5cmeans <- aggregate(grade5$classize,
                          by  = list(grade5$c_size),
                          FUN = mean,
                          na.rm = TRUE)

# Rename aggregaed columns
colnames(grade4cmeans) <- c("c_size", "classize.mean")
colnames(grade5cmeans) <- c("c_size", "classize.mean")
#%%
In [28]:
# Create function for Maimonides Rule
maimonides.rule <- function(x) {x / (floor((x - 1)/40) + 1)}
#%%
# Plot each grade
g4 <- ggplot(data = grade4cmeans, aes(x = c_size))
p4 <- g4 + geom_line(aes(y = classize.mean))            +
           stat_function(fun      = maimonides.rule,
                         linetype = "dashed")           +
           expand_limits(y = 0)                         +
           scale_x_continuous(breaks = seq(0, 220, 20)) +
           ylab("Class size")                           +
           xlab("Enrollment count")                     +
           ggtitle("B. Fourth grade")

g5 <- ggplot(data = grade5cmeans, aes(x = c_size))
p5 <- g5 + geom_line(aes(y = classize.mean))            +
           stat_function(fun      = maimonides.rule,
                         linetype = "dashed")           +
           expand_limits(y = 0)                         +
           scale_x_continuous(breaks = seq(0, 220, 20)) +
           ylab("Class size")                           +
           xlab("Enrollment count")                     +
           ggtitle("A. Fifth grade")

grid.arrange(p5, p4, ncol = 1)