options(digits=2)
# R Imports Boilerplate
rm(list=ls())

library(LalRUtils)
invisible(load_or_install(c('tidyverse','data.table', 'stargazer','lfe', 'estimatr',
  'magrittr','Hmisc', 'rio', 'texreg', 'knitr', 'foreach', 'doParallel')))
theme_set(theme_bw())

exp_type = 'text'

root = '/home/alal/Dropbox/0_GradSchool/0_classes/450b/applied-econometrics-notes/'
data = paste0(root, 'data/')

The Experimental Ideal¶

Potential Outcomes (Rubin 1974, Holland 1986)¶

$Y_i$ is the observed outcome, $D_i$ is the treatment

$$ Y_i = \begin{cases} Y_{1i} \text{ if } D_i = 1 \\ Y_{0i} \text{ if } D_i = 0 \\ \end{cases} $$

We observe $$ Y _ { i } = D _ { i } \cdot Y _ { 1 i } + \left( 1 - D _ { i } \right) \cdot Y _ { 0 i } $$

In other words, $$ Y_i = Y_{0i} + (Y_{1i} - Y_{0i}) D_i $$

Fundamental Problem of Causal Inference : We never see both potential outcomes for any given unit.

$$ \begin{aligned} \mathbb{E}[Y_i|D_i = 1] - \mathbb{E}[Y_i|D_i = 0] & = \mathbb{E}[Y_{1i}|D_i=1] - \mathbb{E}[Y_{0i}|D_i=1] + \mathbb{E}[Y_{0i}|D_i=1] - \mathbb{E}[Y_{0i}|D_i=0]\\ \text{observed effect} & = \text{Treatment on Treated} + \text{Selection Bias} \end{aligned} $$

Randomisation¶

Solves the selection problem by making: $$(Y_{1i} , Y_{0i}) \coprod D_i$$

Regression Formulation¶

$$ Y_i = \alpha + \tau X_i + \eta_i $$

$\alpha = \mathbb{E}[Y_{0i}]$
$\tau = (Y_{1i} - Y_{0i})$
$\eta_i = Y_{0i} - \mathbb{E}(Y_{0i})$

Selection bias: $Cov(D_i, \eta_i) \; \neq 0$

$$ \begin{aligned} Y _ { i } & = D _ { i } Y _ { 1 i } + \left( 1 - D _ { i } \right) Y _ { 0 i } \\ & = Y _ { 0 i } + \left( Y _ { 1 i } - Y _ { 0 i } \right) D _ { i } \\ & = \overline { Y } _ { 0 } + \left( \overline { Y } _ { 1 } - \overline { Y } _ { 0 } \right) D _ { i } + \left\{ \left( Y _ { i 0 } - \overline { Y } _ { 0 } \right) + D _ { i } \cdot \left[ \left( Y _ { i 1 } - \overline { Y } _ { 1 } \right) - \left( Y _ { i 0 } - \overline { Y } _ { 0 } \right) \right] \right\} \\ & = \alpha + \tau _ { \operatorname { Reg } } D _ { i } + \epsilon _ { i } \end{aligned} $$

Experimental Design - Treatment effects¶

Estimating equation is:

Treatment group: $y_i = X^T \beta + \epsilon_i$ Expected value of treatment group: $E(y|T) = x^T + E(\epsilon_i|T)$

Control group: $y_i = X^C \beta + \epsilon_i$ Expected value of control group: $E(y|C) = x^C + E(\epsilon_i|C)$

Because of randomisation, $E(\epsilon_i|T) = E(\epsilon_i|C)$, which implies $$ E(y|T) - E(y|C) = (x^T - x^C)\beta $$

Causal Parameter of interest:

$$ \beta = \frac{E(y|T)-E(y|C)}{x^T-x^C} $$

For most experiments with A/B testing structure, $x^T=1$ and $x^C=0$, so the treatment effect collapses to a simple difference in means $\beta = E(y|T)-E(y|C)$

Treatment Effects¶

$$ \tau _ { A T E } : = \mathbb { E } \left( Y _ { 1 i } - Y _ { 0 i } \right) $$$$ \tau _ { A T T } : = \mathbb { E } \left( Y _ { 1 i } - Y _ { 0 i } | D _ { i } = 1 \right) $$

\begin{align*} \tau _ { A T E } & = \mathbf { E } \left[ Y _ { 1 } - Y _ { 0 } \right] = \mathbf { E } \left[ Y _ { 1 } \right] - \mathbf { E } \left[ Y _ { 0 } \right] \\ & = \mathbf { E } [ Y | D = 1 ] - \mathbf { E } [ Y | D = 0 ] \end{align*}

Point Estimation - Difference in Means¶

$$ \widehat { \tau } = \frac { 1 } { N } \sum _ { i = 1 } ^ { N } \left( \frac { D \cdot Y _ { 1 } } { N _ { 1 } / N } - \frac { ( 1 - D ) \cdot Y _ { 0 } } { N _ { 0 } / N } \right) = \frac{1}{N_1} \sum_N D_i Y_i - \frac{1}{N_0} \sum_N (1-D_i) Y_i $$

Olken (2007)

olken <- import(paste0(data, 'olken_data.csv'))
olken %>% dplyr::select(pct_missing, treat_invite) %>%
group_by(treat_invite) %>% summarise_all(mean, na.rm=T) ->
    groupmeans
groupmeans

(diffmeans = groupmeans[2,2] - groupmeans[1,2])

Standard Error estimation¶

$$ \widehat { S E } _ { \widehat { A T E } } = \sqrt { \frac { \widehat {V a r \left[ Y _ { 1 i } \right] } } { N _ { 1 } } + \frac { \widehat {V a r \left[ Y _ { 0 i } \right] } } { N _ { 0 } } } = \left ( \frac{\widehat { \sigma } _ { Y \left| D _ { i } = 1 \right. } ^ { 2 }}{N_1} + \frac{\widehat { \sigma } _ { Y \left| D _ { i } = 0 \right. } ^ { 2 }} {N_0} \right)^\frac{1}{2} $$

Where $$ \widehat { \operatorname { Var } \left[ Y _ { 1 i } \right. } ] \equiv \frac { 1 } { N _ { 1 } - 1 } \sum _ { i \left| D _ { i } = 1 \right. } ^ { N } \left( Y _ { 1 i } - \frac { \sum _ { i \left| D _ { i } = 1 \right. } ^ { N } Y _ { 1 i } } { N _ { 1 } } \right) ^ { 2 } = \widehat { \sigma } _ { Y \left| D _ { i } = 1 \right. } ^ { 2 } $$

$$ \widehat { \operatorname { Var } \left[ Y _ { 0 i } \right. } ] \equiv \frac { 1 } { N _ { 0 } - 1 } \sum _ { i \left| D _ { i } = 0 \right. } ^ { N } \left( Y _ { 0 i } - \frac { \sum _ { i \left| D _ { i } = 0 \right. } ^ { N } Y _ { 0 i } } { N _ { 0 } } \right) ^ { 2 } = \widehat { \sigma } _ { Y \left| D _ { i } = 0 \right. } ^ { 2 } $$

Test Statistic¶

$$ t = \frac { \widehat { \tau } } { \sqrt { \frac { \widehat { \sigma } _ { Y _ { i } \left| D _ { i } = 1 \right. } ^ { 2 } } { N _ { 1 } } + \frac { \widehat { \sigma } _ { Y _ { i } \left| D _ { i } = 0 \right. } ^ { 2 } } { N _ { 0 } } } } $$

olken %>% dplyr::select(pct_missing, treat_invite) %>%
    na.omit() %>%
    group_by(treat_invite) %>% summarise(
    mu = mean(pct_missing),
    sigma_2 = var(pct_missing),
    nobs = n()
) -> sumcalc
sumcalc

se_hat = sqrt(
    sumcalc$sigma_2[2] / sumcalc$nobs[2] +
    sumcalc$sigma_2[1] / sumcalc$nobs[1])

print(c(diffmeans, se_hat))

$pct_missing
[1] -0.023

[[2]]
[1] 0.033

Estimating both Jointly¶

2 sample t-test¶

tt = t.test(pct_missing ~ treat_invite, data = olken, var.equal = F)

tto <- broom::tidy(tt)
kable(tto %>% dplyr::select('estimate1', 'estimate2', 'statistic', 'p.value', 'conf.low', 'conf.high'), type = exp_type)


| estimate1| estimate2| statistic| p.value| conf.low| conf.high|
|---------:|---------:|---------:|-------:|--------:|---------:|
|      0.25|      0.23|       0.7|    0.48|    -0.04|      0.09|

Identical to HC2 standard errors

Regression¶

outcome = 'pct_missing'
treatment = 'treat_invite'
covars = setdiff(colnames(olken), c(outcome, treatment))

fmla_base = formula_stitcher('pct_missing', 'treat_invite')
# without controls
m2_norob = lm(fmla_base, data = olken) # HC0
m2 = lm_robust(fmla_base, data = olken) # HC2

print(screenreg(list(m2_norob, m2), include.ci = F, caption="Regression Estimates of SATE"))

====================================
              Model 1     Model 2   
------------------------------------
(Intercept)     0.25 ***    0.25 ***
               (0.03)      (0.03)   
treat_invite   -0.02       -0.02    
               (0.03)      (0.03)   
------------------------------------
R^2             0.00        0.00    
Adj. R^2       -0.00       -0.00    
Num. obs.     477         477       
RMSE            0.34        0.34    
====================================
*** p < 0.001, ** p < 0.01, * p < 0.05

Covariate Adjustment¶

# base with controls
fmla_full = formula_stitcher(outcome, c(treatment, covars))
m3 = lm_robust(fmla_full, data = olken)

# base with all pairwise interactions between controls
RHS = paste0('treat_invite + ', '(', paste(covars,collapse='+'), ')^', 2)
lfml = as.formula(paste0(outcome, '~', RHS))
m4 = lm_robust(lfml, data = olken %>% na.omit())

# base with demeaned controls and interactions with treatment
# demean covariates
olken %>% na.omit() %>%
mutate_at(covars, funs(c(scale(., scale=F)))) -> olken2
RHS2 = paste0('treat_invite * ' , '(', paste(covars,collapse='+'), ')')
lfml2 = as.formula(paste0(outcome, '~', RHS2))
m5 = lm_robust(lfml2, data = olken2)

print(screenreg(list(m3, m4, m5), include.ci = F, caption="Regression Estimates of SATE"))

==========================================================
                           Model 1     Model 2  Model 3   
----------------------------------------------------------
(Intercept)                  0.39 ***    0.27     0.25 ***
                            (0.09)      (0.28)   (0.03)   
treat_invite                -0.03       -0.02    -0.03    
                            (0.03)      (0.03)   (0.03)   
head_edu                    -0.01        0.01    -0.01    
                            (0.01)      (0.02)   (0.01)   
mosques                     -0.05 **    -0.07    -0.07 *  
                            (0.02)      (0.11)   (0.03)   
pct_poor                    -0.12       -0.48    -0.10    
                            (0.07)      (0.44)   (0.13)   
total_budget                 0.00        0.00     0.00    
                            (0.00)      (0.00)   (0.00)   
head_edu:mosques                        -0.00             
                                        (0.01)            
head_edu:pct_poor                        0.01             
                                        (0.03)            
head_edu:total_budget                   -0.00             
                                        (0.00)            
mosques:pct_poor                         0.16             
                                        (0.09)            
mosques:total_budget                    -0.00             
                                        (0.00)            
pct_poor:total_budget                    0.00             
                                        (0.00)            
treat_invite:head_edu                             0.00    
                                                 (0.01)   
treat_invite:mosques                              0.03    
                                                 (0.04)   
treat_invite:pct_poor                            -0.03    
                                                 (0.16)   
treat_invite:total_budget                         0.00    
                                                 (0.00)   
----------------------------------------------------------
R^2                          0.03        0.04     0.03    
Adj. R^2                     0.02        0.02     0.01    
Num. obs.                  472         472      472       
RMSE                         0.34        0.34     0.34    
==========================================================
*** p < 0.001, ** p < 0.01, * p < 0.05

Regression Adjustment Issues¶

Freedman (2008) critique

Regression of form $$ Y_i = \alpha + \tau_{reg} D_i + \beta_1 X_i + \epsilon_i $$

$\hat{\tau_{reg}}$ is consistent for ATE and has small sample bias (unless model is true)
- bias is on the order of $1/n$
$\hat{\tau_{reg}}$ precision does not improve through inclusion of controls.
- harmful to precision if more than 3/4 units are assigned to one treatment condition

Lin (2013) Fixes

$$ Y _ { i } = \alpha + \tau _ { \text {interact} } D _ { i } + \beta _ { 1 } \cdot \left( X _ { i } - \overline { X } \right) + \beta _ { 2 } \cdot D _ { i } \cdot \left( X _ { i } - \overline { X } \right) + \epsilon _ { i } $$

same small sample bias, but cannot hurt asymptotic precision even if model is incorrect and will likely increase precision if covariates are predictive of the outcomes.

Blocking¶

For $J$ blocks,

Point estimate $$ \hat { \tau } _ { B } = \sum _ { j = 1 } ^ { J } \frac { N _ { j } } { N } \hat { \tau } _ { j } $$

Variance $$ \operatorname { Var } \left( \hat { \tau } _ { B } \right) = \sum _ { j = 1 } ^ { J } \left( \frac { N _ { j } } { N } \right) ^ { 2 } \operatorname { Var } \left( \hat { \tau } _ { j } \right) $$

Regression Formulation

$$ y _ { i } = \tau D _ { i } + \sum _ { j = 2 } ^ { J } \beta _ { j } \cdot B _ { i j } + \epsilon _ { i } $$

Efficiency Improvements¶

$$ \begin{aligned} Y _ { i } & = \alpha + \tau _ { C R } D _ { i } + \varepsilon _ { i } \\ Y _ { i } & = \alpha + \tau _ { B R } D _ { i } + \sum _ { j = 2 } ^ { J } \beta _ { j } B _ { i j } + \varepsilon _ { i } ^ { * } \end{aligned} $$$$ \operatorname { Var } \left[ \widehat { \tau } _ { C R } \right] = \frac { \sigma _ { \varepsilon } ^ { 2 } } { \sum _ { i = 1 } ^ { n } \left( D _ { i } - \overline { D } \right) ^ { 2 } } \quad \text { with } \widehat { \sigma } _ { \varepsilon } ^ { 2 } = \frac { \sum _ { i = 1 } ^ { n } \widehat { \varepsilon } _ { i } ^ { 2 } } { n - 2 } = \frac { S S R _ { \widehat { \varepsilon } } } { n - 2 } $$$$ \operatorname { Var } \left[ \widehat { \tau } _ { B R } \right] = \frac { \sigma _ { \varepsilon ^ { * } } ^ { 2 } } { \sum _ { i = 1 } ^ { n } \left( D _ { i } - \overline { D } \right) ^ { 2 } \left( 1 - R _ { j } ^ { 2 } \right) } \text { with } \widehat { \sigma } _ { \varepsilon ^ { * } } ^ { 2 } = \frac { \sum _ { i = 1 } ^ { n } \widehat { \varepsilon } _ { i } ^ { * ^ { 2 } } } { n - k - 1 } = \frac { S S R _ { \widehat { \varepsilon } ^ { * } } } { n - k - 1 } $$

Where $R^2_j$ is the fit from regressing $D$ on all $B_j$ dummies.

Since $R^2_j \approx 0$ (because of randomisation), $$V[\hat{\tau}_{BR}] < V[\hat{\tau}_{CR}] \iff \frac { S S R _ { \widehat { \varepsilon } ^ { * } } } { n - k - 1 } < \frac { S S R _ { \widehat { \varepsilon } } } { n - 2 }$$

Power Function¶

$\delta = \mu_1 - \mu_0$ (effect size)

For common variance $\sigma$,

$$ \pi = Pr(|t| > 1.96) = \Phi \left( - 1.96 - \frac { \delta \sqrt { N } } { 2 \sigma } \right) + \left( 1 - \Phi \left( 1.96 - \frac { \delta \sqrt { N } } { 2 \sigma } \right) \right) $$

This yields minimum detectable effect: $$ \operatorname { MDE } ( \delta ) = M _ { n - 2 } \sqrt { \frac { \sigma ^ { 2 } } { N p ( 1 - p ) } } $$

where $M _ { n - 2 } = t _ { ( 1 - \alpha / 2 ) } + t _ { 1 - \psi }$

General case with unequal variances $$ \pi = \Phi \left( 1 - \frac { \delta } { \sqrt { \frac { \sigma _ { 1 } ^ { 2 } } { p N } + \frac { \sigma _ { 0 } ^ { 2 } } { ( 1 - p ) N } } } \right) + \left( 1 - \Phi \left( \frac { \delta } { \sqrt { \frac { \sigma _ { 1 } ^ { 2 } } { p N } + \frac { \sigma _ { 0 } ^ { 2 } } { ( 1 - p ) N } } } \right) \right) $$

N = nrow(olken)
olken = na.omit(olken)
olken %>% group_by(treat_invite) %>% summarise(
  n = n(), sigma_2 = var(pct_missing),
  sigma_2_n = sigma_2 / n,
  share = n/N
  ) -> sumstats

# store sigmas and p from original data
sigma_treat   = sumstats$sigma_2[2]
sigma_control = sumstats$sigma_2_n[1]
share_treat   = sumstats$share[2]

powerfunc <- function(delta, N,
    sigma_t=sigma_treat, sigma_c=sigma_control, p = share_treat){
  denom = sqrt(sigma_t/(p * N) + sigma_c/((1-p) * N))
  pnorm(-1.96 - delta/denom) + (1 - pnorm(1.96 - delta/denom))
}

mdes = seq(0, 0.3, 0.0005)
pow_250  = powerfunc(mdes, 250)
pow_500  = powerfunc(mdes, 500)

df = data.frame(mdes, pow_250, pow_500)

p = ggplot(df, aes(x = mdes, y = value, color = N)) +
  geom_line(aes(y=pow_250, col = '250')) +
  geom_line(aes(y=pow_500, col = '500')) +
  labs(
    title='Power ~ Minimum Detectable Size',
    x = bquote(delta),
    y = 'Power'
  )

p

(mde_250 = unique(abs(df$mdes[which(between(df$pow_250, 0.795, 0.805))])))
(mde_500 = unique(abs(df$mdes[which(between(df$pow_500, 0.795, 0.805))])))

# fisher exact
"%ni%" <- Negate("%in%") # function to negate selection

Randomisation Inference¶

short ref - Imbens 2001

A test statistic $T$ is a known function $T(\textbf{W}, \textbf{Y}^{obs}, \textbf{X})$ of assignments, Observed outcomes, and pretreatment variables.

Comparing the value of the statistic for the realized assignment, say $T^{obs}$ , with its distribution over the assignment mechanism leads to an assesment of the probability of observing a value of $T$ as extreme as, or more extreme than what was observed. This allows the calculation of the p-value for the maintained hypothesis of a specific null set of unit treatment values.

Complete randomisation of $2N$ units with $N$ treated. ${2N\choose N}$ assignment vectors. $P$ value can be as small as $1 / {2N \choose N}$.

$\Omega$ is the full set of randomisation realisations, and $\omega$ is an element in the set, with associated probability $1/{2N \choose N}$

'sharp' Null: $\tau_i = 0 ; Y_{1i} = Y_{0i} \; \forall i$

Construct Randomisation distribution (under all possible permutation vectors), and calculate $\hat{\tau}(\omega)$ for each $\omega$
Plot Histogram
Construct (exact) p-value

$$ Pr (\hat{\tau}(\omega) \geq \hat{\tau}) $$

Approximating RI¶

(when ${2N \choose N}$ is too large)

numerical_fisher <- function(nrep, df=olken){
  N = nrow(df)
  ntreat = df %>% filter(treat_invite == 1) %>% nrow()
  registerDoParallel(detectCores()) # parallelise across cores
  tau_stack = foreach(1:nrep, .combine = c) %dopar% {
    treated = base::sample(1:N, ntreat, rep=F)
    tau_i = (mean(df[treated, ]$pct_missing) -
            mean(df[1:N %ni% treated, ]$pct_missing))
    tau_i
  }
  return(tau_stack)
}

taus = numerical_fisher(nrep=1e5)
tau_stack = data.frame(id = 1:(length(taus)), tau_i = taus)
tau_stack %>% head()

# calculate SATE
olken %>% select(pct_missing, treat_invite) %>%
  group_by(treat_invite) %>% summarise_all(mean, na.rm=T) ->
  groupmeans
diffmeans = (groupmeans[2,2] - groupmeans[1,2])[1,1]

p = ggplot(tau_stack, aes(x=tau_i)) +
  geom_histogram(aes(y=..density..),colour="black", fill="white", bins = 50) +
  geom_vline(xintercept = diffmeans, colour = 'red') +
  labs(
    title = 'Treatment effects under Fisher\'s exact Randomisation',
    x = bquote('Simulated ' ~ tau[omega])
  )

print(p)

tau_stack %>% filter(tau_i >= diffmeans) %>% nrow() -> numer
(pval = numer/nrow(tau_stack))

tau_stack %>% filter(tau_i <= diffmeans) %>% nrow() -> nr
nr/nrow(tau_stack)

1 - pval

Standard error (since this is no longer exact)

sqrt((pval * (1-pval)) / 1e5)

id	tau_i
1	-0.0402
2	0.0286
3	0.0283
4	-0.0087
5	0.0397
6	-0.0159

treat_invite	pct_missing
0	0.25
1	0.23

treat_invite	mu	sigma_2	nobs
0	0.25	0.11	162
1	0.23	0.12	315