options(digits=2)
# R Imports Boilerplate
rm(list=ls())
library(LalRUtils)
invisible(load_or_install(c('tidyverse','data.table', 'stargazer','lfe', 'estimatr',
'magrittr','Hmisc', 'rio', 'texreg', 'knitr', 'foreach', 'doParallel')))
theme_set(theme_bw())
exp_type = 'text'
root = '/home/alal/Dropbox/0_GradSchool/0_classes/450b/applied-econometrics-notes/'
data = paste0(root, 'data/')
$Y_i$ is the observed outcome, $D_i$ is the treatment
$$ Y_i = \begin{cases} Y_{1i} \text{ if } D_i = 1 \\ Y_{0i} \text{ if } D_i = 0 \\ \end{cases} $$We observe $$ Y _ { i } = D _ { i } \cdot Y _ { 1 i } + \left( 1 - D _ { i } \right) \cdot Y _ { 0 i } $$
In other words, $$ Y_i = Y_{0i} + (Y_{1i} - Y_{0i}) D_i $$
Fundamental Problem of Causal Inference : We never see both potential outcomes for any given unit.
Solves the selection problem by making: $$(Y_{1i} , Y_{0i}) \coprod D_i$$
Selection bias: $Cov(D_i, \eta_i) \; \neq 0$
Estimating equation is:
Treatment group: $y_i = X^T \beta + \epsilon_i$ Expected value of treatment group: $E(y|T) = x^T + E(\epsilon_i|T)$
Control group: $y_i = X^C \beta + \epsilon_i$ Expected value of control group: $E(y|C) = x^C + E(\epsilon_i|C)$
Because of randomisation, $E(\epsilon_i|T) = E(\epsilon_i|C)$, which implies $$ E(y|T) - E(y|C) = (x^T - x^C)\beta $$
Causal Parameter of interest:
$$ \beta = \frac{E(y|T)-E(y|C)}{x^T-x^C} $$For most experiments with A/B testing structure, $x^T=1$ and $x^C=0$, so the treatment effect collapses to a simple difference in means $\beta = E(y|T)-E(y|C)$
Olken (2007)
olken <- import(paste0(data, 'olken_data.csv'))
olken %>% dplyr::select(pct_missing, treat_invite) %>%
group_by(treat_invite) %>% summarise_all(mean, na.rm=T) ->
groupmeans
groupmeans
(diffmeans = groupmeans[2,2] - groupmeans[1,2])
Where $$ \widehat { \operatorname { Var } \left[ Y _ { 1 i } \right. } ] \equiv \frac { 1 } { N _ { 1 } - 1 } \sum _ { i \left| D _ { i } = 1 \right. } ^ { N } \left( Y _ { 1 i } - \frac { \sum _ { i \left| D _ { i } = 1 \right. } ^ { N } Y _ { 1 i } } { N _ { 1 } } \right) ^ { 2 } = \widehat { \sigma } _ { Y \left| D _ { i } = 1 \right. } ^ { 2 } $$
$$ \widehat { \operatorname { Var } \left[ Y _ { 0 i } \right. } ] \equiv \frac { 1 } { N _ { 0 } - 1 } \sum _ { i \left| D _ { i } = 0 \right. } ^ { N } \left( Y _ { 0 i } - \frac { \sum _ { i \left| D _ { i } = 0 \right. } ^ { N } Y _ { 0 i } } { N _ { 0 } } \right) ^ { 2 } = \widehat { \sigma } _ { Y \left| D _ { i } = 0 \right. } ^ { 2 } $$olken %>% dplyr::select(pct_missing, treat_invite) %>%
na.omit() %>%
group_by(treat_invite) %>% summarise(
mu = mean(pct_missing),
sigma_2 = var(pct_missing),
nobs = n()
) -> sumcalc
sumcalc
se_hat = sqrt(
sumcalc$sigma_2[2] / sumcalc$nobs[2] +
sumcalc$sigma_2[1] / sumcalc$nobs[1])
print(c(diffmeans, se_hat))
tt = t.test(pct_missing ~ treat_invite, data = olken, var.equal = F)
tto <- broom::tidy(tt)
kable(tto %>% dplyr::select('estimate1', 'estimate2', 'statistic', 'p.value', 'conf.low', 'conf.high'), type = exp_type)
Identical to HC2 standard errors
outcome = 'pct_missing'
treatment = 'treat_invite'
covars = setdiff(colnames(olken), c(outcome, treatment))
fmla_base = formula_stitcher('pct_missing', 'treat_invite')
# without controls
m2_norob = lm(fmla_base, data = olken) # HC0
m2 = lm_robust(fmla_base, data = olken) # HC2
print(screenreg(list(m2_norob, m2), include.ci = F, caption="Regression Estimates of SATE"))
# base with controls
fmla_full = formula_stitcher(outcome, c(treatment, covars))
m3 = lm_robust(fmla_full, data = olken)
# base with all pairwise interactions between controls
RHS = paste0('treat_invite + ', '(', paste(covars,collapse='+'), ')^', 2)
lfml = as.formula(paste0(outcome, '~', RHS))
m4 = lm_robust(lfml, data = olken %>% na.omit())
# base with demeaned controls and interactions with treatment
# demean covariates
olken %>% na.omit() %>%
mutate_at(covars, funs(c(scale(., scale=F)))) -> olken2
RHS2 = paste0('treat_invite * ' , '(', paste(covars,collapse='+'), ')')
lfml2 = as.formula(paste0(outcome, '~', RHS2))
m5 = lm_robust(lfml2, data = olken2)
print(screenreg(list(m3, m4, m5), include.ci = F, caption="Regression Estimates of SATE"))
Freedman (2008) critique
Regression of form $$ Y_i = \alpha + \tau_{reg} D_i + \beta_1 X_i + \epsilon_i $$
Lin (2013) Fixes
$$ Y _ { i } = \alpha + \tau _ { \text {interact} } D _ { i } + \beta _ { 1 } \cdot \left( X _ { i } - \overline { X } \right) + \beta _ { 2 } \cdot D _ { i } \cdot \left( X _ { i } - \overline { X } \right) + \epsilon _ { i } $$same small sample bias, but cannot hurt asymptotic precision even if model is incorrect and will likely increase precision if covariates are predictive of the outcomes.
For $J$ blocks,
Point estimate $$ \hat { \tau } _ { B } = \sum _ { j = 1 } ^ { J } \frac { N _ { j } } { N } \hat { \tau } _ { j } $$
Variance $$ \operatorname { Var } \left( \hat { \tau } _ { B } \right) = \sum _ { j = 1 } ^ { J } \left( \frac { N _ { j } } { N } \right) ^ { 2 } \operatorname { Var } \left( \hat { \tau } _ { j } \right) $$
Regression Formulation
$$ y _ { i } = \tau D _ { i } + \sum _ { j = 2 } ^ { J } \beta _ { j } \cdot B _ { i j } + \epsilon _ { i } $$Where $R^2_j$ is the fit from regressing $D$ on all $B_j$ dummies.
Since $R^2_j \approx 0$ (because of randomisation), $$V[\hat{\tau}_{BR}] < V[\hat{\tau}_{CR}] \iff \frac { S S R _ { \widehat { \varepsilon } ^ { * } } } { n - k - 1 } < \frac { S S R _ { \widehat { \varepsilon } } } { n - 2 }$$
$\delta = \mu_1 - \mu_0$ (effect size)
For common variance $\sigma$,
$$ \pi = Pr(|t| > 1.96) = \Phi \left( - 1.96 - \frac { \delta \sqrt { N } } { 2 \sigma } \right) + \left( 1 - \Phi \left( 1.96 - \frac { \delta \sqrt { N } } { 2 \sigma } \right) \right) $$This yields minimum detectable effect: $$ \operatorname { MDE } ( \delta ) = M _ { n - 2 } \sqrt { \frac { \sigma ^ { 2 } } { N p ( 1 - p ) } } $$
where $M _ { n - 2 } = t _ { ( 1 - \alpha / 2 ) } + t _ { 1 - \psi }$
General case with unequal variances $$ \pi = \Phi \left( 1 - \frac { \delta } { \sqrt { \frac { \sigma _ { 1 } ^ { 2 } } { p N } + \frac { \sigma _ { 0 } ^ { 2 } } { ( 1 - p ) N } } } \right) + \left( 1 - \Phi \left( \frac { \delta } { \sqrt { \frac { \sigma _ { 1 } ^ { 2 } } { p N } + \frac { \sigma _ { 0 } ^ { 2 } } { ( 1 - p ) N } } } \right) \right) $$
N = nrow(olken)
olken = na.omit(olken)
olken %>% group_by(treat_invite) %>% summarise(
n = n(), sigma_2 = var(pct_missing),
sigma_2_n = sigma_2 / n,
share = n/N
) -> sumstats
# store sigmas and p from original data
sigma_treat = sumstats$sigma_2[2]
sigma_control = sumstats$sigma_2_n[1]
share_treat = sumstats$share[2]
powerfunc <- function(delta, N,
sigma_t=sigma_treat, sigma_c=sigma_control, p = share_treat){
denom = sqrt(sigma_t/(p * N) + sigma_c/((1-p) * N))
pnorm(-1.96 - delta/denom) + (1 - pnorm(1.96 - delta/denom))
}
mdes = seq(0, 0.3, 0.0005)
pow_250 = powerfunc(mdes, 250)
pow_500 = powerfunc(mdes, 500)
df = data.frame(mdes, pow_250, pow_500)
p = ggplot(df, aes(x = mdes, y = value, color = N)) +
geom_line(aes(y=pow_250, col = '250')) +
geom_line(aes(y=pow_500, col = '500')) +
labs(
title='Power ~ Minimum Detectable Size',
x = bquote(delta),
y = 'Power'
)
p
(mde_250 = unique(abs(df$mdes[which(between(df$pow_250, 0.795, 0.805))])))
(mde_500 = unique(abs(df$mdes[which(between(df$pow_500, 0.795, 0.805))])))
# fisher exact
"%ni%" <- Negate("%in%") # function to negate selection
short ref - Imbens 2001
A test statistic $T$ is a known function $T(\textbf{W}, \textbf{Y}^{obs}, \textbf{X})$ of assignments, Observed outcomes, and pretreatment variables.
Comparing the value of the statistic for the realized assignment, say $T^{obs}$ , with its distribution over the assignment mechanism leads to an assesment of the probability of observing a value of $T$ as extreme as, or more extreme than what was observed. This allows the calculation of the p-value for the maintained hypothesis of a specific null set of unit treatment values.
Complete randomisation of $2N$ units with $N$ treated. ${2N\choose N}$ assignment vectors. $P$ value can be as small as $1 / {2N \choose N}$.
$\Omega$ is the full set of randomisation realisations, and $\omega$ is an element in the set, with associated probability $1/{2N \choose N}$
'sharp' Null: $\tau_i = 0 ; Y_{1i} = Y_{0i} \; \forall i$
(when ${2N \choose N}$ is too large)
numerical_fisher <- function(nrep, df=olken){
N = nrow(df)
ntreat = df %>% filter(treat_invite == 1) %>% nrow()
registerDoParallel(detectCores()) # parallelise across cores
tau_stack = foreach(1:nrep, .combine = c) %dopar% {
treated = base::sample(1:N, ntreat, rep=F)
tau_i = (mean(df[treated, ]$pct_missing) -
mean(df[1:N %ni% treated, ]$pct_missing))
tau_i
}
return(tau_stack)
}
taus = numerical_fisher(nrep=1e5)
tau_stack = data.frame(id = 1:(length(taus)), tau_i = taus)
tau_stack %>% head()
# calculate SATE
olken %>% select(pct_missing, treat_invite) %>%
group_by(treat_invite) %>% summarise_all(mean, na.rm=T) ->
groupmeans
diffmeans = (groupmeans[2,2] - groupmeans[1,2])[1,1]
p = ggplot(tau_stack, aes(x=tau_i)) +
geom_histogram(aes(y=..density..),colour="black", fill="white", bins = 50) +
geom_vline(xintercept = diffmeans, colour = 'red') +
labs(
title = 'Treatment effects under Fisher\'s exact Randomisation',
x = bquote('Simulated ' ~ tau[omega])
)
print(p)
tau_stack %>% filter(tau_i >= diffmeans) %>% nrow() -> numer
(pval = numer/nrow(tau_stack))
tau_stack %>% filter(tau_i <= diffmeans) %>% nrow() -> nr
nr/nrow(tau_stack)
1 - pval
Standard error (since this is no longer exact)
sqrt((pval * (1-pval)) / 1e5)