Causal Inference Methods illustrated using Repeated and Injudicious use of the Lalonde Datasets

causal
Author

Apoorva Lal

We observe \(n\) IID draws of \((Y_i, W_i, \mathbf{X}_i) \in \mathbb{R}\times \left\{ 0, 1 \right\} \times \mathbb{R}^d\). We partition the observations into Treated \(\mathcal{T} := \left\{ i: W_i = 1 \right\}\) and Control \(\mathcal{C} := \left\{ i: W_i = 0 \right\}\), with \(n_t = \vert\mathcal{T}\vert\) and \(n_c = \vert\mathcal{C}\vert\) corresponding with the number of treatment and control units respectively. We define the fraction of treated units in the sample as \(\rho = n_t / n\).

Each unit has two potential outcomes: \(Y^1, Y^0\). Depending on the estimand, we make variations on the following assumptions:

  1. Consistency / SUTVA: \(Y_i = W_i Y^1_i + (1-W_i)Y^0_i\).
  2. Unconfoundedness
  1. \((Y^0, Y^1) \perp \!\!\! \perp W \vert \mathbf{X}\): required for ATE
  2. \(Y^0 \perp \!\!\! \perp W \vert \mathbf{X}\): required for ATT
  1. Overlap
  1. \(0 < \mathbf{Pr}\left(W=1 \vert \mathbf{X}\right) < 1\): required for ATE
  2. \(\mathbf{Pr}\left(W = 1 \vert \mathbf{X}\right) < 1\): required for ATT

Experiments

Difference in Means

library(LalRUtils) # contains lalonde.exp and lalonde.psid
library(estimatr)
# %% experiments
data(lalonde.exp); setDT(lalonde.exp)
yn = 're78'; wn = 'treat';
xn = setdiff(colnames(lalonde.exp), c(yn, wn))

y = lalonde.exp[[yn]]
w = lalonde.exp[[wn]]
X = lalonde.exp[, ..xn] |> as.matrix()
n = nrow(lalonde.exp)

# %% linear regression - robust SE
m = lm_robust(y ~ w)
ϵ = y - m$fitted.values
m$coefficients[2]
   w 
1794 
m$std.error[2]
  w 
671 
library(boot)
# %% bootstrap SE
bs = \(data, i) lm(data[i, ])$coefficients
data.frame(y, w) |> boot(statistic = bs, R = 500)

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = data.frame(y, w), statistic = bs, R = 500)


Bootstrap Statistics :
    original  bias    std. error
t1*     4555 -10.266       345.5
t2*     1794  -8.486       655.2

Covariate Adjustment

naive regression

library(glue)
lm_robust(
    as.formula(glue("{yn} ~ {wn} + {glue_collapse(xn, '+')}")),
  lalonde.exp) |> tidy() |> dplyr::filter(term == "treat")
   term estimate std.error statistic p.value conf.low conf.high  df outcome
1 treat     1671     682.3     2.449 0.01474    329.6      3012 433    re78

lin regression

lm_lin(as.formula(glue("{yn} ~ {wn}")),
  as.formula(glue("~{glue_collapse(xn, '+')}")),
  lalonde.exp) |> tidy() |> dplyr::filter(term == "treat")
   term estimate std.error statistic p.value conf.low conf.high  df outcome
1 treat     1583     678.1     2.335 0.01999    250.7      2916 423    re78

aipw

library(grf)
causal_forest(X, y, w, tune.parameters = "all")  |>
  average_treatment_effect()
estimate  std.err 
  1624.1    664.3 

Selection-on-Observables

data(lalonde.psid); setDT(lalonde.psid)
y = lalonde.psid[[yn]]; w = lalonde.psid[[wn]];
X = lalonde.psid[, ..xn] |> as.matrix()

outcome modelling

naive regression

lm_robust(
    as.formula(glue("{yn} ~ {wn} + {glue_collapse(xn, '+')}")),
  lalonde.psid) |> tidy() |> dplyr::filter(term == "treat")
   term estimate std.error statistic p.value conf.low conf.high   df outcome
1 treat    4.159     856.7  0.004855  0.9961    -1676      1684 2663    re78

lin regression

reg_adjust = function(yn, wn, xs, df,
    estimand = c("ATT", "ATE")) {
  estimand = match.arg(estimand)
  # for ATE, sweep out overall means, else sweep out treatment means
  # dd = ifelse(estimand == "ATE", df[, ..xs], df[get(wn) == 1, ..xs])
  dd = if(estimand == "ATE") df[, ..xs] else df[get(wn) == 1, ..xs]
  # take out means
  Xbar = apply(dd[, ..xs], 2, mean); Xdemeaned = sweep(df[, ..xs], 2, Xbar)
  colnames(Xdemeaned) = paste0(colnames(Xdemeaned), "_dem")
  # concat columns
  regdf = cbind(y = df[[yn]], w = df[[wn]], df[, ..xs], Xdemeaned)
  # interacted regression
  f = as.formula(paste0(
    "y ~ w +",
    paste(xs, collapse = "+"), "+", # controls effects
    "w", ":(", paste(colnames(Xdemeaned), collapse = "+"), ")" # treat effects
  ))
  m = fixest::feols(f, data = regdf, vcov = 'hc1')
  m
}
reg_adjust(yn = yn, wn = wn, xs = xn, df = lalonde.psid, estimand = "ATE")$coeftable[2, ]
   Estimate  Std. Error     t value    Pr(>|t|) 
-8746.28284  3720.56480    -2.35079     0.01881 

wrong sign.

reg_adjust(yn = yn, wn = wn, xs = xn, df = lalonde.psid, estimand = "ATT")$coeftable[2, ]
  Estimate Std. Error    t value   Pr(>|t|) 
  687.8221   897.2045     0.7666     0.4434 

better.

Weighting and Balancing

ehat = glm(w ~ X, family = binomial)$fitted

# ATE
ipwATE = \(y, w, ehat){
  mean(
    (y * w)/ehat -
    (y * (1-w))/(1-ehat)
  )
}

ipwATE(y, w, ehat)
[1] -10454

awful.

# ATT
ipwATT = \(y, w, ehat){
  mean(
    (y * w)/mean(w) - # treated obs - no reweighting
    (y * (1-w) * ehat)/ ( (1-ehat) * mean(w) )
  )
}

ipwATT(y, w, ehat)
[1] 1103

better.

Will stick to ATT from hereon; ATE estimation outside experiments is a pipe-dream.

Matching: 1:1

library(Matching)
Match(y, w, X) |> summary()

Estimate...  2037.2 
AI SE......  1731.1 
T-stat.....  1.1768 
p.val......  0.23928 

Original number of observations..............  2675 
Original number of treated obs...............  185 
Matched number of observations...............  185 
Matched number of observations  (unweighted).  218 

Matching: M:1

Match(y, w, X, M = 5) |> summary()

Estimate...  1976.3 
AI SE......  1444.9 
T-stat.....  1.3679 
p.val......  0.17136 

Original number of observations..............  2675 
Original number of treated obs...............  185 
Matched number of observations...............  185 
Matched number of observations  (unweighted).  938 

Bias-corrected Matching: Linear outcome model

Match(y, w, X, BiasAdjust = T) |> summary()

Estimate...  2374.7 
AI SE......  1714.1 
T-stat.....  1.3854 
p.val......  0.16594 

Original number of observations..............  2675 
Original number of treated obs...............  185 
Matched number of observations...............  185 
Matched number of observations  (unweighted).  218 

Matching: Pscore Matching

Match(y, w, X = ehat, BiasAdjust = T) |> summary()

Estimate...  762.06 
AI SE......  1756.3 
T-stat.....  0.43391 
p.val......  0.66435 

Original number of observations..............  2675 
Original number of treated obs...............  185 
Matched number of observations...............  185 
Matched number of observations  (unweighted).  2027 

Covariate Balancing Propensity Score (CBPS)

library(CBPS)
# CBPS
cbps = CBPS(w ~ X)
[1] "Finding ATT with T=1 as the treatment.  Set ATT=2 to find ATT with T=0 as the treatment"
lm_robust(y ~ w, weights = cbps$weights)
            Estimate Std. Error t value      Pr(>|t|) CI Lower CI Upper   DF
(Intercept)     3911      684.7   5.712 0.00000001236   2568.8     5254 2673
w               2438      896.3   2.720 0.00657755914    680.1     4195 2673

Entropy Balancing

library(ebal)
wts = ebalance(w, X)$w

lm_robust(y ~ w,
  weights = c(
        rep(1/length(w), sum(w)),
        wts)
  )
            Estimate Std. Error t value      Pr(>|t|) CI Lower CI Upper   DF
(Intercept)     3924      682.7   5.748 0.00000001003   2585.8     5263 2673
w               2425      894.8   2.710 0.00677680811    670.1     4179 2673

Hybrid: AIPW

library(npcausal)
r = att(y, w, X)
r$res
      parameter  est     se  ci.ll ci.ul  pval
1      E(Y|A=1) 6349 1158.1 4079.3  8619 0.000
2   E{Y(0)|A=1} 4773  348.1 4090.5  5455 0.000
3 E{Y-Y(0)|A=1} 1576  647.2  307.9  2845 0.015
mod = causal_forest(X, y, w, tune.parameters = "all")
average_treatment_effect(mod, target.sample = 'treated')
estimate  std.err 
    1490      784 

doubleML

library(DoubleML); library(mlr3); library(mlr3learners)
# DML
lalondeDML = DoubleMLData$new(lalonde.psid,
                  y_col = "re78",
                  d_cols = "treat",
                  x_cols = setdiff(colnames(lalonde.psid), c('treat', 're78'))
      )
# %%
learner_g = lrn("regr.ranger", num.trees = 500, min.node.size = 2, max.depth = 5)
learner_m = lrn("classif.ranger", num.trees = 500, min.node.size = 2, max.depth = 5)

doubleml_bonus = DoubleMLIRM$new(lalondeDML,
                        ml_m = learner_m,
                        ml_g = learner_g,
                        score = "ATTE",
                        n_folds = 2,
                        n_rep = 1)
doubleml_bonus$fit()
INFO  [15:37:58.114] [mlr3] Applying learner 'classif.ranger' on task 'nuis_m' (iter 1/2)
INFO  [15:37:58.250] [mlr3] Applying learner 'classif.ranger' on task 'nuis_m' (iter 2/2)
INFO  [15:37:58.494] [mlr3] Applying learner 'regr.ranger' on task 'nuis_g0' (iter 1/2)
INFO  [15:37:58.606] [mlr3] Applying learner 'regr.ranger' on task 'nuis_g0' (iter 2/2)
doubleml_bonus$summary()
Estimates and significance testing of the effect of target variables
      Estimate. Std. Error t value Pr(>|t|)  
treat      1918        834     2.3    0.021 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1