import numpy as np
import pandas as pd
from econml.dr import LinearDRLearner
from joblib import Parallel, delayed
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
import empirical_calibration as ec
import scipy
from typing import List
Jeff Wooldridge recently posted about different estimators for treatment effects under unconfoundedness, and in particular emphasized a somewhat overlooked Regression Adjustment with inverse propensity weights method, which can be viewed as a (simple) instantiation of the Targeted Maximum Likelihood (TMLE) approach to estimation under unconfoundedness.
Exposition from Imbens (2004)
Here, we implement and benchmark some of these estimators and study their behaviour under different forms of mis-specification, and conclude that IPWRA generally works pretty well. It is effectively quite similar to entropy balancing weights in terms of bias and variance, which are also doubly robust in that they implicitly fit a linear model and logistic propensity score.
utils
DGP to generate a dataframe generated from a user-supplied pscore, outcome, and heterogeneous effects function.
# %% DGP
def binary_dgp(n, k, pscore_fn, tau_fn, outcome_fn, k_cat=1):
= np.random.uniform(-1, 1, (k, k))
Sigma = Sigma @ Sigma.T
Sigma = np.random.multivariate_normal(np.zeros(k), Sigma, n)
Xnum # generate categorical variables
= np.random.binomial(1, 0.5, (n, k_cat))
Xcat = np.c_[Xnum, Xcat]
X = np.random.binomial(1, pscore_fn(X), n).astype(int)
W = outcome_fn(X, W, tau_fn)
Y = pd.DataFrame(
df =["Y", "W"] + [f"X{i}" for i in range(k + 1)]
np.c_[Y, W, X], columns
)return df
def sim_summariser(dat, truth=1):
return {
"bias": np.mean(dat, axis=0) - truth,
"var": np.var(dat, axis=0),
"rmse": np.sqrt(np.mean((dat - truth) ** 2, axis=0)),
}
Regression Adjustment + IPW
Omnibus function to compute Regression Adjustment, IPW, and IPW-RA. This is now available in aipyw.
from sklearn.preprocessing import StandardScaler
def ipwra(
y: np.ndarray,
w: np.ndarray,
X: np.ndarray,str = "ATE",
estimand: str = "both",
typ:
):# standardize X
= StandardScaler()
sca = sca.fit_transform(X)
Xtilde # weight specification
if typ in ["both", "ipw"]:
= LogisticRegression(
lr =1e12
C# make inverse reg strength big for no regularisation
)
lr.fit(X, w)= lr.predict_proba(X) # returns (1-e(x), e(x))
p if estimand == "ATE":
= w / p[:, 1] + (1 - w) / p[:, 0]
wt elif estimand == "ATT":
= np.mean(w)
rho = p[:, 1] / rho + (1 - w) / p[:, 0]
wt # covariate matrix specification
if typ in ["both", "ra"]:
= np.c_[
XX len(w)), w, Xtilde, w[:, np.newaxis] * Xtilde
np.ones(# design matrix has [1, W, X, W * X] where X is centered
] else:
= np.c_[np.ones(len(w)), w] # pure IPW
XX if typ == "ra":
= None
wt return LinearRegression().fit(XX, y, sample_weight=wt).coef_[1] # w coef
edit: an earlier version manually applied np.sqrt
to the weights, which LinearRegression
does internally. Τhis was incorrect and has been fixed.
User-facing instantiation that accepts a dataframe, and computes bootstrap confidence intervals by default.
def bootstrap_ipwra(
df: pd.DataFrame,str = "Y",
outcome: str = "W",
treatment: = [],
covariates: List str = "both",
typ: float = 0.05,
alpha: int = 100,
n_iterations: int = -1,
n_jobs:
):if len(covariates) == 0: # use all remaining columns as covariates
= df.columns.difference([outcome, treatment]).tolist()
covariates = ipwra(df[outcome].values, df[treatment].values, df[covariates].values, typ=typ)
point_estimate if n_iterations == 0:
return {"est": point_estimate.round(4)}
def onerep():
= df.sample(n=len(df), replace=True)
bsamp return ipwra(bsamp[outcome].values, bsamp[treatment].values, bsamp[covariates].values, typ=typ)
# multi-thread bootstrap
= Parallel(n_jobs=n_jobs)(delayed(onerep)() for _ in range(n_iterations))
replicates = np.array(replicates)
replicates = np.percentile(replicates, [alpha / 2 * 100, (1 - alpha / 2) * 100])
lb, ub return {
"est": point_estimate.round(4),
f"{(1-alpha)*100}% CI (Bootstrap)": np.r_[lb, ub].round(4),
}
Example of use:
def outcome_fn(x, w, taufn):
= x[:, 0] + 2 * x[:, 1] + x[:, 3]
base_term return taufn(x) * w + base_term + np.random.normal(0, 1, len(w))
def pscore_fn(x):
= x[:, 0] - x[:, 1] - x[:, 2] + x[:, 3] + np.random.normal(0, 1, x.shape[0])
lin return scipy.special.expit(lin)
= binary_dgp(n = 10_000, k = 4, tau_fn = lambda x: 1,
df = pscore_fn, outcome_fn = outcome_fn)
pscore_fn df.head()
Y | W | X0 | X1 | X2 | X3 | X4 | |
---|---|---|---|---|---|---|---|
0 | 0.853669 | 1.0 | 0.540357 | -0.246628 | -1.208470 | -2.053303 | 1.0 |
1 | -2.828752 | 0.0 | 0.055509 | -0.042991 | -0.779749 | -1.748115 | 1.0 |
2 | -2.055299 | 1.0 | 0.288326 | -1.199592 | -0.606765 | -1.226635 | 1.0 |
3 | 3.797470 | 0.0 | -0.708268 | 1.232548 | 1.141870 | 1.584794 | 0.0 |
4 | 1.116264 | 1.0 | 0.317727 | -0.945817 | -0.163137 | 1.769615 | 1.0 |
# default args conform to simulated data; else supply variable names
bootstrap_ipwra(df)
{'est': 0.9953, '95.0% CI (Bootstrap)': array([0.9357, 1.0547])}
Simulations
Our simulation machine accepts DGP parameters and returns a list of estimates. For the first 3 estimators, we use linear + logistic specifications that accommodate linear and logistic propensity scores, so any non-linearity generates misspecification. The last AIPW estimator uses boosting regression and classification and should (at slow rates) learn all functions, so will never be misspecified.
def onesim(**kwargs):
= binary_dgp(**kwargs)
df = "Y", "W"
outcome_column, treatment_column = df.columns.difference([outcome_column, treatment_column]).tolist()
feature_columns # naive
= df.groupby("W")["Y"].mean().diff()[1] # treat - ctrl
naive # ipw and ipwra
= (
ra, ipw, ipwra =0, typ="ra"),
bootstrap_ipwra(df, n_iterations=0, typ="ipw"),
bootstrap_ipwra(df, n_iterations=0, typ="both"),
bootstrap_ipwra(df, n_iterations
)# aipw
= LinearDRLearner(
m =LGBMRegressor(n_estimators=50, verbose=-1),
model_regression=LGBMClassifier(n_estimators=50, verbose=-1),
model_propensity
)
m.fit(=df["Y"],
Y=df["W"],
T=df[feature_columns],
W
)= m.ate_inference().mean_point
dml_est # balancing weights
= ec.maybe_exact_calibrate(
qweights, _ =df.query("W==0")[feature_columns].values,
covariates=df.query("W==1")[feature_columns].values,
target_covariates=ec.Objective.QUADRATIC,
objective
)= ec.maybe_exact_calibrate(
eweights, _ =df.query("W==0")[feature_columns].values,
covariates=df.query("W==1")[feature_columns].values,
target_covariates=ec.Objective.ENTROPY,
objective
)= df.query("W==1")["Y"].mean() - np.average(df.query("W==0")["Y"], weights=qweights)
qb_est = df.query("W==1")["Y"].mean() - np.average(df.query("W==0")["Y"], weights=eweights)
eb_est
return np.r_[naive, ra["est"], ipw["est"], ipwra["est"], dml_est, qb_est, eb_est]
= lambda x: 1
tau_fn = 10_000, 3, 100
n, k, nsim = ['naive', 'ra', 'ipw', 'ipwra', 'aipw', 'qb', 'eb'] estimators
both correctly specified
Linear outcome and logistic pscore.
def outcome_fn(x, w, taufn):
= x[:, 0] + 2 * x[:, 1] + x[:, 3]
base_term return taufn(x) * w + base_term + np.random.normal(0, 1, len(w))
def pscore_fn(x):
= x[:, 0] - x[:, 1] - x[:, 2] + x[:, 3] + np.random.normal(0, 1, x.shape[0])
lin return scipy.special.expit(lin)
=n, k=k, pscore_fn=pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn) onesim(n
array([-0.37865908, 1.0071 , 0.9765 , 0.9991 , 1.02255154,
0.9882471 , 0.98856997])
= np.array(
linear_sims =-1)(
Parallel(n_jobs
delayed(onesim)(=n, k=k, pscore_fn=pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn
n
)for _ in range(nsim)
)
)= sim_summariser(linear_sims)
linear_summary pd.DataFrame.from_dict(linear_summary).set_axis(estimators)
bias | var | rmse | |
---|---|---|---|
naive | -0.333937 | 1.189322 | 1.140542 |
ra | -0.000373 | 0.000628 | 0.025067 |
ipw | -0.006068 | 0.005668 | 0.075530 |
ipwra | -0.003223 | 0.000829 | 0.028967 |
aipw | 0.011722 | 0.001381 | 0.038965 |
qb | -0.003911 | 0.000931 | 0.030758 |
eb | -0.005386 | 0.001297 | 0.036418 |
good weights don’t add much variance to IPWRA (over RA). IPW is quite bad even though the model is correctly specified.
correct outcome, incorrect propensity
introduce nonlinearity into the pscore.
def outcome_fn(x, w, taufn):
= x[:, 0] + 2 * x[:, 1] + x[:, 3]
base_term return taufn(x) * w + base_term + np.random.normal(0, 1, len(w))
def pscore_fn(x):
= (
lin -x[:, 0]
- x[:, 1]
- x[:, 2]
+ x[:, 3]
+ np.exp(x[:, 3])
+ x[:, 2] ** 2
+ 0.1 * x[:, 1] ** 3
+ x[:, 0] * x[:, 1]
+ np.random.normal(0, 1, x.shape[0])
)return scipy.special.expit(lin)
= np.array(
nlinear_sims1 =12)(
Parallel(n_jobs
delayed(onesim)(=n, k=k, pscore_fn=pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn
n
)for _ in range(nsim)
)
)= sim_summariser(nlinear_sims1)
nlinear_summary1 pd.DataFrame.from_dict(nlinear_summary1).set_axis(estimators)
bias | var | rmse | |
---|---|---|---|
naive | -0.391805 | 0.367960 | 0.722130 |
ra | 0.004664 | 0.001873 | 0.043533 |
ipw | -0.375230 | 0.146017 | 0.535551 |
ipwra | 0.000944 | 0.002958 | 0.054398 |
aipw | 0.059299 | 0.035076 | 0.196450 |
qb | 0.002870 | 0.003020 | 0.055026 |
eb | -0.001717 | 0.004221 | 0.064991 |
Bad pscore does not hurt IPWRA in terms of bias, but increases variance a bit.
incorrect outcome, correct propensity
introduce non-linearity into the outcome model.
def outcome_fn(x, w, taufn):
= (
base_term 0]
x[:, + 2 * x[:, 1]
+ x[:, 3]
+ np.exp(x[:, 0])
+ 2 * (x[:, 1] > 0.5)
+ x[:, 1] ** 2
+ np.sqrt(np.abs(x[:, 2]))
+ x[:, 0] * x[:, 2]
)return taufn(x) * w + base_term + np.random.normal(0, 1, len(w))
def pscore_fn(x):
= (
lin -x[:, 0]
- x[:, 1]
- x[:, 2]
+ x[:, 3]
+ np.random.normal(0, 1, x.shape[0])
)return scipy.special.expit(lin)
= np.array(
nlinear_sims2 =12)(
Parallel(n_jobs
delayed(onesim)(=n, k=k, pscore_fn=pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn
n
)for _ in range(nsim)
)
)= sim_summariser(nlinear_sims2)
nlinear_summary2 pd.DataFrame.from_dict(nlinear_summary2).set_axis(estimators)
bias | var | rmse | |
---|---|---|---|
naive | -2.913566 | 4.837368 | 3.650512 |
ra | 0.328193 | 0.246748 | 0.595365 |
ipw | -0.065889 | 0.107900 | 0.335024 |
ipwra | 0.050050 | 0.036513 | 0.197530 |
aipw | -0.033077 | 0.073476 | 0.273075 |
qb | 0.494620 | 0.102425 | 0.589129 |
eb | 0.092074 | 0.030125 | 0.196477 |
RA has some bias. IPWRA has lower bias, which suggests that the weights help a bit.
both incorrect
both outcome and propensity have non-linearity.
def outcome_fn(x, w, taufn):
= (
base_term 0]
x[:, + 2 * x[:, 1]
+ x[:, 3]
+ np.exp(x[:, 0])
+ 2 * (x[:, 1] > 0.5)
+ x[:, 1] ** 2
+ np.sqrt(np.abs(x[:, 2]))
+ x[:, 0] * x[:, 2]
)return taufn(x) * w + base_term + np.random.normal(0, 1, len(w))
def pscore_fn(x):
= (
lin 0]
x[:, - x[:, 1]
- x[:, 2]
+ x[:, 3]
+ np.exp(x[:, 3])
+ x[:, 2] ** 2
+ 0.1 * x[:, 1] ** 3
+ x[:, 0] * x[:, 1]
+ np.random.normal(0, 1, x.shape[0])
)return scipy.special.expit(lin)
= np.array(
nlinear_sims3 =12)(
Parallel(n_jobs
delayed(onesim)(=n, k=k, pscore_fn=pscore_fn, tau_fn=tau_fn, outcome_fn=outcome_fn
n
)for _ in range(nsim)
)
)= sim_summariser(nlinear_sims3)
nlinear_summary3 pd.DataFrame.from_dict(nlinear_summary3).set_axis(estimators)
bias | var | rmse | |
---|---|---|---|
naive | 0.769820 | 2.343603 | 1.713542 |
ra | 0.058784 | 3.701395 | 1.924799 |
ipw | -4.453326 | 128.446402 | 12.176967 |
ipwra | 0.441008 | 7.651208 | 2.801017 |
aipw | 0.059865 | 0.040356 | 0.209618 |
qb | -0.392092 | 5.627199 | 2.404357 |
eb | -1.021150 | 10.837197 | 3.446729 |
All bets are off, only AIPW with boosting nuisances works well.