\(\DeclareMathOperator*{\argmin}{argmin}\) \(\newcommand{\norm}[1]{\left\Vert{#1} \right\Vert}\) \(\newcommand{\var}{\mathrm{Var}}\) \(\newcommand{\tra}{^{\top}}\) \(\newcommand{\sumin}{\sum_{i=1}^n}\) \(\newcommand{\sumiN}{\sum_{i=1}^n}\) \(\newcommand{\epsi}{\varepsilon}\) \(\newcommand{\phii}{\varphi}\) \(\newcommand\Bigpar[1]{\left( #1 \right )}\) \(\newcommand\Bigbr[1]{\left[ #1 \right ]}\) \(\newcommand\Bigcr[1]{\left\{ #1 \right \}}\) \(\newcommand\SetB[1]{\left\{ #1 \right\}}\) \(\newcommand\Sett[1]{\mathcal{#1}}\) \(\newcommand{\Data}{\mathcal{D}}\) \(\newcommand{\Ubr}[2]{\underbrace{#1}_{\text{#2}}}\) \(\newcommand{\Obr}[2]{ \overbrace{#1}^{\text{#2}}}\) \(\newcommand{\sumiN}{\sum_{i=1}^N}\) \(\newcommand{\dydx}[2]{\frac{\partial #1}{\partial #2}}\) \(\newcommand\Indic[1]{\mathbb{1}_{#1}}\) \(\newcommand{\Realm}[1]{\mathbb{R}^{#1}}\) \(\newcommand{\Exp}[1]{\mathbb{E}\left[#1\right]}\) \(\newcommand{\Expt}[2]{\mathbb{E}_{#1}\left[#2\right]}\) \(\newcommand{\Var}[1]{\mathbb{V}\left[#1\right]}\) \(\newcommand{\Covar}[1]{\text{Cov}\left[#1\right]}\) \(\newcommand{\Prob}[1]{\mathbf{Pr}\left(#1\right)}\) \(\newcommand{\Supp}[1]{\text{Supp}\left[#1\right]}\) \(\newcommand{\doyx}{\Prob{Y \, |\,\mathsf{do} (X = x)}}\) \(\newcommand{\doo}[1]{\Prob{Y |\,\mathsf{do} (#1) }}\) \(\newcommand{\R}{\mathbb{R}}\) \(\newcommand{\Z}{\mathbb{Z}}\) \(\newcommand{\wh}[1]{\widehat{#1}} % Wide hat\) \(\newcommand{\wt}[1]{\widetilde{#1}} % Wide tilde\) \(\newcommand{\wb}[1]{\overline{#1}} % Wide bar\) \(\newcommand\Ol[1]{\overline{#1}}\) \(\newcommand\Ul[1]{\underline{#1}}\) \(\newcommand\Str[1]{#1^{*}}\) \(\newcommand{\F}{\mathsf{F}}\) \(\newcommand{\ff}{\mathsf{f}}\) \(\newcommand{\Cdf}[1]{\mathbb{F}\left(#1\right)}\) \(\newcommand{\Cdff}[2]{\mathbb{F}_{#1}\left(#2\right)}\) \(\newcommand{\Pdf}[1]{\mathsf{f}\left(#1\right)}\) \(\newcommand{\Pdff}[2]{\mathsf{f}_{#1}\left(#2\right)}\) \(\newcommand{\dd}{\mathsf{d}}\) \(\newcommand\Normal[1]{\mathcal{N} \left( #1 \right )}\) \(\newcommand\Unif[1]{\mathsf{U} \left[ #1 \right ]}\) \(\newcommand\Bern[1]{\mathsf{Bernoulli} \left( #1 \right )}\) \(\newcommand\Binom[1]{\mathsf{Bin} \left( #1 \right )}\) \(\newcommand\Pois[1]{\mathsf{Poi} \left( #1 \right )}\) \(\newcommand\BetaD[1]{\mathsf{Beta} \left( #1 \right )}\) \(\newcommand\Diri[1]{\mathsf{Dir} \left( #1 \right )}\) \(\newcommand\Gdist[1]{\mathsf{Gamma} \left( #1 \right )}\) \(\def\mbf#1{\mathbf{#1}}\) \(\def\mrm#1{\mathrm{#1}}\) \(\def\mbi#1{\boldsymbol{#1}}\) \(\def\ve#1{\mbi{#1}} % Vector notation\) \(\def\vee#1{\mathbf{#1}} % Vector notation\) \(\newcommand{\Mat}[1]{\mathbf{#1}}\) \(\newcommand{\eucN}[1]{\norm{#1}}\) \(\newcommand{\lzero}[1]{\norm{#1}_0}\) \(\newcommand{\lone}[1]{\norm{#1}_1}\) \(\newcommand{\ltwo}[1]{\norm{#1}_2}\) \(\newcommand{\pnorm}[1]{\norm{#1}_p}\)
Data prep - Abadie et al (2015) German Reunification example
# %% data prep
load('germany.RData')
$country = factor(x$country)
x= data.table(x)
pan := ifelse(country == "West Germany" & year >= 1990, 1, 0)]
pan[, treat = 'West Germany'
treat_name = pan[country == "West Germany" & treat != 1, nunique(year)]
T0 = pan[country == "West Germany" & treat == 1, nunique(year)]
T1 # number of post-treatment periods
# reshape to wide
= pan[, .(country, year, gdp)] |> dcast(year ~ country, value.var = 'gdp')
wide setcolorder(wide, c('year', treat_name))
= wide[1:T0, 2] |> as.matrix()
y_treat_pre = wide[1:T0, -(1:2)] |> as.matrix() y_ctrl_pre
: \(\mu = 0\). Stronger than Parallel trends in DiD.
: \(\sumin \omega_i = 1\). Common to DiD, SC.
: \(\omega_i \geq 0 \; \forall \; i\). Ensures uniqueness via `coarse’ regularisation + precision control. Negative weights may improve out-of-sample prediction.
: \(\omega_i = \Ol{\omega} \; \forall \; i\)
Assumes A(1-3) - implemented straightforwardly in the program below.
#' Solve for synthetic control weights in CVXR
#' @param y_t t_0 X 1 matrix of pre-treatment outcomes for treatment units
#' @param y_c t_0 X n_0 matrix of pre-treatment outcomes for donor units
#' @return vector of weights
#' @import CVXR
#' @export
= function(y_t, y_c){
sc_solve = Variable(ncol(y_c))
ω = Minimize(sum_squares(y_t - y_c %*% ω))
objective = list( # no intercept
constraints >= 0, # positive weights
ω sum(ω) == 1 # weights sum to 1
)= Problem(objective, constraints)
problem = solve(problem, solver = 'MOSEK')
result = result$getValue(ω)
ω_hat return(ω_hat)
}# %%
= sc_solve(y_treat_pre, y_ctrl_pre)
ω_sc = data.frame(donor = colnames(y_ctrl_pre), wt = ω_sc)
wt_table # %% compute and plot
= copy(wide)
wide2 # impute Y(0) post for treated unit using weights
$y0_sc = as.matrix(wide2[, -(1:2)]) %*% ω_sc
wide2$treat_effect = wide2[[treat_name]] - wide2$y0_sc wide2
# %%
= ggplot(wide2, aes(year)) +
sc_fit geom_line(aes(y = `West Germany`), size = 2) +
geom_line(aes(y = y0_sc), size = 2, alpha = 0.7, color = 'cornflowerblue') +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 1990, linetype = "dashed", color = 'red') +
labs(title = "Outcome SC Series", y = "")
# %%
= ggplot(wide2, aes(year, treat_effect)) + geom_line() +
sc_est geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 1990, linetype = "dashed", color = 'red') +
ylim(c(-9000, 6000)) +
labs(title = "SC Estimates", y = "")
| sc_est) (sc_fit
Relaxes all 4 assumptions. Use elastic-net to pick out small and sparse weights.
Desiderata: + Balance: difference between pre-treatment outcomes for treated and linear-combination of pre-treatment outcomes for control
\(\ltwo{\Mat{Y}_{\text{t, pre}} - \mu - \omega \tra \Mat{Y}_{\text{c, pre}} }^2 = (\Mat{Y}_{\text{t, pre}} - \mu - \omega \tra \Mat{Y}_{\text{c, pre}}) \tra (\Mat{Y}_{\text{t, pre}} - \mu - \omega \tra \Mat{Y}_{\text{c, pre}})\)
\[\begin{align*} (\wh{\mu}^{en} (\lambda, \alpha) , \wh{\omega}^{en}(\lambda, \alpha) ) & = \argmin_{\mu, \omega} \; Q(\mu, \omega | \Mat{Y}_{\text{t, pre}}, \Mat{Y}_{\text{c, pre}}; \lambda, \alpha) \\ \text{where }\; \; Q(\mu, \omega | \Mat{Y}_{\text{t, pre}}, \Mat{Y}_{\text{c, pre}}; \lambda, \alpha) & = \ltwo{ \Mat{Y}_{\text{t, pre}} - \mu - \omega \tra \Mat{Y}_{\text{c, pre}} }^2 \\ + & \; \; \lambda \Bigpar{\frac{1-\alpha}{2} \ltwo{\omega}^2 + \alpha \lone{\omega} } \end{align*}\]
#' Solve for Imbens-Doudchenko elastic net synthetic control weights in CVXR
#' @param y_t t_0 X 1 matrix of pre-treatment outcomes for treatment units
#' @param y_c t_0 X n_0 matrix of pre-treatment outcomes for donor units
#' @param y_c t_0 X n_0 matrix of pre-treatment outcomes for donor units
#' @param lambdas vector of penalty values
#' @param alpha scalar mixing value between L1 and L2 regularisation
#' @param t number of lambdas to try (when range not manually specified)
#' @return vector of weights
#' @import CVXR
#' @export
= function(y_t, y_c, lambdas = NULL, alpha = 0.5, t = 10){
en_sc_solve # sequence of lambdas
if (is.null(lambdas)) lambdas = 10^seq(-2, log10(max(y_t)), length.out = t)
# penalty term
= function(ω, λ = 0, α = 0) {
elastic_penalty = cvxr_norm(ω, 1) * α
lasso = cvxr_norm(ω, 2) * (1 - α) / 2
ridge * (lasso + ridge)
λ
}# targets : intercept and weights
= Variable(1) ; ω = Variable(ncol(y_c))
μ = function(l) {
en_solve = (sum_squares(y_t - μ - y_c %*% ω)/(2 * nrow(y_c)) +
obj elastic_penalty(ω, l, alpha) )
# unconstrained problem, just apply L1 and L2 regularisation to weights
= Problem(Minimize(obj))
prob = solve(prob, solver = 'MOSEK')
sol = c(sol$getValue(μ), sol$getValue(ω))
sols
}# solve for each value of lambda
= lapply(lambdas, en_solve)
solution_mat # convert to (n_0 + 1) × |lambdas| matrix
= do.call(cbind, solution_mat)
solution_mat return(solution_mat)
}
Since we don’t have the coarse-regularisation induced by A2 + A3 any more, we need regularisation.
\[\begin{align*} (\wh{\mu}^{en} (j; \lambda, \alpha) , \wh{\omega}^{en}(j; \lambda, \alpha) ) & = \argmin_{\mu, \omega} \; \sum_{t=1}^{T_0} \Bigpar{Y_{j,t} - \mu - \sum_{i \neq 0, j} \omega_i Y_{i,t} }^2 + \\ & \lambda \Bigpar{\frac{1-\alpha}{2} \ltwo{\omega}^2 + \alpha \lone{\omega} } \end{align*}\]
pick the value of the tuning parameters \((\alpha_{opt}^{en}, \lambda_{opt}^{en})\) that minimises
\[ CV^{en}(\alpha, \lambda) = \frac{1}{N} \sum_{j=1}^N ( Y_{j, T} - \overbrace{ \wh{\mu}^{en}(j; \alpha, \lambda) - \sum_{i \neq 0, j} \wh{\omega}_i^{en} (j; \alpha, \lambda) \cdot Y_{i, T}}^{\wh{Y}_{j,T}(0)} ) \]
Currently sets \(\alpha=0.5\) instead of tuning it. Can be relaxed readily, but doesn’t make much difference in this example.
# %% pseudo-treatment prediction to pick λ
= wide[, -(1:2)] |> as.matrix() # all control units in matrix
y_ctrl = 10^seq(-1, log10(max(y_ctrl)), length.out = 20)
lambdas # %%
= function(j, Y, lambdas){
pick_lambda # pre period data
= head(Y, T0)
ypre = ypre[, j]; y_nj = ypre[, -j]
y_j # fit μ, ω for pseudo-treatment unit
= en_sc_solve(y_j, y_nj, lambdas)
ω_tilde # compute prediction error on post-period
= tail(Y, T1)
ypost = function(w) mean(ypost[, j] - cbind(1, ypost[, -j]) %*% w)
mse_j # each column in ω_tilde is a set of weights and intercepts for a given λ, so
# summarise across columns
= apply(ω_tilde, 2, mse_j)
mses # lambda that minimises error
which(mses == min(mses))]
lambdas[
}
# %% # for small number of donors, can compute for all ; in other cases, pick randomly?
system.time(
<- mclapply(1:ncol(y_ctrl),
lam_choices
pick_lambda, y_ctrl,10^seq(-1, log10(max(y_ctrl)), length.out = 20),
mc.cores = 6
)
)# tabulate best lambda and pick the mode
|> as.numeric() |> table() |> sort() |> tail(1) |> names() |>
lam_choices as.numeric() |> round(2)
# %%
Use the above \(\lambda\) to fit Elastic-net synthetic control
tic()
= en_sc_solve(y_treat_pre, y_ctrl_pre, 2517.62)
en_ω toc()
## 0.489 sec elapsed
# %% compute and plot
= copy(wide)
wide3 # impute Y(0) post for treated unit using weights
$y0_hat_en = as.matrix(cbind(1, wide3[, -(1:2)])) %*% en_ω
wide3$treat_effect = wide3[[treat_name]] - wide3$y0_hat_en
wide3
# %%
= ggplot(wide3, aes(year)) +
ensc_fit geom_line(aes(y = `West Germany`), size = 2) +
geom_line(aes(y = y0_hat_en), size = 2, alpha = 0.7, color = 'cornflowerblue') +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 1990, linetype = "dashed", color = 'red') +
labs(title = "Outcome ENSC Series", y = "")
# %%
= ggplot(wide3, aes(year, treat_effect)) + geom_line() +
ensc_est geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 1990, linetype = "dashed", color = 'red') +
ylim(c(-9000, 6000)) +
labs(title = "ENSC Estimates", y = "")
Results look very similar.
| sc_est) / (ensc_fit | ensc_est) (sc_fit
Elastic net weights are more dispersed and aren’t constrained to be positive. This appears not to matter in this example, but may help in others.
= data.frame(
wt_table2 donor = c('intercept', colnames(y_ctrl_pre)),
wt_sc = c(0, round(ω_sc, 2)) ,
wt_en = round(en_ω, 2))
|> kable() wt_table2
donor | wt_sc | wt_en |
---|---|---|
intercept | 0.00 | 296.98 |
Australia | 0.00 | 0.00 |
Austria | 0.32 | 0.27 |
Belgium | 0.00 | 0.03 |
Denmark | 0.00 | 0.00 |
France | 0.04 | 0.05 |
Greece | 0.10 | 0.13 |
Italy | 0.06 | 0.28 |
Japan | 0.00 | 0.00 |
Netherlands | 0.00 | 0.01 |
New Zealand | 0.00 | -0.04 |
Norway | 0.03 | 0.09 |
Portugal | 0.00 | 0.00 |
Spain | 0.00 | -0.06 |
Switzerland | 0.11 | 0.01 |
UK | 0.00 | 0.00 |
USA | 0.34 | 0.25 |