library(LalRUtils)
libreq(tidyverse, data.table, foreach, magrittr, tictoc, readxl, doParallel,
patchwork)
## wants loaded
## [1,] "tidyverse" TRUE
## [2,] "data.table" TRUE
## [3,] "foreach" TRUE
## [4,] "magrittr" TRUE
## [5,] "tictoc" TRUE
## [6,] "readxl" TRUE
## [7,] "doParallel" TRUE
## [8,] "patchwork" TRUE
theme_set(lal_plot_theme())
set.seed(42)
if (!file.exists("millions.XLS")){
download.file("http://www.columbia.edu/~xs23/data/millions.XLS", "millions.XLS")
}
vcodes = read_excel("millions.XLS", sheet = "Variable code") %>% setDT
colnames(vcodes) = c('rawname', 'name')
growth_dat = read_excel("millions.XLS", sheet = "BARROSHO") %>% setDT
growth_dat[, gamma := as.numeric(gamma)]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
perm = c("X1", "X2", "X3")
others = setdiff(paste0("X", 4:62), "X35") # because 35 is missing for some reason
Then, for each variable \(z\), compute
\[ \hat{\beta}_z = \sum_{j = 1}^M \omega_{zj} \beta_{zj} \]
where the weights are
\[ \omega_{wj} = \frac{L_{zj}}{\sum_{i=1}^m L_{zi}} \]
the variance is computed analogously.
salaimartin = function(v){
all_others = others[others != v] # all other vars
rhses = t(combn(all_others, 3)) # combinations of all other vars
formulae = apply(rhses, 1, function(x) paste(c(perm, v , x), collapse = " + "))
# parallelise
registerDoParallel(4)
res = foreach(d=1:length(formulae), .combine='rbind') %dopar% {
f = formulae[[d]]
m = lm(as.formula(paste0('gamma ~', f)), data = growth_dat)
β_j = coef(m)[v] ; σ_j = vcov(m)[v,v]
c(β_j, σ_j, logLik(m))
}
stopImplicitCluster()
res = data.frame(res)
β = res[, 1]; σ = res[, 2]; L = res[, 3]
sum_L = sum(L); ω = L / sum(L)
β_j_bar = weighted.mean(β, ω)
σ_j_bar = weighted.mean(σ, ω)
CDF = ecdf(β)
cdf0 = max(CDF(0), 1 - CDF(0))
list(c(β_j_bar, σ_j_bar, cdf0), res)
}
# %% run everything
tic()
results = map(others, salaimartin)
toc()
saveRDS(results, 'XSI_regs.rds')
results = readRDS('XSI_regs.rds')
vcodes2 = vcodes[rawname %in% others]
distplotter = function(k){
out = results[[k]]
sumstats = out[[1]]
sims = out[[2]]
colnames(sims) = c("beta", "SE", "LL")
p = ggplot(sims, aes(x = beta)) + geom_density() +
geom_vline(xintercept = sumstats[1], linetype = 'dotted', colour = 'red') +
geom_vline(xintercept = 0, colour = 'cornflowerblue') +
labs(title = vcodes2[k, 2]$name, x = '', y = '', subtitle = glue::glue("{round(sumstats[1], 2)}"))
return(p)
}
map(1:length(results), distplotter) %>%
wrap_plots(nrow = 12) +
plot_annotation("Two Million Regressions!")