Regressions

Published:

regressions
In [3]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from linearmodels.iv import IV2SLS

%matplotlib inline 
/home/alal/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

manual and standard OLS calculation

Simulate data

In [4]:
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
In [5]:
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)

X = sm.add_constant(X)
y = np.dot(X, beta) + e
In [6]:
beta
Out[6]:
array([ 1. ,  0.1, 10. ])

$$ \hat{\beta} = (X'X)^{-1}X'Y $$

In [7]:
beta0 = np.linalg.inv(X.T @ X)@X.T@y
beta0.T
Out[7]:
array([ 1.0154608 ,  0.08317293, 10.00008995])
In [8]:
beta1 = sm.OLS(y, X).fit()
beta1.params
Out[8]:
array([ 1.0154608 ,  0.08317293, 10.00008995])
In [9]:
yhat = beta1.predict()
print(beta1.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.215e+06
Date:                Sun, 11 Mar 2018   Prob (F-statistic):          2.85e-240
Time:                        18:30:48   Log-Likelihood:                -144.16
No. Observations:                 100   AIC:                             294.3
Df Residuals:                      97   BIC:                             302.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0155      0.305      3.324      0.001       0.409       1.622
x1             0.0832      0.141      0.589      0.557      -0.197       0.363
x2            10.0001      0.014    732.046      0.000       9.973      10.027
==============================================================================
Omnibus:                        2.388   Durbin-Watson:                   1.821
Prob(Omnibus):                  0.303   Jarque-Bera (JB):                2.385
Skew:                           0.331   Prob(JB):                        0.304
Kurtosis:                       2.632   Cond. No.                         144.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

IV

In [10]:
from statsmodels.api import add_constant
from linearmodels.datasets import mroz
print(mroz.DESCR)
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data['lnwage']=np.log(data.wage)
T.A. Mroz (1987), "The Sensitivity of an Empirical Model of Married Women's
Hours of Work to Economic and Statistical Assumptions," Econometrica 55,
765-799.

nlf        1 if in labor force, 1975
hours      hours worked, 1975
kidslt6    # kids < 6 years
kidsge6    # kids 6-18
age        woman's age in yrs
educ       years of schooling
wage       estimated wage from earns., hours
repwage    reported wage at interview in 1976
hushrs     hours worked by husband, 1975
husage     husband's age
huseduc    husband's years of schooling
huswage    husband's hourly wage, 1975
faminc     family income, 1975
mtr        fed. marginal tax rate facing woman
motheduc   mother's years of schooling
fatheduc   father's years of schooling
unem       unem. rate in county of resid.
city       =1 if live in SMSA
exper      actual labor mkt exper
nwifeinc   (faminc - wage*hours)/1000
lwage      log(wage)
expersq    exper^2

In [11]:
y = np.log(data.wage).as_matrix()
X = data[['const','educ']].as_matrix()

beta = np.linalg.inv(X.T @ X)@X.T@y
beta
Out[11]:
array([-0.18519681,  0.10864865])
In [12]:
data.head()
Out[12]:
constinlfhourskidslt6kidsge6ageeducwagerepwagehushrs...mtrmotheducfatheducunemcityexpernwifeinclwageexpersqlnwage
01.0116101032123.35402.652708...0.72151275.001410.9100601.2101541961.210154
11.0116560230121.38892.652310...0.66157711.01519.4999800.328512250.328512
21.0119801335124.54554.043072...0.69151275.001512.0399101.5141382251.514138
31.014560334121.09653.251920...0.7815775.0066.7999960.092123360.092123
41.0115681231144.59183.602000...0.621512149.51720.1000601.524272491.524272

5 rows × 24 columns

In [13]:
res_ols = sm.formula.ols('lnwage ~ educ', data=data).fit()
print(res_ols.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 lnwage   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     56.93
Date:                Sun, 11 Mar 2018   Prob (F-statistic):           2.76e-13
Time:                        18:30:49   Log-Likelihood:                -441.26
No. Observations:                 428   AIC:                             886.5
Df Residuals:                     426   BIC:                             894.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1852      0.185     -1.000      0.318      -0.549       0.179
educ           0.1086      0.014      7.545      0.000       0.080       0.137
==============================================================================
Omnibus:                       91.833   Durbin-Watson:                   1.985
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              303.790
Skew:                          -0.956   Prob(JB):                     1.08e-66
Kurtosis:                       6.658   Cond. No.                         72.9
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [14]:
first_stage = sm.OLS(data.educ, 
                 data[['fatheduc','const']]).fit()
print(first_stage.summary())
b_fs = first_stage.params[0]

data['educ_hat'] = first_stage.predict()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   educ   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.171
Method:                 Least Squares   F-statistic:                     88.84
Date:                Sun, 11 Mar 2018   Prob (F-statistic):           2.76e-19
Time:                        18:30:49   Log-Likelihood:                -920.02
No. Observations:                 428   AIC:                             1844.
Df Residuals:                     426   BIC:                             1852.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
fatheduc       0.2694      0.029      9.426      0.000       0.213       0.326
const         10.2371      0.276     37.099      0.000       9.695      10.779
==============================================================================
Omnibus:                        9.919   Durbin-Watson:                   1.919
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               16.651
Skew:                          -0.093   Prob(JB):                     0.000242
Kurtosis:                       3.948   Cond. No.                         26.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [15]:
reduced_form = sm.OLS(np.log(data.wage), 
                 data[['fatheduc','const']]).fit()
b_rf = reduced_form.params[0]
print(reduced_form.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.586
Date:                Sun, 11 Mar 2018   Prob (F-statistic):              0.109
Time:                        18:30:49   Log-Likelihood:                -466.81
No. Observations:                 428   AIC:                             937.6
Df Residuals:                     426   BIC:                             945.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
fatheduc       0.0159      0.010      1.608      0.109      -0.004       0.035
const          1.0469      0.096     10.939      0.000       0.859       1.235
==============================================================================
Omnibus:                       63.908   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              167.384
Skew:                          -0.730   Prob(JB):                     4.50e-37
Kurtosis:                       5.693   Cond. No.                         26.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [16]:
second_stage = sm.OLS(np.log(data.wage), 
                 data[['educ_hat','const']]).fit()
b_ss = second_stage.params[0]
print(second_stage.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.586
Date:                Sun, 11 Mar 2018   Prob (F-statistic):              0.109
Time:                        18:30:50   Log-Likelihood:                -466.81
No. Observations:                 428   AIC:                             937.6
Df Residuals:                     426   BIC:                             945.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
educ_hat       0.0592      0.037      1.608      0.109      -0.013       0.131
const          0.4411      0.467      0.944      0.346      -0.477       1.359
==============================================================================
Omnibus:                       63.908   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              167.384
Skew:                          -0.730   Prob(JB):                     4.50e-37
Kurtosis:                       5.693   Cond. No.                         171.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

$$ \beta_{iv} = \frac{cov(y_i,z_i)}{cov(x_i,z_i)}= \frac{\beta_{rf}}{\beta_{fs}} = \frac{cov(y_i,\hat{x_i})}{var(\hat{x_i})}$$

calculate $\beta_{iv}$ in all 3 ways

In [17]:
iv = IV2SLS.from_formula('lnwage ~ 1 + [educ ~ fatheduc]',
                    data=data).fit(cov_type='unadjusted')
print(iv.summary)
                          IV-2SLS Estimation Summary                          
==============================================================================
Dep. Variable:                 lnwage   R-squared:                      0.0934
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0913
No. Observations:                 428   F-statistic:                    2.8487
Date:                Sun, Mar 11 2018   P-value (F-stat)                0.0914
Time:                        18:30:51   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.4411     0.4451     0.9911     0.3216     -0.4312      1.3134
educ           0.0592     0.0351     1.6878     0.0914     -0.0095      0.1279
==============================================================================

Endogenous: educ
Instruments: fatheduc
Unadjusted Covariance (Homoskedastic)
Debiased: False
In [18]:
iv.params[1]
Out[18]:
0.05917348053415239
In [19]:
b_iv = b_rf/b_fs
b_iv
Out[19]:
0.05917348053415326
In [20]:
b_ss
Out[20]:
0.059173480534153174

Standard Errors

Verbatim fork of this repo for reference. Will likely attempt to implement Conley(1999) in python

In [24]:
import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw
In [25]:
from urllib.request import urlopen

with urlopen('http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt') as conn:
    df = pd.read_table(conn, names=['firmid','year','x','y'],
                       delim_whitespace=True)
df.head()
Out[25]:
firmidyearxy
011-1.1139732.251535
112-0.0808541.242346
213-0.237607-1.426376
314-0.152486-1.109394
415-0.0014260.914686
In [26]:
ols = sm.ols(formula='y ~ x', data=df).fit(use_t=True)
ols.summary()
Out[26]:
OLS Regression Results
Dep. Variable:y R-squared: 0.208
Model:OLS Adj. R-squared: 0.208
Method:Least Squares F-statistic: 1311.
Date:Sun, 11 Mar 2018 Prob (F-statistic):4.25e-255
Time:18:31:53 Log-Likelihood: -10573.
No. Observations: 5000 AIC:2.115e+04
Df Residuals: 4998 BIC:2.116e+04
Df Model: 1
Covariance Type:nonrobust
coefstd errzP>|z|[0.0250.975]
Intercept 0.0297 0.028 1.047 0.295 -0.026 0.085
x 1.0348 0.029 36.204 0.000 0.979 1.091
Omnibus: 4.912 Durbin-Watson: 1.096
Prob(Omnibus): 0.086 Jarque-Bera (JB): 4.862
Skew: 0.070 Prob(JB): 0.0880
Kurtosis: 3.063 Cond. No. 1.01

White standard errors (reg y x, robust)

In [27]:
robust_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='HC1', use_t=True)
robust_ols.summary()
Out[27]:
OLS Regression Results
Dep. Variable:y R-squared: 0.208
Model:OLS Adj. R-squared: 0.208
Method:Least Squares F-statistic: 1328.
Date:Sun, 11 Mar 2018 Prob (F-statistic):4.29e-258
Time:18:32:37 Log-Likelihood: -10573.
No. Observations: 5000 AIC:2.115e+04
Df Residuals: 4998 BIC:2.116e+04
Df Model: 1
Covariance Type:HC1
coefstd errtP>|t|[0.0250.975]
Intercept 0.0297 0.028 1.047 0.295 -0.026 0.085
x 1.0348 0.028 36.444 0.000 0.979 1.091
Omnibus: 4.912 Durbin-Watson: 1.096
Prob(Omnibus): 0.086 Jarque-Bera (JB): 4.862
Skew: 0.070 Prob(JB): 0.0880
Kurtosis: 3.063 Cond. No. 1.01

Clustered

In [28]:
cluster_firm_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['firmid']},
                                                        use_t=True)
cluster_firm_ols.summary()
Out[28]:
OLS Regression Results
Dep. Variable:y R-squared: 0.208
Model:OLS Adj. R-squared: 0.208
Method:Least Squares F-statistic: 418.3
Date:Sun, 11 Mar 2018 Prob (F-statistic):5.61e-68
Time:18:33:05 Log-Likelihood: -10573.
No. Observations: 5000 AIC:2.115e+04
Df Residuals: 4998 BIC:2.116e+04
Df Model: 1
Covariance Type:cluster
coefstd errtP>|t|[0.0250.975]
Intercept 0.0297 0.067 0.443 0.658 -0.102 0.161
x 1.0348 0.051 20.453 0.000 0.935 1.134
Omnibus: 4.912 Durbin-Watson: 1.096
Prob(Omnibus): 0.086 Jarque-Bera (JB): 4.862
Skew: 0.070 Prob(JB): 0.0880
Kurtosis: 3.063 Cond. No. 1.01
In [29]:
cluster_year_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['year']},
                                                        use_t=True)
cluster_year_ols.summary()
Out[29]:
OLS Regression Results
Dep. Variable:y R-squared: 0.208
Model:OLS Adj. R-squared: 0.208
Method:Least Squares F-statistic: 960.6
Date:Sun, 11 Mar 2018 Prob (F-statistic):1.86e-10
Time:18:33:16 Log-Likelihood: -10573.
No. Observations: 5000 AIC:2.115e+04
Df Residuals: 4998 BIC:2.116e+04
Df Model: 1
Covariance Type:cluster
coefstd errtP>|t|[0.0250.975]
Intercept 0.0297 0.023 1.269 0.236 -0.023 0.083
x 1.0348 0.033 30.993 0.000 0.959 1.110
Omnibus: 4.912 Durbin-Watson: 1.096
Prob(Omnibus): 0.086 Jarque-Bera (JB): 4.862
Skew: 0.070 Prob(JB): 0.0880
Kurtosis: 3.063 Cond. No. 1.01
In [30]:
cluster_2ways_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                         cov_kwds={'groups': np.array(df[['firmid', 'year']])},
                                                         use_t=True)
cluster_2ways_ols.summary()
Out[30]:
OLS Regression Results
Dep. Variable:y R-squared: 0.208
Model:OLS Adj. R-squared: 0.208
Method:Least Squares F-statistic: 373.3
Date:Sun, 11 Mar 2018 Prob (F-statistic):1.23e-08
Time:18:33:37 Log-Likelihood: -10573.
No. Observations: 5000 AIC:2.115e+04
Df Residuals: 4998 BIC:2.116e+04
Df Model: 1
Covariance Type:cluster
coefstd errtP>|t|[0.0250.975]
Intercept 0.0297 0.065 0.456 0.659 -0.118 0.177
x 1.0348 0.054 19.322 0.000 0.914 1.156
Omnibus: 4.912 Durbin-Watson: 1.096
Prob(Omnibus): 0.086 Jarque-Bera (JB): 4.862
Skew: 0.070 Prob(JB): 0.0880
Kurtosis: 3.063 Cond. No. 1.01

Fixed effects (Python really needs an equivalent to areg or reghdfe to absorb estimates)

In [31]:
year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(use_t=True)
year_fe_ols.summary()
Out[31]:
OLS Regression Results
Dep. Variable:y R-squared: 0.209
Model:OLS Adj. R-squared: 0.207
Method:Least Squares F-statistic: 131.6
Date:Sun, 11 Mar 2018 Prob (F-statistic):7.13e-245
Time:18:34:35 Log-Likelihood: -10570.
No. Observations: 5000 AIC:2.116e+04
Df Residuals: 4989 BIC:2.123e+04
Df Model: 10
Covariance Type:nonrobust
coefstd errzP>|z|[0.0250.975]
Intercept 0.1411 0.090 1.573 0.116 -0.035 0.317
C(year)[T.2] -0.0119 0.127 -0.094 0.925 -0.261 0.237
C(year)[T.3] -0.1453 0.127 -1.145 0.252 -0.394 0.103
C(year)[T.4] -0.2038 0.127 -1.607 0.108 -0.452 0.045
C(year)[T.5] -0.0604 0.127 -0.476 0.634 -0.309 0.188
C(year)[T.6] -0.1312 0.127 -1.034 0.301 -0.380 0.117
C(year)[T.7] -0.1975 0.127 -1.557 0.120 -0.446 0.051
C(year)[T.8] -0.1555 0.127 -1.225 0.220 -0.404 0.093
C(year)[T.9] -0.1535 0.127 -1.210 0.226 -0.402 0.095
C(year)[T.10] -0.0556 0.127 -0.438 0.661 -0.304 0.193
x 1.0351 0.029 36.160 0.000 0.979 1.091
Omnibus: 4.804 Durbin-Watson: 1.096
Prob(Omnibus): 0.091 Jarque-Bera (JB): 4.752
Skew: 0.069 Prob(JB): 0.0929
Kurtosis: 3.061 Cond. No. 10.9
In [32]:
firm_cluster_year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(cov_type='cluster',
                                                                          cov_kwds={'groups': df['firmid']},
                                                                          use_t=True)
firm_cluster_year_fe_ols.summary()
Out[32]:
OLS Regression Results
Dep. Variable:y R-squared: 0.209
Model:OLS Adj. R-squared: 0.207
Method:Least Squares F-statistic: 43.23
Date:Sun, 11 Mar 2018 Prob (F-statistic):1.93e-61
Time:18:34:46 Log-Likelihood: -10570.
No. Observations: 5000 AIC:2.116e+04
Df Residuals: 4989 BIC:2.123e+04
Df Model: 10
Covariance Type:cluster
coefstd errtP>|t|[0.0250.975]
Intercept 0.1411 0.089 1.587 0.113 -0.034 0.316
C(year)[T.2] -0.0119 0.086 -0.139 0.890 -0.181 0.157
C(year)[T.3] -0.1453 0.086 -1.690 0.092 -0.314 0.024
C(year)[T.4] -0.2038 0.089 -2.288 0.023 -0.379 -0.029
C(year)[T.5] -0.0604 0.087 -0.697 0.486 -0.231 0.110
C(year)[T.6] -0.1312 0.084 -1.562 0.119 -0.296 0.034
C(year)[T.7] -0.1975 0.087 -2.275 0.023 -0.368 -0.027
C(year)[T.8] -0.1555 0.094 -1.662 0.097 -0.339 0.028
C(year)[T.9] -0.1535 0.088 -1.752 0.080 -0.326 0.019
C(year)[T.10] -0.0556 0.088 -0.634 0.526 -0.228 0.117
x 1.0351 0.051 20.361 0.000 0.935 1.135
Omnibus: 4.804 Durbin-Watson: 1.096
Prob(Omnibus): 0.091 Jarque-Bera (JB): 4.752
Skew: 0.069 Prob(JB): 0.0929
Kurtosis: 3.061 Cond. No. 10.9