In [1]: import numpy as np
from scipy import stats
In [2]: import statsmodels.api as sm
In [3]: import matplotlib
In [4]: import matplotlib.pyplot as plt
In [5]: from statsmodels.sandbox.regression.predstd import wls_prediction_std
fix a seed for these examples
In [6]: np.random.seed(9876789)
In [7]: nsample = 50
In [8]: sig = 0.5
In [9]: x1 = np.linspace(0, 20, nsample)
In [10]: X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]
In [11]: beta = [0.5, 0.5, -0.02, 5.]
In [12]: y_true = np.dot(X, beta)
In [13]: y = y_true + sig * np.random.normal(size=nsample)
In [14]: plt.figure()
Out[14]: <matplotlib.figure.Figure at 0xf183112c>
In [15]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[15]:
[<matplotlib.lines.Line2D at 0xf178f36c>,
<matplotlib.lines.Line2D at 0xf178fa0c>]
In [16]: res = sm.OLS(y, X).fit()
In [17]: print res.params
[ 0.47040466 0.2931004 -0.01826292 5.24935422]
In [18]: print res.bse
[ 0.02853117 0.11215937 0.00250506 0.18499717]
In [19]: print res.predict()
[ 4.79278116 5.17262168 5.52726298 5.84073136 6.10281792
6.31075592 6.46967527 6.59175976 6.69424528 6.796588
6.91726779 7.07075203 7.26511865 7.50072896 7.77016828
8.05946415 8.35038197 8.62342091 8.8610178 9.05043274
9.18584224 9.26929595 9.31037998 9.32464188 9.33103621
9.34881043 9.39434252 9.47845015 9.60461339 9.76840293
9.95820778 10.15714295 10.34582361 10.50554994 10.62137947
10.68458209 10.69407437 10.65659757 10.58561009 10.49907624
10.41651482 10.3557922 10.33018692 10.34620807 10.4025259
10.49019023 10.594101 10.69548911 10.77500019 10.81587443]
In [20]: prstd, iv_l, iv_u = wls_prediction_std(res)
In [21]: plt.plot(x1, res.fittedvalues, 'r--.')
Out[21]: [<matplotlib.lines.Line2D at 0xf18243ec>]
In [22]: plt.plot(x1, iv_u, 'r--')
Out[22]: [<matplotlib.lines.Line2D at 0xf1e5dbcc>]
In [23]: plt.plot(x1, iv_l, 'r--')
Out[23]: [<matplotlib.lines.Line2D at 0xf1e5dfac>]
In [24]: plt.title('blue: true, red: OLS')
Out[24]: <matplotlib.text.Text at 0xf17a032c>
In [25]: print res.summary()
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.920
Model: OLS Adj. R-squared: 0.915
Method: Least Squares F-statistic: 175.9
Date: Sat, 29 Nov 2014 Prob (F-statistic): 3.30e-25
Time: 15:58:15 Log-Likelihood: -38.308
No. Observations: 50 AIC: 84.62
Df Residuals: 46 BIC: 92.26
Df Model: 3
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1 0.4704 0.029 16.487 0.000 0.413 0.528
x2 0.2931 0.112 2.613 0.012 0.067 0.519
x3 -0.0183 0.003 -7.290 0.000 -0.023 -0.013
const 5.2494 0.185 28.375 0.000 4.877 5.622
==============================================================================
Omnibus: 1.779 Durbin-Watson: 2.359
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.440
Skew: 0.414 Prob(JB): 0.487
Kurtosis: 2.933 Cond. No. 221.
==============================================================================
In [26]: sig = 1.
suppose observations from 3 groups
In [27]: xg = np.zeros(nsample, int)
In [28]: xg[20:40] = 1
In [29]: xg[40:] = 2
In [30]: print xg
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 2 2 2 2 2 2 2 2 2 2]
In [31]: dummy = (xg[:,None] == np.unique(xg)).astype(float)
use group 0 as benchmark
In [32]: X = np.c_[x1, dummy[:,1:], np.ones(nsample)]
In [33]: beta = [1., 3, -3, 10]
In [34]: y_true = np.dot(X, beta)
In [35]: y = y_true + sig * np.random.normal(size=nsample)
In [36]: res2 = sm.OLS(y, X).fit()
In [37]: print res2.params
[ 0.98475721 3.44866047 -2.68110961 9.77981018]
In [38]: print res2.bse
[ 0.06801982 0.64590456 1.05239527 0.35213863]
In [39]: print res.predict()
[ 4.79278116 5.17262168 5.52726298 5.84073136 6.10281792
6.31075592 6.46967527 6.59175976 6.69424528 6.796588
6.91726779 7.07075203 7.26511865 7.50072896 7.77016828
8.05946415 8.35038197 8.62342091 8.8610178 9.05043274
9.18584224 9.26929595 9.31037998 9.32464188 9.33103621
9.34881043 9.39434252 9.47845015 9.60461339 9.76840293
9.95820778 10.15714295 10.34582361 10.50554994 10.62137947
10.68458209 10.69407437 10.65659757 10.58561009 10.49907624
10.41651482 10.3557922 10.33018692 10.34620807 10.4025259
10.49019023 10.594101 10.69548911 10.77500019 10.81587443]
In [40]: prstd, iv_l, iv_u = wls_prediction_std(res2)
In [41]: plt.figure()
Out[41]: <matplotlib.figure.Figure at 0xf161764c>
In [42]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[42]:
[<matplotlib.lines.Line2D at 0xf18120ac>,
<matplotlib.lines.Line2D at 0xf180bdec>]
In [43]: plt.figure()
Out[43]: <matplotlib.figure.Figure at 0xf180b02c>
In [44]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[44]:
[<matplotlib.lines.Line2D at 0xf1840bac>,
<matplotlib.lines.Line2D at 0xf1840e6c>]
In [45]: plt.plot(x1, res2.fittedvalues, 'r--.')
Out[45]: [<matplotlib.lines.Line2D at 0xf1ba570c>]
In [46]: plt.plot(x1, iv_u, 'r--')
Out[46]: [<matplotlib.lines.Line2D at 0xf1e61c0c>]
In [47]: plt.plot(x1, iv_l, 'r--')
Out[47]: [<matplotlib.lines.Line2D at 0xf184530c>]
In [48]: plt.title('blue: true, red: OLS')
Out[48]: <matplotlib.text.Text at 0xf18618cc>
In [49]: print res.summary()
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.920
Model: OLS Adj. R-squared: 0.915
Method: Least Squares F-statistic: 175.9
Date: Sat, 29 Nov 2014 Prob (F-statistic): 3.30e-25
Time: 15:58:20 Log-Likelihood: -38.308
No. Observations: 50 AIC: 84.62
Df Residuals: 46 BIC: 92.26
Df Model: 3
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1 0.4704 0.029 16.487 0.000 0.413 0.528
x2 0.2931 0.112 2.613 0.012 0.067 0.519
x3 -0.0183 0.003 -7.290 0.000 -0.023 -0.013
const 5.2494 0.185 28.375 0.000 4.877 5.622
==============================================================================
Omnibus: 1.779 Durbin-Watson: 2.359
Prob(Omnibus): 0.411 Jarque-Bera (JB): 1.440
Skew: 0.414 Prob(JB): 0.487
Kurtosis: 2.933 Cond. No. 221.
==============================================================================
In [50]: R = [[0, 1, 0, 0],
....: [0, 0, 1, 0]]
....:
F test joint hypothesis R * beta = 0 i.e. coefficient on both dummy variables equal zero
In [51]: print res2.f_test(R)
<F test: F=array([[ 124.30756422]]), p=[[ 0.]], df_denom=46, df_num=2>
strongly rejected Null of identical constant in 3 groups .. <F test: F=124.19050615860911, p=2.87411973729e-019, df_denom=46, df_num=2>
In [52]: help(res2.f_test)
Help on method f_test in module statsmodels.base.model:
f_test(self, r_matrix, q_matrix=None, cov_p=None, scale=1.0, invcov=None) method of statsmodels.regression.linear_model.OLSResults instance
Compute an Fcontrast/F-test for a contrast matrix.
Here, matrix `r_matrix` is assumed to be non-singular. More precisely,
r_matrix (pX pX.T) r_matrix.T
is assumed invertible. Here, pX is the generalized inverse of the
design matrix of the model. There can be problems in non-OLS models
where the rank of the covariance of the noise is not full.
Parameters
----------
r_matrix : array-like
q x p array where q is the number of restrictions to test and
p is the number of regressors in the full model fit.
If q is 1 then f_test(r_matrix).fvalue is equivalent to
the square of t_test(r_matrix).t
q_matrix : array-like
q x 1 array, that represents the sum of each linear restriction.
Default is all zeros for each restriction.
scale : float, optional
Default is 1.0 for no scaling.
invcov : array-like, optional
A qxq matrix to specify an inverse covariance
matrix based on a restrictions matrix.
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
>>> data = sm.datasets.longley.load()
>>> data.exog = sm.add_constant(data.exog)
>>> results = sm.OLS(data.endog, data.exog).fit()
>>> A = np.identity(len(results.params))
>>> A = A[:-1,:]
This tests that each coefficient is jointly statistically
significantly different from zero.
>>> print results.f_test(A)
<F contrast: F=330.28533923463488, p=4.98403052872e-10,
df_denom=9, df_num=6>
Compare this to
>>> results.F
330.2853392346658
>>> results.F_p
4.98403096572e-10
>>> B = np.array(([0,1,-1,0,0,0,0],[0,0,0,0,1,-1,0]))
This tests that the coefficient on the 2nd and 3rd regressors are
equal and jointly that the coefficient on the 5th and 6th regressors
are equal.
>>> print results.f_test(B)
<F contrast: F=9.740461873303655, p=0.00560528853174, df_denom=9,
df_num=2>
See also
--------
statsmodels.contrasts
statsmodels.model.t_test
t test for Null hypothesis effects of 2nd and 3rd group add to zero
In [53]: R = [0, 1, -1, 0]
In [54]: print res2.t_test(R)
<T test: effect=array([ 6.12977008]), sd=array([[ 0.58029388]]), t=array([[ 10.5632168]]), p=array([[ 0.]]), df_denom=46>
don’t reject Null at 5% confidence level (note one sided p-value) .. <T test: effect=1.0363792917100714, sd=0.52675137730463362, t=1.9674923243925513, p=0.027586676754860262, df_denom=46>
OLS with small group effects
In [55]: beta = [1., 0.3, -0.0, 10]
In [56]: y_true = np.dot(X, beta)
In [57]: y = y_true + sig * np.random.normal(size=nsample)
In [58]: res3 = sm.OLS(y, X).fit()
In [59]: print res3.f_test(R)
<F test: F=array([[ 0.4929838]]), p=[[ 0.48613705]], df_denom=46, df_num=1>
don’t reject Null of identical constant in 3 groups .. <F test: F=1.9715385826285652, p=0.15083366806, df_denom=46, df_num=2>