Linear Model in Python


import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
# Rename all the columns 
# View the dataset
df = pd.read_excel('HW5.xls')
df.columns = ['GPA', 'SAT_Math', 'SAT_Verbal', 'HS_Math', 'HS_English']
df
GPA SAT_Math SAT_Verbal HS_Math HS_English
0 1.97 321 247 2.30 2.63
1 2.74 718 436 3.80 3.57
2 2.19 358 578 2.98 2.57
3 2.60 403 447 3.58 2.21
4 2.98 640 563 3.38 3.48
5 1.65 237 342 1.48 2.14
6 1.89 270 472 1.67 2.64
7 2.38 418 356 3.73 2.52
8 2.66 443 327 3.09 3.20
9 1.96 359 385 1.54 3.46
10 3.14 669 664 3.21 3.37
11 1.96 409 518 2.77 2.60
12 2.20 582 364 1.47 2.90
13 3.90 750 632 3.14 3.49
14 2.02 451 435 1.54 3.20
15 3.61 645 704 3.50 3.74
16 3.07 791 341 3.20 2.93
17 2.63 521 483 3.59 3.32
18 3.11 594 665 3.42 2.70
19 3.20 653 606 3.69 3.52
# Draw scatterplots for joint relationships and histograms for univariate distributions:
sns.set(style="ticks", color_codes=True)
sns.pairplot(df, kind="reg")
plt.show()

Correlation_matrix

# Import the OLS model
# Set GPA as my dependent variable, all the other columns as independent variables
# Print out my OLS model stats result
results = smf.ols('GPA ~ SAT_Math + SAT_Verbal + HS_Math + HS_English', data=df).fit()
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    GPA   R-squared:                       0.853
Model:                            OLS   Adj. R-squared:                  0.814
Method:                 Least Squares   F-statistic:                     21.72
Date:                Wed, 02 Aug 2017   Prob (F-statistic):           4.25e-06
Time:                        09:11:33   Log-Likelihood:                0.79507
No. Observations:                  20   AIC:                             8.410
Df Residuals:                      15   BIC:                             13.39
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1615      0.438      0.369      0.717      -0.771       1.094
SAT_Math       0.0020      0.001      3.439      0.004       0.001       0.003
SAT_Verbal     0.0013      0.001      2.270      0.038    7.67e-05       0.002
HS_Math        0.1894      0.092      2.062      0.057      -0.006       0.385
HS_English     0.0876      0.176      0.496      0.627      -0.289       0.464
==============================================================================
Omnibus:                        0.673   Durbin-Watson:                   2.284
Prob(Omnibus):                  0.714   Jarque-Bera (JB):                0.120
Skew:                           0.185   Prob(JB):                        0.942
Kurtosis:                       3.086   Cond. No.                     5.56e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.56e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
# Drop the irrelevant "intercept" and print out the t-values result
t = results.tvalues
t = t.drop(['Intercept'],axis=0)
t
SAT_Math      3.439467
SAT_Verbal    2.270444
HS_Math       2.062090
HS_English    0.496122
dtype: float64
# Find out the variables that are statiscally significant
print (t[t > 2])
SAT_Math      3.439467
SAT_Verbal    2.270444
HS_Math       2.062090
dtype: float64
tmax_key = t.idxmax()
tmax_value = t.loc[tmax_key]
print ("The most significant variable is", tmax_key, ", and its t-value is", tmax_value)

tmin_key = t.idxmin()
tmin_value = t.loc[tmin_key]
print ("The most insignificant variable is", tmin_key, ", and its t-value is", tmin_value)
The most significant variable is SAT_Math , and its t-value is 3.43946691293
The most insignificant variable is HS_English , and its t-value is 0.496122306692
# Generate and print the prediction dataset
predict_df = pd.DataFrame(
             [[680, 550, 3.8, 3.8],
              [650, 500, 3.0, 4],
              [450, 600, 2.5, 4.5],
              [503, 520, 3.25, 3.6],
              [750, 600, 3.95, 4]],
             index = ['Jacob', 'Kelly', 'Stuart', 'Jeremy', 'Linda'],
             columns = ['SAT_Math', 'SAT_Verbal', 'HS_Math', 'HS_English'])
predict_df
SAT_Math SAT_Verbal HS_Math HS_English
Jacob 680 550 3.80 3.8
Kelly 650 500 3.00 4.0
Stuart 450 600 2.50 4.5
Jeremy 503 520 3.25 3.6
Linda 750 600 3.95 4.0
# Import the improved OLS model
# Set GPA as my dependent variable, all the other columns as independent variables
# Print out my OLS model stats result
lm = smf.ols('GPA ~ SAT_Math + SAT_Verbal + HS_Math - 1', data=df).fit()
print(lm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    GPA   R-squared:                       0.991
Model:                            OLS   Adj. R-squared:                  0.990
Method:                 Least Squares   F-statistic:                     656.4
Date:                Wed, 02 Aug 2017   Prob (F-statistic):           9.11e-18
Time:                        09:11:49   Log-Likelihood:               -0.35977
No. Observations:                  20   AIC:                             6.720
Df Residuals:                      17   BIC:                             9.707
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
SAT_Math       0.0023      0.000      5.034      0.000       0.001       0.003
SAT_Verbal     0.0016      0.000      3.483      0.003       0.001       0.003
HS_Math        0.2168      0.085      2.565      0.020       0.038       0.395
==============================================================================
Omnibus:                        0.055   Durbin-Watson:                   1.899
Prob(Omnibus):                  0.973   Jarque-Bera (JB):                0.281
Skew:                           0.003   Prob(JB):                        0.869
Kurtosis:                       2.419   Cond. No.                     1.02e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
# Using the improved OLS model to predict 1Q_GPA
# Append the predicted result into the original data table
Predicted_GPA = lm.predict(predict_df)
predict_df['Predicted_GPA'] = pd.Series(Predicted_GPA, index = predict_df.index)
predict_df
SAT_Math SAT_Verbal HS_Math HS_English Predicted_GPA
Jacob 680 550 3.80 3.8 3.284888
Kelly 650 500 3.00 4.0 2.960764
Stuart 450 600 2.50 4.5 2.557058
Jeremy 503 520 3.25 3.6 2.710301
Linda 750 600 3.95 4.0 3.559927
# Generate a new column into the original data table based on the value of Predicted_GPA
predict_df['Admission'] = np.where(predict_df['Predicted_GPA'] > 3, 'Yes', 'No')
predict_df
SAT_Math SAT_Verbal HS_Math HS_English Predicted_GPA Admission
Jacob 680 550 3.80 3.8 3.284888 Yes
Kelly 650 500 3.00 4.0 2.960764 No
Stuart 450 600 2.50 4.5 2.557058 No
Jeremy 503 520 3.25 3.6 2.710301 No
Linda 750 600 3.95 4.0 3.559927 Yes