Data file:Dummies.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
## beautify the plot made with matplotlib
import seaborn as sns
sns.set()
raw_data = pd.read_csv('C:\\Users\\Python_practice\\1.03. Dummies.csv')
raw_data
#Attendance means the students attended more than 75% of the lesson
SAT | GPA | Attendance | |
---|---|---|---|
0 | 1714 | 2.40 | No |
1 | 1664 | 2.52 | No |
2 | 1760 | 2.54 | No |
3 | 1685 | 2.74 | No |
4 | 1693 | 2.83 | No |
… | … | … | … |
79 | 1936 | 3.71 | Yes |
80 | 1810 | 3.71 | Yes |
81 | 1987 | 3.73 | No |
82 | 1962 | 3.76 | Yes |
83 | 2050 | 3.81 | Yes |
84 rows × 3 columns
#creat a copy to change yes/no into 0/1, in case we change the raw data
data = raw_data.copy()
#change Yes/No into 0/1
data['Attendance'] = data['Attendance'].map({'Yes':1, 'No':0})
data.describe()
SAT | GPA | Attendance | |
---|---|---|---|
count | 84.000000 | 84.000000 | 84.000000 |
mean | 1845.273810 | 3.330238 | 0.464286 |
std | 104.530661 | 0.271617 | 0.501718 |
min | 1634.000000 | 2.400000 | 0.000000 |
25% | 1772.000000 | 3.190000 | 0.000000 |
50% | 1846.000000 | 3.380000 | 0.000000 |
75% | 1934.000000 | 3.502500 | 1.000000 |
max | 2050.000000 | 3.810000 | 1.000000 |
y = data['GPA']
x1 = data[['SAT','Attendance']]
## use OLS(最小平方法) to plot with "statsmodels.api"
x = sm.add_constant(x1.to_numpy())
result = sm.OLS(y,x).fit()
result.summary()
Dep. Variable: | GPA | R-squared: | 0.565 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.555 |
Method: | Least Squares | F-statistic: | 52.70 |
Date: | Fri, 24 Jan 2020 | Prob (F-statistic): | 2.19e-15 |
Time: | 15:48:14 | Log-Likelihood: | 25.798 |
No. Observations: | 84 | AIC: | -45.60 |
Df Residuals: | 81 | BIC: | -38.30 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.6439 | 0.358 | 1.797 | 0.076 | -0.069 | 1.357 |
x1 | 0.0014 | 0.000 | 7.141 | 0.000 | 0.001 | 0.002 |
x2 | 0.2226 | 0.041 | 5.451 | 0.000 | 0.141 | 0.304 |
Omnibus: | 19.560 | Durbin-Watson: | 1.009 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 27.189 |
Skew: | -1.028 | Prob(JB): | 1.25e-06 |
Kurtosis: | 4.881 | Cond. No. | 3.35e+04 |
## according to the calculation above, we know y=0.6439+0.0014*SAT+0.2226*Dummy(Attendance)
##Dummy=0, yhat_no = 0.6439 + 0.0014*SAT
##Dummy=1, yhat_yes = 0.8665 + 0.0014*SAT
## use matplotlib.pyplot to draw the regression line
plt.scatter(data['SAT'],y,c=data['Attendance'],cmap='RdYlGn_r')
yhat_no = 0.6439 + 0.0014*data['SAT']
yhat_yes = 0.8665 + 0.0014*data['SAT']
yhat = 0.0017*data['SAT'] + 0.275 #This line is without dummy variance(attendance)
fig = plt.plot(data['SAT'],yhat_no, lw=2, c='#006837') #green line
fig = plt.plot(data['SAT'],yhat_yes, lw=2, c='#a50026') #red line
fig = plt.plot(data['SAT'], yhat, lw=2, c='orange', label='regression line') #orange line
plt.xlabel('SAT', fontsize = 20)
plt.ylabel('GPA', fontsize = 20)
plt.show()