In [54]:
import pandas as pd
redwine_df = pd.read_csv('winequality-red2.csv')
print(len(redwine_df))
redwine_df.head()
1599
Out[54]:
fixed_acidity volatile_acidity citric_acid residual_sugar chlorides free_sulfur_dioxide total_sulfur_dioxide density ph sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "fixed_acidity",y = 'quality', fit_reg=False, ax=ax1)
sns.regplot(data = redwine_df, x = "volatile_acidity",y = 'quality', fit_reg=False, ax=ax2)
sns.regplot(data = redwine_df, x = "citric_acid",y = 'quality', fit_reg=False, ax=ax3)
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1aef4660748>
In [38]:
fig, (ax4, ax5, ax6) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "residual_sugar",y = 'quality', fit_reg=False, ax=ax4)
sns.regplot(data = redwine_df, x = "chlorides",y = 'quality', fit_reg=False, ax=ax5)
sns.regplot(data = redwine_df, x = "free_sulfur_dioxide",y = 'quality', fit_reg=False, ax=ax6)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1aef529fc88>
In [39]:
fig, (ax7, ax8, ax9) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "total_sulfur_dioxide",y = 'quality', fit_reg=False, ax=ax7)
sns.regplot(data = redwine_df, x = "density",y = 'quality', fit_reg=False, ax=ax8)
sns.regplot(data = redwine_df, x = "ph",y = 'quality', fit_reg=False, ax=ax9)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1aef41a69b0>
In [41]:
fig, (ax10, ax11) = plt.subplots(1,2, figsize=(10,5))
sns.regplot(data = redwine_df, x = "sulphates",y = 'quality', fit_reg=False, ax=ax10)
sns.regplot(data = redwine_df, x = "alcohol",y = 'quality', fit_reg=False, ax=ax11)
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x1aef45864e0>
In [51]:
#外れ値の観測値をデータから除く
redwine_df2 = redwine_df[(redwine_df['volatile_acidity']<1.2)
                                    & (redwine_df['chlorides'] <0.5)
                                    & (redwine_df['total_sulfur_dioxide']<200)
                                    & (redwine_df['ph']<3.8)
                                    & (redwine_df['sulphates']<1.5 )
                                    & (redwine_df['alcohol']<14)
                                    & (redwine_df['fixed_acidity']<15)
                                    & (redwine_df['citric_acid']<0.9)
                                    & (redwine_df['residual_sugar']<9)
                                    & (redwine_df['free_sulfur_dioxide']<55)
                                    & (redwine_df['density']<1.002)]
len(redwine_df2)
Out[51]:
1543
In [55]:
#回帰分析
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols

X = redwine_df2.drop('quality', 1)
X = sm.add_constant(X)
Y = redwine_df2['quality']
model = sm.OLS(Y, X)
result = model.fit()
print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.376
Method:                 Least Squares   F-statistic:                     85.42
Date:                Tue, 16 Jul 2019   Prob (F-statistic):          1.91e-150
Time:                        12:53:01   Log-Likelihood:                -1477.9
No. Observations:                1543   AIC:                             2980.
Df Residuals:                    1531   BIC:                             3044.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   16.8216     22.349      0.753      0.452     -27.016      60.659
fixed_acidity            0.0162      0.027      0.595      0.552      -0.037       0.069
volatile_acidity        -1.0631      0.125     -8.486      0.000      -1.309      -0.817
citric_acid             -0.2830      0.150     -1.888      0.059      -0.577       0.011
residual_sugar           0.0061      0.020      0.300      0.765      -0.034       0.046
chlorides               -1.7823      0.450     -3.959      0.000      -2.665      -0.899
free_sulfur_dioxide      0.0037      0.002      1.614      0.107      -0.001       0.008
total_sulfur_dioxide    -0.0032      0.001     -4.110      0.000      -0.005      -0.002
density                -12.5037     22.818     -0.548      0.584     -57.262      32.255
ph                      -0.5555      0.201     -2.770      0.006      -0.949      -0.162
sulphates                1.1789      0.128      9.202      0.000       0.928       1.430
alcohol                  0.2955      0.028     10.618      0.000       0.241       0.350
==============================================================================
Omnibus:                       22.132   Durbin-Watson:                   1.740
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               33.368
Skew:                          -0.131   Prob(JB):                     5.68e-08
Kurtosis:                       3.671   Cond. No.                     1.16e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.16e+05. This might indicate that there are
strong multicollinearity or other numerical problems.