import pandas as pd
redwine_df = pd.read_csv('winequality-red2.csv')
print(len(redwine_df))
redwine_df.head()

1599

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "fixed_acidity",y = 'quality', fit_reg=False, ax=ax1)
sns.regplot(data = redwine_df, x = "volatile_acidity",y = 'quality', fit_reg=False, ax=ax2)
sns.regplot(data = redwine_df, x = "citric_acid",y = 'quality', fit_reg=False, ax=ax3)

<matplotlib.axes._subplots.AxesSubplot at 0x1aef4660748>

fig, (ax4, ax5, ax6) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "residual_sugar",y = 'quality', fit_reg=False, ax=ax4)
sns.regplot(data = redwine_df, x = "chlorides",y = 'quality', fit_reg=False, ax=ax5)
sns.regplot(data = redwine_df, x = "free_sulfur_dioxide",y = 'quality', fit_reg=False, ax=ax6)

<matplotlib.axes._subplots.AxesSubplot at 0x1aef529fc88>

fig, (ax7, ax8, ax9) = plt.subplots(1,3, figsize=(15,5))
sns.regplot(data = redwine_df, x = "total_sulfur_dioxide",y = 'quality', fit_reg=False, ax=ax7)
sns.regplot(data = redwine_df, x = "density",y = 'quality', fit_reg=False, ax=ax8)
sns.regplot(data = redwine_df, x = "ph",y = 'quality', fit_reg=False, ax=ax9)

<matplotlib.axes._subplots.AxesSubplot at 0x1aef41a69b0>

fig, (ax10, ax11) = plt.subplots(1,2, figsize=(10,5))
sns.regplot(data = redwine_df, x = "sulphates",y = 'quality', fit_reg=False, ax=ax10)
sns.regplot(data = redwine_df, x = "alcohol",y = 'quality', fit_reg=False, ax=ax11)

<matplotlib.axes._subplots.AxesSubplot at 0x1aef45864e0>

#外れ値の観測値をデータから除く
redwine_df2 = redwine_df[(redwine_df['volatile_acidity']<1.2)
                                    & (redwine_df['chlorides'] <0.5)
                                    & (redwine_df['total_sulfur_dioxide']<200)
                                    & (redwine_df['ph']<3.8)
                                    & (redwine_df['sulphates']<1.5 )
                                    & (redwine_df['alcohol']<14)
                                    & (redwine_df['fixed_acidity']<15)
                                    & (redwine_df['citric_acid']<0.9)
                                    & (redwine_df['residual_sugar']<9)
                                    & (redwine_df['free_sulfur_dioxide']<55)
                                    & (redwine_df['density']<1.002)]
len(redwine_df2)

1543

#回帰分析
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols

X = redwine_df2.drop('quality', 1)
X = sm.add_constant(X)
Y = redwine_df2['quality']
model = sm.OLS(Y, X)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.376
Method:                 Least Squares   F-statistic:                     85.42
Date:                Tue, 16 Jul 2019   Prob (F-statistic):          1.91e-150
Time:                        12:53:01   Log-Likelihood:                -1477.9
No. Observations:                1543   AIC:                             2980.
Df Residuals:                    1531   BIC:                             3044.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   16.8216     22.349      0.753      0.452     -27.016      60.659
fixed_acidity            0.0162      0.027      0.595      0.552      -0.037       0.069
volatile_acidity        -1.0631      0.125     -8.486      0.000      -1.309      -0.817
citric_acid             -0.2830      0.150     -1.888      0.059      -0.577       0.011
residual_sugar           0.0061      0.020      0.300      0.765      -0.034       0.046
chlorides               -1.7823      0.450     -3.959      0.000      -2.665      -0.899
free_sulfur_dioxide      0.0037      0.002      1.614      0.107      -0.001       0.008
total_sulfur_dioxide    -0.0032      0.001     -4.110      0.000      -0.005      -0.002
density                -12.5037     22.818     -0.548      0.584     -57.262      32.255
ph                      -0.5555      0.201     -2.770      0.006      -0.949      -0.162
sulphates                1.1789      0.128      9.202      0.000       0.928       1.430
alcohol                  0.2955      0.028     10.618      0.000       0.241       0.350
==============================================================================
Omnibus:                       22.132   Durbin-Watson:                   1.740
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               33.368
Skew:                          -0.131   Prob(JB):                     5.68e-08
Kurtosis:                       3.671   Cond. No.                     1.16e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.16e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	ph	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5