import pandas as pd
heart_df = pd.read_csv("heart2.csv")
heart_df.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | heart_disease | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 70 | 1 | 4 | 130 | 322 | 0 | 2 | 109 | 0 | 2.4 | 2 | 3 | 3 | 0 |
1 | 67 | 0 | 3 | 115 | 564 | 0 | 2 | 160 | 0 | 1.6 | 2 | 0 | 7 | 1 |
2 | 57 | 1 | 2 | 124 | 261 | 0 | 0 | 141 | 0 | 0.3 | 1 | 0 | 7 | 1 |
3 | 64 | 1 | 4 | 128 | 263 | 0 | 0 | 105 | 1 | 0.2 | 2 | 1 | 7 | 1 |
4 | 74 | 0 | 2 | 120 | 269 | 0 | 2 | 121 | 1 | 0.2 | 1 | 1 | 3 | 0 |
heart_df2=heart_df[['age','trestbps','chol','thalach','oldpeak']]
heart_df2.head()
age | trestbps | chol | thalach | oldpeak | |
---|---|---|---|---|---|
0 | 70 | 130 | 322 | 109 | 2.4 |
1 | 67 | 115 | 564 | 160 | 1.6 |
2 | 57 | 124 | 261 | 141 | 0.3 |
3 | 64 | 128 | 263 | 105 | 0.2 |
4 | 74 | 120 | 269 | 121 | 0.2 |
#相関係数
df_corr = heart_df2.corr()
df_corr
age | trestbps | chol | thalach | oldpeak | |
---|---|---|---|---|---|
age | 1.000000 | 0.273053 | 0.220056 | -0.402215 | 0.194234 |
trestbps | 0.273053 | 1.000000 | 0.173019 | -0.039136 | 0.222800 |
chol | 0.220056 | 0.173019 | 1.000000 | -0.018739 | 0.027709 |
thalach | -0.402215 | -0.039136 | -0.018739 | 1.000000 | -0.349045 |
oldpeak | 0.194234 | 0.222800 | 0.027709 | -0.349045 | 1.000000 |
#散布図行列
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
sns.pairplot(heart_df2)
<seaborn.axisgrid.PairGrid at 0x1bb0202bc88>
#回帰分析
#カテゴリ変数をダミー変数に変換
heart_df3=pd.get_dummies(heart_df, drop_first=True, columns=['cp','restecg','exang','slope','ca','thal'])
heart_df3.head()
age | sex | trestbps | chol | fbs | thalach | oldpeak | heart_disease | cp_2 | cp_3 | ... | restecg_1 | restecg_2 | exang_1 | slope_2 | slope_3 | ca_1 | ca_2 | ca_3 | thal_6 | thal_7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 70 | 1 | 130 | 322 | 0 | 109 | 2.4 | 0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
1 | 67 | 0 | 115 | 564 | 0 | 160 | 1.6 | 1 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 57 | 1 | 124 | 261 | 0 | 141 | 0.3 | 1 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 64 | 1 | 128 | 263 | 0 | 105 | 0.2 | 1 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 74 | 0 | 120 | 269 | 0 | 121 | 0.2 | 0 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 21 columns
#回帰モデル
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols
X = heart_df3.drop(['oldpeak','thal_6','thal_7'], 1)
X = sm.add_constant(X)
Y = heart_df3['oldpeak']
model = sm.OLS(Y, X)
result = model.fit()
print(result.summary())
OLS Regression Results ============================================================================== Dep. Variable: oldpeak R-squared: 0.479 Model: OLS Adj. R-squared: 0.441 Method: Least Squares F-statistic: 12.81 Date: Wed, 03 Jul 2019 Prob (F-statistic): 1.83e-26 Time: 22:50:17 Log-Likelihood: -331.26 No. Observations: 270 AIC: 700.5 Df Residuals: 251 BIC: 768.9 Df Model: 18 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- const 0.5145 0.817 0.630 0.529 -1.094 2.123 age -0.0022 0.007 -0.315 0.753 -0.016 0.012 sex 0.0707 0.130 0.545 0.586 -0.185 0.326 trestbps 0.0067 0.003 2.050 0.041 0.000 0.013 chol 8.803e-05 0.001 0.080 0.936 -0.002 0.002 fbs -0.3483 0.155 -2.253 0.025 -0.653 -0.044 thalach -0.0045 0.003 -1.495 0.136 -0.010 0.001 heart_disease 0.1908 0.131 1.461 0.145 -0.066 0.448 cp_2 -0.5630 0.243 -2.319 0.021 -1.041 -0.085 cp_3 -0.2468 0.223 -1.106 0.270 -0.686 0.193 cp_4 -0.2946 0.221 -1.333 0.184 -0.730 0.141 restecg_1 0.7466 0.636 1.173 0.242 -0.507 2.000 restecg_2 -0.0082 0.111 -0.074 0.941 -0.226 0.210 exang_1 0.1404 0.133 1.056 0.292 -0.121 0.402 slope_2 0.7615 0.127 6.015 0.000 0.512 1.011 slope_3 2.0832 0.231 9.014 0.000 1.628 2.538 ca_1 0.0422 0.144 0.293 0.770 -0.241 0.325 ca_2 0.5668 0.179 3.161 0.002 0.214 0.920 ca_3 0.5282 0.225 2.352 0.019 0.086 0.971 ============================================================================== Omnibus: 20.247 Durbin-Watson: 2.071 Prob(Omnibus): 0.000 Jarque-Bera (JB): 23.863 Skew: 0.605 Prob(JB): 6.58e-06 Kurtosis: 3.810 Cond. No. 5.23e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.23e+03. This might indicate that there are strong multicollinearity or other numerical problems.