import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
df = pd.read_csv('processed.cleveland.data', header=None)
df.head()
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 0
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0
df.columns = ['age',
              'sex',
              'cp',
              'restbp',
              'chol',
              'fbs',
              'restecg',
              'thalach',
              'exang',
              'oldpeak',
              'slope',
              'ca',
              'thal',
              'hd']
df.head()
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal hd
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 0
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0
df.dtypes
age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
hd           int64
dtype: object
df['ca'].unique()
df['thal'].unique()
array(['6.0', '3.0', '7.0', '?'], dtype=object)
len(df.loc[(df['ca'] == '?')
        |
        (df['thal'] == '?')])
6
df.loc[(df['ca'] == '?')
        |
        (df['thal'] == '?')]
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal hd
87 53.0 0.0 3.0 128.0 216.0 0.0 2.0 115.0 0.0 0.0 1.0 0.0 ? 0
166 52.0 1.0 3.0 138.0 223.0 0.0 0.0 169.0 0.0 0.0 1.0 ? 3.0 0
192 43.0 1.0 4.0 132.0 247.0 1.0 2.0 143.0 1.0 0.1 2.0 ? 7.0 1
266 52.0 1.0 4.0 128.0 204.0 1.0 0.0 156.0 1.0 1.0 2.0 0.0 ? 2
287 58.0 1.0 2.0 125.0 220.0 0.0 0.0 144.0 0.0 0.4 2.0 ? 7.0 0
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0
len(df)
303
df_no_missing = df.loc[(df['ca'] != '?')
                &
                (df['thal'] != '?')]
len(df_no_missing)
297
df_no_missing['ca'].unique()
array(['0.0', '3.0', '2.0', '1.0'], dtype=object)
X = df_no_missing.drop('hd', axis=1).copy()
X.head()
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0
y = df_no_missing['hd'].copy()
y.head()
0    0
1    2
2    1
3    0
4    0
Name: hd, dtype: int64
X.dtypes
age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
dtype: object
X['cp'].unique()
array([1., 4., 3., 2.])
pd.get_dummies(X, columns=['cp']).head()
age sex restbp chol fbs restecg thalach exang oldpeak slope ca thal cp_1.0 cp_2.0 cp_3.0 cp_4.0
0 63.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 1 0 0 0
1 67.0 1.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 0 0 0 1
2 67.0 1.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 0 0 0 1
3 37.0 1.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0 0 1 0
4 41.0 0.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0 1 0 0
X_encoded = pd.get_dummies(X, columns=                                                  ['cp',
                            'restecg',
                            'slope',
                            'thal'])
X_encoded.head()
age sex restbp chol fbs thalach exang oldpeak ca cp_1.0 ... cp_4.0 restecg_0.0 restecg_1.0 restecg_2.0 slope_1.0 slope_2.0 slope_3.0 thal_3.0 thal_6.0 thal_7.0
0 63.0 1.0 145.0 233.0 1.0 150.0 0.0 2.3 0.0 1 ... 0 0 0 1 0 0 1 0 1 0
1 67.0 1.0 160.0 286.0 0.0 108.0 1.0 1.5 3.0 0 ... 1 0 0 1 0 1 0 1 0 0
2 67.0 1.0 120.0 229.0 0.0 129.0 1.0 2.6 2.0 0 ... 1 0 0 1 0 1 0 0 0 1
3 37.0 1.0 130.0 250.0 0.0 187.0 0.0 3.5 0.0 0 ... 0 1 0 0 0 0 1 1 0 0
4 41.0 0.0 130.0 204.0 0.0 172.0 0.0 1.4 0.0 0 ... 0 0 0 1 1 0 0 1 0 0

5 rows × 22 columns

y_not_zero_index = y>0
y[y_not_zero_index] = 1
y.unique()
array([0, 1], dtype=int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
labels=["Does not have HD", "Has HD"]

plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=labels)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2b37f2bc940>
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]

clf_dts = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs alphas for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.show()
clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0016)
scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
df = pd.DataFrame(data={'tree': range(5), 'accuracy': scores})

df.plot(x='tree', y = 'accuracy', marker='o', linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x2b309b39ac8>
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                             columns=['alpha', 'mean_accuracy', 'std'])

alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x2b30a43eda0>
alpha_results[(alpha_results['alpha'] > 0.014)
                &
                (alpha_results['alpha'] < 0.015)]
alpha mean_accuracy std
21 0.014225 0.738788 0.098832
ideal_ccp_alpha = alpha_results[(alpha_results['alpha'] > 0.014)
                &
                (alpha_results['alpha'] < 0.015)]['alpha']
ideal_ccp_alpha
21    0.014225
Name: alpha, dtype: float64
clf_dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=labels)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2b309df0710>
plt.figure(figsize=(15, 7.5))
tree.plot_tree(clf_dt_pruned,
               filled=True,
               rounded=True,
               class_names=["No HD", "Yes HD"],
               feature_names=X.columns)
[Text(418.5, 305.775, 'ca <= 0.5\ngini = 0.498\nsamples = 222\nvalue = [118, 104]\nclass = No HD'),
 Text(209.25, 101.92500000000001, 'gini = 0.382\nsamples = 132\nvalue = [98, 34]\nclass = No HD'),
 Text(627.75, 101.92500000000001, 'gini = 0.346\nsamples = 90\nvalue = [20, 70]\nclass = Yes HD')]