import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
df = pd.read_csv('', header=None)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
df.columns = ['age',
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal hd
age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
hd           int64
dtype: object
array(['6.0', '3.0', '7.0', '?'], dtype=object)
len(df.loc[(df['ca'] == '?')
        (df['thal'] == '?')])
df.loc[(df['ca'] == '?')
        (df['thal'] == '?')]
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal hd
87 53.0 0.0 3.0 128.0 216.0 0.0 2.0 115.0 0.0 0.0 1.0 0.0 ? 0
166 52.0 1.0 3.0 138.0 223.0 0.0 0.0 169.0 0.0 0.0 1.0 ? 3.0 0
192 43.0 1.0 4.0 132.0 247.0 1.0 2.0 143.0 1.0 0.1 2.0 ? 7.0 1
266 52.0 1.0 4.0 128.0 204.0 1.0 0.0 156.0 1.0 1.0 2.0 0.0 ? 2
287 58.0 1.0 2.0 125.0 220.0 0.0 0.0 144.0 0.0 0.4 2.0 ? 7.0 0
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 ? 3.0 0
df_no_missing = df.loc[(df['ca'] != '?')
                (df['thal'] != '?')]
array(['0.0', '3.0', '2.0', '1.0'], dtype=object)
X = df_no_missing.drop('hd', axis=1).copy()
age sex cp restbp chol fbs restecg thalach exang oldpeak slope ca thal
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0
y = df_no_missing['hd'].copy()
array([1., 4., 3., 2.])
pd.get_dummies(X, columns=['cp']).head()
age sex restbp chol fbs restecg thalach exang oldpeak slope ca thal cp_1.0 cp_2.0 cp_3.0 cp_4.0
0 63.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 1 0 0 0
1 67.0 1.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 0 0 0 1
2 67.0 1.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 0 0 0 1
3 37.0 1.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0 0 1 0
4 41.0 0.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0 1 0 0
X_encoded = pd.get_dummies(X, columns=                                                  ['cp',
age sex restbp chol fbs thalach exang oldpeak ca cp_1.0 ... cp_4.0 restecg_0.0 restecg_1.0 restecg_2.0 slope_1.0 slope_2.0 slope_3.0 thal_3.0 thal_6.0 thal_7.0
0 63.0 1.0 145.0 233.0 1.0 150.0 0.0 2.3 0.0 1 ... 0 0 0 1 0 0 1 0 1 0
1 67.0 1.0 160.0 286.0 0.0 108.0 1.0 1.5 3.0 0 ... 1 0 0 1 0 1 0 1 0 0
2 67.0 1.0 120.0 229.0 0.0 129.0 1.0 2.6 2.0 0 ... 1 0 0 1 0 1 0 0 0 1
3 37.0 1.0 130.0 250.0 0.0 187.0 0.0 3.5 0.0 0 ... 0 1 0 0 0 0 1 1 0 0
4 41.0 0.0 130.0 204.0 0.0 172.0 0.0 1.4 0.0 0 ... 0 0 0 1 1 0 0 1 0 0

5 rows × 22 columns

y_not_zero_index = y>0
y[y_not_zero_index] = 1
array([0, 1], dtype=int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt =, y_train)
labels=["Does not have HD", "Has HD"]

plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=labels)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]

clf_dts = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha), y_train)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax = plt.subplots()
ax.set_title("Accuracy vs alphas for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0016)
scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
df = pd.DataFrame(data={'tree': range(5), 'accuracy': scores})

df.plot(x='tree', y = 'accuracy', marker='o', linestyle='--')
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                             columns=['alpha', 'mean_accuracy', 'std'])

alpha_results[(alpha_results['alpha'] > 0.014)
                (alpha_results['alpha'] < 0.015)]
alpha mean_accuracy std
21 0.014225 0.738788 0.098832
ideal_ccp_alpha = alpha_results[(alpha_results['alpha'] > 0.014)
                (alpha_results['alpha'] < 0.015)]['alpha']
21    0.014225
Name: alpha, dtype: float64
clf_dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
clf_dt_pruned =, y_train)
plt.figure(figsize=(15, 7.5))
               class_names=["No HD", "Yes HD"],
[Text(418.5, 305.775, 'ca <= 0.5\ngini = 0.498\nsamples = 222\nvalue = [118, 104]\nclass = No HD'),
 Text(209.25, 101.92500000000001, 'gini = 0.382\nsamples = 132\nvalue = [98, 34]\nclass = No HD'),
 Text(627.75, 101.92500000000001, 'gini = 0.346\nsamples = 90\nvalue = [20, 70]\nclass = Yes HD')]