import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

df = pd.read_csv('processed.cleveland.data', header=None)

df.head()

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0

df.columns = ['age',
              'sex',
              'cp',
              'restbp',
              'chol',
              'fbs',
              'restecg',
              'thalach',
              'exang',
              'oldpeak',
              'slope',
              'ca',
              'thal',
              'hd']
df.head()

	age	sex	cp	restbp	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	hd
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0

df.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
hd           int64
dtype: object

df['ca'].unique()
df['thal'].unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

len(df.loc[(df['ca'] == '?')
        |
        (df['thal'] == '?')])

df.loc[(df['ca'] == '?')
        |
        (df['thal'] == '?')]

	age	sex	cp	restbp	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	hd
87	53.0	0.0	3.0	128.0	216.0	0.0	2.0	115.0	0.0	0.0	1.0	0.0	?	0
166	52.0	1.0	3.0	138.0	223.0	0.0	0.0	169.0	0.0	0.0	1.0	?	3.0	0
192	43.0	1.0	4.0	132.0	247.0	1.0	2.0	143.0	1.0	0.1	2.0	?	7.0	1
266	52.0	1.0	4.0	128.0	204.0	1.0	0.0	156.0	1.0	1.0	2.0	0.0	?	2
287	58.0	1.0	2.0	125.0	220.0	0.0	0.0	144.0	0.0	0.4	2.0	?	7.0	0
302	38.0	1.0	3.0	138.0	175.0	0.0	0.0	173.0	0.0	0.0	1.0	?	3.0	0

len(df)

df_no_missing = df.loc[(df['ca'] != '?')
                &
                (df['thal'] != '?')]

len(df_no_missing)

df_no_missing['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0'], dtype=object)

X = df_no_missing.drop('hd', axis=1).copy()
X.head()

	age	sex	cp	restbp	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0

y = df_no_missing['hd'].copy()
y.head()

0    0
1    2
2    1
3    0
4    0
Name: hd, dtype: int64

X.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
dtype: object

X['cp'].unique()

array([1., 4., 3., 2.])

pd.get_dummies(X, columns=['cp']).head()

	age	sex	restbp	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	cp_1.0	cp_2.0	cp_3.0	cp_4.0
0	63.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	1	0	0	0
1	67.0	1.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	0	0	0	1
2	67.0	1.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	0	0	0	1
3	37.0	1.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0	0	1	0
4	41.0	0.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0	1	0	0

X_encoded = pd.get_dummies(X, columns=                                                  ['cp',
                            'restecg',
                            'slope',
                            'thal'])
X_encoded.head()

	age	sex	restbp	chol	fbs	thalach	exang	oldpeak	ca	cp_1.0	...	cp_4.0	restecg_0.0	restecg_2.0	slope_1.0	slope_2.0	slope_3.0	thal_3.0	thal_6.0	thal_7.0
0	63.0	1.0	145.0	233.0	1.0	150.0	0.0	2.3	0.0	1	...	0	0	1	0	0	1	0	1	0
1	67.0	1.0	160.0	286.0	0.0	108.0	1.0	1.5	3.0	0	...	1	0	1	0	1	0	1	0	0
2	67.0	1.0	120.0	229.0	0.0	129.0	1.0	2.6	2.0	0	...	1	0	1	0	1	0	0	0	1
3	37.0	1.0	130.0	250.0	0.0	187.0	0.0	3.5	0.0	0	...	0	1	0	0	0	1	1	0	0
4	41.0	0.0	130.0	204.0	0.0	172.0	0.0	1.4	0.0	0	...	0	0	1	1	0	0	1	0	0

5 rows × 22 columns

y_not_zero_index = y>0
y[y_not_zero_index] = 1
y.unique()

array([0, 1], dtype=int64)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)

labels=["Does not have HD", "Has HD"]

plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=labels)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2b37f2bc940>

path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]

clf_dts = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs alphas for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.show()

clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0016)
scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
df = pd.DataFrame(data={'tree': range(5), 'accuracy': scores})

df.plot(x='tree', y = 'accuracy', marker='o', linestyle='--')

<matplotlib.axes._subplots.AxesSubplot at 0x2b309b39ac8>

alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])

alpha_results = pd.DataFrame(alpha_loop_values,
                             columns=['alpha', 'mean_accuracy', 'std'])

alpha_results.plot(x='alpha',
                   y='mean_accuracy',
                   yerr='std',
                   marker='o',
                   linestyle='--')

<matplotlib.axes._subplots.AxesSubplot at 0x2b30a43eda0>

alpha_results[(alpha_results['alpha'] > 0.014)
                &
                (alpha_results['alpha'] < 0.015)]

	alpha	mean_accuracy	std
21	0.014225	0.738788	0.098832

ideal_ccp_alpha = alpha_results[(alpha_results['alpha'] > 0.014)
                &
                (alpha_results['alpha'] < 0.015)]['alpha']
ideal_ccp_alpha

21    0.014225
Name: alpha, dtype: float64

clf_dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)

plot_confusion_matrix(clf_dt_pruned,
                      X_test,
                      y_test,
                      display_labels=labels)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2b309df0710>

plt.figure(figsize=(15, 7.5))
tree.plot_tree(clf_dt_pruned,
               filled=True,
               rounded=True,
               class_names=["No HD", "Yes HD"],
               feature_names=X.columns)

[Text(418.5, 305.775, 'ca <= 0.5\ngini = 0.498\nsamples = 222\nvalue = [118, 104]\nclass = No HD'),
 Text(209.25, 101.92500000000001, 'gini = 0.382\nsamples = 132\nvalue = [98, 34]\nclass = No HD'),
 Text(627.75, 101.92500000000001, 'gini = 0.346\nsamples = 90\nvalue = [20, 70]\nclass = Yes HD')]