import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
df = pd.read_csv('processed.cleveland.data', header=None)
df.head()
df.columns = ['age',
'sex',
'cp',
'restbp',
'chol',
'fbs',
'restecg',
'thalach',
'exang',
'oldpeak',
'slope',
'ca',
'thal',
'hd']
df.head()
df.dtypes
df['ca'].unique()
df['thal'].unique()
len(df.loc[(df['ca'] == '?')
|
(df['thal'] == '?')])
df.loc[(df['ca'] == '?')
|
(df['thal'] == '?')]
len(df)
df_no_missing = df.loc[(df['ca'] != '?')
&
(df['thal'] != '?')]
len(df_no_missing)
df_no_missing['ca'].unique()
X = df_no_missing.drop('hd', axis=1).copy()
X.head()
y = df_no_missing['hd'].copy()
y.head()
X.dtypes
X['cp'].unique()
pd.get_dummies(X, columns=['cp']).head()
X_encoded = pd.get_dummies(X, columns= ['cp',
'restecg',
'slope',
'thal'])
X_encoded.head()
y_not_zero_index = y>0
y[y_not_zero_index] = 1
y.unique()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
labels=["Does not have HD", "Has HD"]
plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=labels)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]
clf_dts = []
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs alphas for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.show()
clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0016)
scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
df = pd.DataFrame(data={'tree': range(5), 'accuracy': scores})
df.plot(x='tree', y = 'accuracy', marker='o', linestyle='--')
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
scores = cross_val_score(clf_dt, X_train, y_train, cv=5)
alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha', 'mean_accuracy', 'std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='--')
alpha_results[(alpha_results['alpha'] > 0.014)
&
(alpha_results['alpha'] < 0.015)]
ideal_ccp_alpha = alpha_results[(alpha_results['alpha'] > 0.014)
&
(alpha_results['alpha'] < 0.015)]['alpha']
ideal_ccp_alpha
clf_dt_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=labels)
plt.figure(figsize=(15, 7.5))
tree.plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["No HD", "Yes HD"],
feature_names=X.columns)