import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
Download HCV Dataset from UCI Machine Learning Repository
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv')
df.head()
(BIL), choline esterase (CHE), γ-glutamyl-transferase (GGT), aspartate amino-transferase (AST), and alanine amino-transferase (ALT). The concentrations of tissue inhibitor of metalloproteinase 1 (TIMP1), N-terminal peptide of procollagen III (PIIINP), and hyaluronic acid (HA) were measured on the immunochemical analyzer ADVIA Centaur CP (Siemens).
Target variable is the Category. What are the possible values?
I will not use the "0s=suspect Blood Donor" target.
df['Category'].value_counts()
df = df[df['Category'] != '0s=suspect Blood Donor']
Change the values in the Sex field to be 1 or 0 instead of "f" or "m".
df['Sex'] = np.where(df['Sex']=='f', 1, 0)
Plot some of the feature distributions among the different classes
X_cols = df.columns[2:]
fig, axs = plt.subplots(12, 1, figsize=(8,40))
for i, col in enumerate(X_cols):
df.groupby('Category')[col].plot(kind='kde', ax=axs[i], title=col)
plt.legend()
plt.show()
I will make the target a 4-dimensional one-hot vector
y = pd.get_dummies(df['Category']).values
X = df[X_cols]
Split into training, validation, and test sets.
60% in training set, 10% in validation set, and 20% in test set.
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=100
)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.125, random_state=100
)
Fill in missing feature values with the mean feature value of the training set
training_means = X_train.mean(axis=0).to_dict()
X_train.fillna(training_means, inplace=True)
X_val.fillna(training_means, inplace=True)
X_test.fillna(training_means, inplace=True)
Normalize the features based on the values in the training set. Apply transformation to training, validation, and test sets.
transformer = Normalizer().fit(X_train)
X_train = transformer.transform(X_train)
X_val = transformer.transform(X_val)
X_test = transformer.transform(X_test)
Define, compile, and train feed-forward neural network
inputs = keras.Input(shape=(len(X_cols),), name='Inputs')
x = layers.Dense(10, activation='relu', name='Dense_1')(inputs)
x = layers.Dense(10, activation='relu', name='Dense_2')(x)
outputs = layers.Dense(4, activation='softmax', name='Outputs')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
history = model.fit(
X_train,
y_train,
batch_size=32,
epochs=256,
validation_data=(X_val, y_val),
verbose=0
)
plt.plot(history.history['loss'], label='training loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()
Evaluate performance on test set
results = model.evaluate(X_test, y_test)
print(f'Test loss: {results[0]}. Test accuracy {results[1]}.')
label_dict = {
0: 'Blood Donor',
1: 'Hepatitis',
2: 'Fibrosis',
3: 'Cirrhosis'
}
y_pred = np.argmax(model.predict(X_test), axis=1)
y_pred_labels = np.vectorize(label_dict.get)(y_pred)
y_test_labels = np.vectorize(label_dict.get)(np.argmax(y_test, axis=1))
print(classification_report(y_test_labels, y_pred_labels, list(label_dict.values())))
Define and train a gradient boosted decision tree
gbt = xgb.XGBClassifier(
objective='multi:softmax',
n_estimators=100,
)
# xgboost expects a label encoded target variable
y_train_le = np.where(y_train==1)[1]
y_val_le = np.where(y_val==1)[1]
y_test_le = np.where(y_test==1)[1]
gbt.fit(
X_train, y_train_le,
#eval_set=[(X_train, y_train_le), (X_val, y_val_le)],
eval_metric='logloss',
verbose=False
)
Evaluate performance on test set
print(f'Test accuracy {gbt.score(X_test, y_test_le)}.')
Confusion matrix
y_pred = gbt.predict(X_test)
# Confusion matrix whose i-th row and j-th column entry indicates the number of
# samples with true label being i-th class and prediced label being j-th class.
confusion_matrix(np.argmax(y_test, axis=1), y_pred)
Convert target from label-encoded back to text and print classification report. XGBoost model performed better.
label_dict = {
0: 'Blood Donor',
1: 'Hepatitis',
2: 'Fibrosis',
3: 'Cirrhosis'
}
y_pred_labels = np.vectorize(label_dict.get)(y_pred)
y_test_labels = np.vectorize(label_dict.get)(np.argmax(y_test, axis=1))
print(classification_report(y_test_labels, y_pred_labels, list(label_dict.values())))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
criterion='entropy',
n_estimators=10,
max_depth=5,
random_state=100,
n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = np.argmax(rf.predict(X_test), axis=1)
y_pred_rf_labels = np.vectorize(label_dict.get)(y_pred_rf)
print(classification_report(y_test_labels, y_pred_rf_labels, list(label_dict.values())))
print(y_pred_rf.shape)
y_pred_rf_proba = np.asarray(rf.predict_proba(X_test))[:, :, 1].T
print(y_pred_rf_proba.shape)
y_pred_rf_proba[:5, :]
Average the output probablilities of the three models to make final predictions.
y_pred_gbt_proba = gbt.predict_proba(X_test)
y_pred_ffnn_proba = model.predict(X_test)
y_pred_rf_proba = np.asarray(rf.predict_proba(X_test))[:, :, 1].T
y_pred_avg_proba = (y_pred_gbt_proba + y_pred_ffnn_proba + y_pred_rf_proba)/3
y_pred_avg = np.argmax(y_pred_gbt_proba, axis=1)
y_pred_avg_labels = np.vectorize(label_dict.get)(y_pred_avg)
print(classification_report(y_test_labels, y_pred_avg_labels, list(label_dict.values())))