In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

Download Dataset

Download HCV Dataset from UCI Machine Learning Repository

https://archive.ics.uci.edu/ml/datasets/HCV+data

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv')
In [3]:
df.head()
Out[3]:
Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL CHE CHOL CREA GGT PROT
0 1 0=Blood Donor 32 m 38.5 52.5 7.7 22.1 7.5 6.93 3.23 106.0 12.1 69.0
1 2 0=Blood Donor 32 m 38.5 70.3 18.0 24.7 3.9 11.17 4.80 74.0 15.6 76.5
2 3 0=Blood Donor 32 m 46.9 74.7 36.2 52.6 6.1 8.84 5.20 86.0 33.2 79.3
3 4 0=Blood Donor 32 m 43.2 52.0 30.6 22.6 18.9 7.33 4.74 80.0 33.8 75.7
4 5 0=Blood Donor 32 m 39.2 74.1 32.6 24.8 9.6 9.15 4.32 76.0 29.9 68.7

(BIL), choline esterase (CHE), γ-glutamyl-transferase (GGT), aspartate amino-transferase (AST), and alanine amino-transferase (ALT). The concentrations of tissue inhibitor of metalloproteinase 1 (TIMP1), N-terminal peptide of procollagen III (PIIINP), and hyaluronic acid (HA) were measured on the immunochemical analyzer ADVIA Centaur CP (Siemens).

Target variable is the Category. What are the possible values?

  • "0=Blood Donor"
  • "1=Hepatitis"
  • "2=Fibrosis"
  • "3=Cirrhosis"
  • "0s=suspect Blood Donor"

Data Preprocessing

I will not use the "0s=suspect Blood Donor" target.

In [4]:
df['Category'].value_counts()
Out[4]:
0=Blood Donor             533
3=Cirrhosis                30
1=Hepatitis                24
2=Fibrosis                 21
0s=suspect Blood Donor      7
Name: Category, dtype: int64
In [5]:
df = df[df['Category'] != '0s=suspect Blood Donor']

Change the values in the Sex field to be 1 or 0 instead of "f" or "m".

In [6]:
df['Sex'] = np.where(df['Sex']=='f', 1, 0)

Exploratory Data Analysis

Plot some of the feature distributions among the different classes

In [7]:
X_cols = df.columns[2:]
fig, axs = plt.subplots(12, 1, figsize=(8,40))
for i, col in enumerate(X_cols):
    df.groupby('Category')[col].plot(kind='kde', ax=axs[i], title=col)
plt.legend()
plt.show()

Train / Val / Test Split

I will make the target a 4-dimensional one-hot vector

In [8]:
y = pd.get_dummies(df['Category']).values
In [9]:
X = df[X_cols]

Split into training, validation, and test sets.

60% in training set, 10% in validation set, and 20% in test set.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=100
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.125, random_state=100
)

Fill in missing feature values with the mean feature value of the training set

In [11]:
training_means = X_train.mean(axis=0).to_dict()

X_train.fillna(training_means, inplace=True)
X_val.fillna(training_means, inplace=True)
X_test.fillna(training_means, inplace=True)
C:\Users\14072\anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)

Normalize the features based on the values in the training set. Apply transformation to training, validation, and test sets.

In [12]:
transformer = Normalizer().fit(X_train)
X_train = transformer.transform(X_train)
X_val = transformer.transform(X_val)
X_test = transformer.transform(X_test)

Neural Network

Define, compile, and train feed-forward neural network

In [13]:
inputs = keras.Input(shape=(len(X_cols),), name='Inputs')
x = layers.Dense(10, activation='relu', name='Dense_1')(inputs)
x = layers.Dense(10, activation='relu', name='Dense_2')(x)
outputs = layers.Dense(4, activation='softmax', name='Outputs')(x)
model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=256,
    validation_data=(X_val, y_val),
    verbose=0
)

plt.plot(history.history['loss'], label='training loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()

Evaluate performance on test set

In [14]:
results = model.evaluate(X_test, y_test)
print(f'Test loss: {results[0]}. Test accuracy {results[1]}.')
1/4 [======>.......................] - ETA: 0s - loss: 0.2924 - accuracy: 0.9062WARNING:tensorflow:Callbacks method `on_test_batch_end` is slow compared to the batch time (batch time: 0.0000s vs `on_test_batch_end` time: 0.0010s). Check your callbacks.
4/4 [==============================] - 0s 1ms/step - loss: 0.2630 - accuracy: 0.9098
Test loss: 0.26302483677864075. Test accuracy 0.9098360538482666.
In [15]:
label_dict = {
    0: 'Blood Donor',
    1: 'Hepatitis',
    2: 'Fibrosis',
    3: 'Cirrhosis'
}

y_pred = np.argmax(model.predict(X_test), axis=1)
y_pred_labels = np.vectorize(label_dict.get)(y_pred)
y_test_labels = np.vectorize(label_dict.get)(np.argmax(y_test, axis=1))
print(classification_report(y_test_labels, y_pred_labels, list(label_dict.values())))
              precision    recall  f1-score   support

 Blood Donor       0.94      0.99      0.96       107
   Hepatitis       1.00      0.12      0.22         8
    Fibrosis       0.40      0.40      0.40         5
   Cirrhosis       0.67      1.00      0.80         2

    accuracy                           0.91       122
   macro avg       0.75      0.63      0.60       122
weighted avg       0.92      0.91      0.89       122

C:\Users\14072\anaconda3\lib\site-packages\sklearn\utils\validation.py:68: FutureWarning: Pass labels=['Blood Donor', 'Hepatitis', 'Fibrosis', 'Cirrhosis'] as keyword args. From version 0.25 passing these as positional arguments will result in an error
  warnings.warn("Pass {} as keyword args. From version 0.25 "

Gradient Boosted Decision Tree

Define and train a gradient boosted decision tree

In [16]:
gbt = xgb.XGBClassifier(
    objective='multi:softmax',
    n_estimators=100,
)
In [17]:
# xgboost expects a label encoded target variable
y_train_le = np.where(y_train==1)[1]
y_val_le = np.where(y_val==1)[1]
y_test_le = np.where(y_test==1)[1]

gbt.fit(
    X_train, y_train_le,
    #eval_set=[(X_train, y_train_le), (X_val, y_val_le)],
    eval_metric='logloss',
    verbose=False
)
Out[17]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Evaluate performance on test set

In [18]:
print(f'Test accuracy {gbt.score(X_test, y_test_le)}.')
Test accuracy 0.9426229508196722.

Confusion matrix

In [19]:
y_pred = gbt.predict(X_test)
# Confusion matrix whose i-th row and j-th column entry indicates the number of 
# samples with true label being i-th class and prediced label being j-th class.
confusion_matrix(np.argmax(y_test, axis=1), y_pred)
Out[19]:
array([[106,   0,   0,   1],
       [  1,   4,   3,   0],
       [  1,   0,   3,   1],
       [  0,   0,   0,   2]], dtype=int64)

Convert target from label-encoded back to text and print classification report. XGBoost model performed better.

In [20]:
label_dict = {
    0: 'Blood Donor',
    1: 'Hepatitis',
    2: 'Fibrosis',
    3: 'Cirrhosis'
}

y_pred_labels = np.vectorize(label_dict.get)(y_pred)
y_test_labels = np.vectorize(label_dict.get)(np.argmax(y_test, axis=1))
print(classification_report(y_test_labels, y_pred_labels, list(label_dict.values())))
              precision    recall  f1-score   support

 Blood Donor       0.98      0.99      0.99       107
   Hepatitis       1.00      0.50      0.67         8
    Fibrosis       0.50      0.60      0.55         5
   Cirrhosis       0.50      1.00      0.67         2

    accuracy                           0.94       122
   macro avg       0.75      0.77      0.72       122
weighted avg       0.96      0.94      0.94       122

Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    criterion='entropy',
    n_estimators=10,
    max_depth=5,
    random_state=100,
    n_jobs=-1
)


rf.fit(X_train, y_train)
Out[21]:
RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=10,
                       n_jobs=-1, random_state=100)
In [22]:
y_pred_rf = np.argmax(rf.predict(X_test), axis=1)
y_pred_rf_labels = np.vectorize(label_dict.get)(y_pred_rf)
print(classification_report(y_test_labels, y_pred_rf_labels, list(label_dict.values())))
              precision    recall  f1-score   support

 Blood Donor       0.92      1.00      0.96       107
   Hepatitis       1.00      0.25      0.40         8
    Fibrosis       1.00      0.20      0.33         5
   Cirrhosis       0.67      1.00      0.80         2

    accuracy                           0.92       122
   macro avg       0.90      0.61      0.62       122
weighted avg       0.93      0.92      0.89       122

C:\Users\14072\anaconda3\lib\site-packages\sklearn\utils\validation.py:68: FutureWarning: Pass labels=['Blood Donor', 'Hepatitis', 'Fibrosis', 'Cirrhosis'] as keyword args. From version 0.25 passing these as positional arguments will result in an error
  warnings.warn("Pass {} as keyword args. From version 0.25 "
In [23]:
print(y_pred_rf.shape)
y_pred_rf_proba = np.asarray(rf.predict_proba(X_test))[:, :, 1].T
print(y_pred_rf_proba.shape)
(122,)
(122, 4)
In [24]:
y_pred_rf_proba[:5, :]
Out[24]:
array([[0.54032738, 0.33392857, 0.05      , 0.07574405],
       [0.99061404, 0.0052193 , 0.00416667, 0.        ],
       [0.99795918, 0.00204082, 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        ],
       [0.99794521, 0.00205479, 0.        , 0.        ]])

Ensemble Model

Average the output probablilities of the three models to make final predictions.

In [25]:
y_pred_gbt_proba = gbt.predict_proba(X_test)
y_pred_ffnn_proba = model.predict(X_test)
y_pred_rf_proba = np.asarray(rf.predict_proba(X_test))[:, :, 1].T

y_pred_avg_proba = (y_pred_gbt_proba + y_pred_ffnn_proba + y_pred_rf_proba)/3
y_pred_avg = np.argmax(y_pred_gbt_proba, axis=1)

Evaluate Performance

In [26]:
y_pred_avg_labels = np.vectorize(label_dict.get)(y_pred_avg)
print(classification_report(y_test_labels, y_pred_avg_labels, list(label_dict.values())))
              precision    recall  f1-score   support

 Blood Donor       0.98      0.99      0.99       107
   Hepatitis       1.00      0.50      0.67         8
    Fibrosis       0.50      0.60      0.55         5
   Cirrhosis       0.50      1.00      0.67         2

    accuracy                           0.94       122
   macro avg       0.75      0.77      0.72       122
weighted avg       0.96      0.94      0.94       122

C:\Users\14072\anaconda3\lib\site-packages\sklearn\utils\validation.py:68: FutureWarning: Pass labels=['Blood Donor', 'Hepatitis', 'Fibrosis', 'Cirrhosis'] as keyword args. From version 0.25 passing these as positional arguments will result in an error
  warnings.warn("Pass {} as keyword args. From version 0.25 "