Load logistic regression, numpy, and cross validation train/test split functions.

In [122]:
from __future__ import division

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.preprocessing import scale

Return to the Wisconsin breast cancer data. Clean it up as we did before.

In [123]:
dimensions = ['mean', 'se', 'worst']
attributes = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness',
              'concavity', 'concave_points', 'symmetry', 'fractal_dimension']

attribute_names = ['{}-{}'.format(x, y) for x in attributes for y in dimensions]

cell_data_filepath = 'https://s3-us-west-2.amazonaws.com/ga-dat-2015-suneel/datasets/breast-cancer.csv'
col_names = ['id', 'diagnosis'] + attribute_names
cell_df = pd.read_csv(cell_data_filepath, header=None, names=col_names)
cell_df.head()

Unnamed: 0,id,diagnosis,radius-mean,radius-se,radius-worst,texture-mean,texture-se,texture-worst,perimeter-mean,perimeter-se,...,concavity-worst,concave_points-mean,concave_points-se,concave_points-worst,symmetry-mean,symmetry-se,symmetry-worst,fractal_dimension-mean,fractal_dimension-se,fractal_dimension-worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [124]:
# pull features
features_df = cell_df[attribute_names]
# pull target and view split
target_df = cell_df['diagnosis']
target_df.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

Split into 66% training set and 33% testing set

In [125]:
#metrics_pct = np.array(bcw.metrics_pct.values)
#metrics_pct = metrics_pct[:, np.newaxis]
X_train, X_test, Y_train, Y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=5)

Fit the logistic regression on the training data

In [126]:
logreg = LogisticRegression(random_state=5)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)

Look at the confusion matrix

In [127]:
from sklearn.metrics import confusion_matrix
# the input format is almost always (test, predict), but always check with documentation!
conmat = np.array(confusion_matrix(Y_test, Y_pred))

confusion = pd.DataFrame(conmat, index=['is_healthy', 'has_cancer'],
                         columns=['predicted_healthy', 'predicted_cancer'])

print(confusion)

            predicted_healthy  predicted_cancer
is_healthy                121                 1
has_cancer                  2                64


Calculate true positives, false positives, true negatives, and false negatives from the confusion matrix

In [128]:
TP = confusion.ix['has_cancer', 'predicted_cancer']
FP = confusion.ix['is_healthy', 'predicted_cancer']
TN = confusion.ix['is_healthy', 'predicted_healthy']
FN = confusion.ix['has_cancer', 'predicted_healthy']

print(zip(['True Positives','False Positives','True Negatives','False Negatives'],
          [TP, FP, TN, FN]))

[('True Positives', 64), ('False Positives', 1), ('True Negatives', 121), ('False Negatives', 2)]


Calculate the accuracy with the accuracy_score() function from sklearn

In [129]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(Y_test, Y_pred)
print(acc)

0.984042553191


Show that the accuracy is equivalent to: True Positives + True Negatives / Total

In [130]:
print((TP + TN) / float(len(Y_test)))

0.984042553191


Create the classification report with the classification_report() function

In [131]:
from sklearn.metrics import classification_report

cls_rep = classification_report(Y_test, Y_pred)
print(cls_rep)

             precision    recall  f1-score   support

          B       0.98      0.99      0.99       122
          M       0.98      0.97      0.98        66

avg / total       0.98      0.98      0.98       188



Show that the precision (for 1 vs 0) is equivalent to: True Positives / (True Positives + False Positives)

In [132]:
# 0 vs. 1
print(float(TN) / (TN + FN))

# 1 vs. 0
print(float(TP) / (TP + FP))

0.983739837398
0.984615384615


Show that the recall (for 1 vs 0) is equivalent to: True Positives / (True Positives + False Negatives)

In [133]:
# 0 vs. 1
print(float(TN) / (TN + FP))

# 1 vs. 0
print(float(TP) / (TP + FN))

0.991803278689
0.969696969697


Show that the F1-score is equivalent to: 2 * (Precision * Recall) / (Precision + Recall)

In [134]:
# 0 vs. 1
neg_precision = float(TN) / (TN + FN)
neg_recall = float(TN) / (TN + FP)
print(2. * (neg_precision * neg_recall) / (neg_precision + neg_recall))

# 1 vs. 0
pos_precision = float(TP) / (TP + FP)
pos_recall = float(TP) / (TP + FN)
print(2. * (pos_precision * pos_recall) / (pos_precision + pos_recall))

0.987755102041
0.977099236641
