Predicting Titantic survivorship

At work recently, I had the chance to work on and learn about some interesting machine learning problems (around customer retention) and in my spare time, I’ve started doing some additional problems on Kaggle to get more exposure to some different datasets and modeling techniques.

I figured that by blogging about it, it would force me to keep my code sane and legible. So to kick things off, here’s my solution to the Titanic competition. The goal here was to predict who survived or not based on attributes such as gender, age, boarding class, port of embarkation, and a few others.

My model is pretty simple, and I certainly could have done a bunch more in terms of feature engineering, but even still it clocked in with an accuracy of 80.4% (top 15% at the time of this post). My decently commented code is below, and I have a repo on GitHub too if you’re in to that sort of thing.

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.grid_search import GridSearchCV

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

sns.set_style("darkgrid")
sns.set_context(rc={"figure.figsize": (6, 3)})

# load in datasets

dfs = {}
for name in ['train', 'test']:
    df = pd.read_csv('../data/%s.csv' % name)

    # add a column denoting source (train/test)
    df['data'] = name
    
    # add df to dfs dict
    dfs[name] = df

# basic info about columns in each dataset

for name, df in dfs.iteritems():
    print 'df: %s\n' % name
    print 'shape: %d rows, %d cols\n' % df.shape
    
    print 'column info:'
    for col in df.columns:
        print '* %s: %d nulls, %d unique vals, most common: %s' % (
            col, 
            df[col].isnull().sum(),
            df[col].nunique(),
            df[col].value_counts().head(2).to_dict()
        )
    print '\n------\n'

df: test

shape: 418 rows, 12 cols

column info:
* PassengerId: 0 nulls, 418 unique vals, most common: {1128: 1, 1023: 1}
* Pclass: 0 nulls, 3 unique vals, most common: {1: 107, 3: 218}
* Name: 0 nulls, 418 unique vals, most common: {'Rosenbaum, Miss. Edith Louise': 1, 'Beauchamp, Mr. Henry James': 1}
* Sex: 0 nulls, 2 unique vals, most common: {'male': 266, 'female': 152}
* Age: 86 nulls, 79 unique vals, most common: {24.0: 17, 21.0: 17}
* SibSp: 0 nulls, 7 unique vals, most common: {0: 283, 1: 110}
* Parch: 0 nulls, 8 unique vals, most common: {0: 324, 1: 52}
* Ticket: 0 nulls, 363 unique vals, most common: {'PC 17608': 5, '113503': 4}
* Fare: 1 nulls, 169 unique vals, most common: {7.75: 21, 26.0: 19}
* Cabin: 327 nulls, 76 unique vals, most common: {'B57 B59 B63 B66': 3, 'C101': 2}
* Embarked: 0 nulls, 3 unique vals, most common: {'S': 270, 'C': 102}
* data: 0 nulls, 1 unique vals, most common: {'test': 418}

------

df: train

shape: 891 rows, 13 cols

column info:
* PassengerId: 0 nulls, 891 unique vals, most common: {891: 1, 293: 1}
* Survived: 0 nulls, 2 unique vals, most common: {0: 549, 1: 342}
* Pclass: 0 nulls, 3 unique vals, most common: {1: 216, 3: 491}
* Name: 0 nulls, 891 unique vals, most common: {'Graham, Mr. George Edward': 1, 'Elias, Mr. Tannous': 1}
* Sex: 0 nulls, 2 unique vals, most common: {'male': 577, 'female': 314}
* Age: 177 nulls, 88 unique vals, most common: {24.0: 30, 22.0: 27}
* SibSp: 0 nulls, 7 unique vals, most common: {0: 608, 1: 209}
* Parch: 0 nulls, 7 unique vals, most common: {0: 678, 1: 118}
* Ticket: 0 nulls, 681 unique vals, most common: {'CA. 2343': 7, '347082': 7}
* Fare: 0 nulls, 248 unique vals, most common: {13.0: 42, 8.0500000000000007: 43}
* Cabin: 687 nulls, 147 unique vals, most common: {'G6': 4, 'C23 C25 C27': 4}
* Embarked: 2 nulls, 3 unique vals, most common: {'S': 644, 'C': 168}
* data: 0 nulls, 1 unique vals, most common: {'train': 891}

------

# combine train and test data into one df
df = dfs['train'].append(dfs['test'])

# lowercase column names
df.columns = map(str.lower, df.columns)

# reorder columns
new_col_order = ['data', 'passengerid', 'survived', 'age',
                'cabin', 'embarked', 'fare', 'name', 'parch',
                'pclass', 'sex', 'sibsp', 'ticket']
df = df[new_col_order]

df.head()

	data	passengerid	survived	age	cabin	embarked	fare	name	pclass	sex	sibsp	ticket
0	train	1	0	22	NaN	S	7.2500	Braund, Mr. Owen Harris	3	male	1	A/5 21171
1	train	2	1	38	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	1	female	1	PC 17599
2	train	3	1	26	NaN	S	7.9250	Heikkinen, Miss. Laina	3	female	0	STON/O2. 3101282
3	train	4	1	35	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	1	female	1	113803
4	train	5	0	35	NaN	S	8.0500	Allen, Mr. William Henry	3	male	0	373450

# convert sex to ints (male = 1, female = 0)
df['gender'] = df['sex'].apply(lambda x: 1 if x == 'male' else 0)

# extract title
df['title'] = df['name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# fill missing age with mean by title
age_by_title = df.groupby('title')['age'].agg(np.mean).to_dict()
df['age'] = df.apply(lambda row: age_by_title.get(row['title']) 
                     if pd.isnull(row['age']) else row['age'], axis=1)

# fill missing fare with median by pclass
# some people have a fare = 0, so only look at those > 0
fare_by_pclass = df[df['fare'] > 0].groupby('pclass')['fare'].agg(np.median).to_dict()
df['fare'] = df.apply(lambda r: r['fare'] if r['fare'] > 0 
                      else fare_by_pclass.get(r['pclass']), axis=1)

# fill missing embarked with most common port
most_common_port = df['embarked'].mode()[0]
df['embarked'] = df['embarked'].fillna(most_common_port)

# transform categorical embarked column to 1/0 dummy columns
dummies = pd.get_dummies(df['embarked'], prefix='port')
df = pd.concat([df, dummies], axis=1)

df.iloc[0]

data                             train
passengerid                          1
survived                             0
age                                 22
cabin                              NaN
embarked                             S
fare                              7.25
name           Braund, Mr. Owen Harris
parch                                0
pclass                               3
sex                               male
sibsp                                1
ticket                       A/5 21171
gender                               1
title                               Mr
port_C                               0
port_Q                               0
port_S                               1
Name: 0, dtype: object

# drop unnecessary columns
drop_cols = ['cabin', 'name', 'sex', 'ticket', 'title', 'embarked']
df = df.drop(drop_cols, axis=1)

df.head()

	data	passengerid	survived	age	fare	pclass	sibsp	gender	port_C	port_S
0	train	1	0	22	7.2500	3	1	1	0	1
1	train	2	1	38	71.2833	1	1	0	1	0
2	train	3	1	26	7.9250	3	0	0	0	1
3	train	4	1	35	53.1000	1	1	0	0	1
4	train	5	0	35	8.0500	3	0	1	0	1

# survival rates across various columns

def plot_survive_rate(df, col, chart='barh'):
    survive_rates = df.groupby(col).agg({
        'survived': lambda x : sum(x) / len(x)
    })
    survive_rates.plot(kind=chart);

def bin_num(x, base=5):
    return int(x - (x % base))
    
# get cleaned training data
dg = df[df['data'] == 'train'].copy()

# discretize age & fare
dg['age_bin'] = dg['age'].apply(lambda x: bin_num(x, 10))
dg['fare_bin'] = dg['fare'].apply(lambda x: bin_num(x, 25))

for col in ['pclass', 'gender', 'age_bin', 'fare_bin']:
    plot_survive_rate(dg, col)

png

# store the columns (features) to be used for classifying survival
x_cols = df.columns.values[3:]

print '%d total features' % len(x_cols)
print x_cols

9 total features
['age' 'fare' 'parch' 'pclass' 'sibsp' 'gender' 'port_C' 'port_Q' 'port_S']

# prep training data for classifying
# (separating features and metric we're predicting)

df_train = df[df['data'] == 'train']

X = df_train[x_cols].as_matrix()
y = df_train['survived'].as_matrix()

print '%d rows, %d features' % (len(df_train), len(x_cols))

891 rows, 9 features

# cross validation to evaluate model
# (dividing training data into n chunks and training model n times 
# with a different holdout chunk each time)

cv = StratifiedKFold(y, n_folds=6)
tot_correct, tot_obs = 0, 0

for i, (train, test) in enumerate(cv):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    model = RandomForestClassifier(random_state=321) 
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    correct, obs = (y_test == y_pred).sum(), len(y_test)
    tot_correct += correct
    tot_obs += obs

print 'accuracy: %f' % (tot_correct * 1.0 / tot_obs)

accuracy: 0.799102

# evaluate model over entire training set 
# and look at each feature's importance

model = RandomForestClassifier(random_state=321)
model.fit(X, y)

feature_rank = pd.Series(model.feature_importances_, index=x_cols).order(ascending=False)
feature_rank

fare      0.265518
age       0.264835
gender    0.259047
pclass    0.078497
sibsp     0.054447
parch     0.046165
port_S    0.011954
port_C    0.011904
port_Q    0.007634
dtype: float64

# model parameter tuning
# (there are several input parameters to a random forest model;
# grid search automates the process of tweaking these parameters to find the 
# optimal values for these parameters)

param_grid = {
    "n_estimators": [100],
    "criterion": ["gini", "entropy"],
    'max_features': [0.5, 1.0, "sqrt"],
    'max_depth': [4, 5, 6, 7, 8, None],
}

model = RandomForestClassifier(random_state=321)
grid_search = GridSearchCV(model, param_grid, cv=12, verbose=0)
grid_search.fit(X, y)

print grid_search.best_score_
print grid_search.best_params_

0.836139169473
{'max_features': 0.5, 'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 7}

# train model with best parameters from grid search
# and finally predict survival of people from test data

df_train = df[df['data'] == 'train'].copy()

X_train = df_train[x_cols].as_matrix()
y_train = df_train['survived'].as_matrix()

model = RandomForestClassifier(
    n_estimators=100, 
    criterion='entropy', 
    max_features=0.5, 
    max_depth=7,
    random_state=321,
)

model.fit(X_train, y_train)

df_test = df[df['data'] == 'test'].copy()

X_test = df_test[x_cols].as_matrix()
y_pred = model.predict(X_test).astype(int)

df_test['survived'] = y_pred

final_df = df_test[['passengerid', 'survived']]
final_df.to_csv('../output/predicted.csv', index=False)
print 'boom.'

boom.