Titanic — Machine Learning from Disaster — Data Cleaning

src: TitanicShip

Introduction

Data Cleaning

# upload the data
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')
Null values in train and Test
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (18, 6))# train data 
sns.heatmap(train.isnull(), yticklabels=False, ax = ax[0], cbar=False, cmap='viridis')
ax[0].set_title('Train data')
# test data
sns.heatmap(test.isnull(), yticklabels=False, ax = ax[1], cbar=False, cmap='viridis')
ax[1].set_title('Test data');
Heatmap to visualize the null values before cleaning
train.Embarked.value_counts()
from collections import Countertrain.Embarked= train.Embarked.replace(np.nan,Counter(train.Embarked).most_common(1)[0][0])
class3_mean = train[train['Pclass']==3]['Fare'].mean()
test['Fare'] = test['Fare'].replace({np.nan:class3_mean})
#defining a function 'impute_age'
def impute_age(age_pclass): # passing age_pclass as ['Age', 'Pclass']

# Passing age_pclass[0] which is 'Age' to variable 'Age'
Age = age_pclass[0]

# Passing age_pclass[2] which is 'Pclass' to variable 'Pclass'
Pclass = age_pclass[1]

#applying condition based on the Age and filling the missing data respectively
if pd.isnull(Age):
if Pclass == 1:
return 38
elif Pclass == 2:
return 30
else:
return 25
else:
return Age
# (for train) grab age and apply the impute_age, our custom function
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
# (for test) grab age and apply the impute_age, our custom function
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)
#Train:
train.loc[train['Cabin'].notnull(), 'Cabin'] =1
train['Cabin'] = train['Cabin'].replace({np.nan:0})
train['Cabin'] = train['Cabin'].astype(int)
#Test:
test.loc[test['Cabin'].notnull(), 'Cabin'] =1
test['Cabin'] = test['Cabin'].replace({np.nan:0})
test['Cabin'] = test['Cabin'].astype(int)
Heatmap to visualize the null values after cleaning

Look’s Great!! No more missing values :)

Feature Engineering

# Train
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
train = pd.concat([train, sex,embark],axis=1)
train=train.drop(['Sex','Embarked'], axis=1)
train.rename(columns={"male": "sex_male", "Q": "Embarked_Q","S": "Embarked_S"}, inplace=True)
# Same for the Test

Model Preparation

features_drop = ['PassengerId','Name', 'Ticket', 'Survived']
selected_features = [x for x in train.columns if x not in features_drop ]
X_train = train[selected_features]
y_train = train['Survived']

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store