Titanic Solution

r
titanic_survival
randomforest

#1

Hi All,

Below is step wise solution for Titanic Survival challenge in R. I have used random forest to predict the survival of passenger.

Reading Data

#Changing directory
setwd("C:\\Users\\manish\\Desktop\\RData")

#load data
train <- read.csv("C:\\Users\\manish\\Desktop\\Titanic\\train1.csv", stringsAsFactors = FALSE, header = T)
test <- read.csv("C:\\Users\\manish\\Desktop\\Titanic\\test.csv", stringsAsFactors = FALSE, header = T)

Data Exploration

#Check Structure of the dataset
str(train)

#Look at the distribution target variable    
table(train$Survived)    
prop.table(table(train$Survived))

#Above you can see that survival rate is 38.38%
#Let's look at the predictor variable Sex and relation with Survival
prop.table(table(train$Sex, train$Survived),1)  

#Female has higher survival rate(74.20%) compare to Male (18.89%)
#let's look at another variable Age
summary(train$Age)

#Above you can see that Age has 177 missing values, will impute later
#Let's look at the distribution
hist(train$Age, breaks=20)

#Above, you can see different distribution for <5, 5-15 and >=15
train$Child <- 2
train$Child[train$Age <= 15] <- 1
train$Child[train$Age <= 5] <- 0
prop.table(table(train$Child, train$Survived),1)

#Above you can see, significant difference in survival rate    
#explore new variables Fare.    
summary(train$Age)
hist(train$Fare, breaks=20)    

#Above you can see that skewness in fare distribution, let's create bins for Fare

train$Fare2 <- '30+'
train$Fare2[train$Fare < 30 & train$Fare >= 20] <- '20-30'
train$Fare2[train$Fare < 20 & train$Fare >= 10] <- '10-20'
train$Fare2[train$Fare < 10] <- '<10'
prop.table(table(train$Child, train$Survived),1) #look at the survival rate across fare class

 #Similarly do for variable Pclass
 prop.table(table(train$Pclass, train$Survived),1)

#You can perform similar task for other variables and build a basic model
library(rpart) 
#model
fit <- rpart(Survived ~ Pclass + Sex + Fare + Age + SibSp + Parch + Embarked, data = train, method = 'class')    
plot(fit)
text(fit)

#predict again using rpart.control
fit <- rpart(Survived ~ Pclass + Sex + Fare + Age + SibSp + Parch + Embarked, data = train, method = 'class', control = rpart.control(minsplit = 2, cp = 0))    

Feature Engineering

# We have already created two new features like Child and Fare2
#Extract salutation from name but first combine both data frames
#Create Fare2 and Child in Test dataframe also
test$Child<-2
test$Child[test$Age<=15]<-1
test$Child[test$Age<=5]<-0
test$Fare2 <- '30+'
test$Fare2[test$Fare < 30 & test$Fare >= 20] <- '20-30'
test$Fare2[test$Fare < 20 & test$Fare >= 10] <- '10-20'
test$Fare2[test$Fare < 10] <- '<10'

#rbind works in data frames where number of columns are equal
test$Survived <- NA
combi <- rbind(train, test)


#string are encoded as factors by default, time to encode them as characters
combi$Name <- as.character(combi$Name)
#split the names
strsplit(combi$Name[1],split = '[,.]')
strsplit(combi$Name[1],split = '[,.]')[[1]]   #it uses double stack matrix
#fetch Mr from the characters
strsplit(combi$Name[1],split = '[,.]')[[1]][2]    

#apply this logic to all rows using sapply
combi$Title <- sapply(combi$Name, FUN = function(x){strsplit(x, split = '[,.]')[[1]][2]})
combi$Title <- sub(' ','',combi$Title)    
table(combi$Title)
    
#combine the titles which are same
combi$Title[combi$Title %in% c('Mme','Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt','Don','Major','Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess','Jonkheer')] <- 'Lady'
combi$Title <- factor(combi$Title)

#Create a new variable Familysize=SibSp+Parch+1(passenger)
combi$FamilySize <- combi$SibSp + combi$Parch + 1

#fix small family category
combi$FamilyID<-'Large'    
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
prop.table(table(combi$FamilyID, combi$Survived),1)

#Impute missing values of Age    
summary(combi$Age)      
#counter missing values using predictive model
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data = combi[!is.na(combi$Age),], method = 'anova')
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

#check for more missing values
which(combi$Embarked == '')
combi$Embarked[c(62,830)] <- "S" #Replace with S as S has higher frequency
combi$Embarked <- factor(combi$Embarked)

summary(combi$Fare)
which(is.na(combi$Fare))
combi$Fare[1044] <- median(combi$Fare, na.rm = TRUE)

combi$Sex <- as.factor(combi$Sex)
combi$Cabin <- ifelse(combi$Cabin == "",0,combi$cabin)

Modeling

#break the data set again
train <- combi[1:891,]
test <- combi[892:1309,]

#run random forest
install.packages('randomForest')
library(randomForest)

set.seed(415)
#this makes result reproducible if run this program next time, oterwise rforest would classify model some other way

fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + Fare2+ Child+ SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data = train, controls = cforest_unbiased(ntree = 2000, mtry = 3))

Prediction <- predict(fit, test, OOB = TRUE)

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = 'randomforestintitanic1.csv', row.names = FALSE)

Hope this helps!

Regards,
Manish


#2

Thanks @Manish for sharing this, it will definitely help newbie to have a good start. I would also suggest you to provide some insights post feature exploration.

Thx,
Manuel


#3

Manish, I can’t thank you enough for the article and forum post. Please continue to break down this method of using R to its fullest potential. I can program in R pretty well, but I can’t do the stats yet : /. I appreciate the tutorial very much.