Here is my code with Single XGB model in R (Machine Config: i5, 3GB RAM, Windows7). LB Score around 2487.
XGB in R (Local CV ~2485)
rm(list=ls())
setwd(“C:\Users\19501\Documents\AnalyticsVidhya\BFData”)
library(readr)
train <- read_csv(“train.csv”)
test <- read_csv(“test.csv”)
train[is.na(train)] <- -999
test[is.na(test)] <- -999
library(sqldf)
Some Basic Feature Engineering
prod_count <- sqldf(“select User_ID, count(distinct Product_ID)as Product_Count from train group by User_ID”)
cust_count <- sqldf(“select Product_ID, count(distinct User_ID)as User_Count from train group by Product_ID”)
train_new <- sqldf(“select a.,b.Product_Count from train a left join prod_count b on a.User_ID = b.User_ID")
test_new <- sqldf("select a.,b.Product_Count from test a left join prod_count b on a.User_ID = b.User_ID”)
train_new2 <- sqldf(“select a.,b.User_Count from train_new a left join cust_count b on a.Product_ID = b.Product_ID")
test_new2 <- sqldf("select a.,b.User_Count from test_new a left join cust_count b on a.Product_ID = b.Product_ID”)
prod_cat1_count <- sqldf(“select User_ID, count(distinct Product_Category_1)as Product_Count_1 from train where Product_Category_1 <> ‘-999’ group by User_ID”)
prod_cat2_count <- sqldf(“select User_ID, count(distinct Product_Category_2)as Product_Count_2 from train where Product_Category_2 <> ‘-999’ group by User_ID”)
prod_cat3_count <- sqldf(“select User_ID, count(distinct Product_Category_3)as Product_Count_3 from train where Product_Category_3 <> ‘-999’ group by User_ID”)
new_feat_prod <- merge(prod_cat1_count,prod_cat2_count)
new_feat_prod <- merge(new_feat_prod, prod_cat3_count)
train.new <- sqldf(“select a.,b. from train_new2 a left join new_feat_prod b on a.User_ID = b.User_ID”)
test.new <- sqldf(“select a.,b. from test_new2 a left join new_feat_prod b on a.User_ID = b.User_ID”)
train.new[is.na(train.new)] <- -999
test.new[is.na(test.new)] <- -999
feature.names <- c(“Product_ID”,
“Gender”,
“Age”,
“Occupation”,
“City_Category”,
“Stay_In_Current_City_Years”,
“Marital_Status”,
“Product_Category_1”,
“Product_Category_2”,
“Product_Category_3”,
“Product_Count”,
“User_Count”,
“User_ID”,
“Product_Count_1”,
“Product_Count_2”,
“Product_Count_3”)
Encoding of Age variable
train.new[which(train.new$Age==“0-17”),“Age”] <- 17
train.new[which(train.new$Age==“18-25”),“Age”] <- 25
train.new[which(train.new$Age==“26-35”),“Age”] <- 35
train.new[which(train.new$Age==“36-45”),“Age”] <- 45
train.new[which(train.new$Age==“46-50”),“Age”] <- 50
train.new[which(train.new$Age==“51-55”),“Age”] <- 55
train.new[which(train.new$Age==“55+”),“Age”] <- 65
Encoding of Stay in Current City Variable
train.new[which(train.new$Stay_In_Current_City_Years==“0”),“Stay_In_Current_City_Years”] <- 1
train.new[which(train.new$Stay_In_Current_City_Years==“1”),“Stay_In_Current_City_Years”] <- 2
train.new[which(train.new$Stay_In_Current_City_Years==“2”),“Stay_In_Current_City_Years”] <- 3
train.new[which(train.new$Stay_In_Current_City_Years==“3”),“Stay_In_Current_City_Years”] <- 4
train.new[which(train.new$Stay_In_Current_City_Years==“4+”),“Stay_In_Current_City_Years”] <- 10
test.new[which(test.new$Age==“0-17”),“Age”] <- 17
test.new[which(test.new$Age==“18-25”),“Age”] <- 25
test.new[which(test.new$Age==“26-35”),“Age”] <- 35
test.new[which(test.new$Age==“36-45”),“Age”] <- 45
test.new[which(test.new$Age==“46-50”),“Age”] <- 50
test.new[which(test.new$Age==“51-55”),“Age”] <- 55
test.new[which(test.new$Age==“55+”),“Age”] <- 65
Encoding of Stay in Current City Variable
test.new[which(test.new$Stay_In_Current_City_Years==“0”),“Stay_In_Current_City_Years”] <- 1
test.new[which(test.new$Stay_In_Current_City_Years==“1”),“Stay_In_Current_City_Years”] <- 2
test.new[which(test.new$Stay_In_Current_City_Years==“2”),“Stay_In_Current_City_Years”] <- 3
test.new[which(test.new$Stay_In_Current_City_Years==“3”),“Stay_In_Current_City_Years”] <- 4
test.new[which(test.new$Stay_In_Current_City_Years==“4+”),“Stay_In_Current_City_Years”] <- 10
for (f in feature.names) {
if (class(train.new[[f]])==“character”) {
levels <- unique(c(train.new[[f]], test.new[[f]]))
train.new[[f]] <- as.integer(factor(train.new[[f]], levels=levels))
test.new[[f]] <- as.integer(factor(test.new[[f]], levels=levels))
}
}
tra <- train.new[,feature.names]
test <- test.new[,feature.names]
RMSE<- function(preds, dtrain) {
labels <- getinfo(dtrain, “label”)
elab<-as.numeric(labels)
epreds<-as.numeric(preds)
err <- sqrt(mean((epreds-elab)^2))
return(list(metric = “RMSE”, value = err))
}
XGBOOST
library(xgboost)
set.seed(100)
h<-sample(nrow(train),10000)
dval<-xgb.DMatrix(data=data.matrix(tra[h,]),label=train.new$Purchase[h])
dtrain<-xgb.DMatrix(data=data.matrix(tra[-h,]),label=train.new$Purchase[-h])
watchlist<-list(val=dval,train=dtrain)
param <- list( objective = “reg:linear”,
#booster = “gblinear”,
eta = 0.15,
max_depth = 8,
subsample = 0.7,
colsample_bytree = 0.7,
scale_pos_weight = 0.8,
min_child_weight = 10
)
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 830,
verbose = 1,
early.stop.round = 100,
watchlist = watchlist,
maximize = FALSE,
feval=RMSE
)
pred1 <- predict(clf, data.matrix(test[,feature.names]))
submission <- data.frame(User_ID=test.new$User_ID, Product_ID = test.new$Product_ID, Purchase=pred1)
submission_adjust <- submission$Purchase
write_csv(submission, “XgbNew.csv”)