H2o.glm for regression problem


#1

I am trying to execute the below code and it has been successfully executed till h2o.performance(regressionmodel). But getting the error message when i try to execute h2o.predict(). Kindly help me out here. Below is the error message

Error: DistributedException from localhost/127.0.0.1:54321, caused by java.lang.ArrayIndexOutOfBoundsException: 60

rm(list=ls())
train <- read.csv(file.choose())
test <- read.csv(file.choose())
test$y <- 1
combi <- rbind(train,test)
head(combi)
str(combi)

names(train)
summary(train)

Categorical Attributes

cat_attr <- c(“X0”,“X1” ,“X2”,“X3”,“X4”,“X5”,“X6”,“X8”)
cat_data <- combi[,cat_attr]
names(cat_data)
head(cat_data)

Numerical Attributes

num_attr<- setdiff(names(train),cat_attr)
num_data <- combi[,num_attr]
head(num_data)
str(num_data)

#Seperate categorical variables and binary/numeric and convert them into appropriate type
cat_data <-data.frame(sapply(cat_data,as.factor))
num_data <- data.frame(sapply(num_data,as.numeric))

finaldata <- cbind(cat_data,num_data)
sum(is.na(finaldata))

#Dummy Variables

library(dummies)
finaldata1 <- dummy.data.frame(data = finaldata,names = c(‘X0’,‘X1’,‘X2’,‘X3’,‘X4’,‘X5’,‘X6’,‘X8’))
finaldata1 <- data.frame(sapply(finaldata1,as.numeric))

names(finaldata1)
str(finaldata1)

finaldata1 <- finaldata1[,setdiff(names(finaldata1),“ID”)]
train_a <- finaldata1[1:nrow(train),]
test_a <- finaldata1[1:nrow(test),]

test_a <- test_a[,setdiff(names(test_a),“y”)]

install.packages(“h2o”)
library(h2o)

localh2o <- h2o.init(nthreads = -1)

trainh2o <- as.h2o(train_a)
testh2o <- as.h2o(test_a)

names(trainh2o)

regressionmodel <- h2o.glm(y = “y”,x = setdiff(names(trainh2o),“y”),training_frame = trainh2o,family = “gaussian”)

h2o.performance(regressionmodel)
predict.reg <- h2o.predict(regressionmodel, newdata = testh2o)


#2

Hi @sirishan, just check whether the column names are same in ‘trainh2o’ and ‘testh2o’.