Hi,

I have tried building a model on weather dataset. As i build my models i get the below error which is due to the presence of highly correlated variables. In order to beat the problem I iteratively use the VIF function and keep removing the variables with very high VIF. At the end I am left with a very few variables and with VIF less than 2 for all. But still I am getting the error. Please advise how to deal with this.

R code is below

setwd(“C:\Users\user\Desktop\Blogs”)

mydata=read.csv(“weather.csv”)

variable=names(mydata)

lists=1

mins=1

maxs=1

average=1

SD=1

u=1

for (i in 1:length(variable)) {

x=variable[i]

y=mydata[,x]

lists[i]=class(y)

mins[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,min(y,na.rm = TRUE),0)

maxs[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,max(y,na.rm = TRUE),0)

average[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,mean(y,na.rm = TRUE),0)

SD[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,mean(y,na.rm = TRUE),0)

u[i]=ifelse(class(y)==“factor”,unique(y,na.rm = TRUE),0)

print(i)

}

missingvalues=colSums(is.na(mydata))

Final_Summary=data.frame(Variable=variable, datatype=lists, Max=maxs,Min=mins,SD=SD, Average=average,Unique=u, Missing=missingvalues)

####Missing value Imputation using kNN

install.packages(“VIM”, dependencies = TRUE)

library(VIM)

#Which variables have missing values

colnames(mydata)[colSums(is.na(mydata)) > 0]

mydata_imputed=kNN(mydata,variable = colnames(mydata)[colSums(is.na(mydata)) > 0],k=5)

colSums(is.na(mydata_imputed))

mydata_imputed=mydata_imputed[,1:24]

str(mydata_imputed)

##Find variables with outliers

#Column names of all variables that are numeric

nums <- unlist(lapply(mydata_imputed, is.numeric))

y=names(mydata_imputed[,nums])

mydata_numeric=mydata_imputed[,nums]

length(y)

dev.off()

#introduce NAs for outliers for replacing predicted values using knn

for (i in 1:length(y) ){

x=boxplot(mydata_numeric[,y[i]],main=y[i], col = i, horizontal = TRUE)

index=which(mydata_numeric[,y[i]]%in%x$out)

mydata_numeric[index,y[i]]=NA

rm(x)

}

colSums(is.na(mydata_numeric))

#imputing missing values

colnames(mydata_numeric)[colSums(is.na(mydata_numeric)) > 0]

mydata_numeric=kNN(mydata_numeric,variable = colnames(mydata_numeric)[colSums(is.na(mydata_numeric)) > 0],k=5)

colSums(is.na(mydata_numeric))

#Check for outliers

for (i in 1:length(y) ){

x=boxplot(mydata_numeric[,y[i]],main=y[i], col = i, horizontal = TRUE)

rm(x)

}

#minimal outliers

#Building model

library(caret)

set.seed(1234)

Index=createDataPartition(mydata_imputed$RainTomorrow,p=0.75,list = FALSE)

Train=mydata_imputed[Index,]

Test=mydata_imputed[-Index,]

unique(mydata_imputed)

LogM1=glm(RainTomorrow~., data = mydata_imputed[,-c(1,2)], family = “binomial”)

library(car)

car::vif(LogM1)

LogM2=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,16:17,20:21)], family = “binomial”)

names(mydata_imputed)

car::vif(LogM2)

LogM3=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,15:17,20:21)], family = “binomial”)

car::vif(LogM3)

LogM4=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,15:17,20:22)], family = “binomial”)

car::vif(LogM4)

LogM5=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,13,15:17,20:22)], family = “binomial”)

car::vif(LogM5)

LogM6=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,12,13,15:17,20:22)], family = “binomial”)

car::vif(LogM6)

LogM7=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:17,20:22)], family = “binomial”)

car::vif(LogM7)

names(mydata_imputed)

LogM8=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:17,19:22)], family = “binomial”)

car::vif(LogM8)

names(mydata_imputed)

LogM9=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:18,19:22)], family = “binomial”)

car::vif(LogM9)

names(mydata_imputed)

LogM10=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12:18,19:22)], family = “binomial”)

car::vif(LogM10)

names(mydata_imputed)