Final Step to Missing Value imputation in R?

r
missing_values

#1

Hi All,

In my attempt to treat missing values, i referred to an older AV article, " Tutorial on 5 Powerful R Packages used for imputing missing values". The article is great and very helpful. However I am not being able to have a dataset with the imputed values alogwith the original dataset. How to do that. However I do not get a dataset with no missing values. How to get there.
I guess a step is missing.

Use of missForest

mydata=read.csc(weather.csv) ##[Link to download the file = https://www.biz.uiowa.edu/faculty/jledolter/DataMining/dataexercises.html]###
library(missForest)

mydata.imp=missForest::missForest(mydata[,-1])
$OOBerror
NRMSE PFC
0.007603105 0.383018377

However I do not get a dataset with no missing values. How to get there.


#2

try the another solution with k means clustering


#3

Thanks I am using library VIM to impute missing values through knn.

However it would still be interesting to know how to use missForest. If someone can help, it will be useful.


#4

Here is my code to impute missing values using knn. The missing values are imputed but 6 new variables created which I am deleting. Would anyone know why 6 new variables (variants of 6 existing variables) got created.

install.packages(“VIM”, dependencies = TRUE)
library(VIM)

#Which variables have missing values
colnames(mydata)[colSums(is.na(mydata)) > 0]
mydata_imputed=kNN(mydata,variable = colnames(mydata)[colSums(is.na(mydata)) > 0],k=5)
colSums(is.na(mydata_imputed))


#5

Hi,

I have tried building a model on weather dataset. As i build my models i get the below error which is due to the presence of highly correlated variables. In order to beat the problem I iteratively use the VIF function and keep removing the variables with very high VIF. At the end I am left with a very few variables and with VIF less than 2 for all. But still I am getting the error. Please advise how to deal with this.

R code is below

setwd(“C:\Users\user\Desktop\Blogs”)
mydata=read.csv(“weather.csv”)
variable=names(mydata)
lists=1
mins=1
maxs=1
average=1
SD=1
u=1
for (i in 1:length(variable)) {
x=variable[i]
y=mydata[,x]
lists[i]=class(y)
mins[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,min(y,na.rm = TRUE),0)
maxs[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,max(y,na.rm = TRUE),0)
average[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,mean(y,na.rm = TRUE),0)
SD[i]=ifelse(class(y)==“numeric”|class(y)==“integer”,mean(y,na.rm = TRUE),0)
u[i]=ifelse(class(y)==“factor”,unique(y,na.rm = TRUE),0)
print(i)
}

missingvalues=colSums(is.na(mydata))

Final_Summary=data.frame(Variable=variable, datatype=lists, Max=maxs,Min=mins,SD=SD, Average=average,Unique=u, Missing=missingvalues)

####Missing value Imputation using kNN

install.packages(“VIM”, dependencies = TRUE)
library(VIM)

#Which variables have missing values
colnames(mydata)[colSums(is.na(mydata)) > 0]
mydata_imputed=kNN(mydata,variable = colnames(mydata)[colSums(is.na(mydata)) > 0],k=5)
colSums(is.na(mydata_imputed))

mydata_imputed=mydata_imputed[,1:24]
str(mydata_imputed)

##Find variables with outliers
#Column names of all variables that are numeric
nums <- unlist(lapply(mydata_imputed, is.numeric))
y=names(mydata_imputed[,nums])
mydata_numeric=mydata_imputed[,nums]
length(y)
dev.off()

#introduce NAs for outliers for replacing predicted values using knn
for (i in 1:length(y) ){
x=boxplot(mydata_numeric[,y[i]],main=y[i], col = i, horizontal = TRUE)
index=which(mydata_numeric[,y[i]]%in%x$out)
mydata_numeric[index,y[i]]=NA
rm(x)
}
colSums(is.na(mydata_numeric))

#imputing missing values
colnames(mydata_numeric)[colSums(is.na(mydata_numeric)) > 0]
mydata_numeric=kNN(mydata_numeric,variable = colnames(mydata_numeric)[colSums(is.na(mydata_numeric)) > 0],k=5)
colSums(is.na(mydata_numeric))

#Check for outliers
for (i in 1:length(y) ){
x=boxplot(mydata_numeric[,y[i]],main=y[i], col = i, horizontal = TRUE)
rm(x)
}
#minimal outliers

#Building model

library(caret)
set.seed(1234)
Index=createDataPartition(mydata_imputed$RainTomorrow,p=0.75,list = FALSE)
Train=mydata_imputed[Index,]
Test=mydata_imputed[-Index,]
unique(mydata_imputed)
LogM1=glm(RainTomorrow~., data = mydata_imputed[,-c(1,2)], family = “binomial”)

library(car)
car::vif(LogM1)

LogM2=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,16:17,20:21)], family = “binomial”)
names(mydata_imputed)
car::vif(LogM2)

LogM3=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,15:17,20:21)], family = “binomial”)
car::vif(LogM3)

LogM4=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,15:17,20:22)], family = “binomial”)
car::vif(LogM4)

LogM5=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,12,13,15:17,20:22)], family = “binomial”)
car::vif(LogM5)

LogM6=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,12,13,15:17,20:22)], family = “binomial”)
car::vif(LogM6)

LogM7=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:17,20:22)], family = “binomial”)
car::vif(LogM7)
names(mydata_imputed)

LogM8=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:17,19:22)], family = “binomial”)
car::vif(LogM8)
names(mydata_imputed)

LogM9=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12,13,15:18,19:22)], family = “binomial”)
car::vif(LogM9)
names(mydata_imputed)

LogM10=glm(RainTomorrow~., data = mydata_imputed[,-c(1:4,6:7,9,12:18,19:22)], family = “binomial”)
car::vif(LogM10)
names(mydata_imputed)