Hello all,

I am new to Python. I have built a model with randomforest in python.

But I think my code is not optimized. Please look into my code and suggest if I have deviated from best practices.

Overview about the data I have:

The data has responsecolumns and predictor columns. Also there is a column ‘TestOrTrainingDataRandom’ which specifies the test and training data.(There are also columns like index, Timestamp,etc which have to be removed)

The predictor columns start with ‘3000’ and ends at ‘3680’ with a step increase of 5 (i.e in total there are 137 predictor columns)

But there are some predictor columns missing. So the missing predictor columns are interpolated.

-----------CODE-------------`

```
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
data = pd.read_csv("combined_spectra_and_gas_params_3.csv") #Reading the file
#create the training & test sets
b = len(data)
a = data.TestOrTrainingDataRandom [data.TestOrTrainingDataRandom == 1].count() #Count the number of training data
new_data = data.drop(data.columns[[0,1,2,3,4,5,6,7,8,120,119,118]], axis=1)
colnames = list(new_data)
len_iteration = len(new_data.columns) - 1
j, i, k = 3000, 0, 0
new_col = pd.DataFrame(index=range(0,b),columns=['temp'])
while i < len_iteration:
if int(colnames[i])!= j:
for k in range(0,b):
new_col.iloc[k] = (new_data.iloc[k,i-1] + new_data.iloc[k,i+1])/2 #Averaging one column before and after the missing column to interpolate
new_data.insert(i,str(j),new_col)
colnames = list(new_data)
len_iteration = len_iteration+1;
j += 5
i += 1
new_data.insert(1,"dataselection_col",data['TestOrTrainingDataRandom'])
new_data.insert(1,'H2S',data['H2S'])
#create the training & test sets
train = pd.DataFrame(index=range(0,a),columns = list(new_data)) #Creating dataframe for training data
test = pd.DataFrame(index=range(0,abs(a-b)),columns = list(new_data)) #Creating dataframe for test data
i,j,k = 0,0,0
for value in new_data['dataselection_col']:
if value == 1 :
train.iloc[i] = new_data.irow(j) #If 'TestOrTrainingDataRandom' column has 1, then append that row data to train and also do indexing
j += 1
i += 1
else:
test.iloc[k] = new_data.irow(j) #If 'TestOrTrainingDataRandom' column has other than 0, then append that row data to test and also do indexing
k += 1
j += 1
trainRes = train['H2S'] #Response column
Actuals = test['H2S'] #Actuals
new_train = train.drop(train.columns[[1,2]],axis=1)
colnames = list(new_train)
trainArr = train.as_matrix(colnames) #Convert dataframe into array matrix representation
testArr = test.as_matrix(colnames)
print "building model"
rf = RandomForestRegressor(n_estimators=500, max_features=15) #For classification use RandomForestClassifier
rf.fit(trainArr, trainRes) #Fit the random forest model
results = rf.predict(testArr) #Prediction on test data for RF
MSE = mean_squared_error(Actuals, results) #Calculate Mean Squared Error
RMSE = sqrt(MSE)
print "RMSE : ",RMSE
```