Connecting predicted values with test values

Good day everyone. I am currently trying to predict class labels for a test data. the problem is converting the predicted labels back to their original forms (Good, Poor, Neutral). before apply a classification algorithm, i encoded my features with label encoder. how do i achieve this? pls find below my source code.

total = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_cat.xlsx’)
train = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_train.xlsx’)
train.drop([‘support’], axis = 1)

test = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_test.xlsx’)
test.drop([‘support’], axis = 1)

preprocessing

mean imputations

label encoding cate. var.

col = [‘sequence’,‘cat’]
test[‘cat’] = 0
combi = train.append(test)

de_combi = combi
number = LabelEncoder()
for i in col:
combi[i] = number.fit_transform(combi[i].astype(‘str’))
combi[i] = combi[i].astype(‘int’)

train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
test.drop(‘cat’,axis=1,inplace=True)

training = train.drop([‘support’],axis=1)
testing = test.drop([‘support’],axis=1)
y_train = training[‘cat’]
training.drop(‘cat’,axis=1,inplace=True)
print(“training set is \n”, training)
print(“testing set is \n”, testing)
print(“y train is: \n”, y_train)

features = training.columns
target = ‘cat’
X_train, X_test = training, testing

print(“Starting with different supervised learning algorithm, let us check which algorithm gives us the best results.”)
print("==================================================================================================================")

from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
#from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

model_factory = [
#RandomForestRegressor()
#XGBRegressor(nthread=1),
#MLPRegressor()
#Ridge()
#BayesianRidge()
#ExtraTreesRegressor(),
#ElasticNet()
KNeighborsRegressor()
#GradientBoostingRegressor()
]

for model in model_factory:
model.seed = 42
num_folds = 3

scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring=‘neg_mean_squared_error’)
score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)

print(’{model:25} CV-5 RMSE: {score}’.format(
model=model.class.name,
score=score_description
))

print(" implement Pseudo-labelling, for this purpose I will be using test data as the unlabelled data.")
print("===============================================================================================")

from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin

class PseudoLabeler(BaseEstimator, RegressorMixin):
def init(self, model, unlabled_data, features, target, sample_rate=0.2, seed=42):
assert sample_rate <= 1.0, ‘Sample_rate should be between 0.0 and 1.0.’
self.sample_rate = sample_rate
self.seed = seed
self.model = model
self.model.seed = seed
self.unlabled_data = unlabled_data
self.features = features
self.target = target

def get_params(self, deep=True):
    return {
            "sample_rate": self.sample_rate,
            "seed": self.seed,
            "model": self.model,
            "unlabled_data": self.unlabled_data,
            "features": self.features,
            "target": self.target
            }
    
def set_params(self, **parameters):
    for parameter, value in parameters.items():
        setattr(self, parameter, value)
    return self

def fit(self, X, y):
    augemented_train = self.__create_augmented_train(X, y)
    self.model.fit(
            augemented_train[self.features],
            augemented_train[self.target]
            )
    return self

def __create_augmented_train(self, X, y):
    num_of_samples = int(len(self.unlabled_data) * self.sample_rate)
    # Train the model and creat the pseudo-labels
    self.model.fit(X, y)
    pseudo_labels = self.model.predict(self.unlabled_data[self.features])
    # Add the pseudo-labels to the test set
    pseudo_data = self.unlabled_data.copy(deep=True)
    pseudo_data[self.target] = pseudo_labels
    # Take a subset of the test set with pseudo-labels and append in onto
    # the training set
    sampled_pseudo_data = pseudo_data.sample(n=num_of_samples)
    temp_train = pd.concat([X, y], axis=1)
    augemented_train = pd.concat([sampled_pseudo_data, temp_train])
    return shuffle(augemented_train)
 
    def predict(self, X):
        return self.model.predict(X)
    
    def get_model_name(self):
        return self.model.__class__.__name__

model_factory = [
KNeighborsRegressor(),
#XGBRegressor(nthread=1),
PseudoLabeler(
KNeighborsRegressor(),
#XGBRegressor(nthread=1),
test,
features,
target,
sample_rate=0.3
),
]

for model in model_factory:
model.seed = 42
num_folds = 8

scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)
score_description = "MSE: %0.4f (+/- %0.4f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)
print('{model:25} CV-{num_folds} {score_cv}'.format(
        model=model.__class__.__name__,
        num_folds=num_folds,
        score_cv=score_description
        ))
print()

#print(“added labels are: \n”,predict(self, X) )

features = testing[‘sequence’]
ps = PseudoLabeler(model = KNeighborsRegressor(), unlabled_data= test, features = features, target =‘cat’)
print(test, ps )
#print(“ps is: \n”, ps)

#ps.predict(model = KNeighborsRegressor(), unlabled_data= test, features = features, target =‘cat’)

@oluwande,
I suppose you are looking for this.

labelencoder.inverse_transform()

You can get back the class label from the encoding .
Eg:

from sklearn.preprocessing import  LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(["Good", "Poor", "Neutral"])
# array([0, 2, 1])
print(encoder.inverse_transform([0,1,2]))
# ['Good' 'Neutral' 'Poor']
© Copyright 2013-2020 Analytics Vidhya