Good day everyone. I am currently trying to predict class labels for a test data. the problem is converting the predicted labels back to their original forms (Good, Poor, Neutral). before apply a classification algorithm, i encoded my features with label encoder. how do i achieve this? pls find below my source code.
total = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_cat.xlsx’)
train = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_train.xlsx’)
train.drop([‘support’], axis = 1)
test = pd.read_excel(‘C:/Users/user/Desktop/my datasets/participant 1/participant 1_sequences_test.xlsx’)
test.drop([‘support’], axis = 1)
preprocessing
mean imputations
label encoding cate. var.
col = [‘sequence’,‘cat’]
test[‘cat’] = 0
combi = train.append(test)
de_combi = combi
number = LabelEncoder()
for i in col:
combi[i] = number.fit_transform(combi[i].astype(‘str’))
combi[i] = combi[i].astype(‘int’)
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
test.drop(‘cat’,axis=1,inplace=True)
training = train.drop([‘support’],axis=1)
testing = test.drop([‘support’],axis=1)
y_train = training[‘cat’]
training.drop(‘cat’,axis=1,inplace=True)
print(“training set is \n”, training)
print(“testing set is \n”, testing)
print(“y train is: \n”, y_train)
features = training.columns
target = ‘cat’
X_train, X_test = training, testing
print(“Starting with different supervised learning algorithm, let us check which algorithm gives us the best results.”)
print("==================================================================================================================")
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
#from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
model_factory = [
#RandomForestRegressor()
#XGBRegressor(nthread=1),
#MLPRegressor()
#Ridge()
#BayesianRidge()
#ExtraTreesRegressor(),
#ElasticNet()
KNeighborsRegressor()
#GradientBoostingRegressor()
]
for model in model_factory:
model.seed = 42
num_folds = 3
scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring=‘neg_mean_squared_error’)
score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)
print(’{model:25} CV-5 RMSE: {score}’.format(
model=model.class.name,
score=score_description
))
print(" implement Pseudo-labelling, for this purpose I will be using test data as the unlabelled data.")
print("===============================================================================================")
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin
class PseudoLabeler(BaseEstimator, RegressorMixin):
def init(self, model, unlabled_data, features, target, sample_rate=0.2, seed=42):
assert sample_rate <= 1.0, ‘Sample_rate should be between 0.0 and 1.0.’
self.sample_rate = sample_rate
self.seed = seed
self.model = model
self.model.seed = seed
self.unlabled_data = unlabled_data
self.features = features
self.target = target
def get_params(self, deep=True):
return {
"sample_rate": self.sample_rate,
"seed": self.seed,
"model": self.model,
"unlabled_data": self.unlabled_data,
"features": self.features,
"target": self.target
}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def fit(self, X, y):
augemented_train = self.__create_augmented_train(X, y)
self.model.fit(
augemented_train[self.features],
augemented_train[self.target]
)
return self
def __create_augmented_train(self, X, y):
num_of_samples = int(len(self.unlabled_data) * self.sample_rate)
# Train the model and creat the pseudo-labels
self.model.fit(X, y)
pseudo_labels = self.model.predict(self.unlabled_data[self.features])
# Add the pseudo-labels to the test set
pseudo_data = self.unlabled_data.copy(deep=True)
pseudo_data[self.target] = pseudo_labels
# Take a subset of the test set with pseudo-labels and append in onto
# the training set
sampled_pseudo_data = pseudo_data.sample(n=num_of_samples)
temp_train = pd.concat([X, y], axis=1)
augemented_train = pd.concat([sampled_pseudo_data, temp_train])
return shuffle(augemented_train)
def predict(self, X):
return self.model.predict(X)
def get_model_name(self):
return self.model.__class__.__name__
model_factory = [
KNeighborsRegressor(),
#XGBRegressor(nthread=1),
PseudoLabeler(
KNeighborsRegressor(),
#XGBRegressor(nthread=1),
test,
features,
target,
sample_rate=0.3
),
]
for model in model_factory:
model.seed = 42
num_folds = 8
scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)
score_description = "MSE: %0.4f (+/- %0.4f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)
print('{model:25} CV-{num_folds} {score_cv}'.format(
model=model.__class__.__name__,
num_folds=num_folds,
score_cv=score_description
))
print()
#print(“added labels are: \n”,predict(self, X) )
features = testing[‘sequence’]
ps = PseudoLabeler(model = KNeighborsRegressor(), unlabled_data= test, features = features, target =‘cat’)
print(test, ps )
#print(“ps is: \n”, ps)
#ps.predict(model = KNeighborsRegressor(), unlabled_data= test, features = features, target =‘cat’)