KeyError in Big Mart Sales Problem

big_mart_sales

#1

I was solving the Big Mart Sales problem but i ended up with the below error.Please help.I am referring the solution provided on the website!

#Determine average visibility of a product
visibility_avg = data.pivot_table(values='Item_Visibility', index='Item_Identifier')

#Impute 0 values with mean visibility of that product:
miss_bool = (data['Item_Visibility'] == 0)

print('Number of 0 values initially: %d'%sum(miss_bool))
data.loc[miss_bool,'Item_Visibility'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: visibility_avg[x])
print('Number of 0 values after modification: %d'%sum(data['Item_Visibility'] == 0))

Number of 0 values initially: 879
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
C:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'FDX07'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-20-a51b4136f1a2> in <module>()
      6 
      7 print('Number of 0 values initially: %d'%sum(miss_bool))
----> 8 data.loc[miss_bool,'Item_Visibility'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: visibility_avg[x])
      9 print('Number of 0 values after modification: %d'%sum(data['Item_Visibility'] == 0))

C:\Anaconda\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   2549             else:
   2550                 values = self.asobject
-> 2551                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2552 
   2553         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()

<ipython-input-20-a51b4136f1a2> in <lambda>(x)
      6 
      7 print('Number of 0 values initially: %d'%sum(miss_bool))
----> 8 data.loc[miss_bool,'Item_Visibility'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: visibility_avg[x])
      9 print('Number of 0 values after modification: %d'%sum(data['Item_Visibility'] == 0))

C:\Anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

C:\Anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

C:\Anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

C:\Anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

C:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'FDX07'

#2

Hey - @AnnaList,

You may have missed a step that was implemented before running this code. You should take a look at the article again, and implement each code block step by step


#3

Hey @jalFaizy,

I’m following the same article and still getting a similar error

    Original #missing: 2439
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-14-0a8079487a22> in <module>()
      7 #Impute data and check #missing values before and after imputation to confirm
      8 print 'Orignal #missing: %d'% sum(miss_bool)
----> 9 data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])
     10 print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2549             else:
   2550                 values = self.asobject
-> 2551                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2552 
   2553         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()

<ipython-input-14-0a8079487a22> in <lambda>(x)
      7 #Impute data and check #missing values before and after imputation to confirm
      8 print 'Orignal #missing: %d'% sum(miss_bool)
----> 9 data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])
     10 print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

/Applications/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'FDP10'

#4

Exactly same error!!
not matter how many times i run the exact same code from the start
please help Someone!


#5

Hi @AnnaList @trmenchen

I faced the same problem while working on this dataset. One possible reason why it worked for item_weight and not item_visibility is that the values in item_visibility are not missing, instead they are filled as 0.00. (I am not sure about this, just a guess)

Coming back to the problem, the item visibility cannot be 0 for an item that is sold, so I filled the values with the median value of this column. (I did not use mode since it was zero, and because most values are zero, using mean will not be appropriate).


#6

Can anybody provide the solution for the same KeyError: ‘FDP10’.


#7

Did you solve it?


#8

HI @digvijay.vyas @anudeep_reddy,

In order to impute the missing values in Item_weight column with the mean, you can use the following code:

df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

#9

Hey,This won’t solve your problem as you are substituting with mean of complete column


#10

Kunal Sir please reply to this particular thread .We are unable to resolve this.
And can you explain how loc and iloc are working


#11

import numpy
df[‘Item_Visibility’].replace(0,numpy.NaN,inplace=True)

df.groupby(‘Item_Identifier’).transform(lambda x: x.fillna(x.mean()))

I solved it using the above code.Hope it helps!

But I don’t know yet how to solve the error in this!

print(‘Number of 0 values initially: %d’%sum(miss_bool))
data.loc[miss_bool,‘Item_Visibility’] = data.loc[miss_bool,‘Item_Identifier’].apply(lambda x: visibility_avg[x])
print(‘Number of 0 values after modification: %d’%sum(data[‘Item_Visibility’] == 0))


#12

i have problem in solving this error .
set.seed(1237)

my_control = trainControl(method=“cv”, number=5) # 5-fold CV
tgrid = expand.grid(

  • .mtry = c(3:10),
  • .splitrule = “variance”,
  • .min.node.size = c(10,15,20)
  • )

rf_mod = train(x = Traindata[, -c(“Item_Identifier”, “Item_Outlet_Sales”)],

  •            y = Traindata$Item_Outlet_Sales,
    
  •            method='ranger', 
    
  •            trControl= my_control, 
    
  •            tuneGrid = tgrid,
    
  •            num.trees = 400,
    
  •            importance = "permutation")
    

Warning message:
model fit failed for Fold5: mtry= 3, splitrule=variance, min.node.size=15 Error in ranger::ranger(dependent.variable.name = “.outcome”, data = x, :
User interrupt or internal error.


#13

@rrao6925 the code is working fine at my end.


#14

Any body found solution for this. I am getting the same error. The “pivot_table” is not working for the key FDP10. Please HELP!!

#########CODE

#Determine the average weight per item:
item_avg_weight = data.pivot_table(values='Item_Weight', index='Item_Identifier')

#Get a boolean variable specifying missing Item_Weight values
miss_bool = data['Item_Weight'].isnull() 

#Impute data and check #missing values before and after imputation to confirm
print 'Orignal #missing: %d'% sum(miss_bool)
data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])
print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

########Output

Orignal #missing: 2439
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-0a8079487a22> in <module>()
      7 #Impute data and check #missing values before and after imputation to confirm
      8 print 'Orignal #missing: %d'% sum(miss_bool)
----> 9 data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])
     10 print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2353             else:
   2354                 values = self.asobject
-> 2355                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2356 
   2357         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src\inference.pyx in pandas._libs.lib.map_infer()

<ipython-input-8-0a8079487a22> in <lambda>(x)
      7 #Impute data and check #missing values before and after imputation to confirm
      8 print 'Orignal #missing: %d'% sum(miss_bool)
----> 9 data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])
     10 print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
   1962             return self._getitem_multilevel(key)
   1963         else:
-> 1964             return self._getitem_column(key)
   1965 
   1966     def _getitem_column(self, key):

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
   1969         # get column
   1970         if self.columns.is_unique:
-> 1971             return self._get_item_cache(key)
   1972 
   1973         # duplicate columns & possible reduce dimensionality

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
   1643         res = cache.get(item)
   1644         if res is None:
-> 1645             values = self._data.get(item)
   1646             res = self._box_item_values(item, values)
   1647             cache[item] = res

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
   3588 
   3589             if not isnull(item):
-> 3590                 loc = self.items.get_loc(item)
   3591             else:
   3592                 indexer = np.arange(len(self.items))[isnull(self.items)]

C:\Users\5002171\AppData\Local\Continuum\anaconda2\lib\site-packages\pandas\core\indexes\base.pyc in get_loc(self, key, method, tolerance)
   2442                 return self._engine.get_loc(key)
   2443             except KeyError:
-> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2445 
   2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'FDP10'

#15

I’m also getting KeyError: 'FDP10'

Edit:
Found another way to impute the average weight per item:

data.set_index('Item_Identifier',inplace=True)
data['Item_Weight'].fillna(item_avg_weight.Item_Weight,inplace=True)
data.reset_index(inplace=True)

#16

This worked for me.
changing the lambda expression to search the index for the Item_Identifier and get the value
change from

data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])

to

data.loc[miss_bool,'Item_Weight']  = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight.at[x,'Item_Weight'])

#17

I had the same problem with Item_wight, the answer is you need to add .loc to item_avg_weight to get the row out of it:
data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight.loc[x])


#18

anybody know why this code doesn’t work?
#Determine another variable with means ratio
data[‘Item_Visibility_MeanRatio’] = data.apply(lambda x: x[‘Item_Visibility’]/visibility_avg[x[‘Item_Identifier’]], axis=1)
print data[‘Item_Visibility_MeanRatio’].describe()


#19

Having the same issue and it’s pretty frustating. KeyError : ‘FDP10’


#20

@ujjwalJain18, did you try the solution provided by @franksalas ?