@johnkeefe absolutely! I also updated the code a bit more, as I wasn’t quite satisfied. This one now has a progress bar to know which variable (out of the total #) you are at, along with another column for that particular variables type (as this was something I found quite confusing to go back and forth on).
def feature_importance(learn:Learner):
pd.options.mode.chained_assignment = None
# based on: https://medium.com/@mp.music93/neural-networks-feature-importance-with-fastai-5c393cf65815
data = learn.data.train_ds.x
cat_names = data.cat_names
cont_names = data.cont_names
loss0=np.array([learn.loss_func(learn.pred_batch(batch=(x,y.to("cpu"))), y.to("cpu")) for x,y in iter(learn.data.valid_dl)]).mean()
#The above gives us our ground truth percentage for our validation set
fi=dict()
types=[cat_names, cont_names]
with tqdm(total=len(data.col_names)) as pbar:
for j, t in enumerate(types): # for all of cat_names and cont_names
for i, c in enumerate(t):
loss=[]
for x,y in (iter(learn.data.valid_dl)): # for all values in validation set
col=x[j][:,i] # select one column of tensors
idx = torch.randperm(col.nelement()) # generate a random tensor
x[j][:,i] = col.view(-1)[idx].view(col.size()) # replace the old tensor with a new one
y=y.to('cpu')
loss.append(learn.loss_func(learn.pred_batch(batch=(x,y)), y))
pbar.update(1)
fi[c]=np.array(loss).mean()-loss0
d = sorted(fi.items(), key=lambda kv: kv[1], reverse=True)
df = pd.DataFrame({'Variable': [l for l, v in d], 'Importance': np.log1p([v for l, v in d])})
df['Type'] = ''
for x in range(len(df)):
if df['Variable'].iloc[x] in cat_names:
df['Type'].iloc[x] = 'categorical'
if df['Variable'].iloc[x] in cont_names:
df['Type'].iloc[x] = 'continuous'
return df