@muellerzr @sgugger Have you been able to find a workaround for the padding issues causing decreased performance with get_preds() versus predict()?
For my use case of multi-label classification on text data, I’m noticing that most of the records not able to be categorized by get_preds() are correctly predicted using predict(). However, runtime with predict() is too long to use in production - it takes around 7 hours for 50k records compared to ~1.5min for get_preds().
Here’s my inference code for both approaches (can ignore the MLflow parts
)
Using predict():
input_df = pd.read_csv(filename, encoding = "ISO-8859-1", dtype='object').dropna(subset=['text'])
def predict_labels(new_data):
# Update this with the production Experiment ID
experiment_id = '4105035027016985'
# Find the 'best' run and load model components
top_run = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.`weighted avg precision` DESC"]).loc[0,['run_id','metrics.weighted avg f1-score','metrics.weighted avg precision', 'metrics.weighted avg recall']]
model_uri = '/dbfs/databricks/mlflow/' + experiment_id + '/' + top_run['run_id'] + '/artifacts/fastai_model'
learn = load_learner(model_uri, 'fastai.pkl')
mlb_uri = mlb_uri = 'dbfs:/databricks/mlflow/' + experiment_id + '/' + top_run['run_id'] + '/artifacts/MultiLabelBinarizer'
mlb = mlflow.sklearn.load_model(mlb_uri)
predictions = []
def predict_apply(df):
results = learn.predict(df)
agg_results = str(results[0])
predictions.append(results[1].numpy().tolist())
return(agg_results)
new_data['Predictions'] = new_data['text'].apply(lambda x: predict_apply(x))
colnames = mlb.classes_.tolist()
colnames = [i.replace('lbl_','') for i in colnames]
predictions_df = pd.DataFrame(predictions, columns=colnames)
new_data = new_data.join(predictions_df)
return new_data
input_df = predict_labels(input_df)
input_df.head(5)
input_df.to_csv(path + 'results_predict.csv',index=False)
Using get_preds()
input_df = pd.read_csv(filename, encoding = "ISO-8859-1", dtype='object').dropna(subset=['text'])
def predict_labels(new_data):
experiment_id = '4105035027016985'
# Find the 'best' run and load model components
top_run = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.`weighted avg precision` DESC"]).loc[0,['run_id','metrics.weighted avg f1-score','metrics.weighted avg precision', 'metrics.weighted avg recall']]
model_uri = '/dbfs/databricks/mlflow/' + experiment_id + '/' + top_run['run_id'] + '/artifacts/fastai_model'
learn = load_learner(model_uri, 'fastai.pkl')
mlb_uri = mlb_uri = 'dbfs:/databricks/mlflow/' + experiment_id + '/' + top_run['run_id'] + '/artifacts/MultiLabelBinarizer'
mlb = mlflow.sklearn.load_model(mlb_uri)
learn.data.add_test(new_data['text'])
predictions, _ = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
predictions = predictions.numpy()
predictions = np.where(predictions > 0.4, 1, 0)
colnames = mlb.classes_.tolist()
colnames = [i.replace('lbl_','') for i in colnames]
# Use column names to build array into DataFrame
predictions_df = pd.DataFrame(predictions, columns=colnames)
# Add aggregated tagging column
predictions_df['Predictions'] = mlb.inverse_transform(predictions)
predictions_df['Predictions'] = [', '.join(map(str, l)) for l in predictions_df['Predictions']]
predictions_df['Predictions'].replace('', 'Not Categorized', inplace=True)
new_data = new_data.join(predictions_df)
return(new_data)
input_df = predict_labels(input_df)
input_df.to_csv(path + 'results_getpreds.csv',index=False)