[Solved] Reproducibility: Where is the randomness coming in?

rpcoelho · June 26, 2019, 8:06pm

Hi @Pomo, I saw your very helpful answer and implemented it but I’m still not getting reproducible results. I’m not sure if I’m setting num_workers in the right place but I’m loading the datasets already split so this can’t be causing the problems like @blissweb mentioned. Here is how I’m creating the databunch, setting the seeds and running the learner:

 def random_seed(seed_value):
    import random 
    random.seed(seed_value) # Python
    import numpy as np
    np.random.seed(seed_value) # cpu vars
    import torch
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

random_seed(0)

dep_var = 'NumberOfSales'
df = train_df[cat_vars + cont_vars + [dep_var]].copy()

path="c:/Benchmarking/testBench.csv"
data = (TabularList.from_df(df, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=False)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch(num_workers=0))

    #x=best.x   I'm using scikit opt to find the best parameters but then can't reproduce the results.
    x=[500, 500, 100, 0.0005, 0.4, 8]
    print(x)
    learn3 = tabular_learner(data, layers=[x[0],x[1],x[2]], ps=[0.09,0.5,0.5], emb_drop=0.04, 
                        y_range=y_range, metrics=mae)
    learn3.fit_one_cycle(1, x[3], wd=x[4], div_factor=x[5])