Tabular dataset cuda out of memory error during lr_find

shaun1 · December 15, 2018, 9:04pm

I’m working on a machine which has a V100 with 16G of on-board memory. So it is very weird I’m getting a cuda out of memory error on a tabular dataset even with a batch size of 1.

Here are the details of my data:

len(train_df), len(test_df)
(1333192, 148469)

train_df.head()
train_id	item_condition_id	category_name	brand_name	price	shipping	main_cat	sub_cat1	sub_cat2
0	0	3	Men/Tops/T-shirts	NaN	10.0	1	Men	Tops	T-shirts
1	1	3	Electronics/Computers & Tablets/Components & P...	Razer	52.0	0	Electronics	Computers & Tablets	Components & Parts
2	2	1	Women/Tops & Blouses/Blouse	Target	10.0	1	Women	Tops & Blouses	Blouse
3	3	1	Home/Home Décor/Home Décor Accents	NaN	35.0	1	Home	Home Décor	Home Décor Accents
4	4	1	Women/Jewelry/Necklaces	NaN	44.0	0	Women	Jewelry	Necklaces

cat_vars = ['item_condition_id', 'category_name', 'brand_name', 'shipping', 'main_cat', 'sub_cat1', 'sub_cat2']
dep_var = ['price']

data_str = (TabularList.from_df(train_df, path=path, cat_names=cat_vars, cont_names=[], procs=[Categorify])
           .split_by_idx(get_rdm_idx(train_df))
           .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
           .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=[])) 
           .databunch(bs=1))

Notice the bs=1 argument. While I’m able to manually load mini-batches from data_strtrain_dl even with a bs=128, I get a cuda out of memory error with a bs=1.

Here is my model:

max_log_y = np.log(train_df['price'].max())*1.2
y_range = torch.tensor([np.log(3), max_log_y], device=defaults.device)
learn = tabular_learner(data_str, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, y_range=y_range, metrics=rmsle)
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(6, 4)
    (1): Embedding(1263, 87)
    (2): Embedding(4446, 177)
    (3): Embedding(3, 3)
    (4): Embedding(11, 6)
    (5): Embedding(114, 23)
    (6): Embedding(860, 70)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=370, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.001)
    (4): Linear(in_features=1000, out_features=500, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01)
    (8): Linear(in_features=500, out_features=1, bias=True)
  )
)

I try to run lr_find() and I get the cuda memory error:

learn.lr_find()
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-15-d81c6bd29d71> in <module>
----> 1 learn.lr_find()

~/fastai/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
     29     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     30     a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 31     learn.fit(a, start_lr, callbacks=[cb], **kwargs)
     32 
     33 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

~/fastai/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    164         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    165         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 166             callbacks=self.callbacks+callbacks)
    167 
    168     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/fastai/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     93         exception = e
     94         raise e
---> 95     finally: cb_handler.on_train_end(exception)
     96 
     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,

~/fastai/fastai/callback.py in on_train_end(self, exception)
    255     def on_train_end(self, exception:Union[bool,Exception])->None:
    256         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 257         self('train_end', exception=exception)
    258 
    259 class AverageMetric(Callback):

~/fastai/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
    186         "Call through to all of the `CallbakHandler` functions."
    187         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 188         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    189 
    190     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

~/fastai/fastai/callback.py in <listcomp>(.0)
    186         "Call through to all of the `CallbakHandler` functions."
    187         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 188         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    189 
    190     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

~/fastai/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
     43         # restore the valid_dl we turned off on `__init__`
     44         self.data.valid_dl = self.valid_dl
---> 45         self.learn.load('tmp')
     46         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
     47         print('LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.')

~/fastai/fastai/basic_train.py in load(self, name, device, strict, with_opt)
    211         "Load model and optimizer state (if `with_opt`) `name` from `self.model_dir` using `device`."
    212         if device is None: device = self.data.device
--> 213         state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
    214         if set(state.keys()) == {'model', 'opt'}:
    215             self.model.load_state_dict(state['model'], strict=strict)

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in load(f, map_location, pickle_module)
    365         f = open(f, 'rb')
    366     try:
--> 367         return _load(f, map_location, pickle_module)
    368     finally:
    369         if new_fd:

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in _load(f, map_location, pickle_module)
    536     unpickler = pickle_module.Unpickler(f)
    537     unpickler.persistent_load = persistent_load
--> 538     result = unpickler.load()
    539 
    540     deserialized_storage_keys = pickle_module.load(f)

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in persistent_load(saved_id)
    502             if root_key not in deserialized_objects:
    503                 deserialized_objects[root_key] = restore_location(
--> 504                     data_type(size), location)
    505             storage = deserialized_objects[root_key]
    506             if view_metadata is not None:

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in restore_location(storage, location)
    385     elif isinstance(map_location, torch.device):
    386         def restore_location(storage, location):
--> 387             return default_restore_location(storage, str(map_location))
    388     else:
    389         def restore_location(storage, location):

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in default_restore_location(storage, location)
    111 def default_restore_location(storage, location):
    112     for _, _, fn in _package_registry:
--> 113         result = fn(storage, location)
    114         if result is not None:
    115             return result

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in _cuda_deserialize(obj, location)
     93     if location.startswith('cuda'):
     94         device = validate_cuda_device(location)
---> 95         return obj.cuda(device)
     96 
     97 

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
     74         else:
     75             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76             return new_type(self.size()).copy_(self, non_blocking)
     77 
     78 

/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
    494     # We need this method only for lazy init, so we can remove it
    495     del _CudaBase.__new__
--> 496     return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
    497 
    498 

RuntimeError: CUDA error: out of memory

Could this be because of the cardinality of the embeddings? I have a language model training on another GPU (same specs V100, 16G) without any problems and its using nearly 15G of memory without any problems. But this dataset is using only around 1G of GPU memory and giving a out of memory error:

Every 5.0s: nvidia-smi                                                                                                         Sat Dec 15 16:04:37 2018

Sat Dec 15 16:04:37 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  Off  | 00000000:1A:00.0 Off |                    0 |
| N/A   66C    P0   269W / 300W |  16127MiB / 16130MiB |     95%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:1C:00.0 Off |                    0 |
| N/A   31C    P0    41W / 300W |     11MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  Off  | 00000000:1D:00.0 Off |                    0 |
| N/A   33C    P0    53W / 300W |   1086MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  Off  | 00000000:1E:00.0 Off |                    0 |
| N/A   66C    P0   217W / 300W |  15044MiB / 16130MiB |     94%      Default |
+-------------------------------+----------------------+----------------------+

GPU 0 and 1 are running language models. I’m using GPU 2 for this dataset. Looking for help debugging this issue.

Thanks.

shaun1 · December 16, 2018, 1:12pm

I noticed that the CUDA driver on the machine was v10 and I was only using the default install option for pytorch. There is an option for pytorch install for a CUDA v10 driver conda install pytorch torchvision cuda100 -c pytorch. Once I upgraded to this version, I did not receive any error.

tyler · January 12, 2019, 7:20am

Been having the same problem for the last few days and this fixed it for me as well. Thanks for sharing.

jankelowitz · February 21, 2019, 5:10am

@shaun1 thanks for this! Had similar problem and was able to fix it with that line.

Daammon · February 22, 2019, 9:39am

Having the same problem on a RTX 2070. Installed the v10 drivers but didn’t solve it. Any advice?

jgmeyer · May 15, 2019, 6:40pm

Also having this problem - it gets through a few batches and then:

RuntimeError Traceback (most recent call last)
in
----> 1 learn.lr_find()

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
29 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
30 a = int(np.ceil(num_it/len(learn.data.train_dl)))
—> 31 learn.fit(a, start_lr, callbacks=[cb], **kwargs)
32
33 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
164 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
165 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
–> 166 callbacks=self.callbacks+callbacks)
167
168 def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
94 raise e
—> 95 finally: cb_handler.on_train_end(exception)
96
97 loss_func_name2activ = {‘cross_entropy_loss’: partial(F.softmax, dim=-1), ‘nll_loss’: torch.exp, ‘poisson_nll_loss’: torch.exp,

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/callback.py in on_train_end(self, exception)
255 def on_train_end(self, exception:Union[bool,Exception])->None:
256 “Handle end of training, exception is an Exception or False if no exceptions during training.”
–> 257 self(‘train_end’, exception=exception)
258
259 class AverageMetric(Callback):

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/callback.py in call(self, cb_name, call_mets, **kwargs)
186 “Call through to all of the CallbakHandler functions.”
187 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 188 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
189
190 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/callback.py in (.0)
186 “Call through to all of the CallbakHandler functions.”
187 if call_mets: [getattr(met, f’on_{cb_name}’)(**self.state_dict, **kwargs) for met in self.metrics]
–> 188 return [getattr(cb, f’on_{cb_name}’)(**self.state_dict, **kwargs) for cb in self.callbacks]
189
190 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
43 # restore the valid_dl we turned off on __init__
44 self.data.valid_dl = self.valid_dl
—> 45 self.learn.load(‘tmp’)
46 if hasattr(self.learn.model, ‘reset’): self.learn.model.reset()
47 print(‘LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.’)

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/fastai/basic_train.py in load(self, name, device, strict, with_opt)
211 “Load model and optimizer state (if with_opt) name from self.model_dir using device.”
212 if device is None: device = self.data.device
–> 213 state = torch.load(self.path/self.model_dir/f’{name}.pth’, map_location=device)
214 if set(state.keys()) == {‘model’, ‘opt’}:
215 self.model.load_state_dict(state[‘model’], strict=strict)

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in load(f, map_location, pickle_module)
366 f = open(f, ‘rb’)
367 try:
–> 368 return _load(f, map_location, pickle_module)
369 finally:
370 if new_fd:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in _load(f, map_location, pickle_module)
540 unpickler = pickle_module.Unpickler(f)
541 unpickler.persistent_load = persistent_load
–> 542 result = unpickler.load()
543
544 deserialized_storage_keys = pickle_module.load(f)

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in persistent_load(saved_id)
503 if root_key not in deserialized_objects:
504 deserialized_objects[root_key] = restore_location(
–> 505 data_type(size), location)
506 storage = deserialized_objects[root_key]
507 if view_metadata is not None:

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in restore_location(storage, location)
386 elif isinstance(map_location, torch.device):
387 def restore_location(storage, location):
–> 388 return default_restore_location(storage, str(map_location))
389 else:
390 def restore_location(storage, location):

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in default_restore_location(storage, location)
112 def default_restore_location(storage, location):
113 for _, _, fn in _package_registry:
–> 114 result = fn(storage, location)
115 if result is not None:
116 return result

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/serialization.py in _cuda_deserialize(obj, location)
94 if location.startswith(‘cuda’):
95 device = validate_cuda_device(location)
—> 96 return obj.cuda(device)
97
98

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/_utils.py in cuda(self, device, non_blocking, **kwargs)
74 else:
75 new_type = getattr(torch.cuda, self.class.name)
—> 76 return new_type(self.size()).copy(self, non_blocking)
77
78

~/anaconda3/envs/nfastai/lib/python3.6/site-packages/torch/cuda/init.py in _lazy_new(cls, *args, **kwargs)
494 # We need this method only for lazy init, so we can remove it
495 del _CudaBase.new
–> 496 return super(_CudaBase, cls).new(cls, *args, **kwargs)
497
498

RuntimeError: CUDA error: out of memory

florobax · May 16, 2019, 11:37am

Hi, I have kind of the same problem. Using different version of resnet, I notice that initially with lr_find, the memory usage explodes before stabilizing. For instance with resnet 34 (and batches of 4 images of sizes 2562563), the model takes up to 10 GB when lr_find starts before stabilizing to 3GB. Is there any way to prevent this so that I can use a deeper resnet and still stick into memory for the whole process (I have 12 GB available) ?