I’m working on a machine which has a V100 with 16G of on-board memory. So it is very weird I’m getting a cuda out of memory error on a tabular dataset even with a batch size of 1.
Here are the details of my data:
len(train_df), len(test_df)
(1333192, 148469)
train_df.head()
train_id item_condition_id category_name brand_name price shipping main_cat sub_cat1 sub_cat2
0 0 3 Men/Tops/T-shirts NaN 10.0 1 Men Tops T-shirts
1 1 3 Electronics/Computers & Tablets/Components & P... Razer 52.0 0 Electronics Computers & Tablets Components & Parts
2 2 1 Women/Tops & Blouses/Blouse Target 10.0 1 Women Tops & Blouses Blouse
3 3 1 Home/Home Décor/Home Décor Accents NaN 35.0 1 Home Home Décor Home Décor Accents
4 4 1 Women/Jewelry/Necklaces NaN 44.0 0 Women Jewelry Necklaces
cat_vars = ['item_condition_id', 'category_name', 'brand_name', 'shipping', 'main_cat', 'sub_cat1', 'sub_cat2']
dep_var = ['price']
data_str = (TabularList.from_df(train_df, path=path, cat_names=cat_vars, cont_names=[], procs=[Categorify])
.split_by_idx(get_rdm_idx(train_df))
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=[]))
.databunch(bs=1))
Notice the bs=1
argument. While I’m able to manually load mini-batches from data_strtrain_dl
even with a bs=128
, I get a cuda out of memory error with a bs=1
.
Here is my model:
max_log_y = np.log(train_df['price'].max())*1.2
y_range = torch.tensor([np.log(3), max_log_y], device=defaults.device)
learn = tabular_learner(data_str, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, y_range=y_range, metrics=rmsle)
learn.model
TabularModel(
(embeds): ModuleList(
(0): Embedding(6, 4)
(1): Embedding(1263, 87)
(2): Embedding(4446, 177)
(3): Embedding(3, 3)
(4): Embedding(11, 6)
(5): Embedding(114, 23)
(6): Embedding(860, 70)
)
(emb_drop): Dropout(p=0.04)
(bn_cont): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(layers): Sequential(
(0): Linear(in_features=370, out_features=1000, bias=True)
(1): ReLU(inplace)
(2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): Dropout(p=0.001)
(4): Linear(in_features=1000, out_features=500, bias=True)
(5): ReLU(inplace)
(6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): Dropout(p=0.01)
(8): Linear(in_features=500, out_features=1, bias=True)
)
)
I try to run lr_find()
and I get the cuda memory error:
learn.lr_find()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-15-d81c6bd29d71> in <module>
----> 1 learn.lr_find()
~/fastai/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
29 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
30 a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 31 learn.fit(a, start_lr, callbacks=[cb], **kwargs)
32
33 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:
~/fastai/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
164 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
165 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 166 callbacks=self.callbacks+callbacks)
167
168 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
~/fastai/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
94 raise e
---> 95 finally: cb_handler.on_train_end(exception)
96
97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
~/fastai/fastai/callback.py in on_train_end(self, exception)
255 def on_train_end(self, exception:Union[bool,Exception])->None:
256 "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 257 self('train_end', exception=exception)
258
259 class AverageMetric(Callback):
~/fastai/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
186 "Call through to all of the `CallbakHandler` functions."
187 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 188 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
189
190 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
~/fastai/fastai/callback.py in <listcomp>(.0)
186 "Call through to all of the `CallbakHandler` functions."
187 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 188 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
189
190 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
~/fastai/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
43 # restore the valid_dl we turned off on `__init__`
44 self.data.valid_dl = self.valid_dl
---> 45 self.learn.load('tmp')
46 if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
47 print('LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.')
~/fastai/fastai/basic_train.py in load(self, name, device, strict, with_opt)
211 "Load model and optimizer state (if `with_opt`) `name` from `self.model_dir` using `device`."
212 if device is None: device = self.data.device
--> 213 state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
214 if set(state.keys()) == {'model', 'opt'}:
215 self.model.load_state_dict(state['model'], strict=strict)
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in load(f, map_location, pickle_module)
365 f = open(f, 'rb')
366 try:
--> 367 return _load(f, map_location, pickle_module)
368 finally:
369 if new_fd:
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in _load(f, map_location, pickle_module)
536 unpickler = pickle_module.Unpickler(f)
537 unpickler.persistent_load = persistent_load
--> 538 result = unpickler.load()
539
540 deserialized_storage_keys = pickle_module.load(f)
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in persistent_load(saved_id)
502 if root_key not in deserialized_objects:
503 deserialized_objects[root_key] = restore_location(
--> 504 data_type(size), location)
505 storage = deserialized_objects[root_key]
506 if view_metadata is not None:
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in restore_location(storage, location)
385 elif isinstance(map_location, torch.device):
386 def restore_location(storage, location):
--> 387 return default_restore_location(storage, str(map_location))
388 else:
389 def restore_location(storage, location):
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in default_restore_location(storage, location)
111 def default_restore_location(storage, location):
112 for _, _, fn in _package_registry:
--> 113 result = fn(storage, location)
114 if result is not None:
115 return result
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/serialization.py in _cuda_deserialize(obj, location)
93 if location.startswith('cuda'):
94 device = validate_cuda_device(location)
---> 95 return obj.cuda(device)
96
97
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
74 else:
75 new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76 return new_type(self.size()).copy_(self, non_blocking)
77
78
/net/vaosl01/opt/NFS/sw/anaconda3/envs/mer/lib/python3.7/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
494 # We need this method only for lazy init, so we can remove it
495 del _CudaBase.__new__
--> 496 return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
497
498
RuntimeError: CUDA error: out of memory
Could this be because of the cardinality of the embeddings? I have a language model training on another GPU (same specs V100, 16G) without any problems and its using nearly 15G of memory without any problems. But this dataset is using only around 1G of GPU memory and giving a out of memory error:
Every 5.0s: nvidia-smi Sat Dec 15 16:04:37 2018
Sat Dec 15 16:04:37 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79 Driver Version: 410.79 CUDA Version: 10.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... Off | 00000000:1A:00.0 Off | 0 |
| N/A 66C P0 269W / 300W | 16127MiB / 16130MiB | 95% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla V100-SXM2... Off | 00000000:1C:00.0 Off | 0 |
| N/A 31C P0 41W / 300W | 11MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla V100-SXM2... Off | 00000000:1D:00.0 Off | 0 |
| N/A 33C P0 53W / 300W | 1086MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla V100-SXM2... Off | 00000000:1E:00.0 Off | 0 |
| N/A 66C P0 217W / 300W | 15044MiB / 16130MiB | 94% Default |
+-------------------------------+----------------------+----------------------+
GPU 0 and 1 are running language models. I’m using GPU 2 for this dataset. Looking for help debugging this issue.
Thanks.