Hi,
How would you use Learn.split() to split a transformer?
My method is below, but I get the following error trying it. Any help or advice would be greatly appreciated. Thanks
> ---------------------------------------------------------------------------
> AttributeError Traceback (most recent call last)
> /usr/local/lib/python3.6/dist-packages/IPython/core/formatters.py in __call__(self, obj)
> 697 type_pprinters=self.type_printers,
> 698 deferred_pprinters=self.deferred_printers)
> --> 699 printer.pretty(obj)
> 700 printer.flush()
> 701 return stream.getvalue()
5 frames
/usr/local/lib/python3.6/dist-packages/fastai/core.py in func_args(func)
278 def func_args(func)->bool:
279 "Return the arguments of `func`."
--> 280 code = func.__code__
281 return code.co_varnames[:code.co_argcount]
282
AttributeError: 'method-wrapper' object has no attribute '__code__'
Here’s my model which uses bert from Huggingface’s Transformer library:
class BertForSequenceClassification(nn.Module):
def __init__(self, num_labels):
super(BertForSequenceClassification, self).__init__()
self.num_labels = num_labels
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(0.1)
self.lin1 = nn.Linear(768, 512)
self.lin2 = nn.Linear(512, 256)
self.lin3 = nn.Linear(256, 64)
self.lin4 = nn.Linear(64, 32)
self.classifier = nn.Linear(32, 7)
nn.init.xavier_normal_(self.lin1.weight)
nn.init.xavier_normal_(self.lin2.weight)
nn.init.xavier_normal_(self.lin3.weight)
nn.init.xavier_normal_(self.lin4.weight)
nn.init.xavier_normal_(self.classifier.weight)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
_, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
pooled_output = self.dropout(pooled_output)
x = self.lin1(pooled_output)
x = self.lin2(x)
x = self.lin3(x)
x = self.lin4(x)
logits = self.classifier(x)
return logits
custom_transformer_model = BertForSequenceClassification(7)
learner = Learner(
databunch, custom_transformer_model,
loss_func=loss_func,
)
learner = learner.to_fp16()
I’m trying to split it as such:
list_layers = [learner.model.bert.embeddings,
learner.model.bert.encoder.layer[0],
learner.model.bert.encoder.layer[1],
learner.model.bert.encoder.layer[2],
learner.model.bert.encoder.layer[3],
learner.model.bert.encoder.layer[4],
learner.model.bert.encoder.layer[5],
learner.model.bert.encoder.layer[6],
learner.model.bert.encoder.layer[7],
learner.model.bert.encoder.layer[8],
learner.model.bert.encoder.layer[9],
learner.model.bert.encoder.layer[10],
learner.model.bert.encoder.layer[11],
learner.model.bert.pooler,
learner.model.dropout,
learner.model.lin1,
learner.model.lin2,
learner.model.lin3,
learner.model.lin4,
learner.model.classifier
]
learner.split(list_layers)
Also here’s the Model if that helps:
> BertForSequenceClassification(
> (bert): BertModel(
> (embeddings): BertEmbeddings(
> (word_embeddings): Embedding(30522, 768, padding_idx=0)
> (position_embeddings): Embedding(512, 768)
> (token_type_embeddings): Embedding(2, 768)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (encoder): BertEncoder(
> (layer): ModuleList(
> (0): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (1): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (2): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (3): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (4): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (5): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (6): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (7): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (8): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (9): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (10): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (11): BertLayer(
> (attention): BertAttention(
> (self): BertSelfAttention(
> (query): Linear(in_features=768, out_features=768, bias=True)
> (key): Linear(in_features=768, out_features=768, bias=True)
> (value): Linear(in_features=768, out_features=768, bias=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> (output): BertSelfOutput(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> (intermediate): BertIntermediate(
> (dense): Linear(in_features=768, out_features=3072, bias=True)
> )
> (output): BertOutput(
> (dense): Linear(in_features=3072, out_features=768, bias=True)
> (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
> (dropout): Dropout(p=0.1, inplace=False)
> )
> )
> )
> )
> (pooler): BertPooler(
> (dense): Linear(in_features=768, out_features=768, bias=True)
> (activation): Tanh()
> )
> )
> (dropout): Dropout(p=0.1, inplace=False)
> (lin1): Linear(in_features=768, out_features=512, bias=True)
> (lin2): Linear(in_features=512, out_features=256, bias=True)
> (lin3): Linear(in_features=256, out_features=64, bias=True)
> (lin4): Linear(in_features=64, out_features=32, bias=True)
> (classifier): Linear(in_features=32, out_features=7, bias=True)
> )