Learn.split()

Hi,

How would you use Learn.split() to split a transformer?
My method is below, but I get the following error trying it. Any help or advice would be greatly appreciated. Thanks

>     ---------------------------------------------------------------------------
>     AttributeError                            Traceback (most recent call last)
>     /usr/local/lib/python3.6/dist-packages/IPython/core/formatters.py in __call__(self, obj)
>         697                 type_pprinters=self.type_printers,
>         698                 deferred_pprinters=self.deferred_printers)
>     --> 699             printer.pretty(obj)
>         700             printer.flush()
>         701             return stream.getvalue()

    5 frames
    /usr/local/lib/python3.6/dist-packages/fastai/core.py in func_args(func)
        278 def func_args(func)->bool:
        279     "Return the arguments of `func`."
    --> 280     code = func.__code__
        281     return code.co_varnames[:code.co_argcount]
        282 

AttributeError: 'method-wrapper' object has no attribute '__code__'

Here’s my model which uses bert from Huggingface’s Transformer library:
class BertForSequenceClassification(nn.Module):

def __init__(self, num_labels):
    super(BertForSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(0.1)
    self.lin1 = nn.Linear(768, 512)
    self.lin2 = nn.Linear(512, 256)
    self.lin3 = nn.Linear(256, 64)
    self.lin4 = nn.Linear(64, 32)
    self.classifier = nn.Linear(32, 7)
    nn.init.xavier_normal_(self.lin1.weight)
    nn.init.xavier_normal_(self.lin2.weight)
    nn.init.xavier_normal_(self.lin3.weight)
    nn.init.xavier_normal_(self.lin4.weight)
    nn.init.xavier_normal_(self.classifier.weight)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
    _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
    pooled_output = self.dropout(pooled_output)
    x = self.lin1(pooled_output)
    x = self.lin2(x)
    x = self.lin3(x)
    x = self.lin4(x)
    logits = self.classifier(x)
    return logits

custom_transformer_model = BertForSequenceClassification(7)

learner = Learner(
    databunch, custom_transformer_model,
    loss_func=loss_func, 
)

learner = learner.to_fp16()

I’m trying to split it as such:

list_layers = [learner.model.bert.embeddings,
               learner.model.bert.encoder.layer[0],
               learner.model.bert.encoder.layer[1],
               learner.model.bert.encoder.layer[2],
               learner.model.bert.encoder.layer[3],
               learner.model.bert.encoder.layer[4],
               learner.model.bert.encoder.layer[5],
               learner.model.bert.encoder.layer[6],
               learner.model.bert.encoder.layer[7],
               learner.model.bert.encoder.layer[8],
               learner.model.bert.encoder.layer[9],
               learner.model.bert.encoder.layer[10],
               learner.model.bert.encoder.layer[11],
               learner.model.bert.pooler,
               learner.model.dropout,
               learner.model.lin1,
               learner.model.lin2,
               learner.model.lin3,
               learner.model.lin4,
               learner.model.classifier
]
               
learner.split(list_layers)

Also here’s the Model if that helps:

> BertForSequenceClassification(
>   (bert): BertModel(
>     (embeddings): BertEmbeddings(
>       (word_embeddings): Embedding(30522, 768, padding_idx=0)
>       (position_embeddings): Embedding(512, 768)
>       (token_type_embeddings): Embedding(2, 768)
>       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>       (dropout): Dropout(p=0.1, inplace=False)
>     )
>     (encoder): BertEncoder(
>       (layer): ModuleList(
>         (0): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (1): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (2): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (3): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (4): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (5): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (6): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (7): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (8): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (9): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (10): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>         (11): BertLayer(
>           (attention): BertAttention(
>             (self): BertSelfAttention(
>               (query): Linear(in_features=768, out_features=768, bias=True)
>               (key): Linear(in_features=768, out_features=768, bias=True)
>               (value): Linear(in_features=768, out_features=768, bias=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>             (output): BertSelfOutput(
>               (dense): Linear(in_features=768, out_features=768, bias=True)
>               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>               (dropout): Dropout(p=0.1, inplace=False)
>             )
>           )
>           (intermediate): BertIntermediate(
>             (dense): Linear(in_features=768, out_features=3072, bias=True)
>           )
>           (output): BertOutput(
>             (dense): Linear(in_features=3072, out_features=768, bias=True)
>             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
>             (dropout): Dropout(p=0.1, inplace=False)
>           )
>         )
>       )
>     )
>     (pooler): BertPooler(
>       (dense): Linear(in_features=768, out_features=768, bias=True)
>       (activation): Tanh()
>     )
>   )
>   (dropout): Dropout(p=0.1, inplace=False)
>   (lin1): Linear(in_features=768, out_features=512, bias=True)
>   (lin2): Linear(in_features=512, out_features=256, bias=True)
>   (lin3): Linear(in_features=256, out_features=64, bias=True)
>   (lin4): Linear(in_features=64, out_features=32, bias=True)
>   (classifier): Linear(in_features=32, out_features=7, bias=True)
> )

Hello @bluteaur,

I tried to reproduce your error but it’s working perfectly in my side.
Did you resolve the problem?

@maroberti

So I figured out that the problem is specific to Jupyter Notebooks. It works perfectly as a script so that’s where I continued to test.

1 Like