class XResNet(nn.Sequential):
def __init__(self, expansion, layers, c_in=3, c_out=1000, sa=False, sym=False, act_cls=defaults.activation):
stem = []
sizes = [c_in,16,32,64] if c_in<3 else [c_in,32,64,64]
for i in range(3):
stem.append(ConvLayer(sizes[i], sizes[i+1], stride=2 if i==0 else 1, act_cls=act_cls))
block_szs = [64//expansion,64,128,256,512] +[256]*(len(layers)-4)
blocks = [self._make_layer(expansion, block_szs[i], block_szs[i+1], l, 1 if i==0 else 2,
sa = sa if i == (len(layers)-4) else False, sym=sym, act_cls=act_cls)
for i,l in enumerate(layers)]
super().__init__(
*stem,
nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
*blocks,
nn.AdaptiveAvgPool2d(1), Flatten(),
nn.Linear(block_szs[-1]*expansion, c_out),
)
init_cnn(self)
def _make_layer(self, expansion, ni, nf, blocks, stride, sa, sym, act_cls):
return nn.Sequential(
*[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1,
sa if i == (blocks-1) else False, sym=sym, act_cls=act_cls)
for i in range(blocks)])
The main changes are the filter sizes (32,64,64), changing the self attention layer conditions, and including the activation function in the ConvLayer.
We could get relatively stable results. Let me know if youād rather I put in a PR to the repo
Thinking about it, at layer 2 32 filters seems too low actually. Receptive field is effectively 5x5 from the input, and with 3 channels thatās 5x5x3=75. 64 still seems like a lot. I wonder if 48 is actually the right number - i.e. 32,48,64 . Maybe try that too?
Right as I finished! I was still able to get 72%(ish) with the filters. Still not the 75-76% we can get directly porting over the code. I wonder if perhaps you could look at something for me. They look close to the exact same but are the both essentially equivalent?
class ConvLayer(nn.Sequential):
"Create a sequence of convolutional (`ni` to `nf`), ReLU (if `use_activ`) and `norm_type` layers."
def __init__(self, ni, nf, ks=3, stride=1, padding=None, bias=None, ndim=2, norm_type=NormType.Batch, bn_1st=True,
act_cls=defaults.activation, transpose=False, init=nn.init.kaiming_normal_, xtra=None, **kwargs):
if padding is None: padding = ((ks-1)//2 if not transpose else 0) # Ours: padding = ks//2
bn = norm_type in (NormType.Batch, NormType.BatchZero)
if bias is None: bias = not bn
conv_func = _conv_func(ndim, transpose=transpose)
conv = init_default(conv_func(ni, nf, kernel_size=ks, bias=bias, stride=stride, padding=padding, **kwargs), init)
if norm_type==NormType.Weight: conv = weight_norm(conv)
elif norm_type==NormType.Spectral: conv = spectral_norm(conv)
layers = [conv]
act_bn = []
if act_cls is not None: act_bn.append(act_cls())
if bn: act_bn.append(BatchNorm(nf, norm_type=norm_type, ndim=ndim))
if bn_1st: act_bn.reverse()
layers += act_bn
if xtra: layers.append(xtra)
super().__init__(*layers)
Vs
def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):
"Create and initialize a `nn.Conv1d` layer with spectral normalization."
conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
nn.init.kaiming_normal_(conv.weight)
if bias: conv.bias.data.zero_()
return spectral_norm(conv)
def conv(ni, nf, ks=3, stride=1, bias=False):
return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)
def noop(x): return x
def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
bn = nn.BatchNorm2d(nf)
nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
layers = [conv(ni, nf, ks, stride=stride), bn]
if act: layers.append(act_fn)
return nn.Sequential(*layers)
In other news, Iām considering redoing imagenette and imagewoof to make the train/val split 50/50 (but using same total size of train+val). The idea being that would largely avoid the need for averaging multiple runs (bigger val set) and can experiment with data augmentation usefully with less epochs (smaller train set). It would mean creating a new leaderboard, but I think itās worth that one-time cost, personallyā¦ Any other reasons this might be a bad idea?
Seems like a great idea to me! Iād still (personally) expect some repeated runs (maybe 3 - 4) just to see how that variance is, but I expect it to be much lower than what weāve been seeing so far.
They are in fact the exact same So now weāve figured out that the architectures are indeed the same.So those changes to the architecture are it. Weāre going back to the optimizer now to be sure otherwise keep scratching our heads
What we found is that if we just plug our architecture onto Lessā code we are able to get ~75/76% (that uses fastai v1 and his implementation of ranger). I got results that are even a little bit better than the current leaderboard. The means the problem is not in the arch anymore.
My current suspects are the optimizer itself (one subtle difference is that Less is using a RAdam threshold of 5, Iāve made a PR for that) and the RRC tfm which is always dropping my accuracy compared to a simple resize.
As I mentioned on the PR, this doesnāt change anything apart for one iteration in training, so I donāt think this has any link with your results. Feel free to use a separate implementation to compare but theoretically, it doesnāt really make sense to me that the difference would come from that.
Iām happy to investigate where the difference could come from, just give me two minimal implementations that show it (one to the 75/76% in v1 and one in v2).