I am having trouble understanding how to label my data.
My inputs are spectrogram images of length 240, my desired output is a length 240 array of float values. Let’s assume I am trying to predict the amplitude of the spectrogram at each frame.
I have created a model with a custom head:
Sequential(
(0): AdaptiveConcatPool2d(
(ap): AdaptiveAvgPool2d(output_size=1)
(mp): AdaptiveMaxPool2d(output_size=1)
)
(1): Flatten()
(2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(3): Dropout(p=0.25, inplace=False)
(4): Linear(in_features=1024, out_features=256, bias=True)
(5): ReLU(inplace=True)
(6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(7): Dropout(p=0.5, inplace=False)
(8): Linear(in_features=256, out_features=240, bias=True)
)
I have a df containing columns [Audio File, Amplitude]. Each cell in Amplitude contains a list of float values.
I have an AudioList:
class AudioList(ItemList):
_bunch = AudioDataBunch
# TODO: __REPR__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get(self, i):
item = self.items[i]
if isinstance(item, (Path, str)):
return AudioItem(AudioData.load(str(item)))
if isinstance(item, (tuple, np.ndarray)): #data,sr
return AudioItem(AudioData(item[0],item[1]))
print('Format not supported!', file=sys.stderr)
raise
def reconstruct(self, t:Tensor): return Image(t.transpose(1,2)) #FIXME!! No Image here
def hear_xys(self, xs, ys, **kwargs):
for x, y in zip(xs, ys): x.hear(title=y, **kwargs)
# TODO: example with from_folder
@classmethod
def from_folder(cls, path:PathOrStr='.', extensions:Collection[str]=None, **kwargs)->ItemList:
extensions = ifnone(extensions, AUDIO_EXTENSIONS)
return super().from_folder(path=path, extensions=extensions, **kwargs)
I am trying to create a Databunch with the Datablock API:
def label_func(audio_file):
filename = str(audio_file.split('/')[-1])
y_vals = df[df['Audio File'].str.contains(filename)]['jawTrans_ty'].values[0]```
data = (AudioList.from_df(df, path=audio_data, cols='Audio File')
.split_by_rand_pct()
.label_from_func(label_func, label_cls=FloatList)
.databunch(bs=bs))
This does not work because FloatList expects a single Float value for each image to regress to.
How can I label my data to train against the length of the spectrogram?