In PyTorch I can write:
nn.Linear(emb, emb * heads, bias=False)
Any idea how to do this in S4TF?
Thank you
It looks like currently their Dense layer and other types of layers like convolutional always incorporate a bias. Perhaps, they may change this in a later version. You could take their Dense layer and create your own version without bias like this:
public struct DenseNoBias<Scalar: TensorFlowFloatingPoint>: Layer {
/// The weight matrix.
public var weight: Tensor<Scalar>
/// The element-wise activation function.
@noDerivative public let activation: Activation
/// Indicates whether this is a batched dense layer.
@noDerivative internal let batched: Bool
/// The element-wise activation function type.
public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
public init(
weight: Tensor<Scalar>,
activation: @escaping Activation
) {
precondition(weight.rank <= 3, "The rank of the 'weight' tensor must be less than 4.")
self.weight = weight
self.activation = activation
self.batched = weight.rank == 3
}
/// Returns the output obtained from applying the layer to the given input.
///
/// - Parameter input: The input to the layer.
/// - Returns: The output.
@differentiable
public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
if batched {
let hidden = matmul(input.expandingShape(at: 1), weight)
return activation(hidden.squeezingShape(at: 1))
}
return activation(matmul(input, weight))
}
}
public extension DenseNoBias {
/// Creates a `Dense` layer with the specified input size, output size, and element-wise
/// activation function. The weight matrix is created with shape `[inputSize, outputSize]` and
/// the bias vector is created with shape `[outputSize]`.
///
/// - Parameters:
/// - inputSize: The dimensionality of the input space.
/// - outputSize: The dimensionality of the output space.
/// - activation: The activation function to use. The default value is `identity(_:)`.
/// - weightInitializer: Initializer to use for `weight`.
init(
inputSize: Int,
outputSize: Int,
activation: @escaping Activation = identity,
weightInitializer: ParameterInitializer<Scalar> = glorotUniform()
) {
self.init(
weight: weightInitializer([inputSize, outputSize]),
activation: activation)
}
}
Thank you. This is very helpful.
I tried to implement this as a special case of Dense, but that didn’t work, because things like
public var bias: Tensor<Scalar>?
are not differentiable.
If you want a general one, you could do it this way:
public struct Dense<Scalar: TensorFlowFloatingPoint>: Layer {
/// The weight matrix.
public var weight: Tensor<Scalar>
/// The bias vector.
public var bias: Tensor<Scalar>
/// The element-wise activation function.
@noDerivative public let activation: Activation
/// Indicates whether this is a batched dense layer.
@noDerivative internal let batched: Bool
/// Indicates whether to use bias or not.
@noDerivative public let useBias: Bool
/// The element-wise activation function type.
public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
public init(
weight: Tensor<Scalar>,
bias: Tensor<Scalar>,
activation: @escaping Activation,
useBias: Bool = true
) {
precondition(weight.rank <= 3, "The rank of the 'weight' tensor must be less than 4.")
precondition(bias.rank <= 2, "The rank of the 'bias' tensor must be less than 3.")
self.weight = weight
self.bias = bias
self.activation = activation
self.batched = weight.rank == 3
self.useBias = useBias
}
/// Returns the output obtained from applying the layer to the given input.
///
/// - Parameter input: The input to the layer.
/// - Returns: The output.
@differentiable
public func callAsFunction(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
var hidden: Tensor<Scalar>
if batched {
hidden = matmul(input.expandingShape(at: 1), weight)
.squeezingShape(at: 1)
} else {
hidden = matmul(input, weight)
}
if useBias {
hidden = hidden + bias
}
return activation(hidden)
}
}
public extension Dense {
/// Creates a `Dense` layer with the specified input size, output size, and element-wise
/// activation function. The weight matrix is created with shape `[inputSize, outputSize]` and
/// the bias vector is created with shape `[outputSize]`.
///
/// - Parameters:
/// - inputSize: The dimensionality of the input space.
/// - outputSize: The dimensionality of the output space.
/// - activation: The activation function to use. The default value is `identity(_:)`.
/// - weightInitializer: Initializer to use for `weight`.
/// - biasInitializer: Initializer to use for `bias`.
init(
inputSize: Int,
outputSize: Int,
activation: @escaping Activation = identity,
weightInitializer: ParameterInitializer<Scalar> = glorotUniform(),
biasInitializer: ParameterInitializer<Scalar> = zeros(),
useBias: Bool = true
) {
self.init(
weight: weightInitializer([inputSize, outputSize]),
bias: biasInitializer([outputSize]),
activation: activation,
useBias: useBias)
}
}
Yes, but this still allocates a tensor for the bias when useBias = false.
public extension Dense {
init(
inputSize: Int,
outputSize: Int,
activation: @escaping Activation = identity,
weightInitializer: ParameterInitializer<Scalar> = glorotUniform(),
biasInitializer: ParameterInitializer<Scalar> = zeros(),
useBias: Bool = true
) {
let bias = useBias ? biasInitializer([outputSize]) : biasInitializer([1])
self.init(
weight: weightInitializer([inputSize, outputSize]),
bias: bias,
activation: activation,
useBias: useBias)
}
}
Above snippet is maybe be a workaround. Not sure if it’s possible to create an empty tensor in TF.
let bias = useBias ? biasInitializer([outputSize]) : Tensor<Scalar>(shape: [0], repeating: Scalar(0))
Excellent, thank you