# Copyright 2023 Calico LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========================================================================
import pdb
import sys
from typing import Optional, List
import numpy as np
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
tf.config.experimental.set_memory_growth(device, True)
#####################
# transfer learning #
#####################
[docs]
class IA3(tf.keras.layers.Layer):
# https://arxiv.org/pdf/2205.05638.pdf
# ia3 module for attention layer, scale output.
def __init__(self, original_layer, trainable=False, **kwargs):
# keep the name of this layer the same as the original dense layer.
original_layer_config = original_layer.get_config()
name = original_layer_config["name"]
kwargs.pop("name", None)
super().__init__(name=name, trainable=trainable, **kwargs)
self.output_dim = original_layer_config["units"]
self.original_layer = original_layer
self.original_layer.trainable = False
# IA3 weights. Make it a dense layer to control trainable
self._ia3_layer = tf.keras.layers.Dense(
units=self.output_dim,
use_bias=False,
kernel_initializer=tf.keras.initializers.Ones(),
trainable=True,
name="ia3",
)
[docs]
def call(self, inputs):
original_output = self.original_layer(inputs)
scaler = self._ia3_layer(tf.constant([[1]], dtype="float64"))[0]
return original_output * scaler
[docs]
def get_config(self):
config = super().get_config().copy()
config.update(
{
"size": self.output_dim,
}
)
return config
[docs]
class IA3_ff(tf.keras.layers.Layer):
# https://arxiv.org/pdf/2205.05638.pdf
# ia3 module for down-projection ff layer, scale input.
def __init__(self, original_layer, trainable=False, **kwargs):
# keep the name of this layer the same as the original dense layer.
original_layer_config = original_layer.get_config()
name = original_layer_config["name"]
kwargs.pop("name", None)
super().__init__(name=name, trainable=trainable, **kwargs)
self.input_dim = original_layer.input_shape[-1]
self.original_layer = original_layer
self.original_layer.trainable = False
# IA3 weights. Make it a dense layer to control trainable
self._ia3_layer = tf.keras.layers.Dense(
units=self.input_dim,
use_bias=False,
kernel_initializer=tf.keras.initializers.Ones(),
trainable=True,
name="ia3_ff",
)
[docs]
def call(self, inputs):
scaler = self._ia3_layer(tf.constant([[1]], dtype="float64"))[0]
return self.original_layer(inputs * scaler)
[docs]
def get_config(self):
config = super().get_config().copy()
config.update({"size": self.input_dim})
return config
[docs]
class Lora(tf.keras.layers.Layer):
# adapted from:
# https://arxiv.org/abs/2106.09685
# https://keras.io/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/
# https://github.com/Elvenson/stable-diffusion-keras-ft/blob/main/layers.py
def __init__(self, original_layer, rank=8, alpha=16, trainable=False, **kwargs):
# keep the name of this layer the same as the original dense layer.
original_layer_config = original_layer.get_config()
name = original_layer_config["name"]
kwargs.pop("name", None)
super().__init__(name=name, trainable=trainable, **kwargs)
self.output_dim = original_layer_config["units"]
if rank > self.output_dim:
raise ValueError(
f"LoRA rank {rank} must be less or equal than {self.output_dim}"
)
self.rank = rank
self.alpha = alpha
self.scale = alpha / rank
self.original_layer = original_layer
self.original_layer.trainable = False
# Note: the original paper mentions that normal distribution was
# used for initialization. However, the official LoRA implementation
# uses "Kaiming/He Initialization".
self.down_layer = tf.keras.layers.Dense(
units=rank,
use_bias=False,
kernel_initializer=tf.keras.initializers.HeUniform(),
# kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1 / self.rank),
trainable=True,
name="lora_a",
)
self.up_layer = tf.keras.layers.Dense(
units=self.output_dim,
use_bias=False,
kernel_initializer=tf.keras.initializers.Zeros(),
trainable=True,
name="lora_b",
)
[docs]
def call(self, inputs):
original_output = self.original_layer(inputs)
lora_output = self.up_layer(self.down_layer(inputs)) * self.scale
return original_output + lora_output
[docs]
def get_config(self):
config = super().get_config().copy()
config.update({"rank": self.rank, "alpha": self.alpha})
return config
[docs]
class Locon(tf.keras.layers.Layer):
# LoRA for conv-layer, adapted from:
# https://arxiv.org/pdf/2309.14859#page=23.84
# https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/modules/locon.py
# use default alpha and rank for locon
def __init__(self, original_layer, rank=4, alpha=1, trainable=False, **kwargs):
# keep the name of this layer the same as the original conv layer.
original_layer_config = original_layer.get_config()
name = original_layer_config["name"]
kwargs.pop("name", None)
super().__init__(name=name, trainable=trainable, **kwargs)
self.input_dim = original_layer.input_shape[-1]
self.output_dim = original_layer_config["filters"]
if rank > self.output_dim:
raise ValueError(
f"LoRA rank {rank} must be less or equal than {self.output_dim}"
)
self.rank = rank
self.alpha = alpha
self.scale = alpha / rank
self.original_layer = original_layer
self.original_layer.trainable = False
input_dim = original_layer.input_shape[-1]
output_dim = original_layer_config["filters"]
kernel_size = original_layer_config["kernel_size"][0]
stride = original_layer_config["strides"][0]
dilation_rate = original_layer_config["dilation_rate"][0]
# Note: the original paper mentions that normal distribution was
# used for initialization. However, the official LoRA implementation
# uses "Kaiming/He Initialization".
self.down_layer = tf.keras.layers.Conv1D(
filters=rank,
kernel_size=kernel_size,
strides=stride,
padding="same",
use_bias=False,
dilation_rate=dilation_rate,
kernel_initializer=tf.keras.initializers.HeUniform(),
name="locon_down",
)
self.up_layer = tf.keras.layers.Conv1D(
filters=output_dim,
kernel_size=1,
strides=stride,
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.Zeros(),
name="locon_up",
)
[docs]
def call(self, inputs):
original_output = self.original_layer(inputs)
lora_output = self.up_layer(self.down_layer(inputs)) * self.scale
return original_output + lora_output
[docs]
def get_config(self):
config = super().get_config().copy()
config.update({"rank": self.rank, "alpha": self.alpha})
return config
[docs]
class AdapterHoulsby(tf.keras.layers.Layer):
# https://arxiv.org/abs/1902.00751
# adapted from: https://github.com/jain-harshil/Adapter-BERT
def __init__(self, latent_size, activation=tf.keras.layers.ReLU(), **kwargs):
super(AdapterHoulsby, self).__init__(**kwargs)
self.latent_size = latent_size
self.activation = activation
[docs]
def build(self, input_shape):
self.down_project = tf.keras.layers.Dense(
units=self.latent_size,
activation="linear",
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3),
bias_initializer="zeros",
name="adapter_down",
)
self.up_project = tf.keras.layers.Dense(
units=input_shape[-1],
activation="linear",
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1e-3),
bias_initializer="zeros",
name="adapter_up",
)
[docs]
def call(self, inputs):
projected_down = self.down_project(inputs)
activated = self.activation(projected_down)
projected_up = self.up_project(activated)
output = projected_up + inputs
return output
[docs]
def get_config(self):
config = super().get_config().copy()
config.update({"latent_size": self.latent_size, "activation": self.activation})
return config