0925 use custom x_transformers for easy develop

2025-09-25 16:04:22 +08:00
parent 6f03357342
commit d077e3210e
6 changed files with 4775 additions and 3 deletions
--- a/Amadeus/custom_attend.py
+++ b/Amadeus/custom_attend.py
@ -0,0 +1,556 @@
 from __future__ import annotations
 from functools import partial
 from typing import Tuple, Callable
 import torch
 from torch.nn import Module, Parameter
 from torch import cat, nn, einsum, Tensor
 import torch.nn.functional as F
 from collections import namedtuple
 from functools import wraps
 from packaging import version
 from dataclasses import dataclass
 from einops import rearrange, repeat, pack, unpack
 # constants
@dataclass
 class Intermediates:
    qk_similarities:    Tensor | None = None
    pre_softmax_attn:   Tensor | None = None
    post_softmax_attn:  Tensor | None = None
    values:             Tensor | None = None
    cached_kv:          tuple[Tensor, Tensor] | None = None
    layer_type:         str | None = None
    hybrid_hidden:      Tensor | None = None
    def to_tuple(self):
        return (self.qk_similarities, self.pre_softmax_attn, self.post_softmax_attn)
 # helpers
 def exists(val):
    return val is not None
 def default(val, d):
    return val if exists(val) else d
 def at_most_one_of(*bools):
    return sum([*map(int, bools)]) <= 1
 def compact(arr):
    return [*filter(exists, arr)]
@torch.jit.script
 def softclamp(t: Tensor, value: float):
    return (t / value).tanh() * value
 def pack_one(t, pattern):
    return pack([t], pattern)
 def unpack_one(t, ps, pattern):
    return unpack(t, ps, pattern)[0]
 def once(fn):
    called = False
    @wraps(fn)
    def inner(x):
        nonlocal called
        if called:
            return
        called = True
        return fn(x)
    return inner
 print_once = once(print)
 # selective attention
 # https://arxiv.org/abs/2410.02703 - section 3.3
 # it is a technique to allow each token to prevent itself from being attended to by future tokens
 # if sim_head_gate not supplied, will use the first head of the attention logits (sim in this framework)
 def selective_attn(
    sim,
    sim_head_gate = None,
    no_mask_sos = True
 ):
    i, j, device = *sim.shape[-2:], sim.device
    sim_head_gate = default(sim_head_gate, sim[:, 0])
    gate = F.relu(sim_head_gate) # only positive
    if no_mask_sos:
        gate = gate.clone()
        gate[..., -i] = 0.
    eye = torch.eye(i, device = device)
    if j > i:
        eye = F.pad(eye, (j - i, 0), value = 1.)
    gate = (1. - eye) * gate
    gate = F.pad(gate, (0, 0, 1, -1), value = 0.) # only allow for masking the future
    gate = gate.cumsum(dim = -2)
    return sim - rearrange(gate, 'b i j -> b 1 i j')
 # alternative distance functions
 def qk_l2_dist_squared(q, k):
    if k.ndim == 3:
        k = repeat(k, 'b j d -> b h j d', h = q.shape[1])
    q, packed_shape = pack_one(q, '* i d')
    k, _ = pack_one(k, '* j d')
    l2_dist_squared = torch.cdist(q, k) ** 2
    return unpack_one(l2_dist_squared, packed_shape, '* i j')
 # one-hot straight through softmax
 def one_hot_straight_through(logits, temperature = 1.):
    one_hot_indices = logits.argmax(dim = -1, keepdim = True)
    one_hot = torch.zeros_like(logits).scatter(-1, one_hot_indices, 1.)
    soft_attn = (logits / temperature).softmax(dim = -1)
    return one_hot + soft_attn - soft_attn.detach()
 # sparse topk attention - only keep topk attn logits for softmax
 # optional straight through with masked out logits by setting `attn_sparse_topk_straight_through = True`
 def sparse_topk_attn(
    logits,
    sparse_topk,
    temperature = 1.,
    straight_through = False
 ):
    orig_logits = logits
    mask_value = -torch.finfo(logits.dtype).max
    top_values, _ = logits.topk(sparse_topk, dim = -1)
    sparse_topk_mask = (logits >= top_values[..., -1:]) & (logits > mask_value)
    logits = logits.masked_fill(~sparse_topk_mask, mask_value)
    topk_attn = logits.softmax(dim = -1)
    if not straight_through:
        return topk_attn
    soft_attn = (orig_logits / temperature).softmax(dim = -1)
    return topk_attn.detach() + soft_attn - soft_attn.detach()
 # functions for creating causal mask
 # need a special one for onnx cpu (no support for .triu)
 def create_causal_mask(i, j, device):
    return torch.ones((i, j), device = device, dtype = torch.bool).triu(j - i + 1)
 def onnx_create_causal_mask(i, j, device):
    r = torch.arange(i, device = device)
    causal_mask = rearrange(r, 'i -> i 1') < rearrange(r, 'j -> 1 j')
    causal_mask = F.pad(causal_mask, (j - i, 0), value = False)
    return causal_mask
 # main class
 class Attend(Module):
    def __init__(
        self,
        *,
        dropout = 0.,
        causal = False,
        heads = None,
        pre_talking_heads = False,
        post_talking_heads = False,
        pre_scale_post_talking_heads = False,
        sparse_topk = None,
        sparse_topk_straight_through = False, # https://arxiv.org/abs/2505.22074
        scale = None,
        qk_norm = False,
        l2_distance = False,
        sigmoid = False,
        custom_attn_fn: Callable | None = None,
        flash = False,
        softclamp_logits = False,
        logit_softclamp_value = 50.,
        add_zero_kv = False,
        head_learned_sink = False,
        selective = False,
        hard = False,
        cope = None,
        onnxable = False,
        sdp_kwargs: dict = dict(
            enable_flash = True,
            enable_math = True,
            enable_mem_efficient = True
        )
    ):
        super().__init__()
        self.scale = scale
        # causal related
        self.causal = causal
        self.create_causal_mask = onnx_create_causal_mask if onnxable else create_causal_mask
        # attention type
        is_sparse_topk_attn = exists(sparse_topk)
        assert not (flash and sigmoid), 'sigmoid attention not available for flash'
        assert not (flash and hard), 'hard attention not available for flash'
        assert not (flash and is_sparse_topk_attn), 'topk attention not available for flash'
        assert at_most_one_of(sigmoid, hard, l2_distance, is_sparse_topk_attn)
        if exists(custom_attn_fn):
            self.attn_fn = custom_attn_fn
        elif sigmoid:
            self.attn_fn = F.sigmoid
        elif hard:
            self.attn_fn = one_hot_straight_through
        elif is_sparse_topk_attn:
            self.attn_fn = partial(sparse_topk_attn, sparse_topk = sparse_topk, straight_through = sparse_topk_straight_through)
        else:
            softmax_fn = partial(F.softmax, dim = -1)
            self.attn_fn = partial(softmax_fn, dtype = torch.float32) if not qk_norm else softmax_fn
        # dropouts
        self.dropout = dropout
        self.attn_dropout = nn.Dropout(dropout)
        # talking heads
        assert not (flash and (pre_talking_heads or post_talking_heads or pre_scale_post_talking_heads)), 'talking heads not compatible with flash attention'
        self.pre_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False) if pre_talking_heads else None
        self.post_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False) if post_talking_heads else None
        self.pre_scale_post_talking_heads = nn.Conv2d(heads, heads, 1, bias = False) if pre_scale_post_talking_heads else None
        if exists(self.pre_softmax_talking_heads):
            nn.init.dirac_(self.pre_softmax_talking_heads.weight)
        if exists(self.post_softmax_talking_heads):
            nn.init.dirac_(self.post_softmax_talking_heads.weight)
        if exists(self.pre_scale_post_talking_heads):
            # an improvisation where heads are combined pre-softmax attention, then used to scale post-softmax attention
            nn.init.dirac_(self.pre_scale_post_talking_heads.weight)
        # selective attention
        assert not (flash and selective), 'selective attention cannot work on flash attention'
        assert not (selective and not causal), 'selective attention is designed for autoregressive'
        self.selective = selective
        # l2 distance attention
        self.l2_distance = l2_distance
        # add a key / value token composed of zeros
        # in case this helps controlling outliers, proposed by https://www.evanmiller.org/attention-is-off-by-one.html
        self.add_zero_kv = add_zero_kv
        # learned sink concatted pre-softmax, working solution from gpt-oss
        assert not (head_learned_sink and flash), f'not supported for flash attention yet'
        self.head_learned_sink = head_learned_sink
        self.head_attn_sink = Parameter(torch.zeros(heads)) if head_learned_sink else None
        # soft clamp attention logit value
        if softclamp_logits:
            assert not flash, 'flash attention not compatible with logit softclamp value yet'
            assert logit_softclamp_value > 0.
        self.softclamp_logits = softclamp_logits
        self.logit_softclamp_value = logit_softclamp_value
        # contextual positional encoding
        self.cope = cope
        # flash attention
        self.flash = flash
        torch_version = version.parse(torch.__version__)
        assert not (flash and torch_version < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
        # torch 2.3 uses new backend and context manager
        if self.flash:
            if torch_version >= version.parse('2.3'):
                from torch.nn.attention import SDPBackend
                str_to_backend = dict(
                    enable_flash = SDPBackend.FLASH_ATTENTION,
                    enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION,
                    enable_math = SDPBackend.MATH,
                    enable_cudnn = SDPBackend.CUDNN_ATTENTION
                )
                sdpa_backends = [str_to_backend[enable_str] for enable_str, enable in sdp_kwargs.items() if enable]
                self.sdp_context_manager = partial(torch.nn.attention.sdpa_kernel, sdpa_backends)
            else:
                self.sdp_context_manager = partial(torch.backends.cuda.sdp_kernel, **sdp_kwargs)
    def flash_attn(
        self,
        q, k, v,
        mask = None,
        attn_bias = None
    ):
        batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
        # Recommended for multi-query single-key-value attention by Tri Dao
        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
        if k.ndim == 3:
            k = repeat(k, 'b ... -> b h ...', h = q.shape[1])
        if v.ndim == 3:
            v = repeat(v, 'b ... -> b h ...', h = q.shape[1])
        # handle maybe l2 distance
        if self.l2_distance:
            k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
            k = F.pad(k, (0, 1), value = -1.)
            k = cat((k, k_norm_sq), dim = -1)
            q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
            q = cat((2 * q, q_norm_sq), dim = -1)
            q = F.pad(q, (0, 1), value = -1.)
        # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
        if exists(self.scale):
            default_scale = q.shape[-1] ** -0.5
            q = q * (self.scale / default_scale)
        # Check if mask exists and expand to compatible shape
        # The mask is B L, so it would have to be expanded to B H N L
        causal = self.causal
        # in the case of kv caching with one token (q_len == 1), just turn off causal masking
        # in speculative decoding, this may go up to 5-6, so right aligned causal mask will be needed there
        if q_len == 1 and causal:
            causal = False
        # expand key padding mask
        if exists(mask):
            assert mask.ndim == 4
            mask = mask.expand(batch, heads, q_len, k_len)
        # handle kv cache - this should be bypassable in updated flash attention 2
        if k_len > q_len and causal:
            causal_mask = self.create_causal_mask(q_len, k_len, device = device)
            if not exists(mask):
                mask = ~causal_mask
            else:
                mask = mask & ~causal_mask
            causal = False
        # manually handle causal mask, if another mask was given
        if exists(mask) and causal:
            causal_mask = self.create_causal_mask(q_len, k_len, device = device)
            mask = mask & ~causal_mask
            causal = False
        # protect against an entire row being masked out
        row_is_entirely_masked = None
        if exists(mask):
            row_is_entirely_masked = ~mask.any(dim = -1)
        # handle alibi positional bias
        # convert from bool to float
        if exists(attn_bias):
            attn_bias = attn_bias.expand(batch, heads, -1, -1)
            # if mask given, the mask would already contain the causal mask from above logic
            # otherwise, if no mask given but still causal, mask out alibi positional bias to a large negative number
            mask_value = -torch.finfo(q.dtype).max
            if exists(mask):
                attn_bias = attn_bias.masked_fill(~mask, mask_value // 2)
            elif causal:
                causal_mask = self.create_causal_mask(q_len, k_len, device = device)
                attn_bias = attn_bias.masked_fill(causal_mask, mask_value // 2)
                causal = False
            # scaled_dot_product_attention handles attn_mask either as bool or additive bias
            # make it an additive bias here
            mask = attn_bias
        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
        with self.sdp_context_manager():
            out = F.scaled_dot_product_attention(
                q, k, v,
                attn_mask = mask,
                dropout_p = self.dropout if self.training else 0., 
                is_causal = causal
            )
        # for a row that is entirely masked out, should zero out the output of that row token
        if exists(row_is_entirely_masked) and row_is_entirely_masked.any():
            out = out.masked_fill(row_is_entirely_masked[..., None], 0.)
        return out, Intermediates()
    def forward(
        self,
        q, k, v,
        mask = None,
        attn_bias = None,
        prev_attn = None
    ):
        """
        einstein notation
        b - batch
        h - heads
        n, i, j - sequence length (base sequence length, source, target)
        d - feature dimension
        """
        n, heads, kv_heads, device = q.shape[-2], q.shape[1], k.shape[1], q.device
        scale = default(self.scale, q.shape[-1] ** -0.5)
        causal = self.causal
        # handle key padding mask
        if exists(mask) and mask.ndim == 2:
            mask = rearrange(mask, 'b j -> b 1 1 j')
        # handle kv cached decoding
        if n == 1 and causal:
            causal = False
        # handle grouped multi-query attention
        if kv_heads == 1:
            k, v = tuple(rearrange(t, 'b 1 n d -> b n d') for t in (k, v))
        elif kv_heads < heads:
            k, v = tuple(repeat(t, 'b kvh n d -> b (r kvh) n d', r = heads // kv_heads) for t in (k, v))
        # handle zero kv, as means for allowing network to attend to nothing
        if self.add_zero_kv:
            k, v = tuple(F.pad(t, (0, 0, 1, 0), value = 0.) for t in (k, v))
            if exists(mask):
                mask = F.pad(mask, (1, 0), value = True)
            if exists(attn_bias):
                attn_bias = F.pad(attn_bias, (1, 0), value = 0.)
        if self.flash:
            assert not exists(prev_attn), 'residual attention not compatible with flash attention'
            return self.flash_attn(q, k, v, mask = mask, attn_bias = attn_bias)
        kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d'
        if not self.l2_distance:
            sim = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k)
        else:
            sim = -qk_l2_dist_squared(q, k)
        sim = sim * scale
        if exists(prev_attn):
            sim = sim + prev_attn
        qk_similarities = sim.clone()
        if exists(self.pre_scale_post_talking_heads):
            pre_to_post_scale = self.pre_scale_post_talking_heads(sim)
        if exists(self.pre_softmax_talking_heads):
            sim = sim + self.pre_softmax_talking_heads(sim)
        if exists(attn_bias):
            sim = sim + attn_bias
        if self.softclamp_logits:
            sim = softclamp(sim, self.logit_softclamp_value)
        i, j, dtype = *sim.shape[-2:], sim.dtype
        mask_value = -torch.finfo(sim.dtype).max
        if exists(mask):
            sim = sim.masked_fill(~mask, mask_value)
        if causal:
            causal_mask = self.create_causal_mask(i, j, device = device)
            sim = sim.masked_fill(causal_mask, mask_value)
        row_is_entirely_masked = None
        if exists(mask):
            row_is_entirely_masked = ~mask.any(dim = -1)
        if exists(self.cope):
            sim = sim + self.cope(q, sim)
        if self.selective:
            sim = selective_attn(sim)
        if self.head_learned_sink:
            # add learned attention sink
            attn_sink = repeat(self.head_attn_sink, 'h -> b h i 1', b = sim.shape[0], i = sim.shape[2])
            sim = cat((attn_sink, sim), dim = -1)
        pre_softmax_attn = sim
        attn = self.attn_fn(sim)
        attn = attn.type(dtype)
        post_softmax_attn = attn
        if self.head_learned_sink:
            # remove attention sink
            attn = attn[..., 1:]
        attn = self.attn_dropout(attn)
        if exists(self.post_softmax_talking_heads):
            attn = self.post_softmax_talking_heads(attn)
        if exists(self.pre_scale_post_talking_heads):
            attn = attn * pre_to_post_scale
        out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v)
        intermediates = Intermediates(
            qk_similarities = qk_similarities,
            pre_softmax_attn = pre_softmax_attn,
            post_softmax_attn = post_softmax_attn
        )
        if exists(row_is_entirely_masked) and row_is_entirely_masked.any():
            out = out.masked_fill(row_is_entirely_masked[..., None], 0.)
        return out, intermediates
--- a/Amadeus/custom_wrapper.py
+++ b/Amadeus/custom_wrapper.py
@ -0,0 +1,581 @@
 from __future__ import annotations
 from math import ceil, log
 from typing import Tuple, Callable
 import torch
 from torch import nn, tensor, Tensor
 from torch.nn import Module
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence
 from einops import rearrange, repeat, pack, unpack
 def exists(val):
    return val is not None
 def default(val, d):
    return val if exists(val) else d
 def identity(t, *args, **kwargs):
    return t
 def join(arr, delimiter = ', '):
    return delimiter.join(arr)
 def cast_tuple(t, length = 1):
    return t if isinstance(t, tuple) else (t,) * length
 def eval_decorator(fn):
    def inner(self, *args, **kwargs):
        was_training = self.training
        self.eval()
        out = fn(self, *args, **kwargs)
        self.train(was_training)
        return out
    return inner
 # gumbel topk
 def log(t, eps = 1e-20):
    return t.clamp(min = eps).log()
 def gumbel_noise(t):
    return -log(-log(torch.rand_like(t)))
 # function for modifying all the cached key / values
 def modify_cached_kv(cache, fn):
    for inter in cache.attn_intermediates:
        if inter.layer_type == 'a':
            inter.cached_kv = [fn(t) for t in inter.cached_kv]
 # for variable lengthed prefixes
 def pad_at_dim(t, pad: tuple[int, int], dim = -1, value = 0.):
    if pad == (0, 0):
        return t
    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
    zeros = ((0, 0) * dims_from_right)
    return F.pad(t, (*zeros, *pad), value = value)
 def align_right(t, lens, pad_id = 0):
    batch, seq_len, device, dtype = *t.shape[:2], t.device, t.dtype
    assert lens.ndim == 1 and lens.shape[0] == batch
    assert lens.amax() <= seq_len
    pad_lens = seq_len - lens
    max_pad_len = pad_lens.amax()
    batch_arange = torch.arange(batch, device = device, dtype = torch.long)[..., None]
    prompt_len_arange = torch.arange(seq_len, device = device, dtype = torch.long)
    t = pad_at_dim(t, (max_pad_len, 0), value = pad_id, dim = 1)
    offset = max_pad_len - pad_lens
    aligned = t[batch_arange, prompt_len_arange + offset[..., None], ...]
    return aligned
 # nucleus
 def top_p(logits, thres = 0.9):
    sorted_logits, sorted_indices = torch.sort(logits, descending = True)
    cum_probs = torch.cumsum(F.softmax(sorted_logits, dim = -1), dim = -1)
    sorted_indices_to_remove = cum_probs > thres
    sorted_indices_to_remove = F.pad(sorted_indices_to_remove, (1, -1), value = False)
    sorted_logits[sorted_indices_to_remove] = float('-inf')
    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
 # topk
 def top_k(logits, frac_num_tokens = 0.1, k = None):
    num_tokens = logits.shape[-1]
    k = default(k, ceil(frac_num_tokens * num_tokens))
    k = min(k, num_tokens)
    val, ind = torch.topk(logits, k)
    probs = torch.full_like(logits, float('-inf'))
    probs.scatter_(1, ind, val)
    return probs
 # top_a
 def top_a(logits, min_p_pow = 2.0, min_p_ratio = 0.02):
    probs = logits.softmax(dim = -1)
    max_probs = probs.amax(dim = -1, keepdim = True)
    limit = torch.pow(max_probs, min_p_pow) * min_p_ratio
    return torch.where(probs < limit, float('-inf'), logits)
 # min_p
 # https://arxiv.org/abs/2407.01082
 def min_p(logits, min_p = 0.1):
    probs = logits.softmax(dim = -1)
    max_probs = probs.amax(dim = -1, keepdim = True)
    limit = min_p * max_probs
    return torch.where(probs < limit, float('-inf'), logits)
 # filter logits functions dict[str -> Callable]
 FILTER_LOGITS_FN = dict(
    top_p = top_p,
    top_k = top_k,
    top_a = top_a,
    min_p = min_p
 )
 # contrastive decoding function
 def contrastive_decode_fn(
    expert_logits,
    amateur_logits,
    alpha = 0.1,
    beta = 0.5
 ):
    """
    Appendix A Algorithm 2
    https://arxiv.org/abs/2309.09117
    """
    cutoff = log(alpha) + expert_logits.amax(dim = -1, keepdim = True)
    diffs = (1 + beta) * expert_logits - beta * amateur_logits
    contrastive_decode_logits = diffs.masked_fill(expert_logits < cutoff, -torch.finfo(expert_logits.dtype).max)
    return contrastive_decode_logits
 # autoregressive wrapper class
 class AutoregressiveWrapper(Module):
    def __init__(
        self,
        net,
        ignore_index = -100,
        pad_value = 0,
        mask_prob = 0.,
        add_attn_z_loss = False,
        next_embed_loss_weight = 0.1
    ):
        super().__init__()
        self.pad_value = pad_value
        self.ignore_index = ignore_index
        self.net = net
        self.max_seq_len = net.max_seq_len
        # paper shows masking (MLM) in conjunction with autoregressive decoder-only training leads to big improvements https://arxiv.org/abs/2210.13432
        assert mask_prob < 1.
        self.mask_prob = mask_prob
        # whether to add router z-loss
        self.add_attn_z_loss = add_attn_z_loss
        # whether to add a continuous loss
        self.add_continuous_pred_head = net.add_continuous_pred_head
        self.next_embed_loss_weight = next_embed_loss_weight
    @torch.no_grad()
    @eval_decorator
    def beam_search(
        self,
        prompts,
        seq_len,
        beams = 4,
        return_beams_and_scores = False,
        eos_token = None,
        temperature = 1.,
        stochastic = False,
        prompt_lens: Tensor | None = None,
        filter_logits_fn: str | Callable = identity,
        restrict_to_max_seq_len = True,
        filter_kwargs: dict = dict(),
        cache_kv = True,
        **kwargs
    ):
        assert not exists(eos_token), 'eos token not supported yet'
        max_seq_len, greedy, device = self.max_seq_len, temperature == 0., prompts.device
        prompts, packed_shape = pack([prompts], '* n')
        batch, orig_seq_len = prompts.shape
        # handle filter logits fn given as string
        if isinstance(filter_logits_fn, str):
            assert filter_logits_fn in FILTER_LOGITS_FN, f"only {join(FILTER_LOGITS_FN.keys())} are available"
            filter_logits_fn = FILTER_LOGITS_FN[filter_logits_fn]
        # handle variable lengthed prompts (prefixes)
        seq_start_pos = None
        if exists(prompt_lens):
            prompts = align_right(prompts, prompt_lens, pad_id = self.pad_value)
            seq_start_pos = orig_seq_len - prompt_lens
        # output from which sampled tokens appended to
        out = prompts
        # kv caches
        cache = None
        should_cache = cache_kv and self.net.can_cache_kv
        # scores for the beams
        scores = torch.zeros((batch,), device = device)
        batch_arange = torch.arange(batch, device = device)
        # sampling up to seq_len
        for i in range(seq_len):
            is_first = i == 0
            if restrict_to_max_seq_len:
                max_len_exceeded = out.shape[-1] > max_seq_len
                assert not (cache_kv and max_len_exceeded and not self.net.can_cache_kv_outside_max_seq_len), 'the network cannot use cached key values when decoding outside the max sequence length. most likely because you are using absolute positional embedding. you can switch to rotary embeddings to resolve this issue'
                x = out[:, -max_seq_len:]
                if exists(cache):
                    modify_cached_kv(cache, lambda t: t[..., -(max_seq_len - 1):, :])
            logits, new_cache = self.net(
                x,
                return_intermediates = True,
                cache = cache,
                seq_start_pos = seq_start_pos,
                **kwargs
            )
            if should_cache:
                cache = new_cache
            logits = logits[:, -1]
            # to add to the scores
            log_probs = logits.log_softmax(dim = -1)
            # maybe filter by top_k, top_p (nucleus) for stochastic beam search
            if stochastic and not greedy:
                logits = filter_logits_fn(logits, **filter_kwargs)
                logits = (logits / temperature) + gumbel_noise(logits)
            # (gumbel) topk
            samples = logits.topk(beams, dim = -1).indices
            # get the scores for keeping track of beams
            next_scores = log_probs.gather(-1, samples)
            # expand beam times
            scores = repeat(scores, 'b -> b beams', beams = beams)
            scores = scores + next_scores
            out = repeat(out, 'b ... -> (b beams) ...', beams = beams)
            samples = rearrange(samples, 'b beams -> (b beams) 1')
            if should_cache and is_first:
                modify_cached_kv(cache, lambda t: repeat(t, 'b ... -> (b beams) ...', beams = beams))
            # concat sample
            out = torch.cat((out, samples), dim=-1)
            # sort by score and excise
            # excise out the beams
            scores = rearrange(scores, '(b prev_beams) next_beams -> b (prev_beams next_beams)', b = batch)
            curr_num_beams = scores.shape[-1]
            if curr_num_beams > beams:
                scores, sort_indices = scores.sort(dim = -1, descending = True)
                scores = scores[:, :beams]
                top_beams_indices = sort_indices[:, :beams]
                top_beams_indices = curr_num_beams * batch_arange[:, None] + top_beams_indices
                flattened_beam_indices = rearrange(top_beams_indices, 'b beams -> (b beams)')
                out = out[flattened_beam_indices]
            scores = rearrange(scores, 'b beams -> (b beams)')
            if not exists(eos_token):
                continue
            is_eos_tokens = (out == eos_token)
            if is_eos_tokens.any(dim = -1).all():
                break
        if exists(eos_token):
            # mask out everything after the eos tokens
            shifted_is_eos_tokens = F.pad(is_eos_tokens, (1, -1))
            mask = shifted_is_eos_tokens.float().cumsum(dim = -1) >= 1
            out = out.masked_fill(mask, self.pad_value)
        # select out the top beam
        out = rearrange(out, '(b beams) seq -> b beams seq', b = batch)
        out = out[..., orig_seq_len:]
        out, = unpack(out, packed_shape, '* beams n') # prompt may have no batch dimension
        if not return_beams_and_scores:
            return out[..., 0, :]
        scores = rearrange(scores, '(b beams) -> beams b', b = batch)
        out = rearrange(out, 'b beams n -> beams b n')
        return out, scores
    @torch.no_grad()
    @eval_decorator
    def generate(
        self,
        prompts: list[Tensor] | Tensor,
        seq_len,
        eos_token = None,
        temperature = 1.,
        prompt_lens: Tensor | None = None,
        filter_logits_fn: str | Callable = top_k,
        restrict_to_max_seq_len = True,
        amateur_model: Module | Tuple[Module] | None = None,
        filter_kwargs: dict = dict(),
        contrastive_decode_kwargs: dict | Tuple[dict] = dict(
            beta = 0.5,
            alpha = 0.1
        ),
        cache_kv = True,
        **kwargs
    ):
        max_seq_len, greedy = self.max_seq_len, temperature == 0.
        # handle prompts given as list of variable lengthed token ids
        if isinstance(prompts, list):
            assert len(prompts) > 0, 'prompts cannot be empty list'
            assert not exists(prompt_lens), '`prompt_len` will be auto derived if prompts are passed in as list of Tensors'
            prompt_lens = tensor([t.shape[0] for t in prompts], device = prompts[0].device)
            prompts = pad_sequence(prompts, batch_first = True)
        # pack maybe no batch
        prompts, ps = pack([prompts], '* n')
        b, t, device = *prompts.shape, prompts.device
        # handle filter logits fn given as string
        if isinstance(filter_logits_fn, str):
            assert filter_logits_fn in FILTER_LOGITS_FN, f"only {join(FILTER_LOGITS_FN.keys())} are available"
            filter_logits_fn = FILTER_LOGITS_FN[filter_logits_fn]
        # handle variable lengthed prompts (prefixes)
        seq_start_pos = None
        if exists(prompt_lens):
            prompts = align_right(prompts, prompt_lens, pad_id = self.pad_value)
            seq_start_pos = t - prompt_lens
        # output from which sampled tokens appended to
        out = prompts
        # kv caches
        cache = None
        # if doing contrastive decoding, turn off filter automatically
        if exists(amateur_model):
            amateur_model = cast_tuple(amateur_model)
            contrastive_decode_kwargs = cast_tuple(contrastive_decode_kwargs)
            assert len(amateur_model) == len(contrastive_decode_kwargs)
            amateur_caches = [None] * len(amateur_model)
            filter_logits_fn = identity
            for i, module in enumerate(amateur_model):
                if isinstance(module, AutoregressiveWrapper):
                    amateur_model[i] = module.net
                module.eval()
        # sampling up to seq_len
        for _ in range(seq_len):
            if restrict_to_max_seq_len:
                max_len_exceeded = out.shape[-1] > max_seq_len
                assert not (cache_kv and max_len_exceeded and not self.net.can_cache_kv_outside_max_seq_len), 'the network cannot use cached key values when decoding outside the max sequence length. most likely because you are using absolute positional embedding. you can switch to rotary embeddings to resolve this issue'
                x = out[:, -max_seq_len:]
                if exists(cache):
                    for inter in cache.attn_intermediates:
                        if inter.layer_type == 'a':
                            inter.cached_kv = [t[..., -(max_seq_len - 1):, :] for t in inter.cached_kv]
            logits, new_cache = self.net(
                x,
                return_intermediates = True,
                cache = cache,
                seq_start_pos = seq_start_pos,
                **kwargs
            )
            if cache_kv and self.net.can_cache_kv:
                cache = new_cache
            logits = logits[:, -1]
            # handle contrastive decoding, Li et al.
            # https://arxiv.org/abs/2210.15097
            if exists(amateur_model):
                for i, (amateur, amateur_cache, amateur_contrastive_decode_kwargs) in enumerate(zip(amateur_model, amateur_caches, contrastive_decode_kwargs)):
                    amateur_logits, next_amateur_cache = amateur(
                        x,
                        return_intermediates = True,
                        cache = amateur_cache,
                        seq_start_pos = seq_start_pos,
                        **kwargs
                    )
                    amateur_logits = amateur_logits[:, -1]
                    assert amateur_logits.shape == logits.shape, 'logits dimension are not the same between amateur and expert model'
                    logits = contrastive_decode_fn(logits, amateur_logits, **amateur_contrastive_decode_kwargs)
                    if cache_kv and amateur.can_cache_kv:
                        amateur_caches[i] = next_amateur_cache
            # filter by top_k, top_p (nucleus), top_a, or custom
            if greedy:
                sample = logits.argmax(dim = -1, keepdim = True)
            else:
                filtered_logits = filter_logits_fn(logits, **filter_kwargs)
                probs = F.softmax(filtered_logits / temperature, dim=-1)
                sample = torch.multinomial(probs, 1)
            # concat sample
            out = torch.cat((out, sample), dim=-1)
            if not exists(eos_token):
                continue
            is_eos_tokens = (out == eos_token)
            if is_eos_tokens.any(dim = -1).all():
                break
        if exists(eos_token):
            # mask out everything after the eos tokens
            shifted_is_eos_tokens = F.pad(is_eos_tokens, (1, -1))
            mask = shifted_is_eos_tokens.float().cumsum(dim = -1) >= 1
            out = out.masked_fill(mask, self.pad_value)
        out = out[:, t:]
        out, = unpack(out, ps, '* n')
        return out
    def forward(
        self,
        x,
        return_outputs = False,
        prepend_embeds = None,
        **kwargs
    ):
        seq, ignore_index, add_attn_z_loss, add_next_embed_loss = x.shape[1], self.ignore_index, self.add_attn_z_loss, self.add_continuous_pred_head
        inp, target = x, x[:, 1:]
        inp = torch.where(inp == ignore_index, self.pad_value, inp)
        if self.mask_prob > 0.:
            rand = torch.randn(inp.shape, device = x.device)
            rand[:, 0] = -torch.finfo(rand.dtype).max # first token should not be masked out
            num_mask = min(int(seq * self.mask_prob), seq - 1)
            indices = rand.topk(num_mask, dim = -1).indices
            mask = ~torch.zeros_like(inp).scatter(1, indices, 1.).bool()
            kwargs.update(self_attn_kv_mask = mask)
        out, cache = self.net(
            inp,
            return_intermediates = True,
            return_attn_z_loss = add_attn_z_loss,
            return_next_embed_pred = add_next_embed_loss,
            prepend_embeds = prepend_embeds,
            **kwargs
        )
        # destruct differently if doing continuous pred
        if add_next_embed_loss:
            logits, (next_embed_pred, init_embeds) = out
        else:
            logits = out
        # if there are prepended embeds, excise it out
        if exists(prepend_embeds):
            prepend_len = prepend_embeds.shape[1]
            logits = logits[:, prepend_len:]
        # take all tokens but the last
        logits = logits[:, :-1]
        # loss function
        loss_fn = F.cross_entropy if not self.net.output_is_log_prob else F.nll_loss
        # cross entropy loss
        loss = loss_fn(
            rearrange(logits, 'b n c -> b c n'),
            target,
            ignore_index = ignore_index
        )
        if add_attn_z_loss:
            loss = loss + cache.attn_z_loss
        if add_next_embed_loss:
            mask = target != ignore_index
            embed_pred = next_embed_pred[:, :-1]
            cont_targets = init_embeds[:, 1:].detach()
            cont_loss = F.l1_loss(embed_pred, cont_targets, reduction = 'none')
            cont_loss = cont_loss[mask].mean()
            loss = loss + cont_loss * self.next_embed_loss_weight
        if not return_outputs:
            return loss
        return loss, (logits, cache)
--- a/Amadeus/custom_x_transformers.py
+++ b/Amadeus/custom_x_transformers.py
--- a/Amadeus/symbolic_yamls/config-accelerate.yaml
+++ b/Amadeus/symbolic_yamls/config-accelerate.yaml
@ -1,8 +1,8 @@
 defaults:
  # - nn_params: nb8_embSum_NMT
  # - nn_params: remi8
-  - nn_params: nb8_embSum_diff_t2m_150M_finetunning
+  # - nn_params: nb8_embSum_diff_t2m_150M_finetunning
-    # - nn_params: nb8_embSum_diff_t2m_150M_pretraining
+    - nn_params: nb8_embSum_diff_t2m_150M_pretrainingv2
  # - nn_params: nb8_embSum_subPararell
  # - nn_params: nb8_embSum_diff_t2m_150M
--- a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretrainingv2.yaml
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretrainingv2.yaml
@ -0,0 +1,19 @@
 encoding_scheme: nb
 num_features: 8
 vocab_name: MusicTokenVocabNB
 model_name: AmadeusModel
 input_embedder_name:  SummationEmbedder
 main_decoder_name: XtransformerNewPretrainingDecoder
 sub_decoder_name: DiffusionDecoder
 model_dropout: 0
 input_embedder:
  num_layer: 1
  num_head: 8
 main_decoder:
  dim_model: 768
  num_layer: 20
  num_head: 12
 sub_decoder:
  decout_window_size: 1 # 1 means no previous decoding output added
  num_layer: 1
  feature_enricher_use: False
--- a/Amadeus/transformer_utils.py
+++ b/Amadeus/transformer_utils.py
@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
-from x_transformers import Decoder, Encoder, PrefixDecoder, CrossAttender
+from Amadeus.custom_x_transformers import Decoder, Encoder, PrefixDecoder, CrossAttender
 from transformers import T5EncoderModel
 from data_representation.vocab_utils import LangTokenVocab