commit 80333dff74cfd162657532117e5df2c27e71aac2
Author: lingyu123-su <3307872825@qq.com>
Date:   Mon Sep 8 14:49:28 2025 +0800

    first commit

diff --git a/Amadeus/.DS_Store b/Amadeus/.DS_Store
new file mode 100644
index 0000000..5df37c8
Binary files /dev/null and b/Amadeus/.DS_Store differ
diff --git a/Amadeus/__init__.py b/Amadeus/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Amadeus/__pycache__/__init__.cpython-310.pyc b/Amadeus/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..d9d05ca
Binary files /dev/null and b/Amadeus/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc b/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc
new file mode 100644
index 0000000..3ad2148
Binary files /dev/null and b/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/model_zoo.cpython-310.pyc b/Amadeus/__pycache__/model_zoo.cpython-310.pyc
new file mode 100644
index 0000000..632bb87
Binary files /dev/null and b/Amadeus/__pycache__/model_zoo.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/sampling_utils.cpython-310.pyc b/Amadeus/__pycache__/sampling_utils.cpython-310.pyc
new file mode 100644
index 0000000..60d2121
Binary files /dev/null and b/Amadeus/__pycache__/sampling_utils.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc b/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc
new file mode 100644
index 0000000..c3826ba
Binary files /dev/null and b/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc b/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc
new file mode 100644
index 0000000..40dbe19
Binary files /dev/null and b/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/train_utils.cpython-310.pyc b/Amadeus/__pycache__/train_utils.cpython-310.pyc
new file mode 100644
index 0000000..104f114
Binary files /dev/null and b/Amadeus/__pycache__/train_utils.cpython-310.pyc differ
diff --git a/Amadeus/__pycache__/transformer_utils.cpython-310.pyc b/Amadeus/__pycache__/transformer_utils.cpython-310.pyc
new file mode 100644
index 0000000..6332188
Binary files /dev/null and b/Amadeus/__pycache__/transformer_utils.cpython-310.pyc differ
diff --git a/Amadeus/catsample.py b/Amadeus/catsample.py
new file mode 100644
index 0000000..13f0ebe
--- /dev/null
+++ b/Amadeus/catsample.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn.functional as F
+
+
+def gumbel_softmax(categorical_probs, hard=False, eps=1e-9):
+    logits = categorical_probs.clamp(min=1e-9).log()
+    return F.gumbel_softmax(logits, hard=hard)
+
+
+def sample_categorical(categorical_probs, method="hard"):
+    if method == "hard":
+        gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()
+        return (categorical_probs / gumbel_norm).argmax(dim=-1)
+    else:
+        raise ValueError(f"Method {method} for sampling categorical variables is not valid.")
+    
+def direct_sampling(logits):
+    probs = logits.softmax(dim=-1)
+    index = sample_categorical(probs.to(torch.float32))
+    return index
+
+
+def top_p_sampling(logits, p=0.9):
+    probs = logits.softmax(dim=-1)
+
+    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+    sorted_indices_to_remove = cumulative_probs > p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    probs.masked_fill_(indices_to_remove, 0)
+    probs /= probs.sum(dim=-1).unsqueeze(-1)
+    index = sample_categorical(probs.to(torch.float32))
+
+    return index
+
+
+def top_k_sampling(logits, k=400):
+    top_k_values, top_k_indices = torch.topk(logits, int(k))
+    top_k_probs = top_k_values.softmax(dim=-1)
+    index = sample_categorical(top_k_probs.to(torch.float32))
+    index = top_k_indices[torch.arange(index.size(0)), index]
+
+    return index
+
+def sample_with_strategy(update_logits, strategy, para = None):
+    if strategy == "direct":
+        return direct_sampling(update_logits)
+    elif strategy == "top_p":
+        return top_p_sampling(update_logits, para)
+    elif strategy == "top_k":
+        return top_k_sampling(update_logits, para)
+    else:
+        raise ValueError(f"Strategy {strategy} is not valid.")
\ No newline at end of file
diff --git a/Amadeus/evaluation_utils.py b/Amadeus/evaluation_utils.py
new file mode 100644
index 0000000..00aa52b
--- /dev/null
+++ b/Amadeus/evaluation_utils.py
@@ -0,0 +1,533 @@
+from collections import defaultdict
+from typing import Union
+from math import log
+from omegaconf import DictConfig
+from pathlib import Path
+import pickle
+import json
+import torch
+from tqdm.auto import tqdm
+from transformers import T5Tokenizer, T5EncoderModel
+
+from . import model_zoo
+from .symbolic_encoding import data_utils
+from .model_zoo import AmadeusModel
+from .symbolic_encoding.data_utils import TuneCompiler
+from .symbolic_encoding.compile_utils import shift_and_pad
+from .symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+from .symbolic_encoding import decoding_utils
+from .train_utils import adjust_prediction_order
+from data_representation import vocab_utils
+from data_representation.vocab_utils import LangTokenVocab
+
+def wandb_style_config_to_omega_config(wandb_conf):
+  # remove wandb related config
+  for wandb_key in ["wandb_version", "_wandb"]:
+    if wandb_key in wandb_conf:
+      del wandb_conf[wandb_key] # wandb-related config should not be overrided! 
+  # print(wandb_conf)
+  # remove nonnecessary fields such as desc and value
+  for key in wandb_conf:
+  #   if 'desc' in wandb_conf[key]:
+  #     del wandb_conf[key]['desc']
+    if isinstance(wandb_conf[key], dict) and 'value' in wandb_conf[key]:
+        wandb_conf[key] = wandb_conf[key]['value']
+    # 处理存在'value'的情况
+    try:
+      if 'value' in wandb_conf[key]:
+          wandb_conf[key] = wandb_conf[key]['value']
+    except:
+      pass
+  return wandb_conf
+
+def get_dir_from_wandb_by_code(wandb_dir: Path, code:str) -> Path:
+  for dir in wandb_dir.iterdir():
+    if dir.name.endswith(code):
+      return dir
+  print(f'No such code in wandb_dir: {code}')
+  return None
+
+def get_best_ckpt_path_and_config(wandb_dir, code):
+  dir = get_dir_from_wandb_by_code(wandb_dir, code)
+  if dir is None:
+    raise ValueError('No such code in wandb_dir')
+  ckpt_dir = dir / 'files' / 'checkpoints'
+
+  config_path = dir / 'files'  / 'config.yaml'
+  # print all files in ckpt_dir
+  vocab_path = next(ckpt_dir.glob('vocab*'))
+  metadata_path = next(ckpt_dir.glob('*metadata.json'))
+
+  # if there is pt file ending with 'last', return it 
+  if len(list(ckpt_dir.glob('*last.pt'))) > 0:
+    last_ckpt_fn = next(ckpt_dir.glob('*last.pt'))
+  else:
+    pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', '')))
+    last_ckpt_fn = pt_fns[-1]
+
+  return last_ckpt_fn, config_path, metadata_path, vocab_path
+
+def prepare_model_and_dataset_from_config(config: DictConfig, metadata_path:str, vocab_path:str):
+  nn_params = config.nn_params
+  dataset_name = config.dataset
+  vocab_path = Path(vocab_path)
+
+  if 'Encodec' in dataset_name:
+    encodec_tokens_path = Path(f"dataset/maestro-v3.0.0-encodec_tokens")
+    encodec_dataset = EncodecDataset(config, encodec_tokens_path, None, None)
+    vocab_sizes = encodec_dataset.vocab.get_vocab_size()
+    train_set, valid_set, test_set = encodec_dataset.split_train_valid_test_set()
+    
+    lm_model:model_zoo.LanguageModelTransformer= getattr(model_zoo, nn_params.model_name)(config, vocab_sizes)
+  else:
+    # print(config)
+    encoding_scheme = config.nn_params.encoding_scheme
+    num_features = config.nn_params.num_features
+    
+    # get vocab
+    vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+
+    vocab = getattr(vocab_utils, selected_vocab_name)(
+      in_vocab_file_path=vocab_path,
+      event_data=None,
+      encoding_scheme=encoding_scheme, 
+      num_features=num_features)
+
+    # Initialize symbolic dataset based on dataset name and configuration parameters
+    symbolic_dataset = getattr(data_utils, dataset_name)(
+                                vocab=vocab,
+                                encoding_scheme=encoding_scheme,
+                                num_features=num_features,
+                                debug=config.general.debug,
+                                aug_type=config.data_params.aug_type,
+                                input_length=config.train_params.input_length,
+                                first_pred_feature=config.data_params.first_pred_feature,
+                                caption_path=config.captions_path if hasattr(config, 'captions_path') else None,
+                                for_evaluation=True,
+                                )
+    
+    vocab_sizes = symbolic_dataset.vocab.get_vocab_size()
+    print(f"---{nn_params.main_decoder}--- is used")
+    print(f"---{dataset_name}--- is used")
+    print(f"---{encoding_scheme}--- is used")
+    split_ratio = config.data_params.split_ratio
+    # test_set = []
+    train_set, valid_set, test_set = symbolic_dataset.split_train_valid_test_set(dataset_name=config.dataset, ratio=split_ratio, seed=42, save_dir=None)
+
+    # get proper prediction order according to the encoding scheme and target feature in the config
+    prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params)
+
+    # Create the Transformer model based on configuration parameters
+    AmadeusModel = getattr(model_zoo, nn_params.model_name)(
+                          vocab=symbolic_dataset.vocab,
+                          input_length=config.train_params.input_length,
+                          prediction_order=prediction_order,
+                          input_embedder_name=nn_params.input_embedder_name,
+                          main_decoder_name=nn_params.main_decoder_name,
+                          sub_decoder_name=nn_params.sub_decoder_name,
+                          sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,
+                          sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+                            if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,
+                          dim=nn_params.main_decoder.dim_model,
+                          heads=nn_params.main_decoder.num_head,
+                          depth=nn_params.main_decoder.num_layer,
+                          dropout=nn_params.model_dropout,
+                          )
+    
+  return AmadeusModel, test_set, symbolic_dataset.vocab
+
+def add_conti_in_valid(tensor, encoding_scheme):
+  new_target = tensor.clone()
+  # Assuming tensor shape is [batch, sequence, features]
+  # Create a shifted version of the tensor
+  shifted_tensor = torch.roll(new_target, shifts=1, dims=1)
+  # The first element of each sequence cannot be a duplicate by definition
+  shifted_tensor[:, 0, :] = new_target[:, 0, :] + 1
+  
+  # Identify where the original and shifted tensors are the same (duplicates)
+  duplicates = new_target == shifted_tensor
+  # TODO: convert hard-coded part
+  # convert values into False except the 1st and 2nd features
+  if encoding_scheme == 'nb':
+    if tensor.shape[2] == 5:
+      # change beat, instrument
+      duplicates[:, :, 0] = False
+      duplicates[:, :, 3] = False
+      duplicates[:, :, 4] = False
+    elif tensor.shape[2] == 4:
+      # change beat
+      duplicates[:, :, 0] = False
+      duplicates[:, :, 2] = False
+      duplicates[:, :, 3] = False
+    elif tensor.shape[2] == 7:
+      # change beat, chord, tempo
+      duplicates[:, :, 0] = False
+      duplicates[:, :, 4] = False
+      duplicates[:, :, 5] = False
+      duplicates[:, :, 6] = False
+  elif encoding_scheme == 'cp':
+    if tensor.shape[2] == 5:
+      # change instrument
+      duplicates[:, :, 0] = False
+      duplicates[:, :, 1] = False
+      duplicates[:, :, 3] = False
+      duplicates[:, :, 4] = False
+    elif tensor.shape[2] == 7:
+      # change chord, tempo
+      duplicates[:, :, 0] = False
+      duplicates[:, :, 1] = False
+      duplicates[:, :, 4] = False
+      duplicates[:, :, 5] = False
+      duplicates[:, :, 6] = False
+  
+  # Replace duplicates with 9999
+  new_target[duplicates] = 9999
+  return new_target
+
+# TODO: hard coded
+def add_conti(list_of_lists, encoding_scheme):
+  if encoding_scheme == 'nb':
+    if len(list_of_lists[0]) == 4:
+      # type, beat, pitch, duration
+      for i in range(0, len(list_of_lists)):
+        if list_of_lists[i][0] == 'SSS':
+          list_of_lists[i][1] = 'Conti'
+    elif len(list_of_lists[0]) == 5:
+      # type, beat, instrument, pitch, duration
+      previous_instrument = None
+      for i in range(0, len(list_of_lists)):
+        if list_of_lists[i][0] == 'SSS':
+          list_of_lists[i][1] = 'Conti'
+        if list_of_lists[i][2] == previous_instrument and previous_instrument != 0:
+          list_of_lists[i][2] = 'Conti'
+        else:
+          previous_instrument = list_of_lists[i][2]
+    elif len(list_of_lists[0]) == 7:
+      # type, beat, chord, tempo, pitch, duration, velocity
+      previous_chord = None
+      previous_tempo = None
+      for i in range(0, len(list_of_lists)):
+        if list_of_lists[i][0] == 'SSS':
+          list_of_lists[i][1] = 'Conti'
+        if list_of_lists[i][2] == previous_chord and previous_chord != 0:
+          list_of_lists[i][2] = 'Conti'
+        elif list_of_lists[i][2] != previous_chord and list_of_lists[i][2] != 0:
+          previous_chord = list_of_lists[i][2]
+        if list_of_lists[i][3] == previous_tempo and previous_tempo != 0:
+          list_of_lists[i][3] = 'Conti'
+        elif list_of_lists[i][3] != previous_tempo and list_of_lists[i][3] != 0:
+          previous_tempo = list_of_lists[i][3]
+  elif encoding_scheme == 'cp':
+    if len(list_of_lists[0]) == 7:
+      # type, beat, chord, tempo, pitch, duration, velocity
+      previous_chord = None
+      previous_tempo = None
+      for i in range(0, len(list_of_lists)):
+        current_chord = list_of_lists[i][2]
+        current_tempo = list_of_lists[i][3]
+        if current_chord == previous_chord and current_chord != 0:
+          list_of_lists[i][2] = 'Conti'
+        elif current_chord != previous_chord and current_chord != 0:
+          previous_chord = current_chord
+        if current_tempo == previous_tempo and current_tempo != 0:
+          list_of_lists[i][3] = 'Conti'
+        elif current_tempo != previous_tempo and current_tempo != 0:
+          previous_tempo = current_tempo
+    if len(list_of_lists[0]) == 5:
+      # type, beat, instrument, pitch, duration
+      previous_instrument = None
+      for i in range(0, len(list_of_lists)):
+        current_instrument = list_of_lists[i][2]
+        if current_instrument == previous_instrument and current_instrument != 0:
+          list_of_lists[i][2] = 'Conti'
+        elif current_instrument != previous_instrument and current_instrument != 0:
+          previous_instrument = current_instrument
+  return list_of_lists
+
+class Evaluator:
+  def __init__(self, 
+               config: DictConfig, 
+               model:AmadeusModel, 
+               test_set:TuneCompiler, 
+               vocab: Union[LangTokenVocab, LangTokenVocab],
+               device:str='cuda',
+               batch_size:int=16):
+    self.config = config
+    self.device = device
+    self.vocab = vocab
+    
+    self.model = model
+    self.model.eval()
+    self.model.to(device)
+    self.test_set = test_set
+    
+    self.input_len = config.train_params.input_length
+    self.loss_by_class = {key:[] for key in self.vocab.feature_list}
+    self.count_by_class = {key:0 for key in self.vocab.feature_list}
+    self.batch_size = batch_size
+
+    self.is_multiclass = True if config.nn_params.encoding_scheme == 'nb' or config.nn_params.encoding_scheme == 'cp' else False
+    self.first_pred_feature = self.config.data_params.first_pred_feature
+
+    self.neglect_keywords = ['SSS', 'SSN', 'Conti', 'Metrical', 'Note']
+    self.valid_item_prob = []
+
+    # we don't use focal loss on evaluation
+    self.focal_alpha = 1
+    self.focal_gamma = 0
+
+  def save_results(self, save_fn):
+    # convert loss_by_clas tensor to cpu
+    for key in self.loss_by_class.keys():
+      self.loss_by_class[key] = torch.tensor(self.loss_by_class[key]).cpu()
+      self.count_by_class[key] = torch.tensor(self.count_by_class[key]).cpu()
+    torch.save({'loss_by_class':self.loss_by_class, 'count_by_class':self.count_by_class}, save_fn)
+
+  @torch.inference_mode()
+  def get_perplexity(self,less_than=256):
+    for data in tqdm(self.test_set.data_list, desc='Cal over dataset', position=0):
+      data_tensor = torch.LongTensor(data[0])
+      if self.config.nn_params.encoding_scheme == 'nb':
+        data_tensor = shift_and_pad(data_tensor, self.first_pred_feature)
+        data_tensor = data_tensor[:-1]
+
+      x_seg = data_tensor[:-1].unsqueeze(0)
+      y_seg = data_tensor[1:].unsqueeze(0)
+      self._cal_initial_seg(x_seg, y_seg)
+
+      if x_seg.shape[1] > self.input_len:
+        cat_logits = []
+        cat_y = []
+        cat_mask_indices = []
+        batch_x = x_seg[0, 1:].unfold(dimension=0, size=self.input_len, step=1)
+        batch_y = y_seg[0, 1:].unfold(dimension=0, size=self.input_len, step=1)
+        if self.is_multiclass:
+          batch_x = batch_x.transpose(1,2)
+          batch_y = batch_y.transpose(1,2)
+        for batch_start_idx in tqdm(range(0, min(batch_x.shape[0], less_than), self.batch_size), desc='In piece iter', position=1, leave=False):
+          x = batch_x[batch_start_idx:batch_start_idx+self.batch_size]
+          y = batch_y[batch_start_idx:batch_start_idx+self.batch_size]
+          logits, y,mask_indices = self._cal_following_seg(x, y)
+          cat_logits.append(logits)
+          cat_y.append(y)
+          cat_mask_indices.append(mask_indices)
+        if self.is_multiclass:
+          cat_dict = {}
+          for key in self.vocab.feature_list:
+            cat_dict[key] = torch.cat([logits_dict[key] for logits_dict in cat_logits], dim=0)
+          cat_logits = cat_dict
+        else:
+          cat_logits = torch.cat(cat_logits, dim=0)
+        cat_y = torch.cat(cat_y, dim=0)
+        mask_indices = torch.cat(cat_mask_indices, dim=0)
+        if self.is_multiclass:
+          self._update_loss_for_multi_class(cat_logits, cat_y,mask_indices)
+        else:
+          cat_prob = torch.nn.functional.softmax(cat_logits, dim=-1)
+          pt = cat_prob[torch.arange(cat_prob.shape[0]), cat_y]
+          # focal_loss = -self.focal_alpha * (1-pt)**self.focal_gamma * torch.log(pt) # [batch_size*seq_len]
+          loss = -torch.log(pt)
+          self._update_loss_for_single_class(loss, cat_y)
+
+  @torch.inference_mode()
+  def _update_loss_for_single_class(self, neg_log_prob:torch.Tensor, y:torch.Tensor):
+    for key in self.vocab.feature_list:
+      feature_mask = self.vocab.total_mask[key].to(y.device) # [vocab_size,]
+      mask_for_target = feature_mask[y] # [b*t]
+      normal_loss_seq_by_class = neg_log_prob[mask_for_target==1]
+      if mask_for_target.sum().item() != 0:
+        self.loss_by_class[key] += normal_loss_seq_by_class.tolist()
+        self.count_by_class[key] += mask_for_target.sum().item()
+
+  @torch.inference_mode()
+  def _update_loss_for_multi_class(self, logits_dict:dict, tgt:torch.Tensor, mask_indices:torch.Tensor=None):
+    correct_token_prob = []
+    for index, key in enumerate(self.vocab.feature_list):
+      feat_tgt = tgt[:,index]
+      logit_values = logits_dict[key]
+      logit_values = logit_values
+      prob_values = torch.nn.functional.softmax(logit_values, dim=-1)
+      # replce the false
+      correct_token_prob.append(prob_values[torch.arange(prob_values.shape[0]), feat_tgt])
+    correct_token_prob = torch.stack(correct_token_prob, dim=1)
+    # tgt = reverse_shift_and_pad_for_tensor(tgt, self.first_pred_feature)
+    y_decoded = self.vocab.decode(tgt)
+    y_decoded = add_conti(y_decoded, self.config.nn_params.encoding_scheme)
+    # correct_token_prob = reverse_shift_and_pad_for_tensor(correct_token_prob, self.first_pred_feature)
+    num_notes = logits_dict['pitch'].shape[0]
+    cum_prob = 1
+    max_num = mask_indices.size(0)
+    for idx in range(max_num):
+      if max_num != num_notes:
+        print("not equal",max_num,num_notes)
+      token = y_decoded[idx]
+      vaild_mask = mask_indices[idx,:]
+      token_prob = correct_token_prob[idx].tolist()
+      for j, key in enumerate(self.vocab.feature_list):
+        cur_feature = token[j]
+        whether_predicted = vaild_mask[j]
+        # clamp cur_prob to avoid when cur_prob is 0
+        cur_prob = max(token_prob[j], 1e-10)
+        if cur_feature == 0: # ignore token
+          continue
+        if whether_predicted is False: # skip provided token
+          continue
+        if cur_feature in self.neglect_keywords:
+          cum_prob *= cur_prob
+          continue
+        if self.config.nn_params.encoding_scheme == 'cp' and 'time_signature' in cur_feature:
+          cum_prob *= cur_prob
+          continue
+        if self.config.nn_params.encoding_scheme == 'cp' and 'Bar' in cur_feature:
+          cum_prob = 1
+          continue
+        self.valid_item_prob.append([cur_feature, cur_prob, cur_prob*cum_prob])
+        pt = cur_prob*cum_prob
+        loss = -log(pt)
+        self.loss_by_class[key].append(loss)
+        self.count_by_class[key] += 1
+        cum_prob = 1
+
+  @torch.inference_mode()
+  def _cal_initial_seg(self, x_seg, y_seg):
+    x, y = x_seg[:, :self.input_len].to(self.device), y_seg[:, :self.input_len].to(self.device)
+    mask_indices = torch.ones_like(y).bool().to(self.device).flatten(0,1)
+    if self.config.use_diff is True:
+      logits,(mask_indices,_) = self.model(x, y)
+    else:
+      logits = self.model(x, y)
+    y = y.flatten(0,1)
+    if self.is_multiclass:
+      for key in logits.keys():
+        feat_tensor = logits[key].flatten(0,1)
+        logits[key] = feat_tensor
+      self._update_loss_for_multi_class(logits, y, mask_indices)
+    else:
+      prob = torch.nn.functional.softmax(logits, dim=-1)
+      prob = prob.flatten(0,1)
+      pt = prob[torch.arange(len(y)), y]
+      loss = -torch.log(pt)
+      self._update_loss_for_single_class(loss, y)
+
+  @torch.inference_mode()
+  def _cal_following_seg(self, x:torch.Tensor, y:torch.Tensor):
+    x, y = x.to(self.device), y.to(self.device)
+    mask_indices = torch.ones_like(y).bool().to(self.device)
+    if self.config.use_diff is True:
+      logits,(mask_indices,_) = self.model(x, y)
+    else:
+      logits = self.model(x, y)
+    y = y[:, -1:].flatten(0,1).cpu()
+    mask_indices = mask_indices.reshape(x.shape)[:,-1:].flatten(0,1).cpu()
+    if self.is_multiclass:
+      logits_dict = {}
+      for key in self.vocab.feature_list:
+        logits_dict[key] = logits[key][:, -1:].flatten(0,1).cpu()
+      return logits_dict, y,mask_indices
+    else:
+      logits = logits[:, -1:].flatten(0,1).cpu()
+      return logits, y,mask_indices
+
+  def prepare_prompt_and_ground_truth(self, save_dir, num_target_samples, num_target_measures):
+    encoding_scheme = self.config.nn_params.encoding_scheme
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[self.config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset)
+
+    for i, (tuneidx, tune_name) in enumerate(self.test_set):
+      ground_truth_sample = tuneidx
+      try:
+        decoder(ground_truth_sample, output_path=str(save_dir / f"{i}_{tune_name}_gt.mid"))
+      except:
+        print(f"Error in generating {i}_{tune_name}.mid")
+
+      prompt = self.model.decoder._prepare_inference(start_token=self.model.decoder.net.start_token, manual_seed=0, condition=tuneidx, num_target_measures=num_target_measures)
+      try:
+        decoder(prompt, output_path=str(save_dir / f"{i}_{tune_name}_prompt.mid"))
+      except:
+        print(f"Error in generating {i}_{tune_name}_prompt.mid")
+
+      if i == num_target_samples:
+        break
+
+  def generate_samples_with_prompt(self, save_dir, num_target_measures, tuneidx, tune_name, first_pred_feature, sampling_method=None, threshold=None, temperature=1.0,generation_length=3072):
+    encoding_scheme = self.config.nn_params.encoding_scheme
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[self.config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset)
+
+    tuneidx = tuneidx.cuda()
+    generated_sample = self.model.generate(0, generation_length, condition=tuneidx, num_target_measures=num_target_measures, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+    if encoding_scheme == 'nb':
+      generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+    decoder(generated_sample, output_path=str(save_dir / f"{tune_name}.mid"))
+
+    prompt = self.model.decoder._prepare_inference(self.model.decoder.net.start_token, 0, tuneidx, num_target_measures=8)
+    decoder(prompt, output_path=str(save_dir / f"{tune_name}_prompt.mid"))
+
+  def generate_samples_unconditioned(self, save_dir, num_samples, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1):
+    encoding_scheme = self.config.nn_params.encoding_scheme
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[self.config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset)
+
+    for i in range(num_samples):
+      generated_sample = self.model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      if encoding_scheme == 'nb':
+        generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+      decoder(generated_sample, output_path=str(save_dir / f"{uid}_{i}.mid"))
+
+  def generate_samples_with_text_prompt(self, save_dir, prompt, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1):
+    encoding_scheme = self.config.nn_params.encoding_scheme
+    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
+    encoder = T5EncoderModel.from_pretrained('google/flan-t5-base').to(self.device)
+    print(f"Using T5EncoderModel for text prompt: {prompt}")
+    context = tokenizer(prompt,  return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(self.device)
+    context = encoder(**context).last_hidden_state
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[self.config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset)
+
+    generated_sample = self.model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context)
+    if encoding_scheme == 'nb':
+      generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+    # Open the jsonl file and count the number of lines to determine the current index
+    jsonl_path = save_dir / "name2prompt.jsonl"
+    if jsonl_path.exists():
+      with open(jsonl_path, 'r') as f:
+        current_idx = sum(1 for _ in f)
+    else:
+      current_idx = 0
+
+    name = f"prompt_{current_idx}"
+    name2prompt_dict = defaultdict(list)
+    name2prompt_dict[name].append(prompt)
+    with open(jsonl_path, 'a') as f:
+      f.write(json.dumps(name2prompt_dict) + '\n')
+    decoder(generated_sample, output_path=str(save_dir / f"{name}_{uid}.mid"))
diff --git a/Amadeus/model_zoo.py b/Amadeus/model_zoo.py
new file mode 100644
index 0000000..492c8c9
--- /dev/null
+++ b/Amadeus/model_zoo.py
@@ -0,0 +1,512 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm.auto import tqdm
+import time
+import json
+
+from . import transformer_utils
+from . import sub_decoder_zoo
+from x_transformers.x_transformers import LayerIntermediates, AbsolutePositionalEmbedding
+from data_representation.vocab_utils import LangTokenVocab
+import os
+
+class AmadeusModelWrapper(nn.Module):
+  def __init__(
+    self, 
+    *, 
+    vocab:LangTokenVocab,                 
+    input_length:int,           
+    prediction_order:list,       
+    input_embedder_name:str,    
+    main_decoder_name:str,      
+    sub_decoder_name:str,       
+    sub_decoder_depth:int,      
+    sub_decoder_enricher_use:bool, 
+    dim:int,                    
+    heads:int,                 
+    depth:int,                 
+    dropout:float
+  ):
+    '''
+    This class wraps the three main components of the AmadeusModel model,
+    which are the input embedding layer, the main transformer decoder, and the sub-decoder.
+    '''
+
+    super().__init__()
+    self.vocab = vocab
+    self.vocab_size = vocab.get_vocab_size()
+    self.start_token = vocab.sos_token if hasattr(vocab, 'sos_token') else None
+    self.end_token = vocab.eos_token if hasattr(vocab, 'eos_token') else None
+    self.input_length = input_length
+    self.prediction_order = prediction_order
+    self._get_input_embedder(input_embedder_name, vocab, dropout, dim)
+    self._get_main_decoder(main_decoder_name, input_length, dim, heads, depth, dropout)
+    self._get_sub_decoder(sub_decoder_name, prediction_order, vocab, sub_decoder_depth, sub_decoder_enricher_use, dim, heads, dropout)
+    self.bos_token_hidden = None
+ 
+  def _get_input_embedder(self, input_embedder_name, vocab, dropout, dim):
+    self.emb_dropout = nn.Dropout(dropout)
+    self.input_embedder = getattr(transformer_utils, input_embedder_name)(
+      vocab=vocab,
+      dim_model=dim
+    )
+
+  def _get_main_decoder(self, main_decoder_name, input_length, dim, heads, depth, dropout):
+    self.pos_enc = AbsolutePositionalEmbedding(dim, input_length)
+    self.main_norm = nn.LayerNorm(dim)
+    self.main_decoder = getattr(transformer_utils, main_decoder_name)(
+      dim=dim,
+      depth=depth,
+      heads=heads,
+      dropout=dropout
+    )
+  
+  def _get_sub_decoder(self, sub_decoder_name, prediction_order, vocab, sub_decoder_depth, sub_decoder_enricher_use, dim, heads, dropout):
+    self.sub_decoder = getattr(sub_decoder_zoo, sub_decoder_name)(
+      prediction_order=prediction_order,
+      vocab=vocab,
+      dim=dim,
+      sub_decoder_depth=sub_decoder_depth,
+      heads=heads,
+      dropout=dropout,
+      sub_decoder_enricher_use=sub_decoder_enricher_use
+    )
+
+  @property
+  def device(self):
+    return next(self.parameters()).device
+
+  def forward(self, input_seq:torch.Tensor, target:torch.Tensor, context=None):
+    embedding = self.input_embedder(input_seq) + self.pos_enc(input_seq)
+    embedding = self.emb_dropout(embedding)
+    hidden_vec,layer_inter = self.main_decoder(embedding,train=True, context=context)  # B x T x d_model
+    hidden_vec = self.main_norm(hidden_vec)
+    input_dict = {'hidden_vec':hidden_vec, 'input_seq': input_seq, 'target': target, 'bos_token_hidden': self.bos_token_hidden}
+    logits = self.sub_decoder(input_dict)
+    # 选择总数中离三分之一最近的层
+    num_layers = len(layer_inter.layer_hiddens)
+    idx = round(num_layers / 3)
+    idx = min(max(idx, 0), num_layers - 1)
+    input_dict['hidden_vec'] = layer_inter.layer_hiddens[idx]
+    return logits, input_dict
+
+class AmadeusModelAutoregressiveWrapper(nn.Module):
+  def __init__(self, net:AmadeusModelWrapper):
+    '''
+    Initializes an autoregressive wrapper around the AmadeusModelWrapper, 
+    which allows sequential token generation.
+    
+    Arguments:
+    - net: The nested music transformer model that performs the token generation.
+    '''
+    super().__init__()
+    self.net = net
+
+  def forward(self, input_seq:torch.Tensor, target:torch.Tensor,context=None): 
+    return self.net(input_seq, target, context=context)
+  
+  def _prepare_inference(self, start_token, manual_seed, condition=None, num_target_measures=4):
+    '''
+    Prepares the initial tokens for autoregressive inference. If a manual seed is provided, 
+    it sets the seed for reproducibility. If a condition is given, it selects a subset of 
+    the tokens based on certain criteria related to the encoding scheme.
+
+    Arguments:
+    - start_token: The token that represents the start of a sequence.
+    - manual_seed: A seed value for reproducibility in inference (if greater than 0).
+    - condition: An optional tensor used for conditional generation, which helps select a 
+      portion of the input tokens based on the encoding scheme.
+
+    Returns:
+    - total_out: A tensor containing the initial tokens for inference, padded to ensure compatibility 
+      with the model.
+    '''
+    if manual_seed > 0:
+      torch.manual_seed(manual_seed)
+
+    total_out = []
+    if condition is None:
+      # Use the start token if no condition is given
+      total_out.extend(start_token)
+    else:
+      # Extract the portion of the sequence depending on encoding scheme (remi, cp, or nb)
+      if self.net.vocab.encoding_scheme == 'remi':
+        type_boundaries = self.net.vocab.remi_vocab_boundaries_by_key['type']
+        # vocab idx -> 0:SOS, 1:EOS, 2:Bar_without_time_signature, ... where_type_ends:Bar_time_signature_end, ...
+        measure_bool = (2 <= condition) & (condition < type_boundaries[1]) # between Bar_ts_start and Bar_ts_end 
+        conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item()
+      elif self.net.vocab.encoding_scheme == 'cp':
+        # find the start and end of the measure
+        beat_event2idx = self.net.vocab.event2idx['beat']
+        for event, idx in beat_event2idx.items():
+          if event == 0:
+            continue
+          if event == 'Bar':
+            start_idx = idx
+          elif event.startswith('Beat'):
+            end_idx = idx
+            break
+        measure_bool = (condition[:,1] >= start_idx) & (condition[:,1] < end_idx)  # measure tokens
+        conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item()
+        # measure_bool = (condition[:,1] == 1)  # measure tokens
+        conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item()
+      elif self.net.vocab.encoding_scheme == 'nb':
+        measure_bool = (condition[:,0] == 2) | (condition[:,0] >= 5)  # Empty measure or where new measure starts
+        conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item()
+
+      if conditional_input_len == 0:
+        conditional_input_len = 50
+
+      selected_tokens = condition[:conditional_input_len].tolist()
+      total_out.extend(selected_tokens)
+
+    total_out = torch.LongTensor(total_out).unsqueeze(0).to(self.net.device)
+    return total_out
+
+  def _run_one_step(self, input_seq, cache=None, sampling_method=None, threshold=None, temperature=1, bos_hidden_vec=None,context=None):
+    '''
+    Runs one step of autoregressive generation by taking the input sequence, embedding it,
+    passing it through the main decoder, and generating logits and a sampled token.
+
+    Arguments:
+    - input_seq: The input sequence tensor to be embedded and processed.
+    - cache: Optional cache for attention mechanisms to avoid recomputation.
+    - sampling_method: Sampling strategy used to select the next token.
+    - threshold: Optional threshold value for sampling methods that require it.
+    - temperature: Controls the randomness of predictions (higher temperature increases randomness).
+
+    Returns:
+    - logits: The predicted logits for the next token.
+    - sampled_token: The token sampled from the logits.
+    - intermidiates: Intermediate states from the main decoder, useful for caching.
+    '''
+    embedding = self.net.input_embedder(input_seq) + self.net.pos_enc(input_seq)
+    embedding = self.net.emb_dropout(embedding)
+
+    # Run through the main decoder and normalize
+    hidden_vec, intermidiates = self.net.main_decoder(embedding, cache,context_embedding=context)  # B x T x d_model
+    hidden_vec = self.net.main_norm(hidden_vec)
+    hidden_vec = hidden_vec[:, -1:]  # Keep only the last time step
+
+    input_dict = {'hidden_vec': hidden_vec, 'input_seq': input_seq, 'target': None, 'bos_token_hidden': bos_hidden_vec}
+
+    # Generate the next token
+    logits, sampled_token = self.net.sub_decoder(input_dict, sampling_method, threshold, temperature)
+    return logits, sampled_token, intermidiates, hidden_vec
+
+  def _update_total_out(self, total_out, sampled_token):
+    '''
+    Updates the output sequence with the newly sampled token. Depending on the encoding scheme, 
+    it either appends the token directly or processes feature-based sampling.
+
+    Arguments:
+    - total_out: The tensor containing the previously generated tokens.
+    - sampled_token: The newly generated token to be appended.
+
+    Returns:
+    - total_out: Updated output tensor with the newly generated token.
+    - sampled_token: The processed sampled token.
+    '''
+    if self.net.vocab.encoding_scheme == 'remi':
+      # For remi encoding, directly append the sampled token
+      total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=-1)
+    else:
+      # Handle other encoding schemes by concatenating features
+      sampled_token_list = []
+      for key in self.net.vocab.feature_list:
+        sampled_token_list.append(sampled_token[key])
+      sampled_token = torch.cat(sampled_token_list, dim=-1)
+      # print(total_out.shape)
+      if len(sampled_token.shape) == 2:
+        total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=1)
+      total_out = torch.cat([total_out, sampled_token.unsqueeze(0).unsqueeze(0)], dim=1)
+
+    return total_out, sampled_token
+
+  @torch.inference_mode()
+  def generate(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1, batch_size=1, context=None):
+    '''
+    Autoregressively generates a sequence of tokens by repeatedly sampling the next token 
+    until the desired maximum sequence length is reached or the end token is encountered.
+
+    Arguments:
+    - manual_seed: A seed value for reproducibility in inference.
+    - max_seq_len: The maximum length of the generated sequence.
+    - condition: An optional conditioning sequence to start generation from.
+    - sampling_method: The method used to sample the next token (e.g., greedy, top-k).
+    - threshold: Optional threshold for sampling (used in methods like top-p sampling).
+    - temperature: Controls the randomness of the token sampling process.
+    - batch_size: The number of sequences to generate in parallel.
+
+    Returns:
+    - total_out: The generated sequence of tokens as a tensor.
+    '''
+    # Prepare the starting sequence for inference
+    total_out = self._prepare_inference(self.net.start_token, manual_seed, condition, num_target_measures)
+
+    # If a condition is provided, run one initial step
+    if condition is not None:
+      _, _, cache = self._run_one_step(total_out[:, -self.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context)
+    else:
+      cache = LayerIntermediates()
+
+    # Continue generating tokens until the maximum sequence length is reached
+    pbar = tqdm(total=max_seq_len, desc="Generating tokens", unit="token")
+    bos_hidden_vec = None
+    hidden_vec_list = []
+    token_time_list = []
+    while total_out.shape[1] < max_seq_len:
+      pbar.update(1)
+      input_tensor = total_out[:, -self.net.input_length:]
+      # Generate the next token and update the cache
+      time_start = time.time()
+      _, sampled_token, cache, hidden_vec = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature,bos_hidden_vec=bos_hidden_vec, context=context)
+      time_end = time.time()
+      token_time_list.append(time_end - time_start)
+      if bos_hidden_vec is None:
+        bos_hidden_vec = hidden_vec
+      hidden_vec_list.append(hidden_vec)
+      # Update attention cache to handle autoregressive generation
+      for inter in cache.attn_intermediates:
+        inter.cached_kv = [t[..., -(self.net.input_length - 1):, :] for t in inter.cached_kv]
+
+      # Update the generated output with the new token
+      total_out, sampled_token = self._update_total_out(total_out, sampled_token)
+
+      # Stop if the end token is reached
+      if sampled_token.tolist() == self.net.end_token[0]:
+        break
+    # append hidden_vec to pkl
+
+    # save_path = 'hidden/diffnoaug_hidden_vec.pt'
+    # save_time_path = 'hidden/diff_noaug_token_time.json'
+    # if os.path.exists(save_path):
+    #   # Load existing list and append
+    #   hidden_vec_all = torch.load(save_path, map_location="cpu")
+    #   hidden_vec_all.extend(hidden_vec_list)
+    #   torch.save(hidden_vec_all, save_path)
+    # else:
+    #   torch.save(hidden_vec_list, save_path)
+    
+    # if os.path.exists(save_time_path):
+    #   # Load existing list and append
+    #   token_time_all = json.load(open(save_time_path, 'r'))
+    #   token_time_all = token_time_all['token_time_list']
+    #   token_time_all.extend(token_time_list)
+    #   average_time = sum(token_time_all) / len(token_time_all)
+    #   data = {
+    #     'average_time': average_time,
+    #     'token_time_list': token_time_all
+    #   }
+    #   json.dump(data, open(save_time_path, 'w'), indent=4)
+    # else:
+    #   average_time = sum(token_time_list) / len(token_time_list)
+    #   data = {
+    #     'average_time': average_time,
+    #     'token_time_list': token_time_list
+    #   }
+    #   json.dump(data, open(save_time_path, 'w'), indent=4)
+
+    return total_out
+  
+  def generate_batch(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1, batch_size=1):
+    '''
+    Autoregressively generates a sequence of tokens by repeatedly sampling the next token 
+    until the desired maximum sequence length is reached or the end token is encountered.
+
+    Arguments:
+    - manual_seed: A seed value for reproducibility in inference.
+    - max_seq_len: The maximum length of the generated sequence.
+    - condition: An optional conditioning sequence to start generation from.
+    - sampling_method: The method used to sample the next token (e.g., greedy, top-k).
+    - threshold: Optional threshold for sampling (used in methods like top-p sampling).
+    - temperature: Controls the randomness of the token sampling process.
+    - batch_size: The number of sequences to generate in parallel.
+
+    Returns:
+    - total_out: The generated sequence of tokens as a tensor.
+    '''
+    # Prepare the starting sequence for inference
+    total_out = self._prepare_inference(self.net.start_token, manual_seed, condition, num_target_measures)
+    # total_out (1,1,num) -> (bs,1,num)
+    total_out = total_out.repeat(batch_size, 1, 1)
+    # If a condition is provided, run one initial step
+    if condition is not None:
+      _, _, cache = self._run_one_step(total_out[:, -self.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+    else:
+      cache = LayerIntermediates()
+
+    # Continue generating tokens until the maximum sequence length is reached
+    pbar = tqdm(total=max_seq_len, desc="Generating tokens", unit="token")
+    while total_out.shape[1] < max_seq_len:
+      pbar.update(1)
+      input_tensor = total_out[:, -self.net.input_length:]
+
+      # Generate the next token and update the cache
+      _, sampled_token, cache = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+
+      # Update attention cache to handle autoregressive generation
+      for inter in cache.attn_intermediates:
+        inter.cached_kv = [t[..., -(self.net.input_length - 1):, :] for t in inter.cached_kv]
+
+      # Update the generated output with the new token
+      total_out, sampled_token = self._update_total_out(total_out, sampled_token)
+
+      # Stop if the end token is reached
+      if sampled_token.tolist() == self.net.end_token[0]:
+        break
+
+    return total_out
+ 
+class AmadeusModel(nn.Module):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab,                 
+    input_length:int,           
+    prediction_order:list,       
+    input_embedder_name:str,    
+    main_decoder_name:str,      
+    sub_decoder_name:str,       
+    sub_decoder_depth:int,      
+    sub_decoder_enricher_use:bool, 
+    dim:int,                    
+    heads:int,                 
+    depth:int,                 
+    dropout:float                 
+  ):
+    '''
+    This class combines the wrapper classes and initializes the full AmadeusModel model, 
+    which can perform autoregressive sequence generation for symbolic music.
+
+    Vocabulary used for tokenization of the symbolic music data.
+    Length of the input seqkeuence in tokens.
+    Defines the order in which features are predicted in a sequence used for compound shift
+    Name of the input embedding model to be used (e.g., one-hot embedding or learned embeddings).
+    Name of the main transformer decoder model used for generating the hidden representations for compound tokens.
+    Name of the sub-decoder, which processes the hidden states and decodes the sub-tokens inside the compound tokens.
+    Depth (number of layers) of the sub-decoder.
+    Whether to use an additional enricher module in the sub-decoder to refine representations.
+    Dimensionality of the model (hidden size of the transformer layers).
+    Number of attention heads in the transformer layers.
+    Number of layers in the main decoder.
+    Dropout rate for all layers in the model.
+    '''
+
+    super().__init__()
+    decoder = AmadeusModelWrapper(
+      vocab=vocab,
+      input_length=input_length,
+      prediction_order=prediction_order,
+      input_embedder_name=input_embedder_name,
+      main_decoder_name=main_decoder_name,
+      sub_decoder_name=sub_decoder_name,
+      sub_decoder_depth=sub_decoder_depth,
+      sub_decoder_enricher_use=sub_decoder_enricher_use,
+      dim=dim,
+      heads=heads,
+      depth=depth,
+      dropout=dropout
+    )
+    self.decoder = AmadeusModelAutoregressiveWrapper(
+      net=decoder
+    )
+  
+  def forward(self, input_seq:torch.Tensor, target:torch.Tensor, context=None):
+    return self.decoder(input_seq, target, context=context)
+  
+  @torch.inference_mode()
+  def generate(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1,batch_size=1,context=None):
+    if batch_size == 1:
+      return self.decoder.generate(manual_seed, max_seq_len, condition, num_target_measures, sampling_method, threshold, temperature, context=context)
+    else:
+      return self.decoder.generate_batch(manual_seed, max_seq_len, condition, num_target_measures, sampling_method, threshold, temperature, batch_size, context=context)
+
+class AmadeusModel4Encodec(AmadeusModel):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab,                 
+    input_length:int,           
+    prediction_order:list,       
+    input_embedder_name:str,    
+    main_decoder_name:str,      
+    sub_decoder_name:str,       
+    sub_decoder_depth:int,      
+    sub_decoder_enricher_use:bool, 
+    dim:int,                    
+    heads:int,                 
+    depth:int,                 
+    dropout:float                 
+  ):
+    super().__init__(
+      vocab=vocab, 
+      input_length=input_length, 
+      prediction_order=prediction_order, 
+      input_embedder_name=input_embedder_name, 
+      main_decoder_name=main_decoder_name, 
+      sub_decoder_name=sub_decoder_name, 
+      sub_decoder_depth=sub_decoder_depth, 
+      sub_decoder_enricher_use=sub_decoder_enricher_use,
+      dim=dim, 
+      heads=heads, 
+      depth=depth, 
+      dropout=dropout
+    )
+
+  def _prepare_inference(self, start_token, manual_seed, condition=None):
+    if manual_seed > 0:
+      torch.manual_seed(manual_seed)
+    total_out = []
+    if condition is None:
+      total_out.extend(start_token)
+    else:
+      if self.decoder.net.vocab.encoding_scheme == 'remi':
+        selected_tokens = condition[:1500].tolist()
+      else:
+        selected_tokens = condition[:500].tolist()
+      total_out.extend(selected_tokens)
+    total_out = torch.LongTensor(total_out).unsqueeze(0).to(self.decoder.net.device)
+    return total_out
+
+  def _update_total_out(self, total_out, sampled_token):
+    if self.decoder.net.vocab.encoding_scheme == 'remi':
+      total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=-1)
+    else:
+      sampled_token_list = []
+      for key in self.decoder.net.vocab.feature_list:
+        sampled_token_list.append(sampled_token[key])
+      sampled_token = torch.cat(sampled_token_list, dim=-1) # B(1) x num_features
+      total_out = torch.cat([total_out, sampled_token.unsqueeze(0).unsqueeze(0)], dim=1)
+    return total_out, sampled_token
+
+  def _run_one_step(self, input_seq, cache=None, sampling_method=None, threshold=None, temperature=1):
+    embedding = self.decoder.net.input_embedder(input_seq) + self.decoder.net.pos_enc(input_seq)
+    embedding = self.decoder.net.emb_dropout(embedding)
+    hidden_vec, intermidiates = self.decoder.net.main_decoder(embedding, cache) # B x T x d_model
+    hidden_vec = self.decoder.net.main_norm(hidden_vec)
+    hidden_vec = hidden_vec[:, -1:] # B x 1 x d_model
+    input_dict = {'hidden_vec':hidden_vec, 'input_seq': input_seq, 'target': None}
+    if self.decoder.net.vocab.encoding_scheme == 'remi':
+      feature_class_idx = (input_seq.shape[1] - 1) % 4
+      feature_type = self.decoder.net.vocab.feature_list[feature_class_idx]
+      logits, sampled_token = self.decoder.net.sub_decoder.run_one_step(input_dict, sampling_method, threshold, temperature, feature_type)
+    else:
+      logits, sampled_token = self.decoder.net.sub_decoder(input_dict, sampling_method, threshold, temperature)
+    return logits, sampled_token, intermidiates
+
+  @torch.inference_mode()
+  def generate(self, manual_seed, max_seq_len, condition=None, sampling_method=None, threshold=None, temperature=1):
+    total_out = self._prepare_inference(self.decoder.net.start_token, manual_seed, condition)
+    if condition is not None:
+      _, _, cache = self._run_one_step(total_out[:, -self.decoder.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+    else:
+      cache = LayerIntermediates()
+    while total_out.shape[1] < max_seq_len:
+      input_tensor = total_out[:, -self.decoder.net.input_length:]
+      _, sampled_token, cache = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      for inter in cache.attn_intermediates:
+        inter.cached_kv = [t[..., -(self.decoder.net.input_length - 1):, :] for t in inter.cached_kv] # B x num_heads x T x d_head
+      total_out, sampled_token = self._update_total_out(total_out, sampled_token)
+      if sampled_token.tolist() == self.decoder.net.end_token[0]:
+        break
+    return total_out
\ No newline at end of file
diff --git a/Amadeus/sampling_utils.py b/Amadeus/sampling_utils.py
new file mode 100644
index 0000000..28f652b
--- /dev/null
+++ b/Amadeus/sampling_utils.py
@@ -0,0 +1,168 @@
+import torch
+import torch.nn.functional as F
+
+def top_p_sampling(logits, thres=0.9):
+  sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+  cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+  sorted_indices_to_remove = cum_probs > thres
+  sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+  sorted_indices_to_remove[..., 0] = 0
+
+  # Create an empty tensor to hold the new logits
+  new_logits = logits.clone()
+
+  # Use the sorted indices to place the '-inf' in the original places
+  indices_to_remove = sorted_indices[sorted_indices_to_remove]
+  new_logits[..., indices_to_remove] = float('-inf')
+  return new_logits
+
+
+# refered: https://github.com/cimeister/typical-sampling
+def typical_sampling(logits, thres=0.99):
+  # calculate entropy
+  normalized = torch.nn.functional.log_softmax(logits, dim=-1)
+  p = torch.exp(normalized)
+  ent = -(normalized * p).nansum(-1, keepdim=True)
+
+  # shift and sort
+  shifted_scores = torch.abs((-normalized) - ent)
+  sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+  sorted_logits = logits.gather(-1, sorted_indices)
+  cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+  # Remove tokens with cumulative mass above the threshold
+  last_ind = (cumulative_probs < thres).sum(dim=-1)
+  last_ind[last_ind < 0] = 0
+  sorted_indices_to_remove = sorted_scores > sorted_scores.gather(-1, last_ind.view(-1, 1, 1))
+  # if self.min_tokens_to_keep > 1:
+  #     # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+  #     sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+  indices_to_remove = sorted_indices_to_remove.scatter(2, sorted_indices, sorted_indices_to_remove)
+
+  scores = logits.masked_fill(indices_to_remove, float("-inf"))
+  return scores
+
+def add_gumbel_noise(logits, temperature):
+    '''
+    The Gumbel max is a method for sampling categorical distributions.
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+    Thus, we use float64.
+    '''
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+  # 
+# refered: https://github.com/john-hewitt/truncation-sampling
+def eta_sampling(logits, epsilon) -> torch.FloatTensor:
+  probabilities = logits.softmax(dim=-1)
+  entropy = torch.distributions.Categorical(probs=probabilities).entropy()
+  new_epsilon = min(epsilon, torch.sqrt(torch.tensor(epsilon))*torch.exp(-entropy))
+  indices_to_remove = probabilities < new_epsilon
+  max_word = torch.argmax(logits, dim=-1)
+  indices_to_remove[..., max_word.squeeze()] = 0
+  new_scores = logits.masked_fill(indices_to_remove, float("-inf"))
+  return new_scores
+
+def sample(logits, sampling_method, threshold, temperature):
+  """Sample from the logits with a specific sampling strategy."""
+  if sampling_method == "top_p":
+    probs = F.softmax(top_p_sampling(logits, thres=threshold) / temperature, dim=-1)
+  elif sampling_method == "typical":
+    probs = F.softmax(typical_sampling(logits, thres=threshold) / temperature, dim=-1)
+  elif sampling_method == "eta":
+    probs = F.softmax(eta_sampling(logits, epsilon=threshold) / temperature, dim=-1)
+  else:
+    probs = F.softmax(logits / temperature, dim=-1)
+  return torch.multinomial(probs[-1,-1,:], 1)
+
+def sample_with_prob(logits, sampling_method, threshold, temperature):
+    """Sample from the logits with a specific sampling strategy and return the token and its probability."""
+    #  temporarily apply the sampling method to logits
+    logits = logits / temperature
+    # logits = add_gumbel_noise(logits, temperature)
+  
+    if sampling_method == "top_p":
+        modified_logits = top_p_sampling(logits, thres=threshold)
+    elif sampling_method == "typical":
+        modified_logits = typical_sampling(logits, thres=threshold)
+    elif sampling_method == "eta":
+        modified_logits = eta_sampling(logits, epsilon=threshold)
+    else:
+        modified_logits = logits  # 其他情况直接使用原始logits
+    
+    # print(modified_logits.shape)
+    # 应用温度调整并计算概率
+    # probs = F.softmax(modified_logits / temperature, dim=-1)
+    probs = F.softmax(modified_logits, dim=-1)
+    
+    # 获取最后一个位置的概率分布
+    # probs_last = probs[-1, -1, :]
+    # print(probs.shape)
+    probs_last = probs[-1, -1, :]
+    
+    # 采样
+    sampled_token = torch.multinomial(probs_last, num_samples=1)
+    # 获取对应的概率值
+    prob_value = probs_last[sampled_token]
+    
+    return sampled_token, prob_value.squeeze()
+
+def top_p_sampling_fast(logits, thres=0.9):
+    """
+    logits: Tensor of shape [B, L, V]
+    Returns: logits with low-prob tokens masked as -inf, shape [B, L, V]
+    """
+    # Step 1: sort logits and get indices
+    sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)  # [B, L, V]
+    
+    # Step 2: compute cumulative probs
+    probs = F.softmax(sorted_logits, dim=-1)  # [B, L, V]
+    cum_probs = torch.cumsum(probs, dim=-1)   # [B, L, V]
+
+    # Step 3: mask tokens beyond cumulative threshold
+    sorted_mask = cum_probs > thres
+    sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+    sorted_mask[..., 0] = False  # always keep at least one token
+
+    # Step 4: scatter back to original order
+    # Create mask of same shape as logits, default False
+    mask = torch.zeros_like(logits, dtype=torch.bool)  # [B, L, V]
+    mask = mask.scatter(-1, sorted_indices, sorted_mask)
+
+    # Step 5: mask logits
+    logits = logits.masked_fill(mask, float('-inf'))  # final masked logits
+
+    return logits
+
+def sample_with_prob_fast(logits, sampling_method="top_p", threshold=0.9, temperature=1.0, mask_indices=None):
+    """
+    logits: [B*T, num_sub_tokens, vocab_size]
+    mask_indices: mask indicating which tokens to sample, shape = [B*T, num_sub_tokens]
+    """
+    if temperature != 1.0:
+        logits = logits / temperature
+
+    if sampling_method == "top_p":
+        logits = top_p_sampling_fast(logits, thres=threshold)  # should support batch
+    elif sampling_method == "typical":
+        logits = typical_sampling(logits, thres=threshold)
+    elif sampling_method == "eta":
+        logits = eta_sampling(logits, epsilon=threshold)
+    # else: keep logits as-is
+
+    probs = torch.softmax(logits, dim=-1)  # [B*T, num_sub_tokens, vocab_size]
+
+    B, L, V = probs.shape
+    probs_flat = probs.view(-1, V)  # [(B*T * num_sub_tokens), V]
+
+    # 采样：multinomial 不能一次性处理 3D，展平后采样
+    sampled = torch.multinomial(probs_flat, num_samples=1)  # [(B*T * num_sub_tokens), 1]
+    sampled = sampled.view(B, L)  # [B*T, num_sub_tokens]
+
+    sampled_probs = torch.gather(probs, 2, sampled.unsqueeze(-1)).squeeze(-1)  # [B*T, num_sub_tokens]
+
+    return sampled, sampled_probs
diff --git a/Amadeus/sub_decoder_utils.py b/Amadeus/sub_decoder_utils.py
new file mode 100644
index 0000000..3109ef4
--- /dev/null
+++ b/Amadeus/sub_decoder_utils.py
@@ -0,0 +1,228 @@
+from math import ceil
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MLP(nn.Module):
+  def __init__(self, in_size, out_size, hidden_size, dropout):
+    super().__init__()
+    self.out_size = out_size
+    self.layer = nn.Sequential(
+      nn.Linear(in_size, hidden_size),
+      nn.Dropout(dropout),
+      nn.ReLU(),
+      nn.Linear(hidden_size, out_size)
+    )
+
+  def forward(self, x):
+    return self.layer(x)
+
+class extendedMLP(nn.Module):
+  def __init__(self, in_size, out_size, num_layers, hidden_size, dropout):
+    super().__init__()
+    self.input_size = in_size
+
+    self.layers = nn.ModuleList()
+    if num_layers == 1:
+      # Only one layer
+      self.layers.append(nn.Linear(in_size, out_size))
+      return
+    elif num_layers > 1:
+      # First layer
+      self.layers.append(nn.Linear(in_size, hidden_size))
+      self.layers.append(nn.Dropout(dropout))
+      self.layers.append(nn.ReLU())
+      # Intermediate layers
+      if num_layers > 2:
+        for _ in range(num_layers - 2):  # -2 because we're manually adding the first and last layers
+          self.layers.append(nn.Linear(hidden_size, hidden_size))
+          self.layers.append(nn.Dropout(dropout))
+          self.layers.append(nn.ReLU())
+      # Last layer
+      self.layers.append(nn.Linear(hidden_size, out_size))
+    else:
+      raise ValueError("num_layers should be a positive integer")
+  
+  def forward(self, x):
+     for layer in self.layers:
+         x = layer(x)
+     return x
+
+class multiMLP(nn.Module):
+  def __init__(self, in_size, out_size, hidden_size, dropout, pred_order):
+    super().__init__()
+    self.out_size = out_size
+    self.layer = nn.ModuleList([MLP(in_size, out_size, hidden_size, dropout) for _ in pred_order])
+  
+  def forward(self, x, choice):
+    '''
+    x: B x T x d_model
+    choice: token type from self.pred_order (str or list of str)
+    '''
+    if isinstance(choice, str):
+      idx = self.pred_order.index(choice)
+      return self.layer[idx](x)
+    elif len(choice) > 1 and not isinstance(choice, str):
+      raise ValueError("multiMLP doesn't support parallel prediction")
+
+class ResidualLayerNormModule(nn.Module):
+  def __init__(self, submodule: nn.Module):
+    super().__init__()
+    self.submodule = submodule
+    if submodule.__class__.__name__ == 'MultiheadAttention':
+      self.layer_norm = nn.LayerNorm(self.submodule.embed_dim)
+    else:
+      self.layer_norm = nn.LayerNorm(self.submodule.input_size)
+
+  def forward_attention(self, q, k, v, attn_mask, type):
+    attn_output, _ = self.submodule(q, k, v, attn_mask=attn_mask, need_weights=False, average_attn_weights=False)
+    return self.layer_norm(attn_output + q)
+
+  def forward_mlp(self, x):
+    return self.layer_norm(self.submodule(x) + x)
+
+class MultiProj_hidden2logit(nn.Module):
+  def __init__(self, dim, vocab_sizes):
+    super().__init__()
+    self.layers = nn.ModuleDict({
+      f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items()
+      })
+  
+  def forward(self, hidden_vec, feature):
+    logit = self.layers[f"layer_{feature}"](hidden_vec)
+    return logit
+
+class MultiProj_catvec2hidden(nn.Module):
+  def __init__(self, config, par_pred_keys, seq_pred_keys):
+    super().__init__()
+    '''
+    This class is used in SQstyleEachEmbStrategy
+    par_pred_keys: list of independent features(These tokens are predicted in parallel)
+    seq_pred_keys: list of sequential features(These tokens are predicted sequentially)
+    '''
+    net_param = config.nn_params
+    self.d_model = net_param.model.d_model
+    independent_emb_size = 0
+    for key in par_pred_keys:
+      independent_emb_size += net_param.emb[key]
+    self.layers = nn.ModuleDict({
+      'layer_independent': nn.Linear(self.d_model + independent_emb_size, self.d_model),
+      **{f"layer_{key}": nn.Linear(self.d_model + net_param.emb[key], self.d_model) for key in seq_pred_keys}
+      })
+    self.par_pred_keys = par_pred_keys
+    self.seq_pred_keys = seq_pred_keys
+    self.dropout = nn.Dropout(0.1)
+    self.relu = nn.ReLU()
+  
+  def forward(self, x, choice):
+    '''
+    x: B x T x (d_model + emb_size)
+    choice: key type (str or list of str)
+    '''
+    if isinstance(choice, str): # single key
+      assert choice in self.seq_pred_keys
+      output = self.layers[f"layer_{choice}"](x)
+      return self.relu(self.dropout(output))
+    elif len(choice) > 1 and not isinstance(choice, str): # multiple keys, parallel
+      assert choice == self.par_pred_keys # the order of choice should be the same as the order of self.par_pred_keys
+      output = self.layers['layer_independent'](x)
+      return self.relu(self.dropout(output))
+
+def mask_tensor(tensor, mask_rate=0.15):
+  # Get the size of the tensor
+  batch_size, seq_len, dim = tensor.size()
+  # Calculate the total number of elements and the number to mask
+  total_elements = batch_size * seq_len
+  num_to_mask = int(total_elements * mask_rate)
+  # Create a 1D binary mask where 1 indicates that element will be masked.
+  # Start by creating a tensor of zeros with length equal to the total number of elements.
+  mask = torch.zeros(total_elements).to(tensor.device)
+  # Set `num_to_mask` random indices to 1 (masking)
+  indices_to_mask = torch.randperm(total_elements)[:num_to_mask]
+  mask[indices_to_mask] = 1
+  # Reshape the mask to match the original tensor's shape
+  mask = mask.reshape(batch_size, seq_len)
+  mask = mask.unsqueeze(2) # B x T x 1
+  masked_tensor = tensor * (mask == 0).float() # B x T x d_model
+  return masked_tensor
+
+def generate_causality_mask_on_window(size, window_size):
+  mask = torch.zeros((size, size))
+  for i in range(size):
+    mask[i, i+window_size:] = 1
+  return mask.bool()
+
+# generate boolean mask, if the value is 1 or true, it means the value is masked
+# considers BOS token and mask margin
+def generate_CA_mask(tgt_len, memory_len, mask_margin=0):
+  mask = torch.triu(torch.ones((tgt_len, memory_len)), diagonal=mask_margin+1)
+  return mask.bool()
+
+# generate boolean mask, if the value is 1 or true, it means the value is masked
+def generate_SA_mask(tgt_len):
+  mask = torch.triu(torch.ones((tgt_len, tgt_len)), diagonal=1)
+  return mask.bool()
+
+def generate_none_causality_mask(tgt_len, memory_len):
+  mask = torch.zeros((tgt_len, memory_len))
+  return mask.bool()
+
+class DecoderLayer(nn.Module):
+  def __init__(self, dim, num_heads, dropout):
+    super().__init__()
+    self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True))
+    self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout))
+    self.dropout = nn.Dropout(dropout)
+      
+  def forward(self, input_dict):
+    '''
+    input_dict = {'input_seq': input_seq, 'memory': memory, 'memory_mask': CA_attn_mask}
+    '''
+    # cross attention
+    attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], input_dict['memory_mask'], type='cross')
+    attn_output = self.residual_FF.forward_mlp(attn_output) 
+    attn_output = self.dropout(attn_output)
+    output_dict = {'input_seq': attn_output, 'memory': input_dict['memory'], 'memory_mask': input_dict['memory_mask']}
+    return output_dict
+
+class TransformerLayer(nn.Module):
+  def __init__(self, dim, num_heads, dropout):
+    super().__init__()
+    self.self_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True))
+    self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True))
+    self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout))
+    self.dropout = nn.Dropout(dropout)
+      
+  def forward(self, input_dict):
+    '''
+    input_dict = {'input_seq': input_seq, 'memory': memory, 'memory_mask': CA_attn_mask}
+    '''
+    # self attention
+    attn_output = self.self_attn_block.forward_attention(input_dict['input_seq'], input_dict['input_seq'], input_dict['input_seq'], input_dict['memory_mask'], type='self')
+    
+    input_dict['input_seq'] = attn_output
+    # cross attention
+    attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], input_dict['memory_mask'], type='cross')
+    attn_output = self.residual_FF.forward_mlp(attn_output) 
+    attn_output = self.dropout(attn_output)
+    output_dict = {'input_seq': attn_output, 'memory': input_dict['memory'], 'memory_mask': input_dict['memory_mask']}
+    return output_dict
+  
+class FeatureEnricher(nn.Module):
+  def __init__(self, dim, num_heads, dropout):
+    super().__init__()
+    self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True))
+    self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout))
+    self.dropout = nn.Dropout(dropout)
+    
+  def forward(self, input_dict):
+    '''
+    input_dict = {'input_seq': input_seq, 'memory': memory}
+    '''
+    # cross attention
+    attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], None, type='feature_enrichment')
+    attn_output = self.residual_FF.forward_mlp(attn_output)
+    attn_output = self.dropout(attn_output)
+    output_dict = {'input_seq': attn_output, 'memory': input_dict['memory']}
+    return output_dict
\ No newline at end of file
diff --git a/Amadeus/sub_decoder_zoo.py b/Amadeus/sub_decoder_zoo.py
new file mode 100644
index 0000000..e0994d6
--- /dev/null
+++ b/Amadeus/sub_decoder_zoo.py
@@ -0,0 +1,1280 @@
+from selectors import EpollSelector
+from turtle import st
+from numpy import indices
+from sympy import Trace, false, true
+import torch
+import torch.profiler
+import torch.nn as nn
+
+from x_transformers import Decoder
+
+from .transformer_utils import MultiEmbedding, RVQMultiEmbedding
+from .sub_decoder_utils import *
+from .sampling_utils import sample, sample_with_prob, sample_with_prob_fast, top_p_sampling, typical_sampling, eta_sampling
+
+from data_representation.vocab_utils import LangTokenVocab
+
+class SingleProjection(nn.Module):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    This sub-decoder is used for REMI based models
+    '''
+    super().__init__()
+    vocab_size = vocab.get_vocab_size()
+    self.proj = nn.Linear(dim, vocab_size)
+    
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=1):
+    hidden_vec = input_dict['hidden_vec']
+    target = input_dict['target']
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      logits = self.proj(hidden_vec[:, -1:])
+      sampled_token = sample(logits, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      return logits, sampled_token
+    # ---- Training ---- #
+    logits = self.proj(hidden_vec)
+    return logits
+
+class SubDecoderClass(nn.Module):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    super().__init__()
+    '''
+    This is the base class for all sub-decoders
+    '''
+    self.prediction_order = prediction_order
+    self.vocab = vocab
+    self.vocab_size = vocab.get_vocab_size()
+    # make layers
+    self._make_emb_layer(vocab, dim)
+    self._make_projection_layer(vocab, dim)
+    self._make_nonlinear_layer()
+
+  @property
+  def device(self):
+    return next(self.parameters()).device
+
+  def _make_emb_layer(self, vocab, dim):
+    self.emb_layer = MultiEmbedding(
+      vocab=vocab,
+      dim_model=dim
+    )
+
+  # def _make_projection_layer(self, vocab, dim):
+  #   vocab_sizes = vocab.get_vocab_size()
+  #   self.hidden2logit = nn.ModuleDict({
+  #     f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items()
+  #     })
+
+  def _make_nonlinear_layer(self):
+    pass
+  def _make_projection_layer(self, vocab, dim):
+      vocab_sizes = vocab.get_vocab_size()
+      self.vocab_sizes = vocab_sizes
+      self.max_vocab_size = max(vocab_sizes.values())
+      self.projection_keys = list(vocab_sizes.keys())  # For index order
+
+      # ✅ 保留原来的 Linear 层（这样 state_dict 可以匹配）
+      self.hidden2logit = nn.ModuleDict({
+          f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items()
+      })
+
+      # # ✅ 构建用于 block 并行的权重
+      # weight_list = []
+      # bias_list = []
+
+      # for key in self.projection_keys:
+      #     layer = self.hidden2logit[f"layer_{key}"]
+      #     w = layer.weight
+      #     b = layer.bias
+
+      #     # pad to max_vocab_size
+      #     w_padded = F.pad(w, (0, 0, 0, self.max_vocab_size - w.shape[0]))
+      #     b_padded = F.pad(b, (0, self.max_vocab_size - b.shape[0]))
+
+      #     weight_list.append(w_padded.unsqueeze(0))  # (1, Vmax, D)
+      #     bias_list.append(b_padded.unsqueeze(0))    # (1, Vmax)
+
+      # self.register_buffer("proj_weight", torch.cat(weight_list, dim=0))  # (F, Vmax, D)
+      # self.register_buffer("proj_bias", torch.cat(bias_list, dim=0))      # (F, Vmax)  
+class FeedForward(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    FeedForward sub-decoder is used for compound token like CP or NB.
+    We followed the original sub-decoder proposed in the paper "Compound Word Transformer",
+    however the embedding size for each sub-token or musical feature is the same in our implementation.
+    The reason for that is we didn't find any significant difference in the performance of the model
+
+    There are two types of decoding style for the FeedForward sub-decoder:
+    1. Partial-sequential prediction: Predict type token first and then predict all the sub-tokens in parallel (origianl CP)
+    2. Fully-sequential prediction: Predict all the sub-tokens sequentially
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+
+  def _make_projection_layer(self, vocab, dim):
+    vocab_sizes = vocab.get_vocab_size()
+    self.hidden2logit = nn.ModuleDict({
+      f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items()
+      })
+    self.catvec2hidden = nn.ModuleDict({
+      f"layer_{key}": nn.Linear(dim+dim, dim) for key, _ in vocab_sizes.items()
+      })
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec']
+    target = input_dict['target']
+
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      for feature in self.prediction_order:
+        if isinstance(feature, str):
+          logit = self.hidden2logit[f"layer_{feature}"](hidden_vec)
+          logits_dict[feature] = logit
+          sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+          sampled_token_dict[feature] = sampled_token
+          feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) # B x T x emb_size
+          catvec = torch.cat([hidden_vec, feature_emb.unsqueeze(0)], dim=-1)
+          hidden_vec = self.catvec2hidden[f"layer_{feature}"](catvec)
+        else:
+          assert feature == self.prediction_order[-1], "Parallel prediction should be the last feature"
+          for par_feature in feature:
+            logit = self.hidden2logit[f"layer_{par_feature}"](hidden_vec)
+            logits_dict[par_feature] = logit
+            sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+            sampled_token_dict[par_feature] = sampled_token
+      return logits_dict, sampled_token_dict
+
+    # ---- Training ---- #
+    for feature in self.prediction_order:
+      if isinstance(feature, str):
+        logit = self.hidden2logit[f"layer_{feature}"](hidden_vec)
+        logits_dict[feature] = logit
+        feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., self.vocab.feature_list.index(feature)]) # B x T x emb_size
+        catvec = torch.cat([hidden_vec, feature_emb], dim=-1)
+        hidden_vec = self.catvec2hidden[f"layer_{feature}"](catvec)
+      else:
+        assert feature == self.prediction_order[-1], "Parallel prediction should be the last feature"
+        for par_feature in feature:
+          logit = self.hidden2logit[f"layer_{par_feature}"](hidden_vec)
+          logits_dict[par_feature] = logit
+    return logits_dict
+
+class Parallel(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    Parallel sub-decoder is used for parallel prediction of multiple sub-tokens or musical features
+    This method is proposed in the paper "Multitrack Music Transformer"
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec']
+    target = input_dict['target']
+
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      for feature in self.prediction_order:
+        logit = self.hidden2logit[f"layer_{feature}"](hidden_vec) # B x T x vocab_size
+        logits_dict[feature] = logit
+        sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        sampled_token_dict[feature] = sampled_token
+      return logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    for feature in self.prediction_order:
+      logit = self.hidden2logit[f"layer_{feature}"](hidden_vec)
+      logits_dict[feature] = logit
+    return logits_dict
+
+class RNN(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    RNN sub-decoder is used for sequential prediction of multiple sub-tokens or musical features
+    This method is similar to the method proposed in "PianoTree VAE"
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+    self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)}
+
+    self.pos_enc = nn.Embedding(len(prediction_order), dim)
+    nn.init.zeros_(self.pos_enc.weight)
+
+    self.decoding_rnn = nn.GRU(
+                      input_size=dim,
+                      hidden_size=dim,
+                      num_layers=sub_decoder_depth,
+                      dropout=dropout,
+                      batch_first=True)
+
+  def _apply_pos_enc(self, tgt, apply_type='last'):
+    if apply_type == 'all':
+      pos = torch.arange(tgt.shape[1]).to(tgt.device)
+      pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1)
+      tgt_pos = tgt + self.pos_enc(pos.long())
+    elif apply_type == 'last':
+      pos = torch.arange(tgt.shape[1]).to(tgt.device)
+      pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1)
+      pos_emb = self.pos_enc(pos.long())
+      # zero out the pos_emb except for the last token
+      pos_emb[:, :-1, :] = 0
+      tgt_pos = tgt + pos_emb
+    return tgt_pos
+
+  def _prepare_token_embedding_for_teacher_forcing(self, input_seq, target):
+    for feature in self.prediction_order[:-1]:
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      input_seq = torch.cat([input_seq, feature_emb_reshape], dim=1) 
+    return input_seq
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] # B x T x num_sub_tokens-1
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], -1)).unsqueeze(1) # (B*T) x 1 x d_model
+    input_seq = hidden_vec_reshape # (B*T) x 1 x d_model
+    
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      h_0 = input_seq[:, 0, :].unsqueeze(0) # 1 x (B*T) x d_model
+      input_seq = self._apply_pos_enc(input_seq, apply_type='all') # (B*T) x 1 x d_model
+      for idx, feature in enumerate(self.prediction_order):
+        input_seq, _ = self.decoding_rnn(input_seq, h_0) # input_seq: (B*T) x (idx+1) x hidden_size, h_n: num_layers x (B*T) x hidden_size
+        logit = self.hidden2logit[f"layer_{feature}"](input_seq[:, -1, :]) # (B*T) x vocab_size
+        logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+        logits_dict[feature] = logit
+        sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        sampled_token_dict[feature] = sampled_token
+        if idx == len(self.prediction_order)-1:
+          return logits_dict, sampled_token_dict
+        feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) # B x T x emb_size
+        feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+        input_seq = torch.cat([input_seq, feature_emb_reshape], dim=1) # (B*T) x (idx+2) x d_model
+        input_seq = self._apply_pos_enc(input_seq, apply_type='last') # (B*T) x (idx+2) x d_model
+      return logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    input_seq = self._prepare_token_embedding_for_teacher_forcing(input_seq, target) # (B*T) x len(prediction_order) x d_model
+    # initial hidden state has no positional encoding
+    h0 = input_seq[:, 0, :].unsqueeze(0) # 1 x (B*T) x d_model 
+    h0 = h0.contiguous()
+    # apply positional encoding
+    input_seq = self._apply_pos_enc(input_seq, apply_type='all') # (B*T) x len(prediction_order) x d_model
+    # get output using rnn
+    output, _ = self.decoding_rnn(input_seq, h0) # (B*T) x len(prediction_order) x d_model
+    output = output.reshape((hidden_vec.shape[0], hidden_vec.shape[1], len(self.prediction_order), -1)) # B x T x len(prediction_order) x d_model
+    for idx, feature in enumerate(self.prediction_order):
+      logit = self.hidden2logit[f"layer_{feature}"](output[:, :, idx, :]) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict
+
+class SelfAttention(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    This sub-decoder is used for sequential prediction of multiple sub-tokens or musical features
+    This method is similar to the method proposed in "UniAudio", but different in making the sequence of sub-tokens.
+    The UniAudio adds the output of the main decoder or hidden vec directly to embedding of the sub-token,
+    while our method puts the hidden vec in the input sequence so that the attention mechanism can learn the relationship between the hidden vec and the sub-token
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+    self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)}
+    
+    self.pos_enc = nn.Embedding(1 + len(prediction_order), dim)
+    nn.init.zeros_(self.pos_enc.weight)
+    
+    self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True)
+    
+    window_size = 1 # number of previous output of the main decoder to be used in the sub-decoder
+    causal_mask = generate_causality_mask_on_window(size=window_size + len(prediction_order), window_size=window_size)
+    self.register_buffer('causal_mask', causal_mask)
+
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = sub_decoder_depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def _apply_pos_enc(self, tgt, apply_type='last'):
+    if apply_type == 'all':
+      pos = torch.arange(tgt.shape[1]).to(tgt.device)
+      pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1)
+      tgt_pos = tgt + self.pos_enc(pos.long())
+    elif apply_type == 'last':
+      pos = torch.arange(tgt.shape[1]).to(tgt.device)
+      pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1)
+      pos_emb = self.pos_enc(pos.long()) # (B*T) x (window_size + BOS + num_sub_tokens-1) x dim
+      # zero out the pos_emb except for the last token
+      pos_emb[:, :-1, :] = 0
+      tgt_pos = tgt + pos_emb
+    return tgt_pos
+
+  def _prepare_input_seq_list(self, hidden_vec_reshape, target=None):
+    input_seq_list = []
+    input_seq_list.append(hidden_vec_reshape)
+    BOS_emb = self.sub_decoder_BOS_emb.unsqueeze(0).repeat(hidden_vec_reshape.shape[0], 1, 1) # (B*T) x 1 x d_model
+    if target is None:
+      input_seq_list.append(BOS_emb[-1:, :, :])
+    else: # training
+      input_seq_list.append(BOS_emb)
+    return input_seq_list
+
+  def _prepare_token_embedding_for_teacher_forcing(self, input_seq_list, target):
+    for feature in self.prediction_order[:-1]:
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      input_seq_list.append(feature_emb_reshape)
+    memory_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model
+    return memory_tensor
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] # B x T x num_sub_tokens
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    input_seq_list = self._prepare_input_seq_list(hidden_vec_reshape, target)
+    
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      input_seq_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x (window_size + BOS) x d_model
+      pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS) x d_model
+      for idx, feature in enumerate(self.prediction_order):
+        output = self.transformer_decoder(pos_target_tensor)
+        logit = self.hidden2logit[f"layer_{feature}"](output[:, -1:])
+        logits_dict[feature] = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size
+        sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        sampled_token_dict[feature] = sampled_token
+        if idx == len(self.prediction_order)-1:
+          return logits_dict, sampled_token_dict
+        feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+        feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+        input_seq_list.append(feature_emb_reshape)
+        input_seq_tensor = torch.cat(input_seq_list, dim=1)
+        pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='last')
+      return logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    # preparing for training
+    input_seq_tensor = self._prepare_token_embedding_for_teacher_forcing(input_seq_list, target) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model
+    pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model
+    # get output using self-attention
+    output = self.transformer_decoder(pos_target_tensor)
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict
+
+class SelfAttentionUniAudio(SelfAttention):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+    '''
+    Uniaudio version of self-attention sub-decoder
+    Through the experiments, we found that the performance of the model is better than our proposed self-attention sub-decoder
+    It shows comparable performance with the cross-attention sub-decoder
+    However, NMT shows better performance than UniAudio in terms of the performance of the model
+    '''
+
+  def _prepare_token_embedding_for_teacher_forcing(self, hidden_vec_reshape, target):
+    input_seq_list = []
+    # append zero vector
+    input_seq_list.append(torch.zeros(hidden_vec_reshape.shape[0], 1, hidden_vec_reshape.shape[2]).to(self.device))
+    for feature in self.prediction_order[:-1]:
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      input_seq_list.append(feature_emb_reshape)
+
+    feature_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x num_sub-tokens x d_model
+    # Ensure hidden_vec_reshape and feature_tensor have the same shape
+    assert hidden_vec_reshape.shape == feature_tensor.shape, f"Shapes of hidden_vec_reshape and feature_tensor do not match: {hidden_vec_reshape.shape} vs {feature_tensor.shape}"
+    # Sum hidden_vec_reshape and feature_tensor in the last dimension
+    memory_tensor = hidden_vec_reshape + feature_tensor
+    return memory_tensor
+  
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] # B x T x num_sub-tokens
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    hidden_vec_reshape = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub-tokens x d_model
+    
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      pos_target_tensor = self._apply_pos_enc(hidden_vec_reshape, apply_type='all') # (B*T) x (window_size + BOS) x d_model
+      for idx, feature in enumerate(self.prediction_order):
+        output = self.transformer_decoder(pos_target_tensor)
+        logit = self.hidden2logit[f"layer_{feature}"](output[:, -1:])
+        logits_dict[feature] = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size
+        sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        sampled_token_dict[feature] = sampled_token
+        if idx == len(self.prediction_order)-1:
+          return logits_dict, sampled_token_dict
+        feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+        feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+        pos_target_tensor = torch.cat([pos_target_tensor[:, :idx+1, :], feature_emb_reshape + pos_target_tensor[:, idx+1:idx+2, :], pos_target_tensor[:, idx+2:, :]], dim=1)
+
+      return logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    # preparing for training
+    input_seq_tensor = self._prepare_token_embedding_for_teacher_forcing(hidden_vec_reshape, target) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model
+    pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model
+    # get output using self-attention
+    output = self.transformer_decoder(pos_target_tensor)
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict
+
+class CrossAttention(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    '''
+    The power of Cross-attention and UniAudio style Self-attention lies in that using the output of the main decoder or hidden vec directly in the sub-decoder
+    As the output of the main decoder is the representation of the whole sequence, 
+    it contains richer information which can even decode out sub-tokens in a parallel manner
+    So both architectures using the output of the main decoder in a direct way show better performance than the original self-attention sub-decoder
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+    self.sub_decoder_enricher_use = sub_decoder_enricher_use
+    self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)}
+    
+    self.pos_enc = nn.Embedding(len(self.prediction_order), dim)
+    nn.init.zeros_(self.pos_enc.weight)
+
+    self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True)
+    if sub_decoder_enricher_use:
+      self.enricher_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True)
+    causal_mask = generate_SA_mask(len(prediction_order))
+    causl_ca_mask = generate_CA_mask(len(prediction_order), len(prediction_order)).to(self.device)
+    self.register_buffer('causal_mask', causal_mask)
+    self.register_buffer('causal_ca_mask', causl_ca_mask)
+
+    if sub_decoder_depth > 1:
+      self.sub_decoder_layers = nn.Sequential(
+        *[DecoderLayer(dim=dim, num_heads=heads, dropout=dropout) for _ in range(sub_decoder_depth)]
+      )
+    else:
+      self.sub_decoder_layers = nn.Sequential(DecoderLayer(dim=dim, num_heads=heads, dropout=dropout))
+    if sub_decoder_enricher_use:
+      self.feature_enricher_layers = nn.Sequential(FeatureEnricher(dim=dim, num_heads=heads, dropout=dropout))
+
+  def _apply_window_on_hidden_vec(self, hidden_vec):
+    BOS_emb = self.enricher_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model
+    # through our experiments, we found that the size of the window doesn't affect the performance of the model much
+    window_size = 1
+    zero_vec = torch.zeros((hidden_vec.shape[0], window_size-1, hidden_vec.shape[2])).to(self.device) # B x (window_size-1) x d_model
+    cat_hidden_vec = torch.cat([zero_vec, hidden_vec], dim=1) # B x (window_size-1+T) x d_model
+    new_hidden_vec = cat_hidden_vec.unfold(1, window_size, 1).transpose(2, 3) # B x T x window_size x d_model
+    new_hidden_vec = new_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], window_size, -1)) # (B*T) x window_size x d_model
+    new_hidden_vec = torch.cat([BOS_emb, new_hidden_vec], dim=1) # (B*T) x (window_size+1) x d_model
+    return new_hidden_vec
+
+  def _apply_pos_enc(self, tgt):
+    pos = torch.arange(tgt.shape[1]).to(tgt.device) # num_sub_tokens
+    pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x num_sub_tokens
+    tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x num_sub_tokens x d_model
+    return tgt_pos
+
+  def _prepare_token_embedding_for_teacher_forcing(self, memory_list, target):
+    for _, feature in enumerate(self.prediction_order[:-1]):
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      memory_list.append(feature_emb_reshape)
+    memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens-1) x d_model
+    return memory_tensor
+
+  def _prepare_memory_list(self, hidden_vec, target=None):
+    memory_list = [] # used for key and value in cross attention
+    BOS_emb = self.sub_decoder_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model
+    if target is not None: # training
+      memory_list.append(BOS_emb)
+    else: # inference
+      memory_list.append(BOS_emb[-1:, :, :])
+    return memory_list
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target']
+
+    # apply window on hidden_vec for enricher
+    if self.sub_decoder_enricher_use:
+      window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = self._apply_pos_enc(input_seq)
+    # prepare memory
+    memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target)
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x 1 x d_model
+      old_memory_tensor = memory_tensor
+      for idx, feature in enumerate(self.prediction_order):
+        feature_pos = self.feature_order_in_output[feature]
+        if self.sub_decoder_enricher_use:
+          input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec[-1:]}
+          input_dict = self.feature_enricher_layers(input_dict)
+          memory_tensor = input_dict['input_seq']
+        CA_attn_mask = generate_CA_mask(input_seq_pos.shape[1], memory_tensor.shape[1]).to(self.device)
+        input_dict = {'input_seq': input_seq_pos[-1:], 'memory': memory_tensor, 'memory_mask': CA_attn_mask}
+        input_dict = self.sub_decoder_layers(input_dict)
+        attn_output = input_dict['input_seq']
+        logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+        logit = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size
+        logits_dict[feature] = logit
+        sampled_token,prob = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        sampled_token_dict[feature] = sampled_token
+        if idx == len(self.prediction_order)-1:
+          return logits_dict, sampled_token_dict
+        feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+        feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+        memory_list.append(feature_emb_reshape)
+        memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + idx+1) x d_model
+      return logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    memory_tensor = self._prepare_token_embedding_for_teacher_forcing(memory_list, target) # (B*T) x (BOS + num_sub_tokens-1) x d_model
+    # apply feature enricher to memory
+    if self.sub_decoder_enricher_use:
+      input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+      input_dict = self.feature_enricher_layers(input_dict)
+      memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # implement sub decoder cross attention
+    input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+    input_dict = self.sub_decoder_layers(input_dict)
+    attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # get prob
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict
+
+class Flatten4Encodec(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool
+  ):
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None):
+    hidden_vec = input_dict['hidden_vec']
+
+    # ---- Training ---- #
+    logits_tensor = torch.zeros(hidden_vec.shape[0], hidden_vec.shape[1], 2049).to(self.device)
+    for idx, feature_type in enumerate(self.prediction_order):
+      # ::4 means that we only use the first token in each 4 tokens
+      # so the chosen tokens will be: 0, 4, 8, 12, ...
+      # 1::4 means that we only use the second token in each 4 tokens
+      # so the chosen tokens will be: 1, 5, 9, 13, ...
+      separated_hidden_vec = hidden_vec[:, idx::4, :]
+      logit = self.hidden2logit[f"layer_{feature_type}"](separated_hidden_vec)
+      logits_tensor[:, idx::4, :] = logit
+      # prob_dict[feature_type] = prob
+    return logits_tensor
+  
+  def run_one_step(self, input_dict, sampling_method=None, threshold=None, temperature=None, feature_type=None):
+    # ---- Generate(Inference) ---- #
+    hidden_vec = input_dict['hidden_vec']
+    logit = self.hidden2logit[f"layer_{feature_type}"](hidden_vec[:, -1:])
+    sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+    return logit, sampled_token
+
+
+class DiffusionDecoder(SubDecoderClass):
+  def __init__(
+    self, 
+    prediction_order:list, 
+    vocab:LangTokenVocab, 
+    sub_decoder_depth:int, 
+    dim:int, 
+    heads:int, 
+    dropout:float, 
+    sub_decoder_enricher_use:bool,
+    MASK_IDX:int = 126336,
+    denoising_steps:int = 8,
+    eps:float = 1e-3,
+    method:str = 'low-confidence', # or random or auto-regressive
+  ):
+    '''
+    The power of Cross-attention and UniAudio style Self-attention lies in that using the output of the main decoder or hidden vec directly in the sub-decoder
+    As the output of the main decoder is the representation of the whole sequence, 
+    it contains richer information which can even decode out sub-tokens in a parallel manner
+    So both architectures using the output of the main decoder in a direct way show better performance than the original self-attention sub-decoder
+    '''
+    super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use)
+    self.sub_decoder_enricher_use = sub_decoder_enricher_use
+    self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)}
+    
+    self.pos_enc = nn.Embedding(len(self.prediction_order), dim)
+    nn.init.zeros_(self.pos_enc.weight)
+
+    self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True)
+    self.diffusion_mask_emb = nn.Parameter(torch.empty(dim), requires_grad=True) # embedding of mask token,idx is 126336,which is not in vocab
+    nn.init.normal_(self.diffusion_mask_emb, mean=0.0, std=0.02)
+    self.MASK_idx = MASK_IDX
+    self.denoising_steps = denoising_steps
+    self.eps = eps
+    self.method = method
+    
+    self.input_norm = nn.LayerNorm(dim)
+    
+    self.feature_boost_layers = nn.Sequential(TransformerLayer(dim=dim, num_heads=heads, dropout=dropout))
+    
+    if sub_decoder_enricher_use:
+      self.enricher_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True)
+    causal_mask = generate_SA_mask(len(prediction_order))
+    causal_ca_mask = generate_none_causality_mask(len(prediction_order), len(prediction_order)).to(self.device)
+    self.register_buffer('causal_mask', causal_mask)
+    self.register_buffer('causal_ca_mask', causal_ca_mask)
+    
+    # get depth of the sub-decoder
+    if sub_decoder_depth > 1:
+      self.sub_decoder_layers = nn.Sequential(*[TransformerLayer(dim=dim, num_heads=heads, dropout=dropout) for _ in range(sub_decoder_depth)])
+    else:
+      self.sub_decoder_layers = nn.Sequential(TransformerLayer(dim=dim, num_heads=heads, dropout=dropout))
+    if sub_decoder_enricher_use:
+      self.feature_enricher_layers = nn.Sequential(FeatureEnricher(dim=dim, num_heads=heads, dropout=dropout))
+
+  
+  # simplified version of the forward process in diffusion model
+  def _forward_process(self, input_ids, eps=1e-3, mask_idx=None):
+    reshaped_input_ids = torch.reshape(input_ids, (-1, input_ids.shape[-1])) # B*T x num_sub_tokens
+    b, l = reshaped_input_ids.shape
+    t = torch.rand(b, device=input_ids.device)
+    p_mask = (1 - eps) * t + eps
+    p_mask = p_mask[:, None].repeat(1, l)
+
+    masked_indices = torch.rand((b, l), device=input_ids.device) < p_mask
+    # 126336 is used for [MASK] token,attention that this token is not in the vocab
+    if mask_idx is not None:
+      noisy_batch = torch.where(masked_indices, mask_idx, reshaped_input_ids)
+    else:
+      noisy_batch = torch.where(masked_indices, 126336, reshaped_input_ids)# 126336 is used for [MASK] token in
+    return noisy_batch, masked_indices, p_mask
+  
+    
+  def _apply_window_on_hidden_vec(self, hidden_vec):
+    BOS_emb = self.enricher_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model
+    # through our experiments, we found that the size of the window doesn't affect the performance of the model much
+    window_size = 1
+    zero_vec = torch.zeros((hidden_vec.shape[0], window_size-1, hidden_vec.shape[2])).to(self.device) # B x (window_size-1) x d_model
+    cat_hidden_vec = torch.cat([zero_vec, hidden_vec], dim=1) # B x (window_size-1+T) x d_model
+    new_hidden_vec = cat_hidden_vec.unfold(1, window_size, 1).transpose(2, 3) # B x T x window_size x d_model
+    new_hidden_vec = new_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], window_size, -1)) # (B*T) x window_size x d_model
+    new_hidden_vec = torch.cat([BOS_emb, new_hidden_vec], dim=1) # (B*T) x (window_size+1) x d_model
+    return new_hidden_vec
+
+  def _apply_pos_enc(self, tgt):
+    pos = torch.arange(tgt.shape[1]).to(tgt.device) # num_sub_tokens
+    pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x num_sub_tokens
+    tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x num_sub_tokens x d_model
+    return tgt_pos
+
+  def _prepare_token_embedding_for_teacher_forcing(self, memory_list, target):
+    for _, feature in enumerate(self.prediction_order[:-1]):
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      memory_list.append(feature_emb_reshape)
+    memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens-1) x d_model
+    return memory_tensor
+
+  # return a tensor 
+  def _get_noisy_tensor(self, target_shape):
+    new_target = torch.zeros(target_shape).to(self.device)
+    # fill all the elements in the tensor with the embedding of the mask token
+    new_target[:, :, :] = self.diffusion_mask_emb
+    return new_target
+    
+    # prepare the embedding of the target,
+  def _prepare_embedding(self, memory_list, target):
+    for _, feature in enumerate(self.prediction_order):
+      feature_idx = self.vocab.feature_list.index(feature)
+      feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size
+      feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size
+      memory_list.append(feature_emb_reshape)
+    memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens) x d_model
+    return memory_tensor
+
+      
+  def _prepare_memory_list(self, hidden_vec, target=None, add_BOS=True):
+    memory_list = [] # used for key and value in cross attention
+    BOS_emb = self.sub_decoder_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model
+    if add_BOS is true:
+      if target is not None: # training
+        memory_list.append(BOS_emb)
+      else: # inference
+        memory_list.append(BOS_emb[-1:, :, :])
+    else:
+      pass
+    return memory_list
+
+  def _get_num_transfer_tokens(self, mask_index, steps):
+      '''
+      In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+      Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+      the expected number of tokens transitioned at each step should be consistent.
+
+      This function is designed to precompute the number of tokens that need to be transitioned at each step.
+      '''
+      mask_num = mask_index.sum(dim=1, keepdim=True)
+      base = mask_num // steps
+      remainder = mask_num % steps
+
+      num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+
+      for i in range(mask_num.size(0)):
+          num_transfer_tokens[i, :remainder[i]] += 1
+
+      return num_transfer_tokens
+    
+  def sample_from_logits(self, attn_output, hidden_vec, sampling_method=None, threshold=None, temperature=None, force_decode=False,step=None):
+    sampled_token_dict = {}
+    logits_dict = {}
+    candidate_token_embeddings = {}
+    candidate_token_probs = {}
+    b,t,d = hidden_vec.shape # B x T x d_model
+    # print("*"*8)
+    logits_list = []
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_list.append(logit)
+    for idx, feature in enumerate(self.prediction_order):
+      logit = logits_list[idx] # B x T x vocab_siz
+      sampled_token, prob = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      if step==0 and force_decode:
+        if feature == 'velocity':
+          sampled_token = torch.tensor([2]).to(logit.device)
+          prob = torch.tensor([1.0]).to(logit.device)
+        else:
+          prob = torch.tensor([0.0]).to(logit.device)
+          # print(feature, sampled_token, prob)
+      sampled_token_dict[feature] = sampled_token
+      logits_dict[feature] = logit
+      candidate_token_probs[feature] = prob
+      feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+      feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+      candidate_token_embeddings[feature] = feature_emb_reshape
+    stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, -1)) # (B*T) x num_sub_tokens x vocab_size
+    stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, -1, d)) # (B*T) x num_sub_tokens x d_model
+    # print("sampled_token_dict", sampled_token_dict)
+    return sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings
+
+  def sample_from_logits_fast(self, attn_output, hidden_vec, sampling_method=None, threshold=None, temperature=None):
+    sampled_token_dict = {}
+    logits_dict = {}
+    candidate_token_embeddings = {}
+    candidate_token_probs = {}
+
+    b, t, d = hidden_vec.shape  # (B, T, D)
+    F = len(self.projection_keys)
+    Vmax = self.max_vocab_size
+
+    # === 1. 取出所有 feature 的位置 === #
+    feature_pos_list = [self.feature_order_in_output[f] for f in self.projection_keys]
+
+    # === 2. 提取 attn_output 中各 feature 的位置 → (B, F, D) === #
+    attn_features = torch.stack(
+        [attn_output[:, pos, :] for pos in feature_pos_list], dim=1
+    )  # (B, F, D)
+
+    # === 3. 使用 batch 矩阵乘法：einsum 实现并行 Linear === #
+    # attn_features: (B, F, D)
+    # proj_weight:   (F, Vmax, D)
+    # proj_bias:     (F, Vmax)
+    # output: (B, F, Vmax)
+    logits = torch.einsum("bfd,fvd->bfv", attn_features, self.proj_weight) + self.proj_bias
+
+    # === 4. 按照原始 vocab size 截断每个 feature 的 logits === #
+    logits_list = []
+    logits_dict_by_feature = {
+    feature: logits[:, i, :self.vocab_sizes[feature]]
+    for i, feature in enumerate(self.projection_keys)
+}
+    for i, feature in enumerate(self.projection_keys):
+        vocab_size = self.vocab_sizes[feature]
+        logits_list.append(logits[:, i, :vocab_size])  # (B, vocab_size)
+    for idx, feature in enumerate(self.prediction_order):
+      logit = logits_dict_by_feature[feature].unsqueeze(0)  # B x T x vocab_size  
+      sampled_token, prob = sample_with_prob_fast(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      # print(feature, sampled_token, prob)
+      sampled_token_dict[feature] = sampled_token.squeeze(0)  # B x T
+      logits_dict[feature] = logit
+      candidate_token_probs[feature] = prob
+      feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+      feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+      candidate_token_embeddings[feature] = feature_emb_reshape
+    stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, -1)) # (B*T) x num_sub_tokens x vocab_size
+    stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, -1, d)) # (B*T) x num_sub_tokens x d_model
+
+    return sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings
+
+  def choose_tokens(self, hidden_vec, step, method, stacked_logits_probs, num_transfer_tokens):
+    if method == 'low-confidence':
+      _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1)
+    elif method == 'random':
+      indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device)
+    elif method == 'auto-regressive':
+      indices = torch.tensor([[step]], device=hidden_vec.device)
+    return indices
+  
+  
+  def forward_(self, input_dict, sampling_method=None, threshold=None, temperature=None, worst_case=False, validation=False):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] #B x T x d_model
+    
+
+    # apply window on hidden_vec for enricher
+    if self.sub_decoder_enricher_use:
+      window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = input_seq
+    # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model
+    # prepare memory
+    memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False)
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      b,t,d = hidden_vec.shape # B x T x d_model
+      l = len(self.prediction_order) # num_sub_tokens
+      memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d))
+      all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model
+      
+      # indicate the position of the mask token,1 means that the token hsa been masked
+      masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool()
+      num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps)
+      # denoising c
+      stored_logits_dict = {}
+      stored_probs_dict = {}
+      for step in range(self.denoising_steps):
+        # nomalize the memory tensor
+        # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model
+        if self.sub_decoder_enricher_use:
+          input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+          input_dict = self.feature_enricher_layers(input_dict)
+          memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+        # input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask}
+        input_dict = self.sub_decoder_layers(input_dict)
+        attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        candidate_token_probs = {}
+        sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings = self.sample_from_logits(attn_output, hidden_vec, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+        
+        # set prob of the changed tokens to -inf
+        stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf)
+        # indices = self.choose_tokens(hidden_vec,step, "auto-regressive", stacked_logits_probs, num_transfer_tokens)
+        indices = self.choose_tokens(hidden_vec, step, self.method, stacked_logits_probs, num_transfer_tokens)  
+        # breakpoint()
+        # undate the masked history
+        for i in range(b*t):
+          for j in range(l):
+            if j in indices[i]:
+              masked_history[i][j] = False
+              stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone()
+              stored_probs_dict[self.prediction_order[j]] = candidate_token_probs[self.prediction_order[j]].clone()
+        expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model
+        memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings)
+      # breakpoint()
+      # print("stored_probs_dict", stored_probs_dict)
+      # print("sampled_token_dict", sampled_token_dict)
+      return stored_logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model
+    memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model
+    # apply layer norm
+   
+    extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model
+    if worst_case: # mask all ,turn into parallel
+      extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device)
+    memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor)
+    if self.sub_decoder_enricher_use:
+      input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+      input_dict = self.feature_enricher_layers(input_dict)
+      memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+    input_dict = self.sub_decoder_layers(input_dict)
+    attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # get prob
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict, (masked_indices, p_mask)
+  
+  def forward_old(self, input_dict, sampling_method=None, threshold=None, temperature=None, worst_case=False, validation=False):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] #B x T x d_model
+    bos_hidden_vec = input_dict['bos_token_hidden'] # B x 1 x d_model, used for the first token in the sub-decoder
+
+    # apply window on hidden_vec for enricher
+    if self.sub_decoder_enricher_use:
+      window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model
+    
+    if bos_hidden_vec is None: # start of generation
+      if target is None:
+        bos_hidden_vec = input_seq_pos
+      else:
+        bos_hidden_vec =hidden_vec[:, 0, :].unsqueeze(1).repeat(1, hidden_vec.shape[1], 1) # B x T x d_model
+        bos_hidden_vec = bos_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1))
+        bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1)
+        
+    else:
+      bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    
+    # input_seq_pos = input_seq
+    input_dict = {'input_seq': input_seq_pos, 'memory': bos_hidden_vec, 'memory_mask': self.causal_ca_mask}
+    boosted_input_dict = self.feature_boost_layers(input_dict) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = boosted_input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # input_seq_pos = self.input_norm(input_seq_pos) # (B*T) x num_sub_tokens x d_model
+    # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model
+    # prepare memory
+    memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False)
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      b,t,d = hidden_vec.shape # B x T x d_model
+      l = len(self.prediction_order) # num_sub_tokens
+      memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d))
+      all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model
+      
+      # indicate the position of the mask token,1 means that the token hsa been masked
+      masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool()
+      num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps)
+      # denoising c
+      stored_logits_dict = {}
+      stored_probs_dict = {}
+      for step in range(self.denoising_steps):
+        memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model
+        # nomalize the memory tensor
+        # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model
+        if self.sub_decoder_enricher_use:
+          input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+          input_dict = self.feature_enricher_layers(input_dict)
+          memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+        input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask}
+        input_dict = self.sub_decoder_layers(input_dict)
+        attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        candidate_token_probs = {}
+        candidate_token_embeddings = {}
+        for idx, feature in enumerate(self.prediction_order):
+          feature_pos = self.feature_order_in_output[feature]
+          logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+          logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+          logits_dict[feature] = logit
+          sampled_token,probs = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+          # print(idx,feature,sampled_token,probs)
+          sampled_token_dict[feature] = sampled_token
+          candidate_token_probs[feature] = probs
+          feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token)
+          feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size
+          candidate_token_embeddings[feature] = feature_emb_reshape
+
+        stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, l)) # (B*T) x num_sub_tokens x vocab_size
+        stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, l, d))
+        
+        # set prob of the changed tokens to -inf
+        stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf)
+
+        if self.method == 'low-confidence':
+          _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1)
+        elif self.method == 'random':
+          indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device)
+        elif self.method == 'auto-regressive':
+          indices = torch.tensor([[step]], device=logit.device)
+        # undate the masked history
+        for i in range(b*t):
+          for j in range(l):
+            if j in indices[i]:
+              masked_history[i][j] = False
+              stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone()
+              stored_probs_dict[self.prediction_order[j]] = candidate_token_probs[self.prediction_order[j]].clone()
+        expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model
+        memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings)
+      return stored_logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model
+    memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model
+    # apply layer norm
+   
+    extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model
+    if worst_case: # mask all ,turn into parallel
+      extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device)
+    memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor)
+    memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model
+    # all is embedding
+    # memory_tensor = self.layer_norm(memory_tensor)
+    # apply feature enricher to memory
+    if self.sub_decoder_enricher_use:
+      input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+      input_dict = self.feature_enricher_layers(input_dict)
+      memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # implement sub decoder cross attention
+    # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+    # inter_input = torch.cat([input_seq_pos, memory_tensor], dim=1)
+    # inter_input = input_seq_pos + memory_tensor # (B*T) x num_sub_tokens x d_model
+    # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+    input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask}
+    input_dict = self.sub_decoder_layers(input_dict)
+    attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # get prob
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict, (masked_indices, p_mask)
+  
+  def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None, Force_decode=False, worst_case=False, validation=False):
+    logits_dict = {}
+    hidden_vec = input_dict['hidden_vec'] # B x T x d_model
+    target = input_dict['target'] #B x T x d_model
+    bos_hidden_vec = input_dict['bos_token_hidden'] # B x 1 x d_model, used for the first token in the sub-decoder
+
+    # apply window on hidden_vec for enricher
+    if self.sub_decoder_enricher_use:
+      window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model
+    hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model
+    input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model
+    
+    if bos_hidden_vec is None: # start of generation
+      if target is None:
+        bos_hidden_vec = input_seq_pos
+      else:
+        bos_hidden_vec =hidden_vec[:, 0, :].unsqueeze(1).repeat(1, hidden_vec.shape[1], 1) # B x T x d_model
+        bos_hidden_vec = bos_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1))
+        bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1)
+        
+    else:
+      bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model
+    
+    # input_seq_pos = input_seq
+    input_dict = {'input_seq': input_seq_pos, 'memory': bos_hidden_vec, 'memory_mask': self.causal_ca_mask}
+    boosted_input_dict = self.feature_boost_layers(input_dict) # (B*T) x num_sub_tokens x d_model
+    input_seq_pos = boosted_input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # input_seq_pos = self.input_norm(input_seq_pos) # (B*T) x num_sub_tokens x d_model
+    # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model
+    # prepare memory
+    memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False)
+    # ---- Generate(Inference) ---- #
+    if target is None:
+      sampled_token_dict = {}
+      b,t,d = hidden_vec.shape # B x T x d_model
+      l = len(self.prediction_order) # num_sub_tokens
+      memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d))
+      all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model
+      
+      # indicate the position of the mask token,1 means that the token hsa been masked
+      masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool()
+      num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps)
+      # denoising c
+      stored_logits_dict = {}
+      stored_probs_dict = {}
+  #     with torch.profiler.profile(
+  #     activities=[
+  #         torch.profiler.ProfilerActivity.CPU,
+  #         torch.profiler.ProfilerActivity.CUDA],
+  #     record_shapes=True,
+  #     profile_memory=True,
+  #     with_stack=True
+  # ) as prof:
+      for step in range(self.denoising_steps):
+        memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model
+        # nomalize the memory tensor
+        # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model
+        if self.sub_decoder_enricher_use:
+          input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+          input_dict = self.feature_enricher_layers(input_dict)
+          memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask}
+        input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask}
+        input_dict = self.sub_decoder_layers(input_dict)
+        attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+        candidate_token_probs = {}
+        
+        sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings = self.sample_from_logits(attn_output, hidden_vec, sampling_method=sampling_method, threshold=threshold, temperature=temperature,
+                                                                                                                                         force_decode=Force_decode,
+                                                                                                                                         step=step)
+
+        # set prob of the changed tokens to -inf
+        stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf)
+
+        if self.method == 'low-confidence':
+          _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1)
+        elif self.method == 'random':
+          indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device)
+        elif self.method == 'auto-regressive':
+          indices = torch.tensor([[step]], device=logit.device)
+        # undate the masked history
+        for i in range(b*t):
+          for j in range(l):
+            if j in indices[i]:
+              masked_history[i][j] = False
+              stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone()
+        expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model
+        memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings)
+      # print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))
+      # print(sampled_token_dict)
+      return stored_logits_dict, sampled_token_dict
+    
+    # ---- Training ---- #
+    _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model
+    memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model
+    # apply layer norm
+   
+    extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model
+    if worst_case: # mask all ,turn into parallel
+      extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device)
+    memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor)
+    memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model
+    # all is embedding
+    # memory_tensor = self.layer_norm(memory_tensor)
+    # apply feature enricher to memory
+    if self.sub_decoder_enricher_use:
+      input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec}
+      input_dict = self.feature_enricher_layers(input_dict)
+      memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # implement sub decoder cross attention
+    input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask}
+    input_dict = self.sub_decoder_layers(input_dict)
+    attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model
+    # get prob
+    for idx, feature in enumerate(self.prediction_order):
+      feature_pos = self.feature_order_in_output[feature]
+      logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :])
+      logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size
+      logits_dict[feature] = logit
+    return logits_dict, (masked_indices, p_mask)
\ No newline at end of file
diff --git a/Amadeus/symbolic_encoding/__init__.py b/Amadeus/symbolic_encoding/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..b115a9d
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..8963d99
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..abbcb9c
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc
new file mode 100644
index 0000000..0a0c766
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc
new file mode 100644
index 0000000..00368c3
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc
new file mode 100644
index 0000000..e966c57
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc
new file mode 100644
index 0000000..ed17777
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc
new file mode 100644
index 0000000..e3258ec
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc
new file mode 100644
index 0000000..385928a
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc
new file mode 100644
index 0000000..5258d8d
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc
new file mode 100644
index 0000000..0de9881
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc
new file mode 100644
index 0000000..02944f3
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc
new file mode 100644
index 0000000..d196e76
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc differ
diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc
new file mode 100644
index 0000000..4b4a5d5
Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc differ
diff --git a/Amadeus/symbolic_encoding/anylazesf.py b/Amadeus/symbolic_encoding/anylazesf.py
new file mode 100644
index 0000000..803d5e6
--- /dev/null
+++ b/Amadeus/symbolic_encoding/anylazesf.py
@@ -0,0 +1,46 @@
+from sf2utils.sf2parse import Sf2File
+
+def print_sorted_presets(sf2_path):
+    presets_info = []
+
+    with open(sf2_path, 'rb') as f:
+        sf2 = Sf2File(f)
+
+        for preset in sf2.presets:
+            try:
+                # 尝试直接读取
+                name = getattr(preset, 'name', '???').strip('\x00')
+                bank = getattr(preset, 'bank', None)
+                program = getattr(preset, 'preset', None)
+
+                # 如果获取不到，再尝试从子属性中取
+                if bank is None or program is None:
+                    for attr in dir(preset):
+                        attr_value = getattr(preset, attr)
+                        if hasattr(attr_value, 'bank') and hasattr(attr_value, 'preset'):
+                            bank = attr_value.bank
+                            program = attr_value.preset
+                            name = getattr(attr_value, 'name', name).strip('\x00')
+                            break
+
+                # 收集有效结果
+                if bank is not None and program is not None:
+                    presets_info.append((program, bank, name))
+            except Exception as e:
+                print(f"Error reading preset: {e}")
+
+    # 按 program 升序排序（若需要按 bank 再 program，改为 sorted(..., key=lambda x: (x[1], x[0]))）
+    presets_info.sort(key=lambda x: x[0])
+
+    # 打印结果
+    print(f"{'Program':<8} {'Bank':<6} {'Preset Name'}")
+    print("-" * 40)
+    for program, bank, name in presets_info:
+        print(f"{program:<8} {bank:<6} {name}")
+
+# DEFAULT_SOUND_FONT = '/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2'
+# DEFAULT_SOUND_FONT = '~/.fluidsynth/default_sound_font.sf2'
+
+# 替换为你的 sf2 文件路径
+sf2_path = "/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2"
+print_sorted_presets(sf2_path)
\ No newline at end of file
diff --git a/Amadeus/symbolic_encoding/augmentor.py b/Amadeus/symbolic_encoding/augmentor.py
new file mode 100644
index 0000000..40c1839
--- /dev/null
+++ b/Amadeus/symbolic_encoding/augmentor.py
@@ -0,0 +1,94 @@
+import random
+from typing import Union
+
+import torch
+
+class Augmentor:
+  def __init__(
+      self, 
+      vocab, 
+      aug_type:Union[str, None], 
+      input_length:int
+  ):
+    self.vocab = vocab
+    self.aug_type = aug_type
+    self.input_length = input_length
+    self.feature_list = vocab.feature_list
+    self.num_features = len(self.feature_list)
+    self.encoding_scheme = vocab.encoding_scheme
+
+    self.pitch_idx = self.feature_list.index('pitch')
+    if 'chord' in self.feature_list:
+      self.chord_idx = self.feature_list.index('chord')
+  
+  def _get_shift(self, segment):
+    # the pitch vocab has ignore token in 0 index
+    if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb':
+      pitch_mask = segment != 0
+      pitch_segment = segment[pitch_mask[:,self.pitch_idx], self.pitch_idx]
+      # check if tensor is empty
+      if pitch_segment.numel() == 0:
+        shift = 0
+      else:
+        lowest_pitch = max(12, torch.min(pitch_segment))
+        highest_pitch = min(119, torch.max(pitch_segment))
+        lower_shift_bound = torch.where(lowest_pitch - torch.arange(6) > 11)[0][-1].item()
+        upper_shift_bound = torch.where(highest_pitch + torch.arange(7) < 120)[0][-1].item()
+        shift = random.randint(-lower_shift_bound, upper_shift_bound)
+    else: # remi
+      mask_for_pitch = self.vocab.total_mask['pitch'].to(segment.device)
+      segemnt_pitch_mask = mask_for_pitch[segment]
+      segment_pitch = segment * segemnt_pitch_mask
+      segment_pitch = segment_pitch[segment_pitch != 0]
+      # check if tensor is empty
+      if segment_pitch.numel() == 0:
+        shift = 0
+      else:
+        lower_bound = torch.argwhere(mask_for_pitch == 1)[0].item()
+        upper_bound = torch.argwhere(mask_for_pitch == 1)[-1].item()
+        lowest_pitch = max(lower_bound, torch.min(segment_pitch))
+        highest_pitch = min(upper_bound, torch.max(segment_pitch))
+        lower_shift_bound = torch.where(lowest_pitch - torch.arange(6) >= lower_bound)[0][-1].item()
+        upper_shift_bound = torch.where(highest_pitch + torch.arange(7) <= upper_bound)[0][-1].item()
+        shift = random.randint(-lower_shift_bound, upper_shift_bound)
+    return shift
+
+  # TODO: arrange hard coded part
+  def __call__(self, segment):
+    '''
+    input_tensor is segments of x, y
+    for transformer_xl, the shape of x, y is [max_num_segments, input_length, num_features]
+    so we need to change the shape of x, y to [max_num_segments*input_length, num_features]
+    '''
+    if self.aug_type == 'random':
+      shift = self._get_shift(segment)
+      if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb':
+        # pitch augmentation
+        segment_pitch_mask = segment != 0
+        new_segment = segment.clone()
+        new_segment[segment_pitch_mask[:,self.pitch_idx], self.pitch_idx] += shift
+        if 'chord' in self.feature_list:
+          # chord augmentation
+          segment_chord_mask = (segment[:,self.chord_idx] != 0) & (segment[:,self.chord_idx] != 1)
+          new_segment[segment_chord_mask, self.chord_idx] = (((new_segment[segment_chord_mask, self.chord_idx]-2) %  12) + shift ) % 12 + ((new_segment[segment_chord_mask, self.chord_idx]-2) // 12) * 12 + 2
+        segment = new_segment
+      else: # remi
+        # choose random interger between -5 and 6
+        # the augmented results from shift -6 and 6 are same, so we choose -5 and 6
+        # pitch augmentation
+        mask_for_pitch = self.vocab.total_mask['pitch'].to(segment.device)
+        segment_pitch_mask = mask_for_pitch[segment]
+        new_segment = segment.clone()
+        new_segment_valid = (new_segment + shift) * segment_pitch_mask
+        new_segment = new_segment * (1 - segment_pitch_mask) + new_segment_valid
+        if 'chord' in self.feature_list:
+          # chord augmentation
+          mask_for_chord = self.vocab.total_mask['chord'].clone().to(segment.device)
+          chord_n_n_idx = torch.argwhere(mask_for_chord == 1)[-1].item()
+          mask_for_chord[chord_n_n_idx] = 0
+          start_idx_chord = self.vocab.remi_vocab_boundaries_by_key['chord'][0]
+          segment_chord_mask = mask_for_chord[segment]
+          new_segment_valid = ((((new_segment - start_idx_chord) % 12 + shift) % 12) + ((new_segment - start_idx_chord) // 12) * 12 + start_idx_chord) * segment_chord_mask
+          new_segment = new_segment * (1 - segment_chord_mask) + new_segment_valid
+        segment = new_segment
+    return segment
diff --git a/Amadeus/symbolic_encoding/compile_utils.py b/Amadeus/symbolic_encoding/compile_utils.py
new file mode 100644
index 0000000..40eae76
--- /dev/null
+++ b/Amadeus/symbolic_encoding/compile_utils.py
@@ -0,0 +1,207 @@
+import random
+from collections import defaultdict
+
+import torch
+import numpy as np
+import  random
+
+def reverse_shift_and_pad(tune_in_idx, slice_boundary=4):
+  new_lst = [curr_elems[:slice_boundary] + next_elems[slice_boundary:] for curr_elems, next_elems in zip(tune_in_idx, tune_in_idx[1:])]
+  return new_lst
+
+def reverse_shift_and_pad_for_tensor(tensor, first_pred_feature):
+  '''
+  tensor: [batch_size x seq_len x feature_size]
+  '''
+  if first_pred_feature == 'type':
+    return tensor
+  if tensor.shape[-1] == 8:
+    slice_boundary_dict = {'type':0, 'beat':1, 'chord':2, 'tempo':3, 'instrument':4, 'pitch':5, 'duration':6, 'velocity':7}
+  elif tensor.shape[-1] == 7:
+    slice_boundary_dict = {'type':0, 'beat':1, 'chord':2, 'tempo':3, 'pitch':4, 'duration':5, 'velocity':6}
+  elif tensor.shape[-1] == 5:
+    slice_boundary_dict = {'type':0, 'beat':1, 'instrument':2, 'pitch':3, 'duration':4}
+  elif tensor.shape[-1] == 4:
+    slice_boundary_dict = {'type':0, 'beat':1, 'pitch':2, 'duration':3}
+  slice_boundary = slice_boundary_dict[first_pred_feature]
+  new_tensor = torch.zeros_like(tensor)
+  new_tensor[..., :, :slice_boundary] = tensor[..., :, :slice_boundary]
+  new_tensor[..., :-1, slice_boundary:] = tensor[..., 1:, slice_boundary:]
+  return new_tensor
+
+def shift_and_pad(tune_in_idx, first_pred_feature):
+  if first_pred_feature == 'type':
+    return tune_in_idx
+  if len(tune_in_idx[0]) == 8:
+    slice_boundary_dict = {'type':0, 'beat':-7, 'chord':-6, 'tempo':-5, 'instrument':-4, 'pitch':-3, 'duration':-2, 'velocity':-1}
+  elif len(tune_in_idx[0]) == 7:
+    slice_boundary_dict = {'type':0, 'beat':-6, 'chord':-5, 'tempo':-4, 'pitch':-3, 'duration':-2, 'velocity':-1}
+  elif len(tune_in_idx[0]) == 5:
+    slice_boundary_dict = {'type':0, 'beat':-4, 'instrument':-3, 'pitch':-2, 'duration':-1}
+  elif len(tune_in_idx[0]) == 4: 
+    slice_boundary_dict = {'type':0, 'beat':-3, 'pitch':-2, 'duration':-1}
+  slice_boundary = slice_boundary_dict[first_pred_feature]
+  # Add an empty list padded with zeros at the beginning, and sos and eos tokens are not shifted
+  padded_tune_in_idx = torch.cat([torch.zeros(1, len(tune_in_idx[0]), dtype=torch.long), tune_in_idx], dim=0)
+  new_tensor = torch.zeros_like(padded_tune_in_idx)
+  new_tensor[:, slice_boundary:] = padded_tune_in_idx[:, slice_boundary:]
+  new_tensor[:-1, :slice_boundary] = padded_tune_in_idx[1:, :slice_boundary]
+  return new_tensor
+
+class VanillaTransformer_compiler():
+  def __init__(
+      self, 
+      data_list, 
+      augmentor, 
+      eos_token, 
+      input_length,
+      first_pred_feature,
+      encoding_scheme
+  ):
+    self.data_list = data_list
+    self.augmentor = augmentor
+    self.eos_token = eos_token
+    self.input_length = input_length
+    self.first_pred_feature = first_pred_feature
+    self.encoding_scheme = encoding_scheme
+
+  def make_segments(self, data_type):
+    segments = []
+    tune_name2segment = defaultdict(list)
+    segment2tune_name = []
+    num_segments = 0
+    for i in range(len(self.data_list)):
+      tune_in_idx, tune_name = self.data_list[i]
+      tune_in_idx = torch.LongTensor(tune_in_idx)
+      if self.encoding_scheme == 'remi' or self.encoding_scheme == 'cp':
+        eos_token = torch.LongTensor(self.eos_token)
+      else:
+        eos_token = torch.LongTensor(self.eos_token)
+        # shift and pad
+        tune_in_idx = shift_and_pad(tune_in_idx, self.first_pred_feature)
+      if data_type == 'train':
+        if len(tune_in_idx) <= self.input_length+1:
+          if 'remi' in self.encoding_scheme:
+            padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx))
+          else:
+            padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx), 1)
+          mask = torch.cat([torch.ones(len(tune_in_idx), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+          segment = torch.cat([tune_in_idx, padding_seq], dim=0)
+          segments.append([segment, mask])
+          segment2tune_name.append(tune_name)
+        else:
+          start_point = 0
+          while start_point + self.input_length+1 < len(tune_in_idx):
+            mask = torch.ones(self.input_length+1, dtype=torch.long)
+            segment = tune_in_idx[start_point:start_point + self.input_length+1]
+            segments.append([segment, mask])
+            segment2tune_name.append(tune_name)
+            assert len(segment) == self.input_length+1
+            # Randomly choose the start point for the next segment, which is in the range of half of the current segment to the end of the current segment
+            start_point += random.randint((self.input_length+1)//2, self.input_length+1)
+             # if text controled,we only use the first segment
+          # add the last segment
+          if len(tune_in_idx[start_point:]) < self.input_length+1:
+            if 'remi' in self.encoding_scheme:
+              padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx[start_point:]))
+            else:
+              padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx[start_point:]), 1)
+            mask = torch.cat([torch.ones(len(tune_in_idx[start_point:]), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+            segment = torch.cat([tune_in_idx[start_point:], padding_seq], dim=0)
+            segments.append([segment, mask])
+            segment2tune_name.append(tune_name)
+           
+          
+      else: # for validset
+        for i in range(0, len(tune_in_idx), self.input_length+1):
+          segment = tune_in_idx[i:i+self.input_length+1]
+          if len(segment) <= self.input_length+1:
+            if 'remi' in self.encoding_scheme:
+              padding_seq = eos_token[0].repeat(self.input_length+1-len(segment))
+            else:
+              padding_seq = eos_token.repeat(self.input_length+1-len(segment), 1)
+            mask = torch.cat([torch.ones(len(segment), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+            segment = torch.cat([segment, padding_seq], dim=0)
+            segment2tune_name.append(tune_name)
+            segments.append([segment, mask])
+            num_segments += 1
+            tune_name2segment[tune_name].append(num_segments-1)
+          else:
+            mask = torch.ones(self.input_length+1, dtype=torch.long)
+            segments.append([segment, mask])
+            segment2tune_name.append(tune_name)
+            segments.append([segment, mask])
+            num_segments += 1
+            tune_name2segment[tune_name].append(num_segments-1)
+          assert len(segment) == self.input_length+1
+          
+    return segments, tune_name2segment, segment2tune_name
+
+  def make_segments_iters(self, data_type):
+    tune_name2segment = defaultdict(list)
+    segment2tune_name = []
+    num_segments = 0
+    # shuffle the data_list
+    if data_type == 'train':
+      random.shuffle(self.data_list)
+    print("length of data_list:", len(self.data_list))
+    for i in range(len(self.data_list)):
+      tune_in_idx, tune_name = self.data_list[i]
+      tune_in_idx = torch.LongTensor(tune_in_idx)
+      if self.encoding_scheme == 'remi' or self.encoding_scheme == 'cp':
+        eos_token = torch.LongTensor(self.eos_token)
+      else:
+        eos_token = torch.LongTensor(self.eos_token)
+        # shift and pad
+        tune_in_idx = shift_and_pad(tune_in_idx, self.first_pred_feature)
+      if data_type == 'train':
+        if len(tune_in_idx) <= self.input_length+1:
+          if 'remi' in self.encoding_scheme:
+            padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx))
+          else:
+            padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx), 1)
+          mask = torch.cat([torch.ones(len(tune_in_idx), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+          segment = torch.cat([tune_in_idx, padding_seq], dim=0)
+          segment2tune_name.append(tune_name)
+          yield [segment, mask], tune_name2segment, segment2tune_name
+        else:
+          start_point = 0
+          while start_point + self.input_length+1 < len(tune_in_idx):
+            mask = torch.ones(self.input_length+1, dtype=torch.long)
+            segment = tune_in_idx[start_point:start_point + self.input_length+1]
+            segment2tune_name.append(tune_name)
+            yield [segment, mask], tune_name2segment, segment2tune_name
+            assert len(segment) == self.input_length+1
+            start_point += random.randint((self.input_length+1)//2, self.input_length+1)
+            # break
+          if len(tune_in_idx[start_point:]) < self.input_length+1:
+            if 'remi' in self.encoding_scheme:
+              padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx[start_point:]))
+            else:
+              padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx[start_point:]), 1)
+            mask = torch.cat([torch.ones(len(tune_in_idx[start_point:]), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+            segment = torch.cat([tune_in_idx[start_point:], padding_seq], dim=0)
+            segment2tune_name.append(tune_name)
+            yield [segment, mask], tune_name2segment, segment2tune_name
+      else: # for validset
+        for i in range(0, len(tune_in_idx), self.input_length+1):
+          segment = tune_in_idx[i:i+self.input_length+1]
+          if len(segment) <= self.input_length+1:
+            if 'remi' in self.encoding_scheme:
+              padding_seq = eos_token[0].repeat(self.input_length+1-len(segment))
+            else:
+              padding_seq = eos_token.repeat(self.input_length+1-len(segment), 1)
+            mask = torch.cat([torch.ones(len(segment), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0)
+            segment = torch.cat([segment, padding_seq], dim=0)
+            segment2tune_name.append(tune_name)
+            num_segments += 1
+            tune_name2segment[tune_name].append(num_segments-1)
+            yield [segment, mask], tune_name2segment, segment2tune_name
+          else:
+            mask = torch.ones(self.input_length+1, dtype=torch.long)
+            segment2tune_name.append(tune_name)
+            num_segments += 1
+            tune_name2segment[tune_name].append(num_segments-1)
+            yield [segment, mask], tune_name2segment, segment2tune_name
+          assert len(segment) == self.input_length+1
+
diff --git a/Amadeus/symbolic_encoding/data_utils.py b/Amadeus/symbolic_encoding/data_utils.py
new file mode 100644
index 0000000..2400d36
--- /dev/null
+++ b/Amadeus/symbolic_encoding/data_utils.py
@@ -0,0 +1,1610 @@
+import re
+import random
+from pathlib import Path
+from collections import OrderedDict
+from typing import Union, List, Tuple, Dict
+
+import numpy as np
+import matplotlib.pyplot as plt
+# lock of thread
+from threading import Lock
+
+import json
+from tqdm import tqdm
+from torch.utils.data import Dataset,IterableDataset
+from transformers import T5Tokenizer
+
+from .augmentor import Augmentor
+from .compile_utils import VanillaTransformer_compiler
+from data_representation import vocab_utils
+
+def get_emb_total_size(config, vocab):
+  emb_param = config.nn_params.emb
+  total_size = 0 
+  for feature in vocab.feature_list:
+    size = int(emb_param[feature] * emb_param.emb_size)
+    total_size += size
+    emb_param[feature] = size
+  emb_param.total_size = total_size
+  config.nn_params.emb = emb_param
+  return config
+
+class TuneCompiler(Dataset):
+  def __init__(
+      self, 
+      data:List[Tuple[np.ndarray, str]], 
+      data_type:str, 
+      augmentor:Augmentor, 
+      vocab:vocab_utils.LangTokenVocab,
+      input_length:int,
+      first_pred_feature:str,
+      caption_path:Union[str, None] = None,
+      for_evaluation: bool = False
+  ):
+    '''
+    The data is distributed on-the-fly by the TuneCompiler
+    Pitch, Chord augementation is applied to the training data every iteration
+    Segmentation is applied every epoch for the training data
+    '''
+    super().__init__()
+    self.data_list = data
+    self.data_type = data_type
+    self.augmentor = augmentor
+    self.eos_token = vocab.eos_token
+    self.compile_function = VanillaTransformer_compiler(
+      data_list=self.data_list, 
+      augmentor=self.augmentor, 
+      eos_token=self.eos_token, 
+      input_length=input_length,
+      first_pred_feature=first_pred_feature,
+      encoding_scheme=vocab.encoding_scheme
+    )
+    self.segment2tune_name = None
+    self.tune_name2segment = None
+    self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large",legacy=False)  # Initialize T5 tokenizer for caption processing
+
+
+    if self.data_type == 'valid' or self.data_type == 'test':
+      self._update_segments_for_validset()
+    else:
+      self._update_segments_for_trainset()
+
+  def _update_segments_for_trainset(self, random_seed=0):
+    random.seed(random_seed)
+    if self.segment2tune_name is not None:
+      # If segments are already compiled, we can skip the compilation
+      print("Segments are already compiled, skipping compilation")
+      return
+    print("Compiling segments for training data")
+    with Lock():
+      self.segments, _, self.segment2tune_name = self.compile_function.make_segments(self.data_type)
+    print(f"number of trainset segments: {len(self.segments)}")
+
+  def _update_segments_for_validset(self, random_seed=0):
+    random.seed(random_seed)
+    with Lock():
+      self.segments, self.tune_name2segment, self.segment2tune_name = self.compile_function.make_segments(self.data_type)
+    print(f"number of testset segments: {len(self.segments)}")
+
+  def __getitem__(self, idx):
+    segment, tensor_mask = self.segments[idx]
+    tune_name = self.segment2tune_name[idx]
+    try:
+      encoded_caption = self.t5_tokenizer(tune_name, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
+    except Exception as e:
+      print(f"Error encoding caption for tune {tune_name}: {e}")
+      encoded_caption = self.t5_tokenizer("No caption available", return_tensors='pt', padding='max_length', truncation=True, max_length=128)
+    return segment, tensor_mask, tune_name, encoded_caption
+    if self.data_type == 'train':
+      augmented_segment = self.augmentor(segment)
+      return augmented_segment, tensor_mask, tune_name, encoded_caption
+    else:
+      return segment, tensor_mask, tune_name, encoded_caption
+  
+  def get_segments_with_tune_idx(self, tune_name, seg_order):
+    '''
+    This function is used to retrieve the segment with the tune name and segment order during the validation
+    '''
+    segments_list = self.tune_name2segment[tune_name]
+    segment_idx = segments_list[seg_order]
+    segment, mask = self.segments[segment_idx][0], self.segments[segment_idx][1]
+    return segment, mask
+
+  def __len__(self):
+    return len(self.segments)
+
+class IterTuneCompiler(IterableDataset):
+  def __init__(
+      self, 
+      data: List[Tuple[np.ndarray, str]], 
+      data_type: str, 
+      augmentor: Augmentor, 
+      vocab: vocab_utils.LangTokenVocab,
+      input_length: int,
+      first_pred_feature: str,
+      caption_path: Union[str, None] = None,
+      for_evaluation: bool = False
+  ):
+    '''
+    The data is distributed on-the-fly by the IterTuneCompiler.
+    Pitch, Chord augmentation is applied to the training data every iteration.
+    Segmentation is applied every epoch for the training data.
+    '''
+    super().__init__()
+    self.data_list = data
+    self.data_type = data_type
+    self.augmentor = augmentor
+    self.eos_token = vocab.eos_token
+    self.compile_function = VanillaTransformer_compiler(
+      data_list=self.data_list, 
+      augmentor=self.augmentor, 
+      eos_token=self.eos_token, 
+      input_length=input_length,
+      first_pred_feature=first_pred_feature,
+      encoding_scheme=vocab.encoding_scheme
+    )
+    self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False)
+    self.random_seed = 0
+
+  def __iter__(self):
+    # This will yield ([segment, mask], tune_name2segment, segment2tune_name)
+    generator = self.compile_function.make_segments_iters(self.data_type)
+    for ([segment, mask], tune_name2segment, segment2tune_name) in generator:
+      # print(len(segment2tune_name), len(tune_name2segment))
+      tune_name = segment2tune_name[-1]  # Get the last tune name from the segment2tune_name list
+      # print(f"Processing tune: {tune_name}")
+      try:
+        encoded_caption = self.t5_tokenizer(tune_name, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
+      except Exception as e:
+        encoded_caption = self.t5_tokenizer("No caption available", return_tensors='pt', padding='max_length', truncation=True, max_length=128)
+      if self.data_type == 'train':
+        segment = self.augmentor(segment)
+      # use input_ids replace tune_name
+      tune_name = encoded_caption['input_ids'][0]  # Use the input_ids from the encoded caption
+      yield segment, mask, tune_name, encoded_caption
+
+  def __len__(self):
+    # If you want to use __len__, you need to know the number of segments in advance.
+    # Otherwise, you can raise an exception or return a default value.
+    raise NotImplementedError("IterTuneCompiler is an iterable dataset and does not support __len__.")
+  
+class SymbolicMusicDataset(Dataset):
+  def __init__(
+      self, 
+      vocab: vocab_utils.LangTokenVocab,
+      encoding_scheme: str,                
+      num_features: int,                   
+      debug: bool,                         
+      aug_type: Union[str, None],          
+      input_length: int,                   
+      first_pred_feature: str,
+      caption_path: Union[str, None] = None,
+      for_evaluation: bool = False           
+  ):
+    '''
+    The vocabulary containing token representations for the dataset
+    The encoding scheme used for representing symbolic music (e.g., REMI, NB, etc.)
+    The number of features used for the dataset
+    Debug mode; limits dataset size for faster testing if enabled
+    Type of data augmentation to apply, if 'random' the compiler will apply pitch and chord augmentation
+    Length of the input sequence for each sample
+    Feature to predict first which is used for compound shift for NB, if not shift, 'type' is used
+    '''
+    super().__init__()
+    # Initializing instance variables
+    self.encoding_scheme = encoding_scheme
+    self.num_features = num_features
+    self.debug = debug
+    self.input_length = input_length
+    self.first_pred_feature = first_pred_feature
+    self.caption_path = caption_path
+    self.for_evaluation = for_evaluation
+
+    # Load the vocabulary passed into the constructor
+    self.vocab = vocab
+    
+    # Initialize augmentor for data augmentation
+    self.augmentor = Augmentor(vocab=self.vocab, aug_type=aug_type, input_length=input_length)
+    
+    # Load preprocessed tune indices
+    if self.for_evaluation:
+      # For evaluation, we load the tune indices without any augmentation
+      self.tune_in_idx, self.len_tunes, self.file_name_list = [], [], []
+    else:
+      self.tune_in_idx, self.len_tunes, self.file_name_list = self._load_tune_in_idx()
+    # Plot the histogram of tune lengths for analysis
+    dataset_name = self.__class__.__name__  # Get the class name (dataset name)
+    len_dir_path = Path(f"len_tunes/{dataset_name}")  # Directory to store tune length histograms
+    len_dir_path.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist
+    if self. for_evaluation is False:
+      self._plot_hist(self.len_tunes, len_dir_path / f"len_{encoding_scheme}{num_features}.png")
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    # Load preprocessed tune indices from .npz files
+    print("preprocessed tune_in_idx data is being loaded")
+    
+    # List of files containing tune index data
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    
+    # If debug mode is enabled, limit the number of loaded files
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+
+    # Initialize dictionaries and lists for storing tune index data, tune lengths, and file names
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # Load tune index data from each .npz file
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']  # Load the numpy array from the file
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx  # Store the tune indices in the dictionary
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)  # Record the length of the tune
+      file_name_list.append(tune_in_idx_file.stem)  # Append the file name (without extension)
+    
+    return tune_in_idx_dict, len_tunes, file_name_list  # Return the data structures
+
+  def _plot_hist(self, len_tunes, path_outfile):
+    # Plot histogram of tune lengths and save the plot
+    Path(path_outfile).parent.mkdir(parents=True, exist_ok=True)  # Ensure the directory for the plot exists
+    
+    # Convert tune lengths to a NumPy array
+    data = np.array(list(len_tunes.values()))
+    
+    # Compute mean and standard deviation of tune lengths
+    self.mean_len_tunes = np.mean(data)
+    data_mean = np.mean(data)
+    data_std = np.std(data)
+    
+    # cumpute the total length of all tunes
+    self.total_len_tunes = np.sum(data)
+    
+    # Plot the histogram
+    plt.figure(dpi=100)
+    plt.hist(data, bins=50)
+    plt.title(f"mean: {data_mean:.2f}, std: {data_std:.2f}, total: {self.total_len_tunes}, num_tunes: {len(data)}")
+    plt.savefig(path_outfile)  # Save the plot to file
+    plt.close()  # Close the plot to free memory
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    # Split the dataset into train, validation, and test sets based on the given ratio
+    shuffled_tune_names = list(self.tune_in_idx.keys())  # Get the list of all tune names
+    random.seed(seed)  # Set the seed for reproducibility
+    random.shuffle(shuffled_tune_names)  # Shuffle the tune names
+    
+    # Compute the number of training, validation, and test samples
+    num_train = int(len(shuffled_tune_names) * ratio)
+    num_valid = int(len(shuffled_tune_names) * (1 - ratio) / 2)
+    
+    # Split the tune names into training, validation, and test sets
+    train_names = shuffled_tune_names[:num_train]
+    valid_names = shuffled_tune_names[num_train:num_train + num_valid]
+    test_names = shuffled_tune_names[num_train + num_valid:]
+    
+    return train_names, valid_names, test_names, shuffled_tune_names  # Return the split lists
+
+  def split_train_valid_test_set(self, dataset_name=None, ratio=None, seed=42, save_dir=None, for_evaluation: bool = False):
+    # Split the dataset into train, validation, and test sets or load an existing split
+    if not Path(f"metadata/{dataset_name}_caption_metadata.json").exists():
+      # If no metadata exists, perform a random split and save metadata
+      assert ratio is not None, "ratio should be given when you make metadata for split"
+      
+      # Perform the split
+      train_names, valid_names, test_names, shuffled_tune_names = self._get_split_list_from_tune_in_idx(ratio, seed)
+      
+      # Log the split information
+      print(f"Randomly split train and test set using seed {seed}")
+      out_dict = {'shuffle_seed': seed,  # Seed used for shuffling
+                  'shuffled_names': shuffled_tune_names,  # Shuffled list of tune names
+                  'train': train_names,  # Training set names
+                  'valid': valid_names,  # Validation set names
+                  'test': test_names}  # Test set names
+      
+      # Save the split metadata to a JSON file
+      with open(f"metadata/{dataset_name}_caption_metadata.json", "w") as f:
+        json.dump(out_dict, f, indent=2)
+    else:
+      # If metadata already exists, load it
+      with open(f"metadata/{dataset_name}_caption_metadata.json", "r") as f:
+        out_dict = json.load(f)
+      
+      # Ensure that the loaded data matches the current dataset
+      train_names, valid_names, test_names = out_dict['train'], out_dict['valid'], out_dict['test']
+      if self.for_evaluation is False:
+        assert set(out_dict['shuffled_names']) == set(self.tune_in_idx.keys()), "Loaded data is not matched with the recorded metadata"
+
+    # Prepare training, validation, and test datasets using the TuneCompiler
+    if self.for_evaluation:
+      # For evaluation, we do not need to create train and valid datasets
+      train_data = []
+      valid_data = []
+      self.test_data = []
+    else:
+      train_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in train_names]
+      valid_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in valid_names]
+      self.test_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in test_names]
+
+    # Initialize TuneCompiler objects for each split
+    # if self.for_evaluation:
+    #   train_dataset = None  # No training dataset for evaluation
+    #   valid_dataset = None
+    #   test_dataset = TuneCompiler(data=self.test_data, data_type='test', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature)
+    # else:
+    train_dataset = IterTuneCompiler(data=train_data, data_type='train', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature)
+    valid_dataset = TuneCompiler(data=valid_data, data_type='valid', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature)
+    test_dataset = TuneCompiler(data=self.test_data, data_type='test', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature)
+
+    # Save metadata to a directory if specified
+    if save_dir is not None:
+      Path(save_dir).mkdir(parents=True, exist_ok=True)
+      with open(Path(save_dir) / f"{dataset_name}_metadata.json", "w") as f:
+        json.dump(out_dict, f, indent=2)
+    
+    # Return the datasets for training, validation, and testing
+    return train_dataset, valid_dataset, test_dataset
+
+class Pop1k7(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+    
+
+class SymphonyMIDI(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+class LakhClean(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if tune_in_idx_file.stem in irregular_tunes:
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class LakhClean(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if tune_in_idx_file.stem in irregular_tunes:
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class ariamidi(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if tune_in_idx_file.stem in irregular_tunes:
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class gigamidi(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if tune_in_idx_file.stem in irregular_tunes:
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class PretrainingDataset(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+  
+  def _load_tune_in_idx_aria(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Load preprocessed tune indices for the aria dataset
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_ariamidi/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  def _load_tune_in_idx_giga(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Load preprocessed tune indices for the gigamidi dataset
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_gigamidi/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if "drums-only" in tune_in_idx_file.stem:
+        print(f"skipping {tune_in_idx_file.stem} as it is a drums-only file")
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  def _load_tune_in_idx_pop1k7(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Load preprocessed tune indices for the Pop1k7 dataset
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_pop1k7/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  def _load_tune_in_idx_sod(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Load preprocessed tune indices for the SOD dataset
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_SOD/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  def _load_tune_in_idx_lakh(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_LakhALLFined/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.lakh_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid"
+      location_key = f"lmd_full/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  
+  def _load_tune_in_idx_xmidi(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_XMIDI_Dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.xmidi_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem + ".midi"
+      location_key = f"{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  
+  def _load_tune_in_idx_new(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_new_dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.new_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem + ".mid"
+      location_key = f"new_data_new_dataset/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    self.lakh_caption_path =  "dataset/represented_data/tuneidx/train_set.json"
+    self.xmidi_caption_path = "dataset/represented_data/tuneidx/all_captions.json"
+    self.new_caption_path = "dataset/represented_data/tuneidx/new_dataset_captions_final.jsonl"
+
+    # load all tune_in_idx data from aria, giga datasets
+    tune_in_idx_giga, len_tunes_giga, file_name_list_giga = self._load_tune_in_idx_giga()
+    tune_in_idx_aria, len_tunes_aria, file_name_list_aria = self._load_tune_in_idx_aria()
+    tune_in_idx_lakh, len_tunes_lakh, file_name_list_lakh = self._load_tune_in_idx_lakh()
+    tune_in_idx_xmidi, len_tunes_xmidi, file_name_list_xmidi = self._load_tune_in_idx_xmidi()
+    tune_in_idx_new, len_tunes_new, file_name_list_new = self._load_tune_in_idx_new()
+
+    # merge the two datasets
+    tune_in_idx = {**tune_in_idx_aria, **tune_in_idx_giga, **tune_in_idx_lakh, **tune_in_idx_xmidi, **tune_in_idx_new}
+    len_tunes = {**len_tunes_aria, **len_tunes_giga, **len_tunes_lakh, **len_tunes_xmidi, **len_tunes_new}
+    file_name_list = file_name_list_aria + file_name_list_giga + file_name_list_lakh + file_name_list_xmidi + file_name_list_new
+    print(f"number of loaded tunes: {len(tune_in_idx)}")
+    return tune_in_idx, len_tunes, file_name_list
+    
+  
+class SOD(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+  
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    with open("metadata/SOD_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      if tune_in_idx_file.stem in irregular_tunes:
+        continue
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx
+      len_tunes[tune_in_idx_file.stem] = len(tune_in_idx)
+      file_name_list.append(tune_in_idx_file.stem)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+class BachChorale(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+class Pop909(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Pop909 dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"-v\d+$", "", tune) for tune in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+  
+  
+class LakhALL(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        #   # remove file in tune_in_idx_list
+        #   location2caption[item["location"]] = "test_set"
+          # continue
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid"
+      location_key = f"lmd_full/{location_key}"
+      try:
+        caption = location2caption.get(location_key, None)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        print(f"Caption for {location_key} is None, skipping this tune")
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class LakhALLFined(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        # if item["test_set"] is True:
+        #   continue # skip test set tunes
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid"
+      location_key = f"lmd_full/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    # filter out none in tune_in_idx
+    print("length of tune_in_idx before filtering:", len(self.tune_in_idx))
+    try:
+      self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None}
+    except:
+      print("Error filtering None values in tune_in_idx, skipping filtering")
+      return [], [], [], []
+    print("length of tune_in_idx after filtering:", len(self.tune_in_idx))
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class XMIDI_Dataset(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        # if item["test_set"] is True:
+        #   continue # skip test set tunes
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem + ".midi"
+      print(f"Processing file: {tune_in_idx_file.stem}, location_key: {location_key}")
+      location_key = f"{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    # filter out none in tune_in_idx
+    print("length of tune_in_idx before filtering:", len(self.tune_in_idx))
+    self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None}
+    print("length of tune_in_idx after filtering:", len(self.tune_in_idx))
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class new_dataset(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        # if item["test_set"] is True:
+        #   continue # skip test set tunes
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.split("/")[-1] + ".mid"
+      location_key = f"new_data_new_dataset/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    # filter out none in tune_in_idx
+    print("length of tune_in_idx before filtering:", len(self.tune_in_idx))
+    self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None}
+    print("length of tune_in_idx after filtering:", len(self.tune_in_idx))
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+class SymphonyNet_Dataset(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path)
+
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid"
+      location_key = f"/data2/suhongju/research/music-generation/BandZero/SymphonyNet_Dataset/{location_key}"
+      try:
+        caption = location2caption.get(location_key, None)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        print(f"Caption for {location_key} is None, skipping this tune")
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = shuffled_tune_names
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+
+# use lakhAllFined, XMIDI_dataset, new_dataset,  as finetune dataset
+class FinetuneDataset(SymbolicMusicDataset):
+  def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None,
+                for_evaluation: bool = False):
+    super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path,
+                     for_evaluation=for_evaluation)
+    
+    
+  def _load_tune_in_idx_lakh(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_LakhALLFined/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.lakh_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid"
+      location_key = f"lmd_full/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  
+  def _load_tune_in_idx_xmidi(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_XMIDI_Dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.xmidi_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem + ".midi"
+      location_key = f"{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  
+  def _load_tune_in_idx_new(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Irregular tunes are removed from the dataset for better generation quality
+    It includes tunes that are not quantized properly, mostly theay are expressive performance data
+    '''
+    print("preprocessed tune_in_idx data is being loaded")
+    tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_new_dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz")))
+    if self.debug:
+      tune_in_idx_list = tune_in_idx_list[:5000]
+    tune_in_idx_dict = OrderedDict()
+    len_tunes = OrderedDict()
+    file_name_list = []
+    
+    # load caption
+    self.caption_list = []
+    with open(self.new_caption_path, "r") as f:
+      # every line is a caption for the tune
+      for line in f:
+        self.caption_list.append(line.strip())
+    print(f"number of loaded captions: {len(self.caption_list)}")
+
+    with open("metadata/LakhClean_irregular_tunes.json", "r") as f:
+      irregular_tunes = json.load(f)
+    
+    # 构建 location 到 caption 的映射
+    location2caption = {}
+    for line in self.caption_list:
+      try:
+        # 假设每行是一个json字符串
+        item = json.loads(line)
+        location2caption[item["location"]] = item["caption"]
+      except Exception:
+        continue
+
+    for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)):
+      # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1
+      location_key = tune_in_idx_file.stem + ".mid"
+      location_key = f"new_data_new_dataset/{location_key}"
+      try:
+        caption = location2caption.get(location_key)
+      except KeyError:
+        print(f"KeyError: {location_key} not found in location2caption")
+        continue
+      if caption is None:
+        continue
+      # print(tune_in_idx_file.stem, location_key, caption)
+      # print("*" * 20)
+      # 你可以在这里使用caption变量
+      tune_in_idx = np.load(tune_in_idx_file)['arr_0']
+      tune_in_idx_dict[caption] = tune_in_idx
+      len_tunes[caption] = len(tune_in_idx)
+      file_name_list.append(caption)
+    print(f"number of loaded tunes: {len(tune_in_idx_dict)}")
+    return tune_in_idx_dict, len_tunes, file_name_list
+  
+  def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]:
+    '''
+    Load tune_in_idx from all three datasets
+    '''
+    self.lakh_caption_path =  "dataset/represented_data/tuneidx/train_set.json"
+    self.xmidi_caption_path = "dataset/represented_data/tuneidx/all_captions.json"
+    self.new_caption_path = "dataset/represented_data/tuneidx/new_dataset_captions_final.jsonl"
+
+    tune_in_idx_lakh, len_tunes_lakh, file_name_list_lakh = self._load_tune_in_idx_lakh()
+    tune_in_idx_xmidi, len_tunes_xmidi, file_name_list_xmidi = self._load_tune_in_idx_xmidi()
+    tune_in_idx_new, len_tunes_new, file_name_list_new = self._load_tune_in_idx_new()
+    # 合并三个数据集
+    tune_in_idx = {**tune_in_idx_lakh, **tune_in_idx_xmidi, **tune_in_idx_new}
+    len_tunes = {**len_tunes_lakh, **len_tunes_xmidi, **len_tunes_new}
+    file_name_list = file_name_list_lakh + file_name_list_xmidi + file_name_list_new
+    print(f"number of loaded tunes: {len(tune_in_idx)}")
+    return tune_in_idx, len_tunes, file_name_list 
+  
+  def _get_split_list_from_tune_in_idx(self, ratio, seed):
+    '''
+    As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name
+    '''
+    # filter out none in tune_in_idx
+    print("length of tune_in_idx before filtering:", len(self.tune_in_idx))
+    try:
+      self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None}
+    except:
+      print("Error filtering None values in tune_in_idx, skipping filtering")
+      return [], [], [], []
+    print("length of tune_in_idx after filtering:", len(self.tune_in_idx))
+    shuffled_tune_names = list(self.tune_in_idx.keys())
+    song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names]
+    song_dict = {}
+    for song, orig_song in zip(song_names_without_version, shuffled_tune_names):
+      if song not in song_dict:
+        song_dict[song] = []
+      song_dict[song].append(orig_song)
+    unique_song_names = list(song_dict.keys())
+    random.seed(seed)
+    random.shuffle(unique_song_names)
+    num_train = int(len(unique_song_names)*ratio)
+    num_valid = int(len(unique_song_names)*(1-ratio)/2)
+    train_names = []
+    valid_names = []
+    test_names = []
+    for song_name in unique_song_names[:num_train]:
+      train_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train:num_train+num_valid]:
+      valid_names.extend(song_dict[song_name])
+    for song_name in unique_song_names[num_train+num_valid:]:
+      test_names.extend(song_dict[song_name])
+    return train_names, valid_names, test_names, shuffled_tune_names
+    
\ No newline at end of file
diff --git a/Amadeus/symbolic_encoding/decoding_utils.py b/Amadeus/symbolic_encoding/decoding_utils.py
new file mode 100644
index 0000000..99312a3
--- /dev/null
+++ b/Amadeus/symbolic_encoding/decoding_utils.py
@@ -0,0 +1,404 @@
+import os, sys
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+from collections import defaultdict
+
+from music21 import converter
+import muspy
+import miditoolkit
+from miditoolkit.midi.containers import Marker, Instrument, TempoChange, Note, TimeSignature
+
+from .midi2audio import FluidSynth
+from data_representation.constants import PROGRAM_INSTRUMENT_MAP
+
+class MuteWarn:
+  def __enter__(self):
+    self._init_stdout = sys.stdout
+    sys.stdout = open(os.devnull, "w")
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    sys.stdout.close()
+    sys.stdout = self._init_stdout
+
+def save_score_image_from_midi(midi_fn, file_name):
+  assert isinstance(midi_fn, str)
+  with MuteWarn():
+    convert = converter.parse(midi_fn)
+    convert.write('musicxml.png', fp=file_name)
+
+def save_pianoroll_image_from_midi(midi_fn, file_name):
+  assert isinstance(midi_fn, str)
+  midi_obj_muspy = muspy.read_midi(midi_fn)
+  midi_obj_muspy.show_pianoroll(track_label='program', preset='frame')
+  plt.gcf().set_size_inches(20, 10)
+  plt.savefig(file_name)
+  plt.close()
+
+def save_wav_from_midi(midi_fn, file_name, qpm=80):
+  assert isinstance(midi_fn, str)
+  with MuteWarn():
+    music = muspy.read_midi(midi_fn)
+    music.tempos[0].qpm = qpm
+    music.write_audio(file_name, rate=44100, gain=3)
+
+def save_wav_from_midi_fluidsynth(midi_fn, file_name, gain=3):
+  assert isinstance(midi_fn, str)
+  fs = FluidSynth(gain=gain)
+  fs.midi_to_audio(midi_fn, file_name)
+
+class MidiDecoder4REMI:
+  def __init__(
+      self, 
+      vocab, 
+      in_beat_resolution, 
+      dataset_name
+  ):
+    self.vocab = vocab
+    self.in_beat_resolution = in_beat_resolution
+    self.dataset_name = dataset_name
+    if dataset_name == 'SymphonyMIDI':
+      self.gain = 0.7
+    elif dataset_name == 'SOD' or dataset_name == 'LakhClean':
+      self.gain = 1.1
+    elif dataset_name == 'Pop1k7' or dataset_name == 'Pop909':
+      self.gain = 2.5
+    else:
+      self.gain = 1.5
+
+  def __call__(self, generated_output, output_path=None):
+    '''
+    generated_output: list of tensor, the tensor
+    '''
+    idx2event = self.vocab.idx2event
+    if generated_output.dim() == 2:
+      generated_output = generated_output.squeeze(0)
+    events = [idx2event[token.item()] for token in generated_output]
+
+    midi_obj = miditoolkit.midi.parser.MidiFile()
+    if 'tempo' not in idx2event.keys():
+      default_tempo = 95
+      midi_obj.tempo_changes.append(
+        TempoChange(tempo=default_tempo, time=0))
+    default_ticks_per_beat = 480
+    default_in_beat_ticks  =  480 // self.in_beat_resolution
+    cur_pos = 0
+    bar_pos = 0
+    cur_bar_resol = 0
+    beat_pos = 0
+    cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53
+    instr_notes_dict = defaultdict(list)
+    for i in range(len(events)-2):
+      cur_event = events[i]
+      # print(cur_event)
+      name = cur_event.split('_')[0]
+      attr = cur_event.split('_')
+      if name == 'Bar':
+        bar_pos += cur_bar_resol
+        if 'time' in cur_event:
+          cur_num, cur_denom = attr[-1].split('/')
+          new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom)))
+          cur_bar_resol = new_bar_resol
+          midi_obj.time_signature_changes.append(
+            TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos))
+      elif name == 'Beat':
+        beat_pos = int(attr[1])
+        cur_pos = bar_pos + beat_pos * default_in_beat_ticks
+      elif name == 'Chord':
+        chord_text = attr[1] + '_' + attr[2]
+        midi_obj.markers.append(Marker(text=chord_text, time=cur_pos))
+      elif name == 'Tempo':
+        midi_obj.tempo_changes.append(
+            TempoChange(tempo=int(attr[1]), time=cur_pos))
+      elif name == 'Instrument':
+        cur_instr = int(attr[1])
+      else:
+        if len(self.vocab.feature_list) == 7 or len(self.vocab.feature_list) == 8:
+          if 'Note_Pitch' in events[i] and \
+          'Note_Duration' in events[i+1] and \
+          'Note_Velocity' in events[i+2]:
+            pitch = int(events[i].split('_')[-1])
+            duration = int(events[i+1].split('_')[-1])
+            duration = duration * default_in_beat_ticks
+            end = cur_pos + duration 
+            velocity = int(events[i+2].split('_')[-1])
+            instr_notes_dict[cur_instr].append(
+              Note(
+                pitch=pitch, 
+                start=cur_pos, 
+                end=end, 
+                velocity=velocity))
+        elif len(self.vocab.feature_list) == 4 or len(self.vocab.feature_list) == 5:
+          if 'Note_Pitch' in events[i] and \
+          'Note_Duration' in events[i+1]:
+            pitch = int(events[i].split('_')[-1])
+            duration = int(events[i+1].split('_')[-1])
+            duration = duration * default_in_beat_ticks
+            end = cur_pos + duration 
+            velocity = 90
+            instr_notes_dict[cur_instr].append(
+              Note(
+                pitch=pitch, 
+                start=cur_pos, 
+                end=end, 
+                velocity=velocity))
+          
+    # save midi  
+    for instr, notes in instr_notes_dict.items():
+      instrument_name = PROGRAM_INSTRUMENT_MAP[instr]
+      if instr == 114: is_drum = True
+      else: is_drum = False
+      instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name)
+      instr_track.notes = notes
+      midi_obj.instruments.append(instr_track)
+
+    if isinstance(output_path, str) or isinstance(output_path, Path):
+      output_path = str(output_path)
+      # make subdir
+      music_path = os.path.join(os.path.dirname(output_path), 'music')
+      prompt_music_path = os.path.join(os.path.dirname(output_path), 'prompt_music')
+      if not os.path.exists(music_path):
+        os.makedirs(music_path)
+      if not os.path.exists(prompt_music_path):
+        os.makedirs(prompt_music_path)
+      # if not contain 'prompt' in output_path, save prompt music
+      if 'prompt' in output_path:
+        music_path = os.path.join(prompt_music_path, output_path.split('/')[-1].replace('.mid', '.wav'))
+      else:
+        music_path = os.path.join(music_path, output_path.split('/')[-1].replace('.mid', '.wav'))
+      
+      midi_obj.dump(output_path)
+      # save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png'))
+      save_wav_from_midi_fluidsynth(output_path, music_path, gain=self.gain)
+    return midi_obj
+
+class MidiDecoder4CP(MidiDecoder4REMI):
+  def __init__(self, vocab, in_beat_resolution, dataset_name):
+    super().__init__(vocab, in_beat_resolution, dataset_name)
+  
+  def _update_chord_tempo(self, midi_obj, cur_pos, token_with_7infos, feature2idx):
+    if len(feature2idx) == 7 or len(feature2idx) == 8:
+      # chord
+      if token_with_7infos[feature2idx['chord']] != 'CONTI' and token_with_7infos[feature2idx['chord']] != 0:
+        midi_obj.markers.append(
+          Marker(text=str(token_with_7infos[feature2idx['chord']]), time=cur_pos))
+      # tempo
+      if token_with_7infos[feature2idx['tempo']] != 'CONTI' and token_with_7infos[feature2idx['tempo']] != 0 and token_with_7infos[feature2idx['tempo']] != "Tempo_N_N":
+        tempo = int(token_with_7infos[feature2idx['tempo']].split('_')[-1])
+        midi_obj.tempo_changes.append(
+          TempoChange(tempo=tempo, time=cur_pos))
+      return midi_obj
+    elif len(feature2idx) == 4 or len(feature2idx) == 5:
+      return midi_obj
+    
+  def __call__(self, generated_output, output_path=None):
+    '''
+    generated_output: tensor, batch x seq_len x num_types
+    num_types includes: type, tempo, chord,'beat, pitch, duration, velocity
+    '''
+    idx2event = self.vocab.idx2event
+    feature_keys = self.vocab.feature_list
+    feature2idx = {key: idx for idx, key in enumerate(feature_keys)}
+
+    midi_obj = miditoolkit.midi.parser.MidiFile()
+    if len(feature2idx) == 4 or len(feature2idx) == 5:
+      default_tempo = 95
+      midi_obj.tempo_changes.append(
+        TempoChange(tempo=default_tempo, time=0))
+    default_ticks_per_beat = 480
+    default_in_beat_ticks  =  480 // self.in_beat_resolution
+    cur_pos = 0
+    bar_pos = 0
+    cur_bar_resol = 0
+    beat_pos = 0
+    instr_notes_dict = defaultdict(list)
+    generated_output = generated_output.squeeze(0)
+    for i in range(len(generated_output)):
+      token_with_7infos = []
+      for kidx, key in enumerate(feature_keys):
+        token_with_7infos.append(idx2event[key][generated_output[i][kidx].item()])
+      # type token
+      if 'time_signature' in token_with_7infos[feature2idx['type']]:
+        cur_num, cur_denom = token_with_7infos[feature2idx['type']].split('_')[-1].split('/')
+        bar_pos += cur_bar_resol
+        new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom)))
+        cur_bar_resol = new_bar_resol
+        midi_obj.time_signature_changes.append(
+          TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos))
+      elif token_with_7infos[feature2idx['type']] == 'Metrical':
+        if 'time_signature' in token_with_7infos[feature2idx['beat']]:
+          cur_num, cur_denom = token_with_7infos[feature2idx['beat']].split('_')[-1].split('/')
+          bar_pos += cur_bar_resol
+          new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom)))
+          cur_bar_resol = new_bar_resol
+          midi_obj.time_signature_changes.append(
+            TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos))
+        elif token_with_7infos[feature2idx['beat']] == 'Bar':
+          bar_pos += cur_bar_resol
+        elif 'Beat' in str(token_with_7infos[feature2idx['beat']]):
+          beat_pos = int(token_with_7infos[feature2idx['beat']].split('_')[1])
+          cur_pos = bar_pos + beat_pos * default_in_beat_ticks # ticks
+          # chord and tempo
+          midi_obj = self._update_chord_tempo(midi_obj, cur_pos, token_with_7infos, feature2idx)
+      elif token_with_7infos[feature2idx['type']] == 'Note':
+        # instrument token
+        if len(feature2idx) == 8 or len(feature2idx) == 5:
+          if token_with_7infos[feature2idx['instrument']] != 0 and token_with_7infos[feature2idx['instrument']] != 'CONTI':
+            cur_instr = int(token_with_7infos[feature2idx['instrument']].split('_')[-1])
+        else:
+          cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53
+        try:
+          pitch = token_with_7infos[feature2idx['pitch']].split('_')[-1]
+          duration = token_with_7infos[feature2idx['duration']].split('_')[-1]
+          duration = int(duration) * default_in_beat_ticks
+          if len(feature2idx) == 7 or len(feature2idx) == 8:
+            velocity = token_with_7infos[feature2idx['velocity']].split('_')[-1]
+          else:
+            velocity = 80
+          end = cur_pos + duration
+          instr_notes_dict[cur_instr].append(
+            Note(
+              pitch=int(pitch), 
+              start=cur_pos, 
+              end=end, 
+              velocity=int(velocity))
+            )
+        except:
+          continue
+      else: # when new bar started without beat
+        continue
+
+    # save midi
+    for instr, notes in instr_notes_dict.items():
+      instrument_name = PROGRAM_INSTRUMENT_MAP[instr]
+      if instr == 114: is_drum = True
+      else: is_drum = False
+      instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name)
+      instr_track.notes = notes
+      midi_obj.instruments.append(instr_track)
+
+    if isinstance(output_path, str) or isinstance(output_path, Path):
+      output_path = str(output_path)
+      output_music_dir = os.path.join(os.path.dirname(output_path), 'music')
+      if not os.path.exists(output_music_dir):
+        os.makedirs(output_music_dir)
+      midi_obj.dump(output_path)
+      save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png'))
+      save_wav_from_midi_fluidsynth(output_path, output_music_dir.replace('.mid', '.wav'), gain=self.gain)
+    return midi_obj
+  
+class MidiDecoder4NB(MidiDecoder4REMI):
+  def __init__(self, vocab, in_beat_resolution, dataset_name):
+    super().__init__(vocab, in_beat_resolution, dataset_name)
+  
+  def _update_additional_info(self, midi_obj, cur_pos, token_with_7infos, feature2idx):
+    if len(feature2idx) == 7 or len(feature2idx) == 8:
+      # chord
+      if token_with_7infos[feature2idx['chord']] != 'CONTI' and token_with_7infos[feature2idx['chord']] != 0 and token_with_7infos[feature2idx['chord']] != 'Chord_N_N':
+        midi_obj.markers.append(
+          Marker(text=str(token_with_7infos[feature2idx['chord']]), time=cur_pos))
+      # tempo
+      if token_with_7infos[feature2idx['tempo']] != 'CONTI' and token_with_7infos[feature2idx['tempo']] != 0 and token_with_7infos[feature2idx['tempo']] != "Tempo_N_N":
+        tempo = int(token_with_7infos[feature2idx['tempo']].split('_')[-1])
+        midi_obj.tempo_changes.append(
+          TempoChange(tempo=tempo, time=cur_pos))
+      return midi_obj
+    elif len(feature2idx) == 4 or len(feature2idx) == 5:
+      return midi_obj
+    
+  def __call__(self, generated_output, output_path=None):
+    '''
+    generated_output: tensor, batch x seq_len x num_types
+    num_types includes: type, beat, chord, tempo, intrument, pitch, duration, velocity
+    '''
+    idx2event = self.vocab.idx2event
+    feature_keys = self.vocab.feature_list
+    feature2idx = {key: idx for idx, key in enumerate(feature_keys)}
+
+    midi_obj = miditoolkit.midi.parser.MidiFile()
+    if len(feature2idx) == 4 or len(feature2idx) == 5:
+      default_tempo = 95
+      midi_obj.tempo_changes.append(
+        TempoChange(tempo=default_tempo, time=0))
+    default_ticks_per_beat = 480
+    default_in_beat_ticks = 480 // self.in_beat_resolution
+    cur_pos = 0
+    bar_pos = 0
+    cur_bar_resol = 0
+    beat_pos = 0
+    instr_notes_dict = defaultdict(list)
+    generated_output = generated_output.squeeze(0)
+    for i in range(len(generated_output)):
+      token_with_7infos = []
+      for kidx, key in enumerate(feature_keys):
+        token_with_7infos.append(idx2event[key][generated_output[i][kidx].item()])
+      # type token
+      if token_with_7infos[feature2idx['type']] == 'Empty_Bar' or token_with_7infos[feature2idx['type']] == 'SNN':
+        bar_pos += cur_bar_resol
+      elif 'NNN' in token_with_7infos[feature2idx['type']]:
+        cur_num, cur_denom = token_with_7infos[feature2idx['type']].split('_')[-1].split('/')
+        bar_pos += cur_bar_resol
+        new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom)))
+        cur_bar_resol = new_bar_resol
+        midi_obj.time_signature_changes.append(
+          TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos))
+      # instrument token
+      if len(feature2idx) == 8 or len(feature2idx) == 5:
+        if token_with_7infos[feature2idx['instrument']] != 0 and token_with_7infos[feature2idx['instrument']] != 'CONTI':
+          cur_instr = int(token_with_7infos[feature2idx['instrument']].split('_')[-1])
+      else:
+        cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53
+      if 'Beat' in str(token_with_7infos[feature2idx['beat']]) or 'CONTI' in str(token_with_7infos[feature2idx['beat']]):
+        if 'Beat' in str(token_with_7infos[feature2idx['beat']]): # when beat is not CONTI beat is updated
+          beat_pos = int(token_with_7infos[feature2idx['beat']].split('_')[1])
+          cur_pos = bar_pos + beat_pos * default_in_beat_ticks # ticks
+        # update chord and tempo
+        midi_obj = self._update_additional_info(midi_obj, cur_pos, token_with_7infos, feature2idx)  
+        # note
+        try:
+          pitch = token_with_7infos[feature2idx['pitch']].split('_')[-1]
+          duration = token_with_7infos[feature2idx['duration']].split('_')[-1] # duration between 1~192
+          duration = int(duration) * default_in_beat_ticks
+          if len(feature2idx) == 7 or len(feature2idx) == 8:
+            velocity = token_with_7infos[feature2idx['velocity']].split('_')[-1]
+          else:
+            velocity = 90
+          end = cur_pos + duration
+          instr_notes_dict[cur_instr].append(
+            Note(
+              pitch=int(pitch), 
+              start=cur_pos, 
+              end=end, 
+              velocity=int(velocity))
+            )
+        except:
+          continue
+      else: # when new bar started without beat
+        continue
+    
+    # save midi
+    for instr, notes in instr_notes_dict.items():
+      instrument_name = PROGRAM_INSTRUMENT_MAP[instr]
+      if instr == 114: is_drum = True
+      else: is_drum = False
+      instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name)
+      instr_track.notes = notes
+      midi_obj.instruments.append(instr_track)
+
+    if isinstance(output_path, str) or isinstance(output_path, Path):
+      output_path = str(output_path)
+      music_path = os.path.join(os.path.dirname(output_path), 'music')
+      prompt_music_path = os.path.join(os.path.dirname(output_path), 'prompt_music')
+      if not os.path.exists(music_path):
+        os.makedirs(music_path)
+      if not os.path.exists(prompt_music_path):
+        os.makedirs(prompt_music_path)
+      # if not contain 'prompt' in output_path, save prompt music
+      if 'prompt' in output_path:
+        music_path = os.path.join(prompt_music_path, output_path.split('/')[-1].replace('.mid', '.wav'))
+      else:
+        music_path = os.path.join(music_path, output_path.split('/')[-1].replace('.mid', '.wav'))
+      midi_obj.dump(output_path)
+      # save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png'))
+      save_wav_from_midi_fluidsynth(output_path, music_path, gain=self.gain)
+    return midi_obj
diff --git a/Amadeus/symbolic_encoding/metric_utils.py b/Amadeus/symbolic_encoding/metric_utils.py
new file mode 100644
index 0000000..138c9e0
--- /dev/null
+++ b/Amadeus/symbolic_encoding/metric_utils.py
@@ -0,0 +1,208 @@
+import torch
+import numpy as np
+
+from collections import Counter
+
+# TODO: refactor hard coded values
+def check_syntax_errors_in_inference_for_nb(generated_output, feature_list):
+  generated_output = generated_output.squeeze(0)
+  type_idx = feature_list.index('type')
+  beat_idx = feature_list.index('beat')
+  type_beat_list = []
+  for token in generated_output:
+    type_beat_list.append((token[type_idx].item(), token[beat_idx].item())) # type, beat
+  
+  last_note = 1
+  beat_type_unmatched_error_list = []
+  num_unmatched_errors = 0
+  beat_backwards_error_list = []
+  num_backwards_errors = 0
+  for type_beat in type_beat_list:
+    if type_beat[0] == 4: # same bar, new beat
+      if type_beat[1] == 0 or type_beat[1] == 1:
+        num_unmatched_errors += 1
+        beat_type_unmatched_error_list.append(type_beat)
+      if type_beat[1] <= last_note:
+        num_backwards_errors += 1
+        beat_backwards_error_list.append([last_note, type_beat])
+      else:
+        last_note = type_beat[1] # update last note
+    elif type_beat[0] >= 5: # new bar, new beat
+      if type_beat[1] == 0:
+        num_unmatched_errors += 1
+        beat_type_unmatched_error_list.append(type_beat)
+      last_note = 1
+  unmatched_error_rate = num_unmatched_errors / len(type_beat_list)
+  backwards_error_rate = num_backwards_errors / len(type_beat_list)
+  type_beat_errors_dict = {'beat_type_unmatched_error': unmatched_error_rate, 'beat_backwards_error': backwards_error_rate}
+  return type_beat_errors_dict
+
+def check_syntax_errors_in_inference_for_cp(generated_output, feature_list):
+  generated_output = generated_output.squeeze(0)
+  type_idx = feature_list.index('type')
+  beat_idx = feature_list.index('beat')
+  pitch_idx = feature_list.index('pitch')
+  duration_idx = feature_list.index('duration')
+  last_note = 1
+  beat_type_unmatched_error_list = []
+  num_unmatched_errors = 0
+  beat_backwards_error_list = []
+  num_backwards_errors = 0
+  for token in generated_output:
+    if token[type_idx].item() == 2: # Metrical
+      if token[pitch_idx].item() != 0 or token[duration_idx].item() != 0:
+        num_unmatched_errors += 1
+        beat_type_unmatched_error_list.append(token)
+      if token[beat_idx].item() == 1: # new bar
+        last_note = 1 # last note will be updated in the next token
+      elif token[beat_idx].item() != 0 and token[beat_idx].item() <= last_note:
+        num_backwards_errors += 1
+        last_note = token[beat_idx].item() # update last note
+        beat_backwards_error_list.append([last_note, token])
+      else:
+        last_note = token[beat_idx].item() # update last note
+    if token[type_idx].item() == 3: # Note
+      if token[beat_idx].item() != 0:
+        num_unmatched_errors += 1
+        beat_type_unmatched_error_list.append(token)
+  unmatched_error_rate = num_unmatched_errors / len(generated_output)
+  backwards_error_rate = num_backwards_errors / len(generated_output)
+  type_beat_errors_dict = {'beat_type_unmatched_error': unmatched_error_rate, 'beat_backwards_error': backwards_error_rate}
+  return type_beat_errors_dict
+
+def check_syntax_errors_in_inference_for_remi(generated_output, vocab):
+  generated_output = generated_output.squeeze(0)
+  # to check duration errors
+  beat_mask = vocab.total_mask['beat'].to(generated_output.device)
+  beat_mask_for_target = beat_mask[generated_output]
+  beat_target = generated_output * beat_mask_for_target
+  bar_mask = vocab.total_mask['type'].to(generated_output.device)
+  bar_mask_for_target = bar_mask[generated_output]
+  bar_target = (generated_output+1) * bar_mask_for_target # as bar token in 0 in remi vocab, we add 1 to bar token
+  target = beat_target + bar_target
+  target = target[target!=0]
+  # collect beats in between bars(idx=1)
+  num_backwards_errors = 0
+  collected_beats = []
+  total_beats = 0
+  for token in target:
+    if token == 1 or 3 <= token <= 26: # Bar_None, or Bar_time_signature
+      collected_beats_tensor = torch.tensor(collected_beats)
+      diff = torch.diff(collected_beats_tensor)
+      num_error_beats = torch.where(diff<=0)[0].shape[0]
+      num_backwards_errors += num_error_beats
+      collected_beats = []
+    else:
+      collected_beats.append(token.item())
+      total_beats += 1
+  if total_beats != 0:
+    backwards_error_rate = num_backwards_errors / total_beats
+  else:
+    backwards_error_rate = 0
+  # print(f"error rate in beat backwards: {backwards_error_rate}")
+  return {'beat_backwards_error': backwards_error_rate}
+
+def type_beat_errors_in_validation_nb(beat_prob, answer_type, input_beat, mask):
+  bool_mask = mask.bool().flatten() # (b*t)
+  pred_beat_idx = torch.argmax(beat_prob, dim=-1).flatten() # (b*t)
+  valid_pred_beat_idx = pred_beat_idx[bool_mask] # valid beat_idx
+  answer_type = answer_type.flatten() # (b*t)
+  valid_type_input = answer_type[bool_mask] # valid answer_type
+  type_beat_list = []
+  for i in range(len(valid_pred_beat_idx)):
+    type_beat_list.append((valid_type_input[i].item(), valid_pred_beat_idx[i].item())) # type, beat
+  input_beat = input_beat.flatten()
+  valid_input_beat = input_beat[bool_mask]
+  
+  last_note = 1
+  num_unmatched_errors = 0
+  num_backwards_errors = 0
+  for type_beat, input_beat_idx in zip(type_beat_list, valid_input_beat):
+    # update last note
+    if input_beat_idx.item() >= 1: # beat
+      last_note = input_beat_idx.item()
+    if type_beat[0] == 4: # same bar, new beat
+      if type_beat[1] == 0 or type_beat[1] == 1:
+        num_unmatched_errors += 1
+      if type_beat[1] <= last_note:
+        num_backwards_errors += 1
+    elif type_beat[0] >= 5: # new bar, new beat
+      if type_beat[1] == 0:
+        num_unmatched_errors += 1
+  return len(type_beat_list), num_unmatched_errors, num_backwards_errors
+
+def type_beat_errors_in_validation_cp(beat_prob, answer_type, input_beat, mask):
+  bool_mask = mask.bool().flatten() # (b*t)
+  beat_idx = torch.argmax(beat_prob, dim=-1).flatten() # (b*t)
+  valid_beat_idx = beat_idx[bool_mask] # valid beat_idx
+  answer_type = answer_type.flatten() # (b*t)
+  valid_type_input = answer_type[bool_mask] # valid answer_type
+  type_beat_list = []
+  for i in range(len(valid_beat_idx)):
+    type_beat_list.append((valid_type_input[i].item(), valid_beat_idx[i].item())) # type, beat
+  input_beat = input_beat.flatten()
+  valid_input_beat = input_beat[bool_mask]
+  
+  last_note = 1
+  num_unmatched_errors = 0
+  num_backwards_errors = 0
+  for type_beat, input_beat_idx in zip(type_beat_list, valid_input_beat):
+    # update last note
+    if input_beat_idx.item() == 1: # bar
+      last_note = 1
+    elif input_beat_idx.item() >= 2: # new beat
+      last_note = input_beat_idx.item()
+    # check errors
+    if type_beat[0] == 2: # Metrical
+      if type_beat[1] == 0: # ignore
+        num_unmatched_errors += 1
+      elif type_beat[1] >= 2: # new beat
+        if type_beat[1] <= last_note:
+          num_backwards_errors += 1
+    elif type_beat[0] == 3: # Note
+      if type_beat[1] != 0:
+        num_unmatched_errors += 1
+  return len(type_beat_list), num_unmatched_errors, num_backwards_errors
+
+def get_beat_difference_metric(prob_dict, arranged_prob_dict, mask):
+  orign_beat_prob = prob_dict['beat'] # b x t x vocab_size
+  arranged_beat_prob = arranged_prob_dict['beat'] # b x t x vocab_size
+
+  # calculate similarity between original beat prob and arranged beat prob
+  origin_beat_token = torch.argmax(orign_beat_prob, dim=-1) * mask # b x t
+  arranged_beat_token = torch.argmax(arranged_beat_prob, dim=-1) * mask # b x t
+  num_same_beat = torch.sum(origin_beat_token == arranged_beat_token) - torch.sum(mask==0)
+  num_beat = torch.sum(mask==1)
+  beat_sim = (num_same_beat / num_beat).item() # scalar
+
+  # apply mask, shape of mask: b x t
+  orign_beat_prob = orign_beat_prob * mask.unsqueeze(-1) # b x t x vocab_size
+  arranged_beat_prob = arranged_beat_prob * mask.unsqueeze(-1)
+
+  # calculate cosine similarity between original beat prob and arranged beat prob
+  orign_beat_prob = orign_beat_prob.flatten(0,1) # (b*t) x vocab_size
+  arranged_beat_prob = arranged_beat_prob.flatten(0,1) # (b*t) x vocab_size
+  cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
+  beat_cos_sim = cos(orign_beat_prob, arranged_beat_prob) # (b*t)
+  # exclude invalid tokens, zero padding tokens
+  beat_cos_sim = beat_cos_sim[mask.flatten().bool()] # num_valid_tokens
+  beat_cos_sim = torch.mean(beat_cos_sim).item() # scalar
+  return {'beat_cos_sim': beat_cos_sim, 'beat_sim': beat_sim}
+
+def get_gini_coefficient(generated_output):
+  if len(generated_output.shape) == 3:
+    generated_output = generated_output.squeeze(0).tolist()
+    gen_list = [tuple(x) for x in generated_output]
+  else:
+    gen_list = generated_output.squeeze(0).tolist()
+  counts = Counter(gen_list).values()
+  sorted_counts = sorted(counts)
+  n = len(sorted_counts)
+  cumulative_counts = np.cumsum(sorted_counts)
+  cumulative_proportion = cumulative_counts / cumulative_counts[-1]
+
+  lorenz_area = sum(cumulative_proportion[:-1]) / n  # Exclude the last element
+  equality_area = 0.5  # The area under line of perfect equality
+
+  gini = (equality_area - lorenz_area) / equality_area
+  return gini
\ No newline at end of file
diff --git a/Amadeus/symbolic_encoding/midi2audio.py b/Amadeus/symbolic_encoding/midi2audio.py
new file mode 100644
index 0000000..ddbae0f
--- /dev/null
+++ b/Amadeus/symbolic_encoding/midi2audio.py
@@ -0,0 +1,78 @@
+import argparse
+import os
+import subprocess
+from pydub import AudioSegment
+
+'''
+This file is a modified version of midi2audio.py from https://github.com/bzamecnik/midi2audio
+Author: Bohumír Zámečník (@bzamecnik)
+License: MIT, see the LICENSE file
+'''
+
+__all__ = ['FluidSynth']
+
+DEFAULT_SOUND_FONT = '/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2'
+DEFAULT_SAMPLE_RATE = 48000
+DEFAULT_GAIN = 0.05
+# DEFAULT_SOUND_FONT = "/data2/suhongju/research/music-generation/sound_file/Advent GM 7.sf2"
+# DEFAULT_SOUND_FONT = '~/.fluidsynth/default_sound_font.sf2'
+# DEFAULT_SAMPLE_RATE = 16000
+# DEFAULT_GAIN = 0.20
+
+class FluidSynth():
+    def __init__(self, sound_font=DEFAULT_SOUND_FONT, sample_rate=DEFAULT_SAMPLE_RATE, gain=DEFAULT_GAIN):
+        self.sample_rate = sample_rate
+        self.sound_font = os.path.expanduser(sound_font)
+        self.gain = gain
+
+    def midi_to_audio(self, midi_file: str, audio_file: str, verbose=True):
+        if verbose:
+            stdout = None
+        else:
+            stdout = subprocess.DEVNULL
+        
+        # Convert MIDI to WAV
+        subprocess.call(
+            ['fluidsynth', '-ni', '-g', str(self.gain), self.sound_font, midi_file, '-F', audio_file, '-r', str(self.sample_rate)], 
+            stdout=stdout
+        )
+
+        # Convert WAV to MP3
+        # mp3_path = audio_file.replace('.wav', '.mp3')
+        # AudioSegment.from_wav(audio_file).export(mp3_path, format="mp3")
+        
+        # # Delete the temporary WAV file
+        # os.remove(audio_file)
+
+    def play_midi(self, midi_file):
+        subprocess.call(['fluidsynth', '-i', '-g', str(self.gain), self.sound_font, midi_file, '-r', str(self.sample_rate)])
+
+def parse_args(allow_synth=True):
+    parser = argparse.ArgumentParser(description='Convert MIDI to audio via FluidSynth')
+    parser.add_argument('midi_file', metavar='MIDI', type=str)
+    if allow_synth:
+        parser.add_argument('audio_file', metavar='AUDIO', type=str, nargs='?')
+    parser.add_argument('-s', '--sound-font', type=str,
+        default=DEFAULT_SOUND_FONT,
+        help='path to a SF2 sound font (default: %s)' % DEFAULT_SOUND_FONT)
+    parser.add_argument('-r', '--sample-rate', type=int, nargs='?',
+        default=DEFAULT_SAMPLE_RATE,
+        help='sample rate in Hz (default: %s)' % DEFAULT_SAMPLE_RATE)
+    return parser.parse_args()
+
+def main(allow_synth=True):
+    args = parse_args(allow_synth)
+    fs = FluidSynth(args.sound_font, args.sample_rate)
+    if allow_synth and args.audio_file:
+        fs.midi_to_audio(args.midi_file, args.audio_file)
+    else:
+        fs.play_midi(args.midi_file)
+
+def main_play():
+    """
+    A method for the `midiplay` entry point. It omits the audio file from args.
+    """
+    main(allow_synth=False)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/config-accelerate.yaml b/Amadeus/symbolic_yamls/config-accelerate.yaml
new file mode 100644
index 0000000..0769a14
--- /dev/null
+++ b/Amadeus/symbolic_yamls/config-accelerate.yaml
@@ -0,0 +1,65 @@
+defaults:
+  # - nn_params: nb8_embSum_NMT
+  # - nn_params: remi8
+  - nn_params: nb8_embSum_diff_t2m_150M_finetunning
+    # - nn_params: nb8_embSum_diff_t2m_150M_pretraining
+  # - nn_params: nb8_embSum_subPararell
+  # - nn_params: nb8_embSum_diff_t2m_150M
+
+    #  - nn_params: nb8_embSum_subFeedForward
+    # -  nn_params: nb8_embSum_diff
+  # nn_params: nb8_SA_diff
+  # - nn_params: nb8_embSum_diff_main12head16dim512_ave
+  # - nn_params: nb8_embSum_NMT_main12_head_16_dim512
+  # - nn_params: remi8_main12_head_16_dim512
+    # - nn_params: nb5_embSum_diff_main12head16dim768_sub3
+
+dataset: FinetuneDataset  # Pop1k7, Pop909, SOD, LakhClean,PretrainingDataset FinetuneDataset
+captions_path: dataset/midicaps/train_set.json
+
+# dataset:  SymphonyNet_Dataset # Pop1k7, Pop909, SOD, LakhClean
+# captions_path: dataset/symphonyNet/syd-caption.json
+
+use_ddp: True # True, False | distributed data parallel
+use_fp16: True # True, False | mixed precision training
+use_diff: True # True,use diffusion in subdecoder
+diff_steps: 8 # number of diffusion steps
+use_dispLoss: True
+lambda_weight: 0.5
+tau: 0.5
+
+train_params:
+  device: cuda
+  batch_size: 3
+  grad_clip: 1.0
+  num_iter: 300000  # total number of iterations
+  num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference
+  num_cycles_for_model_checkpoint: 1 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint
+  iterations_per_training_cycle: 10 # number of iterations for logging training loss
+  iterations_per_validation_cycle: 5000 # number of iterations for validation process
+  input_length: 3072 # input sequence length3072
+  # you can use focal loss, it it's not used, set focal_gamma to 0
+  focal_alpha: 1
+  focal_gamma: 0
+  # learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details
+  scheduler : cosinelr
+  initial_lr: 0.00005
+  decay_step_rate: 0.8 # means it will reach its lowest point at decay_step_rate * total_num_iter
+  num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts'
+  warmup_steps: 2000 #number of warmup steps
+  max_lr: 0.00015
+  gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts'
+  # Distributed Data Parallel
+  world_size: 5 # 0 means no distributed training
+  gradient_accumulation_steps: 4  # 1 means no gradient accumulation
+inference_params:
+  num_uncond_generation: 1 # number of unconditional generation
+  num_cond_generation: 3 # number of conditional generation
+data_params:
+  first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument)
+  split_ratio: 0.998 # train-validation-test split ratio
+  aug_type: pitch # random, null | pitch and chord augmentation type
+general:
+  debug: False
+  make_log: True # True, False | update the log file in wandb online to your designated project and entity
+  infer_and_log: True # True, False | inference and log the results
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/config.yaml b/Amadeus/symbolic_yamls/config.yaml
new file mode 100644
index 0000000..9081dbc
--- /dev/null
+++ b/Amadeus/symbolic_yamls/config.yaml
@@ -0,0 +1,54 @@
+defaults:
+  # - nn_params: nb8_embSum_NMT
+  # - nn_params: remi8
+  # - nn_params: nb8_embSum_diff
+   - nn_params: nb8_embSum_subFeedForward
+  #  - nn_params: nb8_SA_diff
+  # - nn_params: nb8_embSum_diff_main12head16dim512_ave
+  # - nn_params: nb8_embSum_NMT_main12_head_16_dim512
+  # - nn_params: remi8_main12_head_16_dim512
+    # - nn_params: nb5_embSum_diff_main12head16dim768_sub3
+
+dataset:  LakhClean # Pop1k7, Pop909, SOD, LakhClean
+use_ddp: True # True, False | distributed data parallel
+use_fp16: True # True, False | mixed precision training
+use_diff: True # True,use diffusion in subdecoder
+use_dispLoss: True
+lambda_weight: 0.5
+tau: 0.5
+diff_steps: 8 # number of diffusion steps
+train_params:
+  device: cuda
+  batch_size: 8
+  grad_clip: 1.0
+  num_iter: 25000  # total number of iterations
+  num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference
+  num_cycles_for_model_checkpoint: 10 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint
+  iterations_per_training_cycle: 10 # number of iterations for logging training loss
+  iterations_per_validation_cycle: 500 # number of iterations for validation process
+  input_length: 3072 # input sequence length3072
+  # you can use focal loss, it it's not used, set focal_gamma to 0
+  focal_alpha: 1
+  focal_gamma: 0
+  # learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details
+  scheduler : cosinelr
+  initial_lr: 0.0001
+  decay_step_rate: 0.4 # means it will reach its lowest point at decay_step_rate * total_num_iter
+  num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts'
+  warmup_steps: 2000 # number of warmup steps
+  max_lr: 0.00015 
+  gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts'
+  # Distributed Data Parallel
+  world_size: 5 # 0 means no distributed training
+  gradient_accumulation_steps: 1 # 1 means no gradient accumulation
+inference_params:
+  num_uncond_generation: 1 # number of unconditional generation
+  num_cond_generation: 3 # number of conditional generation
+data_params:
+  first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument)
+  split_ratio: 0.99 # train-validation-test split ratio
+  aug_type: null # random, null | pitch and chord augmentation type
+general:
+  debug: False
+  make_log: True # True, False | update the log file in wandb online to your designated project and entity
+  infer_and_log: True # True, False | inference and log the results
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml
new file mode 100644
index 0000000..88b6ef7
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml
@@ -0,0 +1,20 @@
+encoding_scheme: cp
+num_features: 5
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  input_length: 1024
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml
new file mode 100644
index 0000000..2f185d7
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml
@@ -0,0 +1,20 @@
+encoding_scheme: cp
+num_features: 5
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  input_length: 1024
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml
new file mode 100644
index 0000000..dae9889
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: cp
+num_features: 5
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml
new file mode 100644
index 0000000..97f2a75
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: cp
+num_features: 5
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+partial_sequential_prediction: True
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml
new file mode 100644
index 0000000..93ee60f
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: cp
+num_features: 7
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml
new file mode 100644
index 0000000..fbcda81
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: cp
+num_features: 7
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml
new file mode 100644
index 0000000..e991ad3
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: cp
+num_features: 7
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml
new file mode 100644
index 0000000..848b0a2
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml
@@ -0,0 +1,20 @@
+encoding_scheme: cp
+num_features: 7
+vocab_name: MusicTokenVocabCP
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+partial_sequential_prediction: True
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  input_length: 1024
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml
new file mode 100644
index 0000000..1d58b6d
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml
new file mode 100644
index 0000000..8602ebc
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml
new file mode 100644
index 0000000..f1f71e0
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 3
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml
new file mode 100644
index 0000000..26c281b
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 3
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml
new file mode 100644
index 0000000..26a65e6
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml
new file mode 100644
index 0000000..44f38fa
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml
new file mode 100644
index 0000000..4022e9f
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: Parallel
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml
new file mode 100644
index 0000000..db7611a
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: RNN
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml
new file mode 100644
index 0000000..fb276a4
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 5
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SelfAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml
new file mode 100644
index 0000000..f25a42c
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml
new file mode 100644
index 0000000..5359592
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml
new file mode 100644
index 0000000..0103bea
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml
new file mode 100644
index 0000000..161144e
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: Parallel
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml
new file mode 100644
index 0000000..4db5550
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: RNN
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml
new file mode 100644
index 0000000..71571a3
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 7
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SelfAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml
new file mode 100644
index 0000000..2a3864e
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SelfAttentionEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml
new file mode 100644
index 0000000..123ba16
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml
new file mode 100644
index 0000000..ace6af3
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml
new file mode 100644
index 0000000..412409d
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 3
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml
new file mode 100644
index 0000000..3510d68
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: CrossAttention
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 6
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml
new file mode 100644
index 0000000..4145a00
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml
new file mode 100644
index 0000000..510a725
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.2
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 16
+  num_head: 12
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml
new file mode 100644
index 0000000..cd72a8c
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml
@@ -0,0 +1,20 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
+  
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml
new file mode 100644
index 0000000..4d193a5
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml
@@ -0,0 +1,20 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: AverageEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: True
+  
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml
new file mode 100644
index 0000000..37bf321
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 3
+  feature_enricher_use: True
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml
new file mode 100644
index 0000000..6ec2c3d
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 2
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml
new file mode 100644
index 0000000..9b7411e
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 6
+  feature_enricher_use: True
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml
new file mode 100644
index 0000000..8c59625
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerCrossAttendDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.2
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 16
+  num_head: 12
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml
new file mode 100644
index 0000000..71e42f5
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerFinetuningDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.2
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 20
+  num_head: 12
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml
new file mode 100644
index 0000000..9aa36a5
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerPrefixDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 16
+  num_head: 12
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml
new file mode 100644
index 0000000..8ffffdd
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerPretrainingDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 768
+  num_layer: 20
+  num_head: 12
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml
new file mode 100644
index 0000000..6da49a3
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml
@@ -0,0 +1,19 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name:  SummationEmbedder
+main_decoder_name: XtransformerCrossAttendDecoder
+sub_decoder_name: DiffusionDecoder
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
+  feature_enricher_use: False
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml
new file mode 100644
index 0000000..473839d
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: FeedForward
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml
new file mode 100644
index 0000000..7d98ce6
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml
@@ -0,0 +1,18 @@
+encoding_scheme: nb
+num_features: 8
+vocab_name: MusicTokenVocabNB
+model_name: NestedMusicTransformer
+input_embedder_name: SummationEmbedder
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: Parallel
+model_dropout: 0.1
+input_embedder:
+  num_layer: 1
+  num_head: 8
+main_decoder:
+  dim_model: 512
+  num_layer: 6
+  num_head: 8
+sub_decoder:
+  decout_window_size: 1 # 1 means no previous decoding output added
+  num_layer: 1
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/remi5.yaml b/Amadeus/symbolic_yamls/nn_params/remi5.yaml
new file mode 100644
index 0000000..dba0f34
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/remi5.yaml
@@ -0,0 +1,12 @@
+encoding_scheme: remi
+num_features: 5
+vocab_name: LangTokenVocab
+model_name: NestedMusicTransformer
+input_embedder_name: SingleEmbedding
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SingleProjection
+model_dropout: 0.1
+main_decoder:
+  dim_model: 512
+  num_layer: 8
+  num_head: 8
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/remi7.yaml b/Amadeus/symbolic_yamls/nn_params/remi7.yaml
new file mode 100644
index 0000000..b9b7768
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/remi7.yaml
@@ -0,0 +1,12 @@
+encoding_scheme: remi
+num_features: 7
+vocab_name: LangTokenVocab
+model_name: NestedMusicTransformer
+input_embedder_name: SingleEmbedding
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SingleProjection
+model_dropout: 0.1
+main_decoder:
+  dim_model: 512
+  num_layer: 8
+  num_head: 8
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/remi8.yaml b/Amadeus/symbolic_yamls/nn_params/remi8.yaml
new file mode 100644
index 0000000..20e2948
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/remi8.yaml
@@ -0,0 +1,12 @@
+encoding_scheme: remi
+num_features: 8
+vocab_name: LangTokenVocab
+model_name: NestedMusicTransformer
+input_embedder_name: SingleEmbedding
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SingleProjection
+model_dropout: 0.1
+main_decoder:
+  dim_model: 512
+  num_layer: 8
+  num_head: 8
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml b/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml
new file mode 100644
index 0000000..da50eb2
--- /dev/null
+++ b/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml
@@ -0,0 +1,12 @@
+encoding_scheme: remi
+num_features: 8
+vocab_name: LangTokenVocab
+model_name: NestedMusicTransformer
+input_embedder_name: SingleEmbedding
+main_decoder_name: XtransformerDecoder
+sub_decoder_name: SingleProjection
+model_dropout: 0.1
+main_decoder:
+  dim_model: 512
+  num_layer: 12
+  num_head: 16
\ No newline at end of file
diff --git a/Amadeus/symbolic_yamls/symbolic_sweep.yaml b/Amadeus/symbolic_yamls/symbolic_sweep.yaml
new file mode 100644
index 0000000..166bd7d
--- /dev/null
+++ b/Amadeus/symbolic_yamls/symbolic_sweep.yaml
@@ -0,0 +1,17 @@
+program: train.py
+method: grid
+metric:
+  name: valid.total
+  goal: minimize
+parameters:
+  train_params.batch_size:
+    values: [8]
+  train_params.focal_gamma:
+    values: [0, 1]
+  nn_params.main_decoder.input_length:
+    values: [8192]
+
+command:
+  - python3
+  - ${program}
+  - ${args_no_hyphens}
\ No newline at end of file
diff --git a/Amadeus/train_utils.py b/Amadeus/train_utils.py
new file mode 100644
index 0000000..a98ce59
--- /dev/null
+++ b/Amadeus/train_utils.py
@@ -0,0 +1,428 @@
+import math
+
+from numpy import mask_indices
+import torch
+import torch.nn as nn
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim import Optimizer
+from collections import defaultdict
+import torch.nn.functional as F
+
+def add_conti_for_single_feature(tensor):
+  new_target = tensor.clone()
+  # Assuming tensor shape is [batch, sequence, features]
+  # Create a shifted version of the tensor
+  shifted_tensor = torch.roll(new_target, shifts=1, dims=1)
+  # The first element of each sequence cannot be a duplicate by definition
+  shifted_tensor[:, 0] = new_target[:, 0] + 1
+  
+  # Identify where the original and shifted tensors are the same (duplicates)
+  duplicates = new_target == shifted_tensor
+  # Replace duplicates with 9999
+  new_target[duplicates] = 9999
+  return new_target
+
+def adjust_prediction_order(encoding_scheme, num_features, target_feature, nn_params):
+    feature_prediction_order_dict = {
+        4: ["type", "beat", "pitch", "duration"],
+        5: ["type", "beat", "instrument", "pitch", "duration"],
+        7: ["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"],
+        8: ["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"]
+    }
+
+    if encoding_scheme == 'remi':
+        prediction_order = feature_prediction_order_dict[num_features]
+    elif encoding_scheme == 'cp':
+        if nn_params.get("partial_sequential_prediction", False):
+            default_prediction_order = feature_prediction_order_dict[num_features]
+            prediction_order = [default_prediction_order[0], default_prediction_order[1:]]
+        else:
+            prediction_order = feature_prediction_order_dict[num_features]
+    elif encoding_scheme == 'nb':
+        assert target_feature in feature_prediction_order_dict[num_features], f"Target feature {target_feature} not in the selected sub-token set. Please check target feature in the config and num_features in nn_params."
+        default_prediction_order = feature_prediction_order_dict[num_features]
+        
+        # Reorganize the prediction order based on the target_feature
+        target_index = default_prediction_order.index(target_feature)
+        prediction_order = default_prediction_order[target_index:] + default_prediction_order[:target_index]
+        
+    return prediction_order
+
+########################### Loss function ################################
+
+class NLLLoss4REMI():
+  def __init__(
+      self, 
+      focal_alpha:float,
+      focal_gamma:float,
+  ):
+    self.alpha = focal_alpha
+    self.gamma = focal_gamma
+  
+  def get_nll_loss(self, logits, target, mask):
+    probs = logits.softmax(dim=-1)
+    if probs.ndim == 3:
+      probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    # clamp min value to 1e-7 to avoid log(0)
+    pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len]
+    loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len]
+    loss_seq = loss * mask.flatten(0, 1) # [batch_size*seq_len]
+    loss = loss_seq.sum() / mask.sum() # calculating mean loss considering mask
+    return loss, loss_seq
+
+  def __call__(self, logits, shifted_tgt, mask, vocab):
+    if vocab is not None:
+      loss, loss_seq = self.get_nll_loss(logits, shifted_tgt, mask)
+      loss_by_class_normal = defaultdict(float)
+      shifted_tgt_with_mask = shifted_tgt * mask # [b, t]
+      answers_idx = shifted_tgt_with_mask.flatten(0,1) # [b*t]
+      for feature in vocab.feature_list:
+        feature_mask = vocab.total_mask[feature].to(answers_idx.device) # [327,]
+        mask_for_target = feature_mask[answers_idx] # [b*t]
+        normal_loss_seq_by_class = loss_seq * mask_for_target
+        if mask_for_target.sum().item() != 0:
+          loss_by_class_normal[feature+'_normal'] += (normal_loss_seq_by_class.sum().item() / mask_for_target.sum().item())
+      return loss, loss_by_class_normal
+    else:
+      loss, loss_seq = self.get_nll_loss(logits, shifted_tgt, mask)
+      return loss, None
+    
+class NLLLoss4CompoundToken():
+  def __init__(self, feature_list, focal_alpha:float, focal_gamma:float):
+    self.feature_list = feature_list
+    self.alpha = focal_alpha
+    self.gamma = focal_gamma
+
+  def get_nll_loss(self, logits, target, mask):
+    probs = logits.softmax(dim=-1)
+    if probs.ndim == 3:
+      probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    # clamp min value to 1e-7 to avoid log(0)
+    pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len]
+    loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len]
+    loss = loss * mask.flatten(0, 1) # [batch_size*seq_len]
+    loss = loss.sum() / mask.sum() # calculating mean loss considering mask
+    return loss
+  
+  def get_nll_loss_for_logging(self, logits, target, mask, ignore_token, conti_token):
+    probs = logits.softmax(dim=-1)
+    
+    if ignore_token is not None and conti_token is not None:
+      target_conti = add_conti_for_single_feature(target) # [batch_size*seq_len]
+      valid_mask = (target_conti != ignore_token) & (target_conti != conti_token) # [batch_size*seq_len]
+    elif ignore_token is not None and conti_token is None:
+      valid_mask = (target != ignore_token)
+    elif ignore_token is None and conti_token is None:
+      valid_mask = torch.ones_like(target).bool()
+    valid_mask = valid_mask.flatten(0, 1)
+    
+    if probs.ndim == 3:
+      probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    pt = probs[torch.arange(len(target)), target] # [batch_size*seq_len]
+    total_mask = mask.flatten(0, 1) & valid_mask # [batch_size*seq_len]
+    loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len]
+    loss = loss * total_mask # [batch_size*seq_len]
+    loss = loss.sum() / total_mask.sum() # calculating mean loss considering mask
+    return loss
+
+  def __call__(self, logits_dict, shifted_tgt, mask, valid):
+    train_loss_list = []
+    log_loss_dict_normal = {}
+    for idx, key in enumerate(self.feature_list):
+      training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask)
+      train_loss_list.append(training_loss)
+      if valid:
+        if key == 'type':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=None, conti_token=None)
+        elif key == 'beat':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999)
+        elif key == 'chord' or key == 'tempo' or key == 'instrument':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999)
+        else:
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=None)
+        k_normal = key + '_normal'
+        log_loss_dict_normal[k_normal] = log_normal_loss
+    total_loss = sum(train_loss_list) / len(train_loss_list)
+    if valid:
+      return  total_loss, log_loss_dict_normal
+    else:
+      return total_loss, None
+
+def dispersive_loss(z, tau=0.5, eps=1e-8):
+    """使用余弦距离的Dispersive Loss实现"""
+    B = z.size(0)
+    
+    # 计算余弦相似度矩阵 [B, B]
+    z_norm = torch.nn.functional.normalize(z, p=2, dim=1)  # 向量归一化
+    sim_matrix = torch.matmul(z_norm, z_norm.transpose(0, 1))  # 余弦相似度
+    
+    # 转换为余弦距离 (1 - 相似度)，排除对角线
+    mask = 1 - torch.eye(B, device=z.device)
+    cos_dist = (1 - sim_matrix) * mask
+    
+    # 计算分散性损失（与L2版本相同）
+    exp_term = torch.exp(-cos_dist / tau)
+    mean_exp = exp_term.sum() / (B * (B - 1) + eps)
+    loss = -torch.log(mean_exp + eps)
+    return loss
+class DiffusionLoss4CompoundToken():
+  def __init__(self, feature_list, focal_alpha:float, focal_gamma:float):
+    self.feature_list = feature_list
+    self.alpha = focal_alpha
+    self.gamma = focal_gamma
+
+  def get_nll_loss(self, logits, target, mask,mask_indices, p_mask):
+    if logits.ndim == 3:
+      logits = logits.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    if mask_indices.ndim == 2:
+      mask_indices = mask_indices.flatten(0, 1)
+    if p_mask.ndim == 2:
+      p_mask = p_mask.flatten(0, 1)
+    if mask.ndim == 2:
+      mask = mask.flatten(0, 1)
+    # datatype of logits, target, mask_indices, p_mask should be the same
+    token_loss = F.cross_entropy(
+        logits[mask_indices],  # 直接索引 logits
+        target[mask_indices],
+        reduction='none'
+    ) / p_mask[mask_indices]
+    loss = (token_loss * mask[mask_indices]).sum() / mask[mask_indices].sum()
+    return loss
+
+  def get_nll_loss_for_logging(self, logits, target, mask, ignore_token, conti_token, mask_indices, p_mask):
+    if ignore_token is not None and conti_token is not None:
+      target_conti = add_conti_for_single_feature(target) # [batch_size*seq_len]
+      valid_mask = (target_conti != ignore_token) & (target_conti != conti_token) # [batch_size*seq_len]
+    elif ignore_token is not None and conti_token is None:
+      valid_mask = (target != ignore_token)
+    elif ignore_token is None and conti_token is None:
+      valid_mask = torch.ones_like(target).bool()
+    valid_mask = valid_mask.flatten(0, 1)
+    
+    if logits.ndim == 3:
+      logits = logits.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    if mask_indices.ndim == 2:
+      mask_indices = mask_indices.flatten(0, 1)
+    if p_mask.ndim == 2:
+      p_mask = p_mask.flatten(0, 1)
+    token_loss = F.cross_entropy(
+        logits[mask_indices],  # 直接索引 logits
+        target[mask_indices],
+        reduction='none'
+    ) / p_mask[mask_indices]
+    total_mask = mask.flatten(0, 1) & valid_mask # [batch_size*seq_len]
+    loss = (token_loss * total_mask[mask_indices]).sum() / total_mask[mask_indices].sum()
+
+    return loss
+
+  def __call__(self, logits_dict, shifted_tgt, mask, mask_indices, p_mask, valid, input_dict=None,lambda_weight=0.5, tau=0.5):
+    train_loss_list = []
+    log_loss_dict_normal = {}
+    mask_indices = mask_indices.reshape(shifted_tgt.shape[0], shifted_tgt.shape[1], -1)
+    p_mask = p_mask.reshape(shifted_tgt.shape[0], shifted_tgt.shape[1], -1)
+    disp_loss = None
+    if input_dict is not None:
+      hidden_vec =input_dict['hidden_vec'] #bs,seq_len,dim
+      feat = hidden_vec.mean(dim=1) #bs,dim
+      disp_loss = dispersive_loss(feat, tau=tau) # scalar
+    for idx, key in enumerate(self.feature_list):
+      training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask, mask_indices[..., idx], p_mask[..., idx])
+      train_loss_list.append(training_loss)
+      if valid:
+        if key == 'type':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=None, conti_token=None, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx])
+        elif key == 'beat':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx])
+        elif key == 'chord' or key == 'tempo' or key == 'instrument':
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx])
+        else:
+          log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=None, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx])
+        k_normal = key + '_normal'
+        log_loss_dict_normal[k_normal] = log_normal_loss
+    total_loss = sum(train_loss_list) / len(train_loss_list)
+    if disp_loss is not None:
+      total_loss = total_loss + lambda_weight * disp_loss
+      log_loss_dict_normal['dispersion'] = disp_loss.item()
+    if valid:
+      return  total_loss, log_loss_dict_normal
+    else:
+      return total_loss, None
+    
+class EncodecFlattenLoss():
+  def __init__(self, feature_list):
+    self.feature_list = feature_list
+  
+  def get_nll_loss(self, logits, target, mask):
+    probs = logits.softmax(dim=-1)
+    if probs.ndim == 3:
+      probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size]
+    if target.ndim == 2:
+      target = target.flatten(0, 1) # [batch_size*seq_len]
+    pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len]
+    loss_seq = -torch.log(pt) # [batch_size*seq_len]
+    loss_seq = loss_seq * mask.flatten(0, 1) # [batch_size*seq_len]
+    loss = loss_seq.sum() / mask.sum() # calculating mean loss considering mask
+    return loss
+
+  def __call__(self, logits, shifted_tgt, mask):
+    loss = self.get_nll_loss(logits, shifted_tgt, mask)
+    return loss
+  
+class EncodecMultiClassLoss(EncodecFlattenLoss):
+  def __init__(self, feature_list):
+    super().__init__(feature_list)
+  
+  def __call__(self, logits_dict, shifted_tgt, mask):
+    train_loss_list = []
+    for idx, key in enumerate(self.feature_list):
+      training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask)
+      train_loss_list.append(training_loss)
+    total_loss = sum(train_loss_list) / len(train_loss_list)
+    return total_loss
+
+########################### Learning rate Scheduler ################################
+'''
+This scheduler is from  https://gaussian37.github.io/dl-pytorch-lr_scheduler/#custom-cosineannealingwarmrestarts-1
+It's basically a cosine annealing scheduler with warm restarts including two methods, warm up start and reducing maximum lr.
+'''
+
+class CosineAnnealingWarmUpRestarts(_LRScheduler):
+    def __init__(self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1., last_epoch=-1, eta_min=0):
+        if T_0 <= 0 or not isinstance(T_0, int):
+            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
+        if T_mult < 1 or not isinstance(T_mult, int):
+            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
+        if T_up < 0 or not isinstance(T_up, int):
+            raise ValueError("Expected positive integer T_up, but got {}".format(T_up))
+        self.T_0 = T_0
+        self.T_mult = T_mult
+        self.base_eta_max = eta_max
+        self.eta_max = eta_max
+        self.T_up = T_up
+        self.T_i = T_0
+        self.gamma = gamma
+        self.cycle = 0
+        self.T_cur = last_epoch
+        super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch)
+    
+    def get_lr(self):
+        if self.T_cur == -1:
+            return self.base_lrs
+        elif self.T_cur < self.T_up:
+            return [(self.eta_max - base_lr)*self.T_cur / self.T_up + base_lr for base_lr in self.base_lrs]
+        else:
+            return [base_lr + (self.eta_max - base_lr) * (1 + math.cos(math.pi * (self.T_cur-self.T_up) / (self.T_i - self.T_up))) / 2
+                    for base_lr in self.base_lrs]
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+            self.T_cur = self.T_cur + 1
+            if self.T_cur >= self.T_i:
+                self.cycle += 1
+                self.T_cur = self.T_cur - self.T_i
+                self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up
+        else:
+            if epoch >= self.T_0:
+                if self.T_mult == 1:
+                    self.T_cur = epoch % self.T_0
+                    self.cycle = epoch // self.T_0
+                else:
+                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
+                    self.cycle = n
+                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
+                    self.T_i = self.T_0 * self.T_mult ** (n)
+            else:
+                self.T_i = self.T_0
+                self.T_cur = epoch
+                
+        self.eta_max = self.base_eta_max * (self.gamma**self.cycle)
+        self.last_epoch = math.floor(epoch)
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
+
+class CosineLRScheduler(_LRScheduler):
+    """Cosine LR scheduler.
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        lr_min_ratio (float): Minimum learning rate.
+        cycle_length (float): Cycle length.
+    """
+    def __init__(self, optimizer: Optimizer, total_steps: int, warmup_steps: int,
+                 lr_min_ratio: float = 0.0, cycle_length: float = 1.0):
+        self.warmup_steps = warmup_steps
+        assert self.warmup_steps >= 0
+        self.total_steps = total_steps
+        assert self.total_steps >= 0
+        self.lr_min_ratio = lr_min_ratio
+        self.cycle_length = cycle_length
+        super().__init__(optimizer)
+
+    def _get_sched_lr(self, lr: float, step: int):
+        if step < self.warmup_steps:
+            lr_ratio = step / self.warmup_steps
+            lr = lr_ratio * lr
+        elif step <= self.total_steps:
+            s = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+            lr_ratio = self.lr_min_ratio + 0.5 * (1 - self.lr_min_ratio) * \
+                (1. + math.cos(math.pi * s / self.cycle_length))
+            lr = lr_ratio * lr
+        else:
+            lr_ratio = self.lr_min_ratio
+            lr = lr_ratio * lr
+        return lr
+
+    def get_lr(self):
+        return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs]
+      
+
+class DispersiveLoss(nn.Module):
+    def __init__(self, loss_type='infonce_l2', tau=0.5, lambda_weight=0.5):
+        super().__init__()
+        self.loss_type = loss_type
+        self.tau = tau
+        self.lambda_weight = lambda_weight
+        
+    def forward(self, features, diffusion_loss):
+        """
+        features: 批次特征矩阵，形状为 [batch_size, feature_dim]
+        diffusion_loss: 原扩散损失
+        """
+        batch_size = features.size(0)
+        
+        # 计算距离矩阵
+        if self.loss_type == 'infonce_l2':
+            # 计算平方L2距离
+            dist_matrix = torch.cdist(features, features, p=2) ** 2
+            # 计算分散损失
+            exp_dist = torch.exp(-dist_matrix / self.tau)
+            disp_loss = torch.log(exp_dist.mean())
+        elif self.loss_type == 'hinge':
+            # Hinge损失，假设阈值epsilon=1.0
+            dist_matrix = torch.cdist(features, features, p=2)
+            disp_loss = torch.max(torch.zeros_like(dist_matrix), 1.0 - dist_matrix).mean()
+        elif self.loss_type == 'covariance':
+            # 协方差损失
+            normalized_features = (features - features.mean(dim=0)) / features.std(dim=0)
+            cov_matrix = torch.matmul(normalized_features.T, normalized_features) / batch_size
+            # 非对角线元素平方和
+            mask = ~torch.eye(cov_matrix.size(0), dtype=torch.bool)
+            disp_loss = (cov_matrix[mask] ** 2).mean()
+        else:
+            raise ValueError("Unsupported loss type")
+        
+        # 总损失 = 扩散损失 + lambda * 分散损失
+        total_loss = diffusion_loss + self.lambda_weight * disp_loss
+        return total_loss, disp_loss
\ No newline at end of file
diff --git a/Amadeus/trainer_accelerate.py b/Amadeus/trainer_accelerate.py
new file mode 100644
index 0000000..3b7ccc2
--- /dev/null
+++ b/Amadeus/trainer_accelerate.py
@@ -0,0 +1,1012 @@
+from calendar import EPOCH, c
+from multiprocessing import context
+import time
+import pickle
+import os
+from pathlib import Path
+from typing import Union
+from datetime import datetime
+from omegaconf import OmegaConf
+import random
+import itertools 
+
+
+import torch
+import torchaudio
+from torch.utils.data import DataLoader
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler, Sampler
+
+# import accelerate
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.utils import set_seed
+#======================================================================
+
+
+import wandb
+from collections import defaultdict
+from tqdm.auto import tqdm
+
+from .model_zoo import AmadeusModel
+from .symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+from .symbolic_encoding.data_utils import TuneCompiler
+from .symbolic_encoding.decoding_utils import MidiDecoder4REMI
+from .evaluation_utils import add_conti_in_valid
+from .train_utils import NLLLoss4REMI
+
+from data_representation.vocab_utils import LangTokenVocab
+class InfiniteSampler(Sampler):
+    def __init__(self, data_source, shuffle=True):
+        self.data_source = data_source
+        self.shuffle = shuffle
+        self.indices = list(range(len(data_source)))
+        if self.shuffle:
+            random.shuffle(self.indices)
+        self.infinite_iterator = itertools.cycle(self.indices)
+    
+    def __iter__(self):
+        return self.infinite_iterator
+    
+    def __len__(self):
+        return None  # 表示无限长度
+      
+class LanguageModelTrainer:
+    def __init__(
+        self,
+        model: AmadeusModel,  # The language model for music generation
+        optimizer: torch.optim.Optimizer,  # Optimizer for updating model weights
+        scheduler: torch.optim.lr_scheduler._LRScheduler,  # Learning rate scheduler
+        loss_fn: NLLLoss4REMI,  # Loss function to compute the error
+        midi_decoder: MidiDecoder4REMI,  # Decoder to convert model output into MIDI format
+        train_set: TuneCompiler,  # Training dataset
+        valid_set: TuneCompiler,  # Validation dataset
+        save_dir: str,  # Directory to save models and logs
+        vocab: LangTokenVocab,  # Vocabulary for tokenizing sequences
+        use_ddp: bool,  # Whether to use Distributed Data Parallel (DDP)
+        use_fp16: bool,  # Whether to use mixed-precision training (FP16)
+        world_size: int,  # Total number of devices for distributed training
+        batch_size: int,  # Batch size for training
+        infer_target_len: int,  # Target length for inference generation
+        gpu_id: int,  # GPU device ID for computation
+        sampling_method: str,  # Sampling method for sequence generation
+        sampling_threshold: float,  # Threshold for sampling decisions
+        sampling_temperature: float,  # Temperature for controlling sampling randomness
+        config,  # Configuration parameters (contains general, training, and inference settings)
+        model_checkpoint: Union[str, None] = None,  # Path to a pre-trainmodl checkpoint (optional)
+    ):
+        # Save model, optimizer, and other configurations
+        self.model = model
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.loss_fn = loss_fn
+
+        self.valid_set = valid_set
+        self.vocab = vocab
+        self.use_ddp = use_ddp
+        self.world_size = world_size
+        self.batch_size = batch_size
+        self.gpu_id = gpu_id
+        self.sampling_method = sampling_method
+        self.sampling_threshold = sampling_threshold
+        self.sampling_temperature = sampling_temperature
+        self.config = config
+        self.last_iter = 0
+
+        # Load pre-trained model if provided
+        if model_checkpoint:
+          # parse the model checkpoint iter
+            if isinstance(model_checkpoint, str):
+              if model_checkpoint.endswith('.pt'):
+                self.last_iter = int(model_checkpoint.split('/')[-1].split('_')[0][4:])
+            checkpoint = torch.load(model_checkpoint, map_location='cpu')
+            # print state dict keys
+            print("Loading model checkpoint from", model_checkpoint)
+            print("Checkpoint keys:", checkpoint['model'].keys())
+            if isinstance(self.model, DDP):
+              self.model.module.load_state_dict(checkpoint['model'], strict=False)
+            else:
+
+              self.model.load_state_dict(checkpoint['model'], strict=False)
+        # Training hyperparameters from config
+        self.grad_clip = config.train_params.grad_clip
+        self.num_cycles_for_inference = config.train_params.num_cycles_for_inference
+        self.num_cycles_for_model_checkpoint = config.train_params.num_cycles_for_model_checkpoint
+        self.iterations_per_training_cycle = config.train_params.iterations_per_training_cycle
+        self.iterations_per_validation_cycle = config.train_params.iterations_per_validation_cycle
+        self.make_log = config.general.make_log
+        self.num_uncond_generation = config.inference_params.num_uncond_generation
+        self.num_cond_generation = config.inference_params.num_cond_generation
+        self.num_max_seq_len = infer_target_len
+        self.infer_and_log = config.general.infer_and_log
+        self.valid_loader = self.generate_data_loader(self.valid_set, shuffle=False, drop_last=True)
+        
+        # gradient accumulation
+        self.gradient_accumulation_steps = config.train_params.gradient_accumulation_steps
+        # Set up mixed-precision training (FP16) if enabled
+        if use_fp16:
+            self.use_fp16 = True
+        else:
+            self.use_fp16 = False
+        # Set up Distributed Data Parallel (DDP) if required
+        if use_ddp:
+        # prepare using accelerator
+            if self.use_fp16:
+                self.accelerator = Accelerator(mixed_precision='bf16',
+                                               step_scheduler_with_optimizer=False,
+                                               gradient_accumulation_steps=self.gradient_accumulation_steps,
+                                                kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)])
+            else:
+                self.accelerator = Accelerator(
+                  gradient_accumulation_steps=self.gradient_accumulation_steps,
+                                               step_scheduler_with_optimizer=False,
+                                               kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)])
+            
+            with self.accelerator.main_process_first():
+              self.train_set = train_set
+              self.train_loader = self.generate_data_loader(self.train_set, shuffle=False, drop_last=False)
+              self.accelerator.wait_for_everyone()
+              self.accelerator.print(f"Using {self.world_size} GPUs for training")
+
+            self.model, self.optimizer, self.scheduler, self.train_loader = self.accelerator.prepare(
+                self.model, self.optimizer, self.scheduler, self.train_loader
+            )
+            self.accelerator.wait_for_everyone()
+            # self.accelerator.init_trackers("nested_music_transformer", config)
+            set_seed(42)
+            self.device = self.accelerator.device
+            self.model.to(self.device)
+            # set up for logging
+            if self.accelerator.is_main_process: 
+              save_dir = self.setup_log(config)
+              print("savwe",save_dir)
+              # Create directory for saving models and logs
+              self.save_dir = Path(save_dir)
+              self.save_dir.mkdir(exist_ok=True, parents=True)
+              self.set_save_out()
+        else:
+            self.train_set = train_set
+                  # Create data loaders for training and validation sets
+            self.train_loader = self.generate_data_loader(train_set, shuffle=False, drop_last=True)
+            self.valid_loader = self.generate_data_loader(valid_set, shuffle=True, drop_last=True)
+            save_dir = self.setup_log(config)
+            # Create directory for saving models and logs
+            self.save_dir = Path(save_dir)
+            self.save_dir.mkdir(exist_ok=True, parents=True)
+            self.set_save_out()
+
+            self.device = config.train_params.device
+            self.model.to(self.device)
+
+
+        # Initialize tracking metrics
+        self.best_valid_accuracy = 0
+        self.best_valid_loss = 100
+        self.training_loss = []
+        self.validation_loss = []
+        self.validation_acc = []
+
+        self.midi_decoder = midi_decoder
+        
+
+    def generate_experiment_name(self, config):
+      # add base hyperparameters to the experiment name
+      dataset_name = config.dataset
+      encoding_name = config.nn_params.encoding_scheme
+      num_features = config.nn_params.num_features
+      input_embedder_name = config.nn_params.input_embedder_name
+      sub_decoder_name = config.nn_params.sub_decoder_name
+      batch_size = config.train_params.batch_size
+      num_layers = config.nn_params.main_decoder.num_layer
+      input_length = config.train_params.input_length
+      first_pred_feature = config.data_params.first_pred_feature
+
+      # Add target hyperparameters to the experiment name
+      # dropout
+      main_dropout = config.nn_params.model_dropout 
+      # learning rate
+      lr_decay_rate = config.train_params.decay_step_rate
+
+      time  = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+      # Combine the information into a single string for the experiment name
+      # experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{input_embedder_name}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}\
+      # _dropout{main_dropout}_lrdecay{lr_decay_rate}"
+      experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}"
+      return experiment_name
+
+    def collate_fn(self, batch):
+        """
+        Custom collate function to handle variable-length sequences in a batch.
+        It pads sequences to the maximum length in the batch and returns a tuple of padded sequences and their lengths.
+        """
+        # Unzip the batch into segments, masks, captions, and encoded captions
+        segments, masks, captions, encoded_captions = zip(*batch)
+        # print("collate_fn",len(segments),len(masks),len(captions),len(encoded_captions))
+        # # Pad the segments and masks to the maximum length in the batch
+        # padded_segments = torch.nn.utils.rnn.pad_sequence(segments, batch_first=True)
+        # padded_masks = torch.nn.utils.rnn.pad_sequence(masks, batch_first=True)
+        # # Return padded segments and masks along with captions and encoded captions
+        segments = torch.stack(segments, dim=0)
+        masks = torch.stack(masks, dim=0)
+        print(captions)
+        print(encoded_captions)
+        # captions = torch.stack(captions, dim=0)
+        # encoded_captions = torch.stack(encoded_captions, dim=0)
+        return segments, masks, captions, encoded_captions
+        # return padded_segments, padded_masks, captions, encoded_captions
+    def setup_log(self, config):
+      if self.accelerator.is_main_process:
+        if config.general.make_log: 
+            experiment_name =self.generate_experiment_name(config)
+            wandb.init(
+                project="Acce_Music_Transformer",
+                name=experiment_name,
+                config=OmegaConf.to_container(config)
+            )
+            # 保存配置到 WANDB 根目录
+            config_path = Path(wandb.run.dir) / "config.yaml"
+            OmegaConf.save(config, config_path)  # 关键代码
+            
+            save_dir = Path(wandb.run.dir) / "checkpoints"
+            save_dir.mkdir(exist_ok=True, parents=True)
+        else:
+            now = datetime.now()
+            save_dir = Path('wandb/debug/checkpoints') / now.strftime('%y-%m-%d')
+            save_dir.mkdir(exist_ok=True, parents=True)
+            # 保存配置到调试目录
+            config_path = save_dir / "config.yaml"
+            OmegaConf.save(config, config_path)  # 关键代码
+        
+        return str(save_dir)
+
+    # Set up the output directories for saving MIDI results during inference
+    def set_save_out(self):
+      if self.accelerator.is_main_process:
+        # copy from latest folder in wandb/debug/checkpoints
+        target_folder = 'wandb/debug/checkpoints'
+        latest_folder = sorted(Path(target_folder).iterdir(), key=os.path.getmtime)[-1]
+        # get files in the latest folder
+        files = [f for f in latest_folder.iterdir() if f.is_file()]
+        # copy files to the save_dir
+        for file in files:
+            # copy the file to the save_dir
+            target_file = self.save_dir / file.name
+            if not target_file.exists():
+                os.system(f'cp {file} {target_file}')
+        if self.infer_and_log:
+            self.valid_out_dir = self.save_dir / 'valid_out'
+            os.makedirs(self.valid_out_dir, exist_ok=True)
+
+    # Save the current model and optimizer state
+    def save_model(self, path):
+        if isinstance(self.model, DDP):
+            torch.save({'model': self.model.module.state_dict(), 'optim': self.optimizer.state_dict()}, path)
+        else:
+            torch.save({'model': self.model.state_dict(), 'optim': self.optimizer.state_dict()}, path)
+
+    # Generate the data loader for either training or validation datasets
+    def generate_data_loader(self, dataset, shuffle=False, drop_last=False) -> DataLoader:
+        return DataLoader(dataset, shuffle=shuffle, batch_size=self.batch_size, drop_last=drop_last,collate_fn=None, pin_memory=True,num_workers=4, persistent_workers=True, prefetch_factor=2, worker_init_fn=None)
+
+    # Training function based on a given number of iterations
+    def accelerate_train_by_num_iter(self, num_iters):
+          # generator = iter(self.train_loader)
+      pbar = tqdm(total=num_iters, desc='Training', unit='iteration', leave=False)
+      completed_steps = self.last_iter
+      # save init model
+      while completed_steps < num_iters:
+        total_loss = 0
+        current_loss = 0
+        for i, batch in enumerate(self.train_loader):
+            # gradient accumulation
+            
+            with self.accelerator.accumulate(self.model):
+                
+            # Start time for the training step,only for main process
+              start_time = time.time()
+                  
+              # Tra\in the model on a single batch
+              # loss_value, loss_dict = self._accelerate_train_by_single_batch(batch)
+              loss, _, loss_dict = self._get_loss_pred_from_single_batch(batch)
+              total_loss += loss.detach().float()
+              current_loss = loss.detach().float()
+              # loss.backward()
+              self.accelerator.backward(loss)
+              if self.accelerator.sync_gradients:
+                self.accelerator.unscale_gradients(self.optimizer)
+              if self.accelerator.sync_gradients:
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
+              # self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_clip)
+                if not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and self.scheduler is not None:
+                    self.scheduler.step()
+
+              self.optimizer.step()
+              self.optimizer.zero_grad()
+                
+                    
+            if self.accelerator.sync_gradients:
+                # update progress bar
+                loss_value = loss.item()
+                # log in main process
+                completed_steps += 1
+                
+                # if self.accelerator.is_main_process:
+                loss_dict['time'] = time.time() - start_time
+                loss_dict['lr'] = self.optimizer.param_groups[0]['lr']
+                loss_dict = self._rename_dict(loss_dict, 'train')
+                self.training_loss.append(loss_value)
+                if self.accelerator.is_main_process:
+                  pbar.update(1)
+                  pbar.set_postfix(loss=loss_value, lr=self.optimizer.param_groups[0]['lr'])
+                  # save iter1 checkpoint
+                if completed_steps == 1 and self.accelerator.is_main_process:
+                    self.save_model(self.save_dir / f'iter{completed_steps}_loss{current_loss:.4f}.pt')
+
+                    # Log training loss at the specified training cycle
+                if (completed_steps + 1) % self.iterations_per_training_cycle == 0 and self.make_log and self.accelerator.is_main_process:
+                    wandb.log(loss_dict, step=completed_steps)
+
+                    # Log training accuracy periodically
+                if (completed_steps + 1) % (self.iterations_per_training_cycle * 3) == 0 and self.make_log:
+                    validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch, train=True)
+                    train_metric_dict = self._get_train_accuracy(num_nonmask_tokens, num_tokens_by_feature, correct_guess_by_feature)
+                    train_metric_dict.update(loss_dict)
+                    train_metric_dict = self._rename_dict(train_metric_dict, 'train')
+                    if self.accelerator.is_main_process:
+                      wandb.log(train_metric_dict, step=completed_steps)
+                    # delete variables to avoid memory leakages
+                    del validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature, train_metric_dict
+
+                    # Perform validation at the specified interval
+                if (completed_steps + 1) % self.iterations_per_validation_cycle == 0:
+                    self.model.eval()
+                    validation_loss, validation_acc, validation_metric_dict = self.validate()
+                    validation_metric_dict['acc'] = validation_acc
+                    validation_metric_dict = self._rename_dict(validation_metric_dict, 'valid')
+                    if self.make_log and self.accelerator.is_main_process:
+                        wandb.log(validation_metric_dict, step=completed_steps)
+                    self.validation_loss.append(validation_loss)
+                    self.validation_acc.append(validation_acc)
+                    self.best_valid_loss = min(validation_loss, self.best_valid_loss)
+
+                    # Perform inference and logging after a certain number of cycles
+                    if (completed_steps + 1) % (self.num_cycles_for_inference * self.iterations_per_validation_cycle) == 0 and self.infer_and_log and self.accelerator.is_main_process:
+                        self.inference_and_log(i, self.num_uncond_generation, self.num_cond_generation, self.num_max_seq_len)
+
+                    # Save a model checkpoint periodically
+                    if (completed_steps + 1) % (self.iterations_per_validation_cycle * self.num_cycles_for_model_checkpoint) == 0 and self.accelerator.is_main_process:
+                        self.accelerator.print(f"Saving model checkpoint at iter {completed_steps}")
+                        self.save_model(self.save_dir / f'iter{completed_steps}_loss{validation_loss:.4f}.pt')
+                    self.model.train()
+
+                    # delete variables to avoid memory leakages
+                    del validation_acc, validation_metric_dict
+                # else:  
+                #   self.accelerator.wait_for_everyone()
+            # Save the model checkpoint at the end of each epoch
+        if self.accelerator.is_main_process:
+            print(f"Saving model checkpoint at iter {completed_steps}")
+            # Save the model state
+            self.save_model(self.save_dir / f"iter{completed_steps}_loss{current_loss:.4f}.pt")
+        # Save the final model after training
+        self.accelerator.wait_for_everyone()
+      if self.accelerator.is_main_process:
+        print("saving last checkpoint")
+        self.save_model(self.save_dir / f'checkpoint_last.pt')
+
+    # same as above but for accelerate
+    def _accelarate_get_loss_pred_from_single_batch(self, batch):
+        """
+        Computes the loss and predictions for a single batch of data.
+
+        Args:
+            batch: A batch of data, typically containing input sequences, targets, and masks.
+
+        Returns:
+            loss: The computed loss for the batch.
+            logits: The raw model predictions (logits).
+            loss_dict: A dictionary containing the total loss.
+
+        The method:
+        - Separates the input sequences and target sequences from the batch.
+        - Moves the data to the appropriate device.
+        - Applies mixed precision (FP16) if applicable.
+        - Computes the logits using the model and calculates the loss using the specified loss function.
+        """
+        segment, mask, caption,encoded_caption = batch
+        input_seq, target = segment[:, :-1], segment[:, 1:] 
+        input_seq = input_seq.to(self.device)
+        target = target.to(self.device)
+        mask = mask[:, :-1].to(self.device)
+        if self.use_fp16:
+            with torch.cuda.amp.autocast():
+                logits = self.model(input_seq, target)
+                loss = self.loss_fn(logits, target, mask)
+        else:
+            logits = self.model(input_seq, None)
+            loss = self.loss_fn(logits, target, mask)
+        loss_dict = {'total': loss.item()}
+        return loss, logits, loss_dict
+
+
+
+    def _train_by_single_batch(self, batch):
+        """
+        Trains the model on a single batch of data.
+
+        Args:
+            batch: A batch of data, typically consisting of input sequences and corresponding targets.
+
+        Returns:
+            loss.item(): The total loss for this batch.
+            loss_dict: A dictionary containing information about the loss and other relevant metrics.
+
+        The method:
+        - Calls `_get_loss_pred_from_single_batch` to compute the loss and predictions.
+        - Resets the optimizer's gradients.
+        - Depending on whether mixed precision (FP16) is used, it scales the loss and applies gradient clipping before stepping the optimizer.
+        - Updates the learning rate scheduler if applicable.
+        - Records the time taken for the training step and the current learning rate in the `loss_dict`.
+        """
+        start_time = time.time()
+        loss, _, loss_dict = self._get_loss_pred_from_single_batch(batch)
+        self.optimizer.zero_grad()
+        if self.use_fp16:
+            self.scaler.scale(loss).backward()
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
+            self.optimizer.step()
+        if not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and self.scheduler is not None:
+            self.scheduler.step()
+        loss_dict['time'] = time.time() - start_time
+        loss_dict['lr'] = self.optimizer.param_groups[0]['lr']
+        return loss.item(), loss_dict
+
+    def _get_loss_pred_from_single_batch(self, batch):
+        """
+        Computes the loss and predictions for a single batch of data.
+
+        Args:
+            batch: A batch of data, typically containing input sequences, targets, and masks.
+
+        Returns:
+            loss: The computed loss for the batch.
+            logits: The raw model predictions (logits).
+            loss_dict: A dictionary containing the total loss.
+
+        The method:
+        - Separates the input sequences and target sequences from the batch.
+        - Moves the data to the appropriate device.
+        - Applies mixed precision (FP16) if applicable.
+        - Computes the logits using the model and calculates the loss using the specified loss function.
+        """
+        segment, mask, caption,encoded_caption = batch
+        input_seq, target = segment[:, :-1], segment[:, 1:] 
+
+        input_seq = input_seq.to(self.device)
+        target = target.to(self.device)
+        mask = mask[:, :-1].to(self.device)
+        if self.use_fp16:
+            with self.accelerator.autocast():
+                logits = self.model(input_seq, target)
+                loss = self.loss_fn(logits, target, mask)
+        else:
+            logits = self.model(input_seq, None)
+            loss = self.loss_fn(logits, target, mask)
+        loss_dict = {'total': loss.item()}
+        return loss, logits, loss_dict
+
+    def _get_valid_loss_and_acc_from_batch(self, batch, train=False):
+        """
+        Computes validation loss and accuracy from a single batch.
+
+        Args:
+            batch: A batch of data, typically containing input sequences, targets, and masks.
+            train (bool): Indicator whether the function is being used in training mode.
+
+        Returns:
+            validation_loss: Total validation loss for the batch.
+            num_tokens: The number of valid tokens in the batch.
+            loss_dict: A dictionary containing the loss and relevant metrics.
+            None: Placeholder for future implementation.
+            num_correct_guess: Number of correctly predicted tokens.
+
+        The method:
+        - Calls `_get_loss_pred_from_single_batch` to compute the loss and predictions.
+        - Computes token-level accuracy by comparing predicted tokens with the targets.
+        """
+        segment, mask, caption,encoded_caption = batch
+        input_seq, target = segment[:, :-1], segment[:, 1:] 
+        loss, logits, loss_dict = self._get_loss_pred_from_single_batch(batch)
+        prob = torch.softmax(logits, dim=-1)
+        num_tokens = torch.sum(mask)
+        target = target.to(self.device)
+        mask = mask[:, :-1].to(self.device)
+        
+        selected_tokens = torch.argmax(prob, dim=-1) * mask
+        shifted_tgt_with_mask = target * mask
+        num_correct_guess = torch.sum(selected_tokens == shifted_tgt_with_mask) - torch.sum(mask == 0)
+        
+        validation_loss = loss.item() * num_tokens
+        num_correct_guess = num_correct_guess.item()
+        return validation_loss, num_tokens, loss_dict, None, num_correct_guess
+
+    def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess):
+        """
+        Computes training accuracy.
+
+        Args:
+            num_tokens: Total number of tokens processed.
+            num_tokens_by_feature: Number of tokens for each feature (not used here).
+            num_correct_guess: Number of correctly predicted tokens.
+
+        Returns:
+            Training accuracy, computed as the ratio of correct predictions to the total number of tokens.
+        """
+        return num_correct_guess / num_tokens
+
+    def validate(self, external_loader=None):
+        """
+        Validates the model on a dataset.
+
+        Args:
+            external_loader (DataLoader): If provided, an external DataLoader can be used for validation.
+
+        Returns:
+            total_validation_loss: Average validation loss over all batches.
+            total_num_correct_guess: Total number of correct predictions divided by the number of tokens (accuracy).
+            validation_metric_dict: Dictionary of validation metrics averaged over all batches.
+
+        The method:
+        - Iterates through the validation data loader, calculating the loss and accuracy for each batch.
+        - Aggregates the results over all batches and returns the overall validation metrics.
+        """
+        if external_loader and isinstance(external_loader, DataLoader):
+            loader = external_loader
+            print('An arbitrary loader is used instead of Validation loader')
+        else:
+            loader = self.valid_loader
+
+        self.model.eval()
+        total_validation_loss = 0
+        total_num_correct_guess = 0
+        total_num_tokens = 0
+        validation_metric_dict = defaultdict(float)
+        with torch.inference_mode():
+            for batch in tqdm(loader, leave=False):
+                validation_loss, num_tokens, loss_dict, _, num_correct_guess = self._get_valid_loss_and_acc_from_batch(batch)
+                total_validation_loss += validation_loss
+                total_num_tokens += num_tokens
+                total_num_correct_guess += num_correct_guess
+                for key, value in loss_dict.items():
+                    validation_metric_dict[key] += value * num_tokens
+            for key in validation_metric_dict.keys():
+                validation_metric_dict[key] /= total_num_tokens
+
+        return total_validation_loss / total_num_tokens, total_num_correct_guess / total_num_tokens, validation_metric_dict
+
+    def _make_midi_from_generated_output(self, generated_output, iter, seed, condition=None):
+        """
+        Generates a MIDI file and logs output from the generated sequence.
+
+        Args:
+            generated_output: The sequence of notes generated by the model.
+            iter: The current iteration of the training process.
+            seed: The seed used for generating the sequence.
+            condition: Optional condition input for generating conditional output.
+        
+        The method:
+        - Converts the generated output into a MIDI file and logs it.
+        - Optionally logs additional error metrics and figures for analysis.
+        """
+        if condition is not None:
+            path_addition = "cond_"
+        else:
+            path_addition = ""
+        with open(self.valid_out_dir / f"{path_addition}generated_output_{iter}_seed_{seed}.pkl", 'wb') as f:
+            pickle.dump(generated_output, f)
+        self.midi_decoder(generated_output, self.valid_out_dir / f"{path_addition}midi_decoded_{iter}_seed_{seed}.mid")
+        if self.make_log:
+            log_dict = {}
+            log_dict[f'{path_addition}gen_score'] = wandb.Image(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.png'))
+            log_dict[f'{path_addition}gen_audio'] = wandb.Audio(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.mp3'))
+            wandb.log(log_dict, step=(iter+seed))
+            print(f"{path_addition}inference is logged: Iter {iter} / seed {seed}")
+        return generated_output
+
+    @torch.inference_mode()
+    def inference_and_log(self, iter, num_uncond_generation=5, num_cond_generation=5, max_seq_len=10000):
+        """
+        Generates and logs both unconditional and conditional output sequences.
+
+        Args:
+            iter: The current iteration.
+            num_uncond_generation: Number of unconditional sequences to generate.
+            num_cond_generation: Number of conditional sequences to generate.
+            max_seq_len: Maximum sequence length to generate.
+
+        The method:
+        - Generates unconditional and conditional sequences using the model's generation function.
+        - Converts the sequences into MIDI files and logs the generated results.
+        """
+        self.model.eval()
+        for i in range(num_uncond_generation):
+            try:
+                start_time = time.time()
+                uncond_generated_output = self.model.module.generate(manual_seed=i, max_seq_len=max_seq_len, condition=None, \
+                    sampling_method=self.sampling_method, threshold=self.sampling_threshold, temperature=self.sampling_temperature)
+                if len(uncond_generated_output) == 0: continue
+                print(f"unconditional generation time_{iter}: {time.time() - start_time:.4f}")
+                print(f"unconditional length of generated_output: {uncond_generated_output.shape[1]}")
+                self._make_midi_from_generated_output(uncond_generated_output, iter, i, None)
+            except Exception as e:
+                print(e)
+        condition_list = [x[1] for x in self.valid_set.data_list[:num_cond_generation] ] 
+        for i in range(num_cond_generation):
+            condition = self.valid_set.get_segments_with_tune_idx(condition_list[i], 0)[0]
+            try:
+                start_time = time.time()
+                generated_output = self.model.module.generate(manual_seed=i, max_seq_len=max_seq_len, condition=condition, \
+                    sampling_method=self.sampling_method, threshold=self.sampling_threshold, temperature=self.sampling_temperature)
+                if len(generated_output) == 0: continue
+                print(f"conditional generation time_{iter}: {time.time() - start_time:.4f}")
+                print(f"conditional length of generated_output: {generated_output.shape[1]}")
+                self._make_midi_from_generated_output(generated_output, iter+num_uncond_generation, i, condition)
+            except Exception as e:
+                print(e)
+
+    def _rename_dict(self, adict, prefix='train'):
+        '''
+        Renames the keys in a dictionary by adding a prefix.
+        '''
+        keys = list(adict.keys())
+        for key in keys:
+            adict[f'{prefix}.{key}'] = adict.pop(key)
+        return dict(adict)
+
+class LanguageModelTrainer4REMI(LanguageModelTrainer):
+  def __init__(self, model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config):
+    super().__init__(model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config)
+
+  def _get_loss_pred_from_single_batch(self, batch, valid=False):
+    segment, mask, caption,encoded_caption = batch
+    input_seq, target = segment[:, :-1], segment[:, 1:] 
+    input_seq = input_seq.to(self.device)
+    target = target.to(self.device)
+    mask = mask[:, :-1].to(self.device)
+    if self.use_fp16:
+      with self.accelerator.autocast():
+        logits = self.model(input_seq, target)
+        if not valid:
+          total_loss, loss_dict = self.loss_fn(logits, target, mask, None)
+          return total_loss, logits, {'total':total_loss.item()}
+        else:
+          total_loss, loss_dict = self.loss_fn(logits, target, mask, self.vocab)
+          loss_dict['total'] = total_loss.item()
+          return total_loss, logits, loss_dict
+    else:
+      logits = self.model(input_seq, target)
+      if not valid:
+        total_loss, loss_dict = self.loss_fn(logits, target, mask, None)
+        return total_loss, logits, {'total':total_loss.item()}
+      else:
+        total_loss, loss_dict = self.loss_fn(logits, target, mask, self.vocab)
+        loss_dict['total'] = total_loss.item()
+        return total_loss, logits, loss_dict
+
+  def _get_valid_loss_and_acc_from_batch(self, batch, train=False):
+    segment, mask, caption,encoded_caption = batch
+    mask = mask[:, :-1]
+    _, target = segment[:, :-1], segment[:, 1:] 
+    loss, logits, loss_dict = self._get_loss_pred_from_single_batch(batch, valid=True)
+    prob = torch.softmax(logits, dim=-1)
+    num_nonmask_tokens = torch.sum(mask) # [b, t]
+    target = target.to(self.device) # [b, t]
+    mask = mask.to(self.device)
+
+    prob_with_mask = torch.argmax(prob, dim=-1) * mask # [b, t]
+    shifted_tgt_with_mask = target * mask # [b, t]
+
+    correct_guess_by_feature = defaultdict(int)
+    num_tokens_by_feature = defaultdict(int)
+    tokens_idx = prob_with_mask.flatten(0,1) # [b*t]
+    answers_idx = shifted_tgt_with_mask.flatten(0,1) # [b*t]
+    if self.vocab.encoding_scheme == 'remi':
+      eos_idx = 2
+    for feature in self.vocab.feature_list:
+      feature_mask = self.vocab.total_mask[feature].to(self.device) # [327,]
+      mask_for_target = feature_mask[answers_idx] # [b*t]
+      if feature == 'type': # because Bar token is 0, we need to add 1 to calculate accuracy
+        valid_pred = (tokens_idx+1) * mask_for_target
+        valid_answers = (answers_idx+1) * mask_for_target
+        eos_mask = valid_answers != eos_idx # because EOS is also working as a padding
+        correct_guess_by_feature[feature] += torch.sum(valid_pred[eos_mask] == valid_answers[eos_mask]).item() - torch.sum(mask_for_target[eos_mask] == 0).item()
+        num_tokens_by_feature[feature] += torch.sum(mask_for_target[eos_mask]).item()
+      else:
+        valid_pred = tokens_idx * mask_for_target # [b, t]
+        valid_answers = answers_idx * mask_for_target # [b, t]
+        correct_guess_by_feature[feature] += torch.sum(valid_pred == valid_answers).item() - torch.sum(mask_for_target == 0).item()
+        num_tokens_by_feature[feature] += torch.sum(mask_for_target).item()
+    validation_loss = loss.item() * num_nonmask_tokens.item()
+    return validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature
+
+  def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess_by_feature):
+    total_num_correct_guess = 0
+    total_num_tokens = 0
+    acc_dict = {}
+    for feature, num_correct_guess in num_correct_guess_by_feature.items():
+      if feature == 'type':
+        continue
+      total_num_correct_guess += num_correct_guess
+      total_num_tokens += num_tokens_by_feature[feature]
+      if num_tokens_by_feature[feature] == 0:
+        continue
+      acc_dict[f"{feature}_acc"] = num_correct_guess / num_tokens_by_feature[feature]
+    total_accuracy = total_num_correct_guess / total_num_tokens
+    acc_dict['total_acc'] = total_accuracy
+    return acc_dict
+
+  def validate(self, external_loader=None):
+    '''
+    total_num_tokens: for calculating loss, nonmask tokens
+    total_num_valid_tokens: for calculating accuracy, valid tokens
+    '''
+    if external_loader and isinstance(external_loader, DataLoader):
+      loader = external_loader
+      print('An arbitrary loader is used instead of Validation loader')
+    else:
+      loader = self.valid_loader
+
+    self.model.eval()
+    total_validation_loss = 0
+    total_num_tokens = 0
+    total_num_valid_tokens = 0
+    total_num_correct_guess = 0
+    validation_metric_dict = defaultdict(float)
+    total_num_tokens_by_feature = defaultdict(int)
+    total_num_correct_guess_dict = defaultdict(int)
+    with torch.inference_mode():
+      for num_iter, batch in enumerate(tqdm(loader, leave=False)):
+        if num_iter == len(self.valid_loader):
+          if loader is not self.valid_loader: # when validate with train_loader
+            break
+        validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, num_correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch)
+        total_validation_loss += validation_loss
+        total_num_tokens += num_nonmask_tokens.item()
+        for key, num_tokens in num_tokens_by_feature.items():
+          total_num_tokens_by_feature[key] += num_tokens
+          if key == 'type':
+            continue
+          total_num_valid_tokens += num_tokens # num tokens are all the same for each musical type, torch.sum(mask)
+        for key, num_correct_guess in num_correct_guess_by_feature.items():
+          total_num_correct_guess_dict[key] += num_correct_guess
+          if key == 'type':
+            continue
+          total_num_correct_guess += num_correct_guess
+        for key, value in loss_dict.items():
+          if key == 'total':
+            validation_metric_dict[key] += value * num_nonmask_tokens
+          else:
+            feature_name = key.split('_')[0]
+            validation_metric_dict[key] += value * num_tokens_by_feature[feature_name]
+
+      for key in validation_metric_dict.keys():
+        if key == 'total':
+          validation_metric_dict[key] /= total_num_tokens
+        else:
+          feature_name = key.split('_')[0]
+          if total_num_tokens_by_feature[feature_name] == 0:
+            continue
+          validation_metric_dict[key] /= total_num_tokens_by_feature[feature_name]
+
+      for key in total_num_tokens_by_feature.keys():
+        num_tokens = total_num_tokens_by_feature[key]
+        num_correct = total_num_correct_guess_dict[key]
+        if num_tokens == 0:
+          continue
+        validation_metric_dict[f'{key}_acc'] = num_correct / num_tokens
+    return total_validation_loss / total_num_tokens, total_num_correct_guess / total_num_valid_tokens, validation_metric_dict
+
+class LanguageModelTrainer4CompoundToken(LanguageModelTrainer):
+  def __init__(self, model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config):
+    super().__init__(model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config)
+
+  '''
+  About ignore_token and conti_token:
+  During validation, tokens with this "conti" value are ignored when calculating accuracy or other metrics, 
+  ensuring that repeated values don't unfairly skew the results. 
+  This is especially relevant for features like beat, chord, tempo, and instrument where repeated tokens may have a specific musical meaning.
+ 
+  We used ignore_token and conti_token to fairly compare compound token based encoding with REMI encoding.
+  '''
+
+  def _get_num_valid_and_correct_tokens(self, prob, ground_truth, mask, ignore_token=None, conti_token=None):
+    valid_prob = torch.argmax(prob, dim=-1) * mask
+    valid_ground_truth = ground_truth * mask
+
+    if ignore_token is None and conti_token is None: 
+      num_valid_tokens = torch.sum(mask)
+      num_correct_tokens = torch.sum(valid_prob == valid_ground_truth) - torch.sum(mask == 0)
+    elif ignore_token is not None and conti_token is None:
+      ignore_mask = valid_ground_truth != ignore_token # batch x seq_len
+      num_valid_tokens = torch.sum(ignore_mask)
+      num_correct_tokens = torch.sum(valid_prob[ignore_mask] == valid_ground_truth[ignore_mask]) # by using mask, the tensor becomes 1d
+    elif ignore_token is not None and conti_token is not None:
+      ignore_conti_mask = (valid_ground_truth != ignore_token) & (valid_ground_truth != conti_token)
+      num_valid_tokens = torch.sum(ignore_conti_mask)
+      num_correct_tokens = torch.sum(valid_prob[ignore_conti_mask] == valid_ground_truth[ignore_conti_mask])
+    return num_correct_tokens.item(), num_valid_tokens.item()
+      
+  def _get_loss_pred_from_single_batch(self, batch, valid=False):
+    # print(batch)
+    segment, mask, caption,encoded_caption = batch
+    input_seq, target = segment[:, :-1], segment[:, 1:] 
+    input_seq = input_seq.to(self.device)
+    target = target.to(self.device)
+    mask = mask[:, :-1].to(self.device)
+    encoded_caption = encoded_caption.to(self.device)
+    if self.use_fp16:
+      if self.config.use_diff is True:
+        with self.accelerator.autocast():
+        # breakpoint()
+          (logits_dict, (masked_indices, p_mask)),input_dict = self.model(input_seq, target,context=encoded_caption)
+          if self.config.use_dispLoss == True:
+            total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid, input_dict=input_dict,lambda_weight=self.config.lambda_weight,tau=self.config.tau)
+          else:
+            total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid)  
+      else:
+        with self.accelerator.autocast():
+          logits_dict,_ = self.model(input_seq, target,context=encoded_caption)
+          total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, valid)
+    else:
+        if self.config.use_diff is True:
+          # breakpoint()
+          if self.config.use_dispLoss == True:
+            total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid, input_dict=input_dict,lambda_weight=self.config.lambda_weight)
+          else:
+            total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid)  
+        else:
+          logits_dict, input_Dict = self.model(input_seq, target,context=encoded_caption)
+          total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, valid)
+    if valid:
+      loss_dict['total'] = total_loss.item()
+    else:
+      loss_dict = {'total':total_loss.item()}
+    return total_loss, logits_dict, loss_dict
+
+  def _get_valid_loss_and_acc_from_batch(self, batch, train=False):
+    '''
+    in this method, valid means handled with both ignore token and mask
+    when valid tokens with only mask, it is called num_nonmask_tokens
+
+    input_seq, target: batch x seq_len x num_features
+    mask: batch x seq_len, 0 for padding
+    prob: batch x seq_len x total_vocab_size
+    '''
+    segment, mask, caption,encoded_caption = batch
+    input_seq, target = segment[:, :-1], segment[:, 1:] 
+    total_loss, logits_dict, loss_dict = self._get_loss_pred_from_single_batch(batch, valid=True)
+    probs_dict = {key:torch.softmax(value, dim=-1) for key, value in logits_dict.items()}
+    num_nonmask_tokens = torch.sum(mask)
+    input_seq = input_seq.to(self.device)
+    target = add_conti_in_valid(target, self.config.nn_params.encoding_scheme).to(self.device)
+    mask = mask[:, :-1].to(self.device)
+    
+    correct_guess_by_feature = defaultdict(int)
+    num_tokens_by_feature = defaultdict(int)
+    for idx, key in enumerate(self.vocab.feature_list):
+      if key == 'type':
+        num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=None, conti_token=None)
+      elif key == 'chord' or key == 'tempo' or key == 'instrument':
+        num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=9999)
+      elif key == 'beat':
+        # NB's beat vocab has Ignore and CONTI token
+        # CP's beat vocab has Ignore and BAR token, we exclude BAR token in accuracy calculation for parity with NB
+        num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=9999)
+      else:
+        num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=None)
+      correct_guess_by_feature[key] = num_correct_tokens
+      num_tokens_by_feature[key] = num_valid_tokens
+    validation_loss = total_loss.item() * num_nonmask_tokens.item()
+    return validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature
+
+  def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess_by_feature):
+    total_num_correct_guess = 0
+    total_num_tokens = 0
+    acc_dict = {}
+    for feature, num_correct_guess in num_correct_guess_by_feature.items():
+      if feature == 'type':
+        continue
+      total_num_correct_guess += num_correct_guess
+      total_num_tokens += num_tokens_by_feature[feature]
+      acc_dict[f"{feature}_acc"] = num_correct_guess / num_tokens_by_feature[feature]
+    total_accuracy = total_num_correct_guess / total_num_tokens
+    acc_dict['total_acc'] = total_accuracy
+    return acc_dict
+
+  def validate(self, external_loader=None):
+    if external_loader and isinstance(external_loader, DataLoader):
+      loader = external_loader
+      print('An arbitrary loader is used instead of Validation loader')
+    else:
+      loader = self.valid_loader
+
+    self.model.eval()
+    total_validation_loss = 0
+    total_num_correct_guess = 0
+    total_num_tokens = 0
+    total_num_valid_tokens = 0
+    validation_metric_dict = defaultdict(float)
+    total_num_tokens_by_feature = defaultdict(int)
+    total_num_correct_guess_dict = defaultdict(int)
+
+    with torch.inference_mode():
+      '''
+      mask is used to calculate loss, accuracy
+      validation_loss: sum of loss for valid tokens conditioned on mask
+      num_nonmask_tokens: sum of tokens conditioned on mask
+      num_tokens_by_feature: sum of valid tokens(handle ignore) for each musical features
+      num_correct_guess_by_feature: sum of correct tokens(handle ignore) for each musical features
+      '''
+      for num_iter, batch in tqdm(enumerate(loader), leave=False):
+        if num_iter == len(self.valid_loader):
+          if loader is not self.valid_loader: # when validate with train_loader
+            break
+        validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, num_correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch)
+        total_validation_loss += validation_loss
+        total_num_tokens += num_nonmask_tokens
+        for key, num_tokens in num_tokens_by_feature.items():
+          total_num_tokens_by_feature[key] += num_tokens
+          if key == 'type': # because cp and nb have different number of type tokens, we don't want to calculate accuracy for type token
+            continue
+          total_num_valid_tokens += num_tokens # num tokens are all the same for each musical type, torch.sum(mask)
+        for key, num_correct_guess in num_correct_guess_by_feature.items():
+          total_num_correct_guess_dict[key] += num_correct_guess
+          if key == 'type':
+            continue
+          total_num_correct_guess += num_correct_guess
+        for key, value in loss_dict.items():
+          if key == 'total':
+            validation_metric_dict[key] += value * num_nonmask_tokens
+          else:
+            # if torch.isnan(value): # in case num valid tokens is 0 because of mask
+            #   continue
+            feature_name = key.split('_')[0]
+            validation_metric_dict[key] += value * num_tokens_by_feature[feature_name]
+
+      for key in validation_metric_dict.keys():
+        if key == 'total':
+          validation_metric_dict[key] /= total_num_tokens
+        else:
+          feature_name = key.split('_')[0]
+          if total_num_tokens_by_feature[feature_name] == 0:
+            continue
+          validation_metric_dict[key] /= total_num_tokens_by_feature[feature_name]
+      for (key_t, num_tokens), (key_c, num_correct) in zip(total_num_tokens_by_feature.items(), total_num_correct_guess_dict.items()):
+        validation_metric_dict[f'{key_c}_acc'] = num_correct / num_tokens
+
+    return total_validation_loss / (total_num_tokens + 1), total_num_correct_guess / (1+total_num_valid_tokens), validation_metric_dict
+
+  def _make_midi_from_generated_output(self, generated_output, iter, seed, condition=None):
+    if self.config.data_params.first_pred_feature != 'type' and self.config.nn_params.encoding_scheme == 'nb':
+      generated_output = reverse_shift_and_pad_for_tensor(generated_output, self.config.data_params.first_pred_feature)
+    if condition is not None:
+      path_addition = "cond_"
+    else:
+      path_addition = ""
+
+    # save generated_output as pickle
+    with open(self.valid_out_dir / f"{path_addition}generated_output_{iter}_seed_{seed}.pkl", 'wb') as f:
+      pickle.dump(generated_output, f)
+    self.midi_decoder(generated_output, self.valid_out_dir / f"{path_addition}midi_decoded_{iter}_seed_{seed}.mid")
+    if self.make_log and self.infer_and_log:
+      log_dict = {}
+      log_dict[f'{path_addition}gen_score'] = wandb.Image(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.png')) 
+      log_dict[f'{path_addition}gen_audio'] = wandb.Audio(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.mp3'))
+      wandb.log(log_dict, step=(iter+seed))
+      print(f"{path_addition}inference is logged: Iter {iter} / seed {seed}")
+  
diff --git a/Amadeus/transformer_utils.py b/Amadeus/transformer_utils.py
new file mode 100644
index 0000000..3982341
--- /dev/null
+++ b/Amadeus/transformer_utils.py
@@ -0,0 +1,949 @@
+import torch
+import torch.nn as nn
+
+from x_transformers import Decoder, Encoder, PrefixDecoder, CrossAttender
+from transformers import T5EncoderModel
+from data_representation.vocab_utils import LangTokenVocab
+  
+class PosEncoding(nn.Module):
+  def __init__(self, emb_size, max_t):
+    super().__init__()
+    self.emb_size =emb_size
+    self.max_t = max_t
+    self.register_buffer('encoding', self._prepare_emb())
+
+  def _prepare_emb(self):
+    dim_axis = 10000**(torch.arange(self.emb_size//2) * 2 / self.emb_size) # 10000 ** (normalized values between 0~1 num_emb_dim)
+    timesteps = torch.arange(self.max_t)
+    pos_enc_in = timesteps.unsqueeze(1) / dim_axis.unsqueeze(0)
+    pos_enc_sin = torch.sin(pos_enc_in) # x values for sin are between 0 ~ 1 so the values could never be the same
+    pos_enc_cos = torch.cos(pos_enc_in)
+
+    pos_enc = torch.stack([pos_enc_sin, pos_enc_cos], dim=-1).reshape([self.max_t, self.emb_size])
+    return pos_enc
+
+  def forward(self, x):
+    return self.encoding[x]
+
+class ResidualLayerNormModule(nn.Module):
+  def __init__(self, submodule):
+    super().__init__()
+    self.submodule = submodule
+    self.layer_norm = nn.LayerNorm(self.submodule.input_size)
+
+  def forward(self, x, mask=None, y=None):
+    if y is not None:
+      res_x = self.submodule(x, y, mask)
+    elif mask is not None:
+      res_x = self.submodule(x, mask)
+    else:
+      res_x = self.submodule(x)
+    x =  x + res_x
+    return self.layer_norm(x)
+
+class SingleEmbedding(nn.Module):
+  def __init__(
+    self, 
+    vocab, 
+    dim_model,
+  ):
+    '''
+    Embedding layer for REMI
+    '''
+    super().__init__()
+    vocab_size = vocab.get_vocab_size()
+    self.embedding = nn.Embedding(vocab_size, dim_model)
+
+  def forward(self, x):
+    return self.embedding(x)
+
+class MultiEmbedding(nn.Module):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab,
+    dim_model:int,
+  ):
+    super().__init__()
+    '''
+    Embedding layer for compound tokens
+    '''
+    self.vocab_size = vocab.get_vocab_size()
+    self.feature_list = vocab.feature_list
+    self.dim_model = dim_model
+    self.layers = []
+
+    self._make_emb_layers()
+    self._init_params()
+    self._make_emb_boundaries_by_key()
+  
+  def _init_params(self):
+    # apply kaiming init
+    for layer in self.layers:
+      if isinstance(layer, nn.Embedding):
+        nn.init.kaiming_normal_(layer.weight)
+
+  def _make_emb_layers(self):
+    vocab_sizes = [self.vocab_size[key] for key in self.feature_list]
+    self.embedding_sizes = [self.dim_model for _ in self.feature_list]
+    for vocab_size, embedding_size in zip(vocab_sizes, self.embedding_sizes):
+      if embedding_size != 0:
+        self.layers.append(nn.Embedding(vocab_size, embedding_size))
+    self.layers = nn.ModuleList(self.layers)
+
+  def _make_emb_boundaries_by_key(self):
+    '''
+    This function returns dict of boundaries for each embedding layer
+    '''
+    self.emb_boundary_by_key = {}
+    start_idx = 0
+    for key, emb_size in zip(self.feature_list, self.embedding_sizes):
+      if emb_size != 0:
+        self.emb_boundary_by_key[key] = (start_idx, start_idx + emb_size)
+        start_idx += emb_size
+
+  def forward(self, x):
+    emb = torch.cat([module(x[..., i]) for i, module in enumerate(self.layers)], dim=-1)
+    return emb
+
+  def __len__(self):
+    return len(self.layers)
+
+  def get_emb_by_key(self, key, token):
+    layer_idx = self.feature_list.index(key)
+    return self.layers[layer_idx](token)
+
+class SummationEmbedder(MultiEmbedding):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab, 
+    dim_model:int
+  ):
+    super().__init__(vocab, dim_model)
+
+  def forward(self, seq):
+    emb_list = [module(seq[..., i]) for i, module in enumerate(self.layers)]
+    stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size
+    output = torch.sum(stacked_emb, dim=2) # B x T x emb_size
+    return output
+
+class AverageEmbedder(MultiEmbedding):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab, 
+    dim_model:int
+  ):
+    super().__init__(vocab, dim_model)
+
+  def forward(self, seq):
+    emb_list = [module(seq[..., i]) for i, module in enumerate(self.layers)]
+    stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size
+    output = torch.mean(stacked_emb, dim=2) # B x T x emb_size
+    return output
+
+class SelfAttentionEmbedder(MultiEmbedding):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab, 
+    dim_model:int
+  ):
+    super().__init__(vocab, dim_model)
+    self.dropout = 0.1
+
+    self.transformer_encoder = Encoder(
+                                    dim = dim_model,
+                                    depth = 1,
+                                    heads = 8,
+                                    attn_dropout = self.dropout,
+                                    ff_dropout = self.dropout,
+                                    attn_flash = True)
+    
+    self.cls_embedding = nn.Parameter(torch.zeros(1, 1, self.dim_model), requires_grad=True)
+
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff()
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn()
+
+  def _add_dropout_after_attn(self):
+    for layer in self.transformer_encoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(self.dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(self.dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self):
+    for layer in self.transformer_encoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(self.dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_encoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def _apply_window_on_input_vec(self, embeddings):
+    window_size = 1
+    zero_vec = torch.zeros(embeddings.shape[0], window_size-1, embeddings.shape[2], embeddings.shape[3]).to(embeddings.device) # B x (window_size-1) x num_features x emb_size
+    window_applied_input_vec = torch.cat([zero_vec, embeddings], dim=1) # B x (T+window_size-1) x num_features x emb_size
+    window_applied_input_vec = window_applied_input_vec.unfold(1, window_size, 1) # B x T x window_size x emb_size x num_features
+    window_applied_input_vec = window_applied_input_vec.transpose(3, 4) # B x T x window_size x num_features x emb_size
+    window_applied_input_vec = window_applied_input_vec.reshape(embeddings.shape[0]*embeddings.shape[1], -1, embeddings.shape[3]) # (B*T) x (num_features*window_size) x emb_size
+    return window_applied_input_vec
+
+  def _apply_pos_enc(self, tgt):
+    pos = torch.arange(tgt.shape[1]).to(tgt.device) # (num_features*window_size+1)
+    pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x (num_features*window_size+1)
+    tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x (num_features*window_size+1) x emb_size
+    return tgt_pos
+
+  def forward(self, input_tokens):
+    '''
+    input_tokens: B x T x num_features
+    '''
+    # prepare input vector
+    emb_list = [module(input_tokens[..., i]) for i, module in enumerate(self.layers)] # B x T x 1 x emb_size
+    stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size
+    # apply window
+    stacked_emb = self._apply_window_on_input_vec(stacked_emb)
+    # add CLS
+    cls = self.cls_embedding.repeat(stacked_emb.shape[0], 1, 1) # (B*T) x 1 x emb_size
+    input_emb = torch.cat([stacked_emb, cls], dim=1) # (B*T) x (num_features*window_size+1) x emb_size
+    output = self.transformer_encoder(input_emb) # (B*T) x (num_features*window_size+1) x emb_size
+    # extract CLS
+    output = output[:, -1, :].reshape((input_tokens.shape[0], input_tokens.shape[1], -1)) # B x T x emb_size
+    return output
+
+class RVQMultiEmbedding(nn.Module):
+  def __init__(
+    self, 
+    vocab:LangTokenVocab, 
+    dim_model:int
+  ):
+    super().__init__()
+    self.vocab_size = vocab.get_vocab_size()
+    self.dim_model = dim_model
+    self.features = vocab.feature_list
+    self.layers = []
+    self._make_emb_layers()
+
+  def _make_emb_layers(self):
+    vocab_sizes = [self.vocab_size[key] for key in self.features]
+    self.embedding_sizes = [self.dim_model for _ in self.features]
+    for vocab_size, embedding_size in zip(vocab_sizes, self.embedding_sizes):
+      if embedding_size != 0:
+        self.layers.append(nn.Embedding(vocab_size, embedding_size))
+    self.layers = nn.ModuleList(self.layers)
+
+  def forward(self, x):
+    embeddings = torch.zeros(x.shape[0], x.shape[1], self.dim_model).to(x.device)
+    emb_list = [module(x[:, (idx+1)%4::4]) for idx, module in enumerate(self.layers)]
+    for idx, emb in enumerate(emb_list):
+      embeddings[:, (idx+1)%4::4] = emb
+    return embeddings
+  
+  def get_emb_by_key(self, key:str, token:torch.Tensor):
+    layer_idx = self.features.index(key)
+    return self.layers[layer_idx](token)
+
+class XtransformerDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq)
+      
+class XtransformerCrossAttendDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True,
+                                    cross_attend = True,
+                                    only_cross = False)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq, context=context)
+ 
+class XtransformerLargeCrossAttendDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-large')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True,
+                                    cross_attend = True,
+                                    only_cross = False)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq, context=context)
+ 
+class NewCrossAttendDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True,
+                                    cross_attend = True,
+                                    only_cross = False,
+                                    use_rmsnorm=True,
+                                  ff_swish = True, # set this to True
+                                  ff_glu = True,    # set to true to use for all feedforwards
+                                  )  
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq, context=context)
+     
+class NewCrossAttendwithRoPEDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True,
+                                    cross_attend = True,
+                                    only_cross = False,
+                                    use_rmsnorm=True,
+                                    rotary_pos_emb = True,
+                                  ff_swish = True, # set this to True
+                                  ff_glu = True,    # set to true to use for all feedforwards
+                                  )  
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq, context=context)
+
+class XtransformerPrefixDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = PrefixDecoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None):
+    assert context is not None, 'context should be provided for prefix decoder'
+    input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+    attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+    assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+    assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+    assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+    context = self.text_encoder(
+      input_ids=input_ids, 
+      attention_mask=attention_mask
+    ).last_hidden_state
+    
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq)
+      
+class XtransformerPretrainingDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None, context_embedding=None):
+
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True)
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True)
+        return hidden_vec, intermediates
+      else:
+        return self.transformer_decoder(seq)
+      
+class XtransformerFinetuningDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask,
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    # concatenate context with seq
+    seq = torch.cat([context, seq], dim=1) # B x (T+context_length) x emb_size
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True)
+      # cut to only return the seq part
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True)
+        # cut to only return the seq part
+        hidden_vec = hidden_vec[:, context.shape[1]:, :]
+        return hidden_vec, intermediates
+      else:
+        # cut to only return the seq part
+        hidden_vec = self.transformer_decoder(seq)
+        hidden_vec = hidden_vec[:, context.shape[1]:, :]
+        return hidden_vec
+      
+class XtransformerLargeFinetuningDecoder(nn.Module):
+  def __init__(
+      self, 
+      dim:int,
+      depth:int,
+      heads:int,
+      dropout:float
+  ):
+    super().__init__()
+    self._make_decoder_layer(dim, depth, heads, dropout)
+    self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-large')
+    # frozen text encoder
+    for param in self.text_encoder.parameters():
+      param.requires_grad = False
+      
+  def _make_decoder_layer(self, dim, depth, heads, dropout):
+    self.transformer_decoder = Decoder(
+                                    dim = dim,
+                                    depth = depth,
+                                    heads = heads,
+                                    attn_dropout = dropout,
+                                    ff_dropout = dropout,
+                                    attn_flash = True)
+    # add final dropout
+    print('Applying Xavier Uniform Init to x-transformer following torch.Transformer')
+    self._apply_xavier_init()
+    print('Adding dropout after feedforward layer in x-transformer')
+    self._add_dropout_after_ff(dropout)
+    print('Adding dropout after attention layer in x-transformer')
+    self._add_dropout_after_attn(dropout)
+
+  def _add_dropout_after_attn(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'Attention' in str(type(layer[1])): 
+        if isinstance(layer[1].to_out, nn.Sequential): # if GLU
+          layer[1].to_out.append(nn.Dropout(dropout))
+        elif isinstance(layer[1].to_out, nn.Linear): # if simple linear
+          layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout))
+        else:
+          raise ValueError('to_out should be either nn.Sequential or nn.Linear')
+
+  def _add_dropout_after_ff(self, dropout):
+    for layer in self.transformer_decoder.layers:
+      if 'FeedForward' in str(type(layer[1])):
+        layer[1].ff.append(nn.Dropout(dropout))
+
+  def _apply_xavier_init(self):
+    for name, param in self.transformer_decoder.named_parameters():
+      if 'to_q' in name or 'to_k' in name or 'to_v' in name:
+          torch.nn.init.xavier_uniform_(param, gain=0.5**0.5)
+
+  def forward(self, seq, cache=None,train=False,context=None,context_embedding=None):
+    assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder'
+    if context_embedding is None:
+      input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids']
+      attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask']
+      assert input_ids is not None, 'input_ids should be provided for prefix decoder'
+      assert attention_mask is not None, 'attention_mask should be provided for prefix decoder'
+      assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder'
+
+      context = self.text_encoder(
+        input_ids=input_ids, 
+        attention_mask=attention_mask,
+      ).last_hidden_state
+    else:
+      context = context_embedding
+    
+    # concatenate context with seq
+    seq = torch.cat([context, seq], dim=1) # B x (T+context_length) x emb_size
+    if cache is not None: # implementing run_one_step in inference
+      if cache.hiddens is None: cache = None
+      hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True)
+      # cut to only return the seq part
+      return hidden_vec, intermediates
+    else:
+      if train:
+        hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True)
+        # cut to only return the seq part
+        hidden_vec = hidden_vec[:, context.shape[1]:, :]
+        return hidden_vec, intermediates
+      else:
+        # cut to only return the seq part
+        hidden_vec = self.transformer_decoder(seq)
+        hidden_vec = hidden_vec[:, context.shape[1]:, :]
+        return hidden_vec
\ No newline at end of file
diff --git a/SongEval/.DS_Store b/SongEval/.DS_Store
new file mode 100644
index 0000000..ee3d00a
Binary files /dev/null and b/SongEval/.DS_Store differ
diff --git a/SongEval/LICENSE b/SongEval/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/SongEval/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/SongEval/README.md b/SongEval/README.md
new file mode 100644
index 0000000..72807ae
--- /dev/null
+++ b/SongEval/README.md
@@ -0,0 +1,88 @@
+# 🎵 SongEval: A Benchmark Dataset for Song Aesthetics Evaluation
+
+[![Hugging Face Dataset](https://img.shields.io/badge/HuggingFace-Dataset-blue)](https://huggingface.co/datasets/ASLP-lab/SongEval)
+[![Arxiv Paper](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2505.10793)
+[![License: CC BY-NC-SA 4.0](https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)  
+
+
+This repository provides a **trained aesthetic evaluation toolkit** based on [SongEval](https://huggingface.co/datasets/ASLP-lab/SongEval), the first large-scale, open-source dataset for human-perceived song aesthetics. The toolkit enables **automatic scoring of generated song** across five perceptual aesthetic dimensions aligned with professional musician judgments.
+
+---
+
+## 🌟 Key Features
+
+- 🧠 **Pretrained neural models** for perceptual aesthetic evaluation
+- 🎼 Predicts **five aesthetic dimensions**:
+  - Overall Coherence
+  - Memorability
+  - Naturalness of Vocal Breathing and Phrasing
+  - Clarity of Song Structure
+  - Overall Musicality
+<!-- - 🧪 Supports **batch evaluation** for model benchmarking -->
+- 🎧 Accepts **full-length songs** (vocals + accompaniment) as input
+- ⚙️ Simple inference interface
+
+---
+
+## 📦 Installation
+
+Clone the repository and install dependencies:
+
+```bash
+git clone https://github.com/ASLP-lab/SongEval.git
+cd SongEval
+pip install -r requirements.txt
+```
+
+## 🚀 Quick Start
+
+- Evaluate a single audio file:
+
+```bash
+python eval.py -i /path/to/audio.mp3 -o /path/to/output
+```
+
+- Evaluate a list of audio files:
+
+```bash
+python eval.py -i /path/to/audio_list.txt -o /path/to/output
+```
+
+- Evaluate all audio files in a directory:
+
+```bash
+python eval.py -i /path/to/audio_directory -o /path/to/output
+```
+
+- Force evaluation on CPU  (⚠️ CPU evaluation may be significantly slower) :
+
+
+```bash
+python eval.py -i /path/to/audio.wav -o /path/to/output --use_cpu True
+```
+
+
+## 🙏 Acknowledgement
+This project is mainly organized by the audio, speech and language processing lab [(ASLP@NPU)](http://www.npu-aslp.org/).
+
+We sincerely thank the **Shanghai Conservatory of Music** for their expert guidance on music theory, aesthetics, and annotation design.
+Meanwhile, we thank AISHELL to help with the orgnization of the song annotations.
+
+<p align="center"> <img src="assets/logo.png" alt="Shanghai Conservatory of Music Logo"/> </p>
+
+## 📑 License
+This project is released under the CC BY-NC-SA 4.0 license. 
+
+You are free to use, modify, and build upon it for non-commercial purposes, with attribution.
+
+## 📚 Citation
+If you use this toolkit or the SongEval dataset, please cite the following:
+```
+@article{yao2025songeval,
+  title   = {SongEval: A Benchmark Dataset for Song Aesthetics Evaluation},
+  author  = {Yao, Jixun and Ma, Guobin and Xue, Huixin and Chen, Huakang and Hao, Chunbo and Jiang, Yuepeng and Liu, Haohe and Yuan, Ruibin and Xu, Jin and Xue, Wei and others},
+  journal = {arXiv preprint arXiv:2505.10793},
+  year={2025}
+}
+
+```
diff --git a/SongEval/assets/logo.png b/SongEval/assets/logo.png
new file mode 100644
index 0000000..d521073
Binary files /dev/null and b/SongEval/assets/logo.png differ
diff --git a/SongEval/clap_score.py b/SongEval/clap_score.py
new file mode 100644
index 0000000..912357a
--- /dev/null
+++ b/SongEval/clap_score.py
@@ -0,0 +1,184 @@
+import os
+import requests
+from tqdm import tqdm
+import torch
+import numpy as np
+import laion_clap
+from clap_module.factory import load_state_dict
+import librosa
+import pyloudnorm as pyln
+
+# following documentation from https://github.com/LAION-AI/CLAP
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+
+def clap_score(id2text, audio_path, audio_files_extension='.wav', clap_model='music_speech_audioset_epoch_15_esc_89.98.pt'):
+    """
+    Cosine similarity is computed between the LAION-CLAP text embedding of the given prompt and 
+    the LAION-CLAP audio embedding of the generated audio. LION-CLAP: https://github.com/LAION-AI/CLAP
+    
+    This evaluation script assumes that audio_path files are identified with the ids in id2text.
+    
+    clap_score() evaluates all ids in id2text.
+
+    GPU-based computation.
+
+    Select one of the following models from https://github.com/LAION-AI/CLAP:
+        - music_speech_audioset_epoch_15_esc_89.98.pt (used by musicgen)
+        - music_audioset_epoch_15_esc_90.14.pt
+        - music_speech_epoch_15_esc_89.25.pt
+        - 630k-audioset-fusion-best.pt (our default, with "fusion" to handle longer inputs)
+
+    Params:
+    -- id2text: dictionary with the mapping between id (generated audio filenames in audio_path) 
+                and text (prompt used to generate audio). clap_score() evaluates all ids in id2text.
+    -- audio_path: path where the generated audio files to evaluate are available.
+    -- audio_files_extension: files extension (default .wav) in eval_path.
+    -- clap_model: choose one of the above clap_models (default: '630k-audioset-fusion-best.pt').
+    Returns:
+    -- CLAP-LION score
+    """
+    # load model
+    if clap_model == 'music_speech_audioset_epoch_15_esc_89.98.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_audioset_epoch_15_esc_89.98.pt'
+        clap_path = 'load/clap_score/music_speech_audioset_epoch_15_esc_89.98.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_audioset_epoch_15_esc_90.14.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt'
+        clap_path = 'load/clap_score/music_audioset_epoch_15_esc_90.14.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_speech_epoch_15_esc_89.25.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_epoch_15_esc_89.25.pt'
+        clap_path = 'load/clap_score/music_speech_epoch_15_esc_89.25.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == '630k-audioset-fusion-best.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/630k-audioset-fusion-best.pt'
+        clap_path = 'load/clap_score/630k-audioset-fusion-best.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=True, device='cuda')
+    else:
+        raise ValueError('clap_model not implemented')
+
+    # download clap_model if not already downloaded
+    if not os.path.exists(clap_path):
+        print('Downloading ', clap_model, '...')
+        os.makedirs(os.path.dirname(clap_path), exist_ok=True)
+
+        response = requests.get(url, stream=True)
+        total_size = int(response.headers.get('content-length', 0))
+
+        with open(clap_path, 'wb') as file:
+            with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar:
+                for data in response.iter_content(chunk_size=8192):
+                    file.write(data)
+                    progress_bar.update(len(data))
+
+    # fixing CLAP-LION issue, see: https://github.com/LAION-AI/CLAP/issues/118
+    pkg = load_state_dict(clap_path)
+    pkg.pop('text_branch.embeddings.position_ids', None)
+    model.model.load_state_dict(pkg)
+    model.eval()
+
+    if not os.path.isdir(audio_path):        
+        raise ValueError('audio_path does not exist')
+
+    if id2text:   
+        print('[EXTRACTING TEXT EMBEDDINGS] ')
+        batch_size = 64
+        text_emb = {}
+        for i in tqdm(range(0, len(id2text), batch_size)):
+            batch_ids = list(id2text.keys())[i:i+batch_size]
+            batch_texts = [id2text[id] for id in batch_ids]
+            with torch.no_grad():
+                embeddings = model.get_text_embedding(batch_texts, use_tensor=True)
+            for id, emb in zip(batch_ids, embeddings):
+                text_emb[id] = emb
+
+    else:
+        raise ValueError('Must specify id2text')
+
+    print('[EVALUATING GENERATIONS] ', audio_path)
+    score = 0
+    count = 0
+    for id in tqdm(id2text.keys()):
+        file_path = os.path.join(audio_path, str(id)+audio_files_extension)
+        with torch.no_grad():
+            audio, _ = librosa.load(file_path, sr=48000, mono=True) # sample rate should be 48000
+            audio = pyln.normalize.peak(audio, -1.0)
+            audio = audio.reshape(1, -1) # unsqueeze (1,T)
+            audio = torch.from_numpy(int16_to_float32(float32_to_int16(audio))).float()
+            audio_embeddings = model.get_audio_embedding_from_data(x = audio, use_tensor=True)
+        cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_emb[id].unsqueeze(0), dim=1, eps=1e-8)[0]
+        score += cosine_sim
+        count += 1
+
+    return score / count if count > 0 else 0
+
+
+if __name__ == "__main__":
+
+    import pandas as pd
+    import json
+    import argparse
+    parser = argparse.ArgumentParser(description='Compute CLAP score for generated audio files.')
+    parser.add_argument('--clap_model', type=str, default='630k-audioset-fusion-best.pt',
+                        help='CLAP model to use for evaluation. Options: music_speech_audioset_epoch_15_esc_89.98.pt, music_audioset_epoch_15_esc_90.14.pt, music_speech_epoch_15_esc_89.25.pt, 630k-audioset-fusion-best.pt (default: 630k-audioset-fusion-best.pt)')
+    parser.add_argument('--root_path', type=str, default='../wandb/run-20250627_172105-xpe7nh5n-worseInstr/generated_samples_text_conditioned_top_p_threshold_0.99_temperature_1.15_8',
+                        help='Path to the directory containing generated audio files and id2text mapping.')
+    args = parser.parse_args()
+    clap_model = args.clap_model
+    root_path = args.root_path
+    json_file_path = os.path.join(root_path, 'name2prompt.jsonl')
+    generated_path = os.path.join(root_path, 'prompt_music')
+    if not os.path.exists(generated_path):
+        generated_path =root_path  # if no 'music' subfolder, use root_path directly
+
+    with open(json_file_path, 'r') as f:
+        id2text_dict = {}
+        for line in f:
+            item = json.loads(line)
+            for k, v in item.items():
+                id2text_dict[k] = v[0]
+    print('length of id2text:', len(id2text_dict))
+    # id2text = {k+'_1': v[0] for k, v in id2text_dict.items()}  # assuming each key has a list of prompts, we take the first one
+    id2text ={}
+    for k, v in id2text_dict.items():
+        if isinstance(v, list):
+            id2text[k] = v[0]
+        # ckeck if k exist as wav file
+        if os.path.exists(os.path.join(generated_path, str(k)+'.wav')):
+            id2text[k] = v[0]
+        else:
+            # find k_*, k_1, k_2, ... and check if they exist
+            for i in range(0, 10):  # assuming no more than 100 variations
+                if os.path.exists(os.path.join(generated_path, str(k)+'_'+str(i)+'.wav')):
+                    new_key = str(k) + '_' + str(i)
+                    id2text[new_key] = v[0]
+    print('length of id2text after checking wav files:', len(id2text))
+    # check if wav exsists
+    new_id2text = {}
+    for id in id2text.keys():
+        file_path = os.path.join(generated_path, str(id)+'.wav')
+        if os.path.exists(file_path):
+            new_id2text[id] = id2text[id]
+        else:
+            print(f"Warning: {file_path} does not exist, skipping this id.")
+    print('length of new_id2text:', len(new_id2text))
+
+    """
+    IMPORTANT: the audios in generated_path should have the same ids as in id2text.
+    For musiccaps, you can load id2text as above and each generated_path audio file
+    corresponds to a prompt (text description) in musiccaps. Files are named with ids, as follows:
+    - your_model_outputs_folder/_-kssA-FOzU.wav
+    - your_model_outputs_folder/_0-2meOf9qY.wav
+    - your_model_outputs_folder/_1woPC5HWSg.wav
+    ...
+    - your_model_outputs_folder/ZzyWbehtt0M.wav
+    """
+
+    clp = clap_score(new_id2text, generated_path, audio_files_extension='.wav')
+    print('CLAP score (cosine similarity):', clp)
\ No newline at end of file
diff --git a/SongEval/config.yaml b/SongEval/config.yaml
new file mode 100644
index 0000000..f30b498
--- /dev/null
+++ b/SongEval/config.yaml
@@ -0,0 +1,6 @@
+generator:
+  _target_: model.Generator
+  in_features: 1024
+  ffd_hidden_size: 4096
+  num_classes: 5
+  attn_layer_num: 4
\ No newline at end of file
diff --git a/SongEval/controlability.py b/SongEval/controlability.py
new file mode 100644
index 0000000..1cb19aa
--- /dev/null
+++ b/SongEval/controlability.py
@@ -0,0 +1,456 @@
+import json
+
+generate_path = 'Text2midi/muzic/musecoco/2-attribute2music_model/generation/0505/linear_mask-1billion-attribute2music/infer_test/topk15-t1.0-ngram0/all_midis'
+# generate_path = 'Text2midi/t2m-inferalign/text2midi_infer_output'
+# generate_path = 'wandb/no-disp-no-ciem/text_condi_top_p_t0.99_temp1.25'
+test_set_json = "dataset/midicaps/train.json"
+
+generated_eval_json_path = f"{generate_path}/eval.json"
+generated_name2prompt_jsonl_path = f"{generate_path}/name2prompt.jsonl"
+
+# 1. 读取 test_set，建立 prompt 到条目的映射
+with open(test_set_json, 'r') as f:
+    test_set =[]
+    for line in f:
+        if not line.strip():
+            continue
+        item = json.loads(line.strip())
+        test_set.append(item)
+prompt2item = {item['caption']: item for item in test_set if item['test_set'] is True}
+print(f"Number of prompts in test set: {len(prompt2item)}")
+# 2. 读取 name2prompt.jsonl，建立 name 到 prompt 的映射
+name2prompt = {}
+with open(generated_name2prompt_jsonl_path, 'r') as f:
+    for line in f:
+        obj = json.loads(line)
+        name2prompt.update({k: v[0] for k, v in obj.items() if isinstance(v, list) and len(v) > 0})
+# 3. 读取 eval.json
+with open(generated_eval_json_path, 'r') as f:
+    eval_items = []
+    for line in f:
+        if not line.strip():
+            continue
+        item = json.loads(line.strip())
+        eval_items.append(item)
+
+# 4. 对每个 name，找到对应的 prompt，确保 prompt 在 test_set 里，然后找到 eval.json 里对应的条目
+results = []
+# turn the name of eval_items into relative name
+for item in eval_items:
+    item['name'] = item['name'].split('/')[-1]  # 假设 name 是一个路径，取最后一部分作为相对名称
+    # 去掉第二个下划线后面的内容
+    if '_' in item['name']:
+        item['name'] = item['name'].split('.')[0].split('_')[0] + '_' + item['name'].split('.')[0].split('_')[1]
+    # print(f"Processed eval item name: {item['name']}")
+    
+for name, prompt in name2prompt.items():
+    if prompt not in prompt2item:
+        print(f"Prompt not found in test set: {prompt}")
+        continue
+    # 找到 eval.json 里对应的条目（假设 eval.json 里有 name 字段）
+    eval_entry = next((item for item in eval_items if item.get('name') == name), None)
+    if eval_entry is None:
+        print(f"Eval entry not found for name: {name}")
+        continue
+    # 原始条目
+    original_entry = prompt2item[prompt]
+    results.append({
+        'name': name,
+        'prompt': prompt,
+        'eval_entry': eval_entry,
+        'original_entry': original_entry
+    })
+print(f"Number of results: {len(results)}")
+print(f"Sample result: {results[0] if results else 'No results'}")
+
+def calculate_TBT_score(results):
+    """
+    • Tempo Bin with Tolerance (TBT): The predicted bpm falls into the ground truth tempo bin or
+a neighboring one.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'tempo' in eval_entry and 'tempo' in original_entry:
+            eval_tempo = eval_entry['tempo'][0] if isinstance(eval_entry['tempo'], list) else eval_entry['tempo']
+            original_tempo = original_entry['tempo'] 
+            if original_tempo is None or eval_tempo is None:
+                continue  # 如果原始条目没有 tempo，跳过
+            # 检查 eval_tempo 是否在 original_tempo 的范围内
+            if original_tempo - 10 <= eval_tempo <= original_tempo + 15:
+                correct += 1
+            total += 1
+    TB_score = correct / total if total > 0 else 0
+    print(f"TB Score: {TB_score:.4f} (Correct: {correct}, Total: {total})")
+    return TB_score
+
+def calculate_CK_score(results):
+    """
+    • Correct Key (CK): The predicted key matches the ground truth key.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'key' in eval_entry and 'key' in original_entry:
+            eval_key = eval_entry['key'][0] if isinstance(eval_entry['key'], list) else eval_entry['key']
+            eval_key = eval_key if eval_key is not None else "C major"  # 默认值为 C 大调
+            original_key = original_entry['key'] if original_entry['key'] is not None else "C major"  # 默认值为 C 大调
+            if original_key is None or eval_key is None:
+                continue
+            if eval_key == original_key:
+                correct += 1
+            total += 1
+    CK_score = correct / total if total > 0 else 0
+    print(f"CK Score: {CK_score:.4f} (Correct: {correct}, Total: {total})")
+    return CK_score
+def calculate_CKD_score(results):
+    """
+    Correct Key with Duplicates (CKD): The predicted key matches the ground truth key or an equivalent key (i.e., a major key and its relative minor).
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'key' in eval_entry and 'key' in original_entry:
+            eval_key = eval_entry['key'][0] if isinstance(eval_entry['key'], list) else eval_entry['key']
+            if eval_key is None:
+                eval_key = "C major"  # 默认值为 C 大调
+            original_key = original_entry['key'] if original_entry['key'] is not None else "C major"
+            if original_key is None or eval_key is None:
+                continue  # 如果原始条目没有 key，跳过
+            # 检查 eval_key 是否与 original_key 相同或是其相对小调
+            if eval_key == original_key or (eval_key.split(' ')[0] == original_key.split(' ')[0]):
+                correct += 1
+            total += 1
+    CKD_score = correct / total if total > 0 else 0
+    print(f"CKD Score: {CKD_score:.4f} (Correct: {correct}, Total: {total})")
+    return CKD_score
+
+def calculate_CTS_score(results):
+    """
+    • Correct Time Signature (CTS): The predicted time signature matches the ground truth time signature.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'time_signature' in eval_entry and 'time_signature' in original_entry:
+            eval_time_signature = eval_entry['time_signature'][0] if isinstance(eval_entry['time_signature'], list) else eval_entry['time_signature']
+            original_time_signature = original_entry['time_signature']
+            if original_time_signature is None or eval_time_signature is None:
+                continue  # 如果原始条目没有 time signature，跳过
+            if eval_time_signature == original_time_signature:
+                correct += 1
+            else:
+                # 检查是否为相同的节拍（如 4/4 和 2/2）
+                eval_numerator, eval_denominator = map(int, eval_time_signature.split('/'))
+                original_numerator, original_denominator = map(int, original_time_signature.split('/'))
+                if (eval_numerator == original_numerator and eval_denominator == original_denominator) or \
+                   (eval_numerator * 2 == original_numerator and eval_denominator == original_denominator):
+                    correct += 1
+            total += 1
+    CTS_score = correct / total if total > 0 else 0
+    print(f"CTS Score: {CTS_score:.4f} (Correct: {correct}, Total: {total})")
+    return CTS_score
+    
+def calculate_ECM_score(results):
+    """
+    Exact Chord Match (ECM): The predicted
+    chord sequence matches the ground truth exactly
+    in terms of order, chord root, and chord type, with
+    tolerance for missing and excess chord instances.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'chord_summary' in eval_entry and 'chord_summary' in original_entry:
+            eval_chord_summary = eval_entry['chord_summary'][0] if isinstance(eval_entry['chord_summary'], list) else eval_entry['chord_summary']
+            original_chord_summary = original_entry['chord_summary']
+            if original_chord_summary is None or eval_chord_summary is None:
+                continue
+            # 检查 eval_chord_summary 是否包含 original_chord_summary，两个都是列表，每个元素是一个字符串
+            if eval_chord_summary == original_chord_summary:
+                correct += 1
+            total += 1
+    ECM_score = correct / total if total > 0 else 0
+    print(f"ECM Score: {ECM_score:.4f} (Correct: {correct}, Total: {total})")
+    return ECM_score
+
+def calculate_CMO_score(results):
+    """
+    • Chord Match in any Order (CMO): The portion of predicted chord sequence matching the
+ground truth chord root and type, in any order
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'chords' in eval_entry and 'chord_summary' in original_entry:
+            eval_chords_seq = eval_entry['chords']
+            # remove the confidence score from eval_chords_seq
+            if isinstance(eval_chords_seq, list) and len(eval_chords_seq) > 0 and isinstance(eval_chords_seq[0], list):
+                eval_chords_seq = [chord[0] for chord in eval_chords_seq]
+            original_chord_summary = original_entry['chord_summary']
+            if original_chord_summary is None or eval_chords_seq is None:
+                continue
+            # 检查 eval_chords_seq 是否包含 original_chord_summary，两个都是列表
+            eval_chords_set = set(eval_chords_seq) #  [['C', 0.464399092], ['G', 2.879274376]]
+            original_chord_set = set(original_chord_summary) #  ['G', 'C']
+            if original_chord_set.issubset(eval_chords_set):
+                correct += 1
+            else:
+                if original_chord_set == eval_chords_set:
+                    correct += 1
+            total += 1
+    CMO_score = correct / total if total > 0 else 0
+    print(f"CMO Score: {CMO_score:.4f} (Correct: {correct}, Total: {total})")
+    return CMO_score
+
+def calculate_CI_score(results):
+    """
+    •Correct Instrument (CI): The predicted instrument matches the ground truth instrument.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'mapped_instruments_summary' in eval_entry and 'instrument_summary' in original_entry:
+            eval_instrument = eval_entry['mapped_instruments_summary'] if isinstance(eval_entry['mapped_instruments'], list) else eval_entry['mapped_instruments']
+            original_instrument = original_entry['instrument_summary']
+            if original_instrument is None or eval_instrument is None:
+                continue
+            # 检查 eval_instrument 是否包含 original_instrument
+            if isinstance(eval_instrument, list):
+                eval_instrument_set = set(eval_instrument)
+                original_instrument_set = set(original_instrument)
+                if original_instrument_set.issubset(eval_instrument_set):
+                    correct += 1
+            else:
+                if eval_instrument == original_instrument:
+                    correct += 1
+            total += 1
+    CI_score = correct / total if total > 0 else 0
+    print(f"CI Score: {CI_score:.4f} (Correct: {correct}, Total: {total})")
+    return CI_score
+
+def calculate_CI_top1_score(results):
+    """
+    •Correct Instrument Top-1 (CI_top1): The predicted instrument matches the ground truth instrument
+    or is one of the top 3 predicted instruments.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'mapped_instruments_summary' in eval_entry and 'instrument_summary' in original_entry:
+            eval_instrument = eval_entry['mapped_instruments_summary'] if isinstance(eval_entry['mapped_instruments'], list) else eval_entry['mapped_instruments']
+            original_instrument = original_entry['instrument_summary']
+            if original_instrument is None or eval_instrument is None:
+                continue
+            # 检查 eval_instrument 是否包含 original_instrument中的一个元素
+            if isinstance(eval_instrument, list):
+                eval_instrument_set = set(eval_instrument)
+                original_instrument_set = set(original_instrument)
+                for inst in original_instrument_set:
+                    if inst in eval_instrument_set:
+                        correct += 1
+                        break
+            else:
+                if eval_instrument == original_instrument:
+                    correct += 1
+            total += 1
+    CI_top1_score = correct / total if total > 0 else 0
+    print(f"CI Top-1 Score: {CI_top1_score:.4f} (Correct: {correct}, Total: {total})")
+    return CI_top1_score
+
+def calculate_CG_score(results):
+    """
+    • Correct Genre (CG): The predicted genre matches the ground truth genre.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'genre' in eval_entry and 'genre' in original_entry:
+            eval_genre = eval_entry['genre'][0] if isinstance(eval_entry['genre'], list) else eval_entry['genre']
+            original_genre = original_entry['genre']
+            if original_genre is None or eval_genre is None:
+                continue
+            # 检查 eval_genre 是否包含 original_genre
+            if isinstance(eval_genre, list):
+                eval_genre_set = set(eval_genre)
+                original_genre_set = set(original_genre)
+                if original_genre_set.issubset(eval_genre_set):
+                    correct += 1
+            else:
+                if eval_genre == original_genre:
+                    correct += 1
+            total += 1
+    CG_score = correct / total if total > 0 else 0
+    print(f"CG Score: {CG_score:.4f} (Correct: {correct}, Total: {total})")
+    return CG_score
+
+def calculate_CG_top1_score(results):
+    """
+    • Correct Genre Top-1 (CG_top1): The predicted genre matches the ground truth genre or is one of the top 3 predicted genres.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'genre' in eval_entry and 'genre' in original_entry:
+            eval_genre = eval_entry['genre'][0] if isinstance(eval_entry['genre'], list) else eval_entry['genre']
+            original_genre = original_entry['genre']
+            if original_genre is None or eval_genre is None:
+                continue
+            # 检查 eval_genre 是否包含 original_genre中的一个元素
+            if isinstance(eval_genre, list):
+                eval_genre_set = set(eval_genre)
+                original_genre_set = set(original_genre)
+                for gen in original_genre_set:
+                    if gen in eval_genre_set:
+                        correct += 1
+                        break
+            else:
+                if eval_genre == original_genre:
+                    correct += 1
+            total += 1
+    CG_top1_score = correct / total if total > 0 else 0
+    print(f"CG Top-1 Score: {CG_top1_score:.4f} (Correct: {correct}, Total: {total})")
+    return CG_top1_score
+
+def calculate_CM_score(results):
+    """
+    • Correct Mood (CM): The predicted mood matches the ground truth mood.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'mood' in eval_entry and 'mood' in original_entry:
+            eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood']
+            original_mood = original_entry['mood']
+            if original_mood is None or eval_mood is None:
+                continue
+            # 检查 eval_mood 是否包含 original_mood
+            if isinstance(eval_mood, list):
+                eval_mood_set = set(eval_mood)
+                original_mood_set = set(original_mood)
+                if original_mood_set.issubset(eval_mood_set):
+                    correct += 1
+            else:
+                if eval_mood == original_mood:
+                    correct += 1
+            total += 1
+    CM_score = correct / total if total > 0 else 0
+    print(f"CM Score: {CM_score:.4f} (Correct: {correct}, Total: {total})")
+    return CM_score
+
+def calculate_CM_top1_score(results):
+    """
+    • Correct Mood Top-1 (CM_top1): The predicted mood matches the ground truth mood or is one of the top 3 predicted moods.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'mood' in eval_entry and 'mood' in original_entry:
+            eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood']
+            original_mood = original_entry['mood']
+            if original_mood is None or eval_mood is None:
+                continue
+            # 检查 eval_mood 是否包含 original_mood中的一个元素
+            if isinstance(eval_mood, list):
+                eval_mood_set = set(eval_mood)
+                original_mood_set = set(original_mood)
+                for mood in original_mood_set:
+                    if mood in eval_mood_set:
+                        correct += 1
+                        break
+            else:
+                if eval_mood == original_mood:
+                    correct += 1
+            total += 1
+    CM_top1_score = correct / total if total > 0 else 0
+    print(f"CM Top-1 Score: {CM_top1_score:.4f} (Correct: {correct}, Total: {total})")
+    return CM_top1_score
+
+def calculate_CM_top3_score(results):
+    """
+    • Correct Mood Top-3 (CM_top3): The predicted mood matches the ground truth mood or is one of the top 3 predicted moods.
+    """
+    correct = 0
+    total = 0
+    for result in results:
+        eval_entry = result['eval_entry']
+        original_entry = result['original_entry']
+        if 'mood' in eval_entry and 'mood' in original_entry:
+            eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood']
+            original_mood = original_entry['mood']
+            if original_mood is None or eval_mood is None:
+                continue
+            # 检查 eval_mood 是否包含 original_mood中的3个元素
+            if isinstance(eval_mood, list):
+                eval_mood_set = set(eval_mood)
+                original_mood_set = set(original_mood)
+                if len(original_mood_set) <= 3 and original_mood_set.issubset(eval_mood_set):
+                    correct += 1
+                elif len(original_mood_set) > 3:
+                    match_num = sum(1 for mood in original_mood_set if mood in eval_mood_set)
+                    if match_num >= 3:
+                        correct += 1
+            else:
+                if eval_mood == original_mood:
+                    correct += 1
+            total += 1
+    CM_top3_score = correct / total if total > 0 else 0
+    print(f"CM Top-3 Score: {CM_top3_score:.4f} (Correct: {correct}, Total: {total})")
+    return CM_top3_score
+
+def calculate_all_scores(results):
+    """
+    Calculate all scores and return them as a dictionary.
+    """
+    scores = {
+        'TBT_score': calculate_TBT_score(results),
+        'CK_score': calculate_CK_score(results),
+        'CKD_score': calculate_CKD_score(results),
+        'CTS_score': calculate_CTS_score(results),
+        'ECM_score': calculate_ECM_score(results),
+        'CMO_score': calculate_CMO_score(results),
+        'CI_score': calculate_CI_score(results),
+        'CI_top1_score': calculate_CI_top1_score(results),
+        'CG_score': calculate_CG_score(results),
+        'CG_top1_score': calculate_CG_top1_score(results),
+        'CM_score': calculate_CM_score(results),
+        'CM_top1_score': calculate_CM_top1_score(results),
+        'CM_top3_score': calculate_CM_top3_score(results)
+    }
+    return scores
+if __name__ == "__main__":
+    scores = calculate_all_scores(results)
+    print("All Scores:")
+    for score_name, score_value in scores.items():
+        print(f"{score_name}: {score_value:.4f}")
+    
+    # Save the results to a JSON file
+    output_file = f"{generate_path}/results.json"
+    with open(output_file, 'w') as f:
+        json.dump(scores, f, indent=4)
+    print(f"Results saved to {output_file}")
+    
diff --git a/SongEval/ebr.py b/SongEval/ebr.py
new file mode 100644
index 0000000..4b7a4f8
--- /dev/null
+++ b/SongEval/ebr.py
@@ -0,0 +1,103 @@
+import argparse
+import glob
+import os
+import pandas as pd
+import muspy
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+
+def compute_midi_metrics(file_path):
+    """计算单个MIDI文件的音乐指标"""
+    try:
+        music = muspy.read(file_path)
+        scale_consistency = muspy.scale_consistency(music)
+        pitch_entropy = muspy.pitch_entropy(music)
+        pitch_class_entropy = muspy.pitch_class_entropy(music)
+        empty_beat_rate = muspy.empty_beat_rate(music)
+        groove_consistency = muspy.groove_consistency(music, 12)
+        metrics = {
+            'scale_consistency': scale_consistency,
+            'pitch_entropy': pitch_entropy,
+            'pitch_class_entropy': pitch_class_entropy,
+            'empty_beat_rate': empty_beat_rate,
+            'groove_consistency': groove_consistency,
+            'filename': os.path.basename(file_path)
+        }
+        return metrics
+    except Exception as e:
+        print(f"处理文件 {os.path.basename(file_path)} 时出错: {str(e)}")
+        return None
+
+def compute_directory_metrics(directory_path, num_workers=8):
+    """计算目录下所有MIDI文件的音乐指标（多线程加速）"""
+    midi_files = []
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            if file.lower().endswith(('.mid', '.midi')):
+                midi_files.append(os.path.join(root, file))
+    if not midi_files:
+        print("目录及子文件夹中未找到MIDI文件")
+        return None
+
+    all_metrics = []
+    average_metrics = {
+        'scale_consistency': 0,
+        'pitch_entropy': 0,
+        'pitch_class_entropy': 0,
+        'empty_beat_rate': 0,
+        'groove_consistency': 0
+    }
+    current_num = 0
+    total_scale_consistency = 0
+    total_pitch_entropy = 0
+    total_pitch_class_entropy = 0
+    total_empty_beat_rate = 0
+    total_groove_consistency = 0
+    print(f"正在处理目录: {directory_path}")
+    print(f"发现 {len(midi_files)} 个MIDI文件:")
+
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(compute_midi_metrics, midi_file): midi_file for midi_file in midi_files}
+        for future in tqdm(as_completed(futures), total=len(midi_files), desc="处理中"):
+            metrics = future.result()
+            
+            if metrics is not None:
+                current_num += 1
+                total_scale_consistency += metrics['scale_consistency']
+                total_pitch_entropy += metrics['pitch_entropy']
+                total_pitch_class_entropy += metrics['pitch_class_entropy']
+                total_empty_beat_rate += metrics['empty_beat_rate']
+                total_groove_consistency += metrics['groove_consistency']
+                average_metrics['scale_consistency'] = total_scale_consistency / current_num
+                average_metrics['pitch_entropy'] = total_pitch_entropy / current_num
+                average_metrics['pitch_class_entropy'] = total_pitch_class_entropy / current_num
+                average_metrics['empty_beat_rate'] = total_empty_beat_rate / current_num
+                average_metrics['groove_consistency'] = total_groove_consistency / current_num
+                print("current_metrics:", metrics)
+
+                all_metrics.append(metrics)
+
+    if not all_metrics:
+        print("所有文件处理失败")
+        return None
+
+    df = pd.DataFrame(all_metrics)
+    output_csv = os.path.join(directory_path, "midi_metrics_report.csv")
+    df.to_csv(output_csv, index=False)
+    avg_metrics = df.mean(numeric_only=True)
+    return df, avg_metrics
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="计算目录下所有MIDI文件的音乐指标")
+    parser.add_argument("path", type=str, help="包含MIDI文件的目录路径")
+    parser.add_argument("--threads", type=int, default=1, help="线程数（默认8）")
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.path):
+        print(f"错误: 路径 '{args.path}' 不存在或不是目录")
+    else:
+        result, averages = compute_directory_metrics(args.path, num_workers=args.threads)
+        if result is not None:
+            print("\n计算完成! 结果已保存到 midi_metrics_report.csv")
+            print("\n平均指标值:")
+            print(averages.to_string())
diff --git a/SongEval/eval.py b/SongEval/eval.py
new file mode 100644
index 0000000..0d1c0d3
--- /dev/null
+++ b/SongEval/eval.py
@@ -0,0 +1,150 @@
+import glob
+import os
+import json
+import librosa
+import numpy as np
+import torch
+import argparse
+from muq import MuQ
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from safetensors.torch import load_file
+from tqdm import tqdm
+
+
+
+class Synthesizer(object):
+
+    def __init__(self,
+                 checkpoint_path,
+                 input_path,
+                 output_dir,
+                 use_cpu: bool = False):
+    
+        self.checkpoint_path = checkpoint_path
+        self.input_path = input_path
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.device = torch.device('cuda') if (torch.cuda.is_available() and (not use_cpu)) else torch.device('cpu')
+    
+    @torch.no_grad()
+    def setup(self):
+        
+        train_config = OmegaConf.load(os.path.join(os.path.dirname(self.checkpoint_path), '../config.yaml'))
+        model = instantiate(train_config.generator).to(self.device).eval()
+        state_dict = load_file(self.checkpoint_path, device="cpu")
+        model.load_state_dict(state_dict, strict=False)
+
+        self.model = model
+        self.muq = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter")
+        self.muq = self.muq.to(self.device).eval()
+        self.result_dcit = {}
+        
+    @torch.no_grad()
+    def synthesis(self):
+        if os.path.isfile(self.input_path):
+            if self.input_path.endswith(('.wav', '.mp3')):
+                lines = []
+                lines.append(self.input_path)
+            else:
+                with open(self.input_path, "r") as f:
+                    lines = [line for line in f]
+            input_files = [{
+                "input_path": line.strip(), 
+            } for line in lines]
+            print(f"input filelst: {self.input_path}")
+        elif os.path.isdir(self.input_path):
+            input_files = [{
+                "input_path": file,
+            }for file in glob.glob(os.path.join(self.input_path, '*')) if file.lower().endswith(('.wav', '.mp3'))]
+        else:
+            raise ValueError(f"input_path {self.input_path} is not a file or directory")
+        
+        
+        for input in tqdm(input_files):
+            try:
+                self.handle(**input)
+            except Exception as e:
+                print(e)
+                continue
+        # add average
+        avg_values = {}
+        for key in self.result_dcit[list(self.result_dcit.keys())[0]].keys():
+            avg_values[key] = round(np.mean([self.result_dcit[fid][key] for fid in self.result_dcit]), 4)
+        self.result_dcit['average'] = avg_values
+        # save result
+        with open(os.path.join(self.output_dir, "result.json") , "w")as f:
+            json.dump(self.result_dcit, f, indent=4, ensure_ascii=False)
+        
+    @torch.no_grad()
+    def handle(self, input_path):
+        
+        fid = os.path.basename(input_path).split('.')[0]
+        if input_path.endswith('.npy'):
+            input = np.load(input_path)
+            
+            # check ssl
+            if len(input.shape) == 3 and input.shape[0] != 1:
+                print('ssl_shape error', input_path)
+                return
+            if np.isnan(input).any():
+                print('ssl nan', input_path)
+                return
+            
+            input = torch.from_numpy(input).to(self.device)
+            if len(input.shape) == 2:
+                input = input.unsqueeze(0)
+
+        if input_path.endswith(('.wav', '.mp3')):
+            wav, sr = librosa.load(input_path, sr=24000)
+            audio = torch.tensor(wav).unsqueeze(0).to(self.device)
+            output = self.muq(audio, output_hidden_states=True)
+            input = output["hidden_states"][6]
+            
+        values = {} 
+        scores_g = self.model(input).squeeze(0)
+        values['Coherence'] = round(scores_g[0].item(), 4)
+        values['Musicality'] = round(scores_g[1].item(), 4)
+        values['Memorability'] = round(scores_g[2].item(), 4)
+        values['Clarity'] = round(scores_g[3].item(), 4)
+        values['Naturalness'] = round(scores_g[4].item(), 4)
+        
+        
+        self.result_dcit[fid] = values
+        # delete 
+        del input, output, scores_g, values,audio, wav, sr
+        
+        
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input_path",
+        type=str,
+        required=True,
+        help="Input audio: path to a single file, a text file listing audio paths, or a directory of audio files."
+    )
+    parser.add_argument(
+        "-o", "--output_dir",
+        type=str,
+        required=True,
+        help="Output directory for generated results (will be created if it doesn't exist)."
+    )
+    parser.add_argument(
+        "--use_cpu",
+        type=str,
+        help="Force CPU mode even if a GPU is available.",
+        default=False
+    )
+    
+    args = parser.parse_args()
+    
+    ckpt_path = "ckpt/model.safetensors"
+    
+    synthesizer = Synthesizer(checkpoint_path=ckpt_path,
+                              input_path=args.input_path,
+                              output_dir=args.output_dir,
+                              use_cpu=args.use_cpu)
+    
+    synthesizer.setup()
+    
+    synthesizer.synthesis()
\ No newline at end of file
diff --git a/SongEval/generate-batch_easy.py b/SongEval/generate-batch_easy.py
new file mode 100644
index 0000000..05e5c38
--- /dev/null
+++ b/SongEval/generate-batch_easy.py
@@ -0,0 +1,404 @@
+import sys
+import os
+from pathlib import Path
+from multiprocessing import Process,set_start_method
+import torch
+import argparse
+from omegaconf import OmegaConf
+import json
+from collections import defaultdict
+
+from Amadeus.evaluation_utils import (
+    wandb_style_config_to_omega_config,
+    prepare_model_and_dataset_from_config,
+    get_best_ckpt_path_and_config,
+    Evaluator
+)
+from transformers import T5Tokenizer, T5EncoderModel
+
+from Amadeus import model_zoo
+from Amadeus.symbolic_encoding import data_utils
+from Amadeus.model_zoo import AmadeusModel
+from Amadeus.symbolic_encoding.data_utils import TuneCompiler
+from Amadeus.symbolic_encoding.compile_utils import shift_and_pad
+from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+from Amadeus.symbolic_encoding import decoding_utils
+from Amadeus.train_utils import adjust_prediction_order
+from data_representation import vocab_utils
+from data_representation.vocab_utils import LangTokenVocab
+
+
+def get_argument_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-wandb_exp_dir",
+        required=True,
+        type=str,
+        help="wandb experiment directory",
+    )
+    parser.add_argument(
+        "-generation_type",
+        type=str,
+        choices=('conditioned', 'unconditioned', 'text-conditioned'),
+        default='unconditioned',
+        help="generation type",
+    )
+    parser.add_argument(
+        "-sampling_method",
+        type=str,
+        choices=('top_p', 'top_k'),
+        default='top_p',
+        help="sampling method",
+    )
+    parser.add_argument(
+        "-threshold",
+        type=float,
+        default=0.99,
+        help="threshold",
+    )
+    parser.add_argument(
+        "-temperature",
+        type=float,
+        default=1.15,
+        help="temperature",
+    )
+    parser.add_argument(
+        "-num_samples",
+        type=int,
+        default=30,
+        help="number of samples to generate",
+    )
+    parser.add_argument(
+        "-num_target_measure",
+        type=int,
+        default=4,
+        help="number of target measures for conditioned generation",
+    )
+    parser.add_argument(
+        "-choose_selected_tunes",
+        action='store_true',
+        help="generate samples from selected tunes, only for SOD dataset",
+    )
+    parser.add_argument(
+        "-generate_length",
+        type=int,
+        default=1024,
+        help="length of the generated sequence",
+    )
+    parser.add_argument(
+        "-num_processes",
+        type=int,
+        default=2,
+        help="number of processes to use",
+    )
+    parser.add_argument(
+        "-gpu_ids",
+        type=str,
+        default="0,5",
+        help="comma-separated list of GPU IDs to use (e.g., '0,1,2,3')",
+    )
+    parser.add_argument(
+    "-prompt",
+    type=str,
+    default="With a rhythm of 100 BPM, this classical piece in 1/4 time signature in the key of Eb major creates a classical mood using String Ensemble, Pizzicato Strings, Tremolo Strings, Trumpet, Timpani.",
+    help="prompt for generation, only used for conditioned generation",
+  )
+    parser.add_argument(
+        "-prompt_file",
+        type=str,
+        default="dataset/midicaps/train.json",
+        help="file containing prompts for text-conditioned generation",
+    )
+    return parser
+
+def load_resources(wandb_exp_dir, device):
+    """Load model and dataset resources for a process"""
+    wandb_dir = Path('wandb')
+    ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, wandb_exp_dir)
+    config = OmegaConf.load(config_path)
+    config = wandb_style_config_to_omega_config(config)
+    
+    # Load checkpoint to specified device
+    ckpt = torch.load(ckpt_path, map_location=device)
+    model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path, vocab_path)
+    model.load_state_dict(ckpt['model'], strict=False)
+    model.to(device)
+    model.eval()
+    torch.compile(model)
+    print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    
+    # Prepare dataset for prompts
+    condition_list = [x[1] for x in test_set.data_list]
+    dataset_for_prompt = []
+    for i in range(len(condition_list)):
+        condition = test_set.get_segments_with_tune_idx(condition_list[i], 0)[0]
+        dataset_for_prompt.append((condition, condition_list[i]))
+    
+    return config, model, dataset_for_prompt, vocab
+
+def conditioned_worker(process_idx, gpu_id, args, data_slice):
+    """Worker process for conditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"cond_{args.num_target_measure}m_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+    
+    evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device)
+    
+    # Process assigned data slice
+    for idx, (tune_in_idx, tune_name) in enumerate(data_slice):
+        batch_dir = base_path / f"process_{process_idx}_batch_{idx}"
+        batch_dir.mkdir(parents=True, exist_ok=True)
+        evaluator.generate_samples_with_prompt(
+            batch_dir,
+            args.num_target_measure,
+            tune_in_idx,
+            tune_name,
+            config.data_params.first_pred_feature,
+            args.sampling_method,
+            args.threshold,
+            args.temperature,
+            generation_length=args.generate_length
+        )
+def generate_samples_unconditioned(config, vocab, model, device,save_dir, num_samples, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1):
+    encoding_scheme = config.nn_params.encoding_scheme
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset)
+
+    for i in range(num_samples):
+      generated_sample = model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature)
+      if encoding_scheme == 'nb':
+        generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+      decoder(generated_sample, output_path=str(save_dir / f"{uid}_{i}.mid"))
+
+def generate_samples_with_text_prompt(config, vocab, model, device, save_dir, prompt, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1):
+    encoding_scheme = config.nn_params.encoding_scheme
+    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')
+    encoder = T5EncoderModel.from_pretrained('google/flan-t5-large').to(device)
+    print(f"Using T5EncoderModel for text prompt: {prompt}")
+    context = tokenizer(prompt,  return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(device)
+    context = encoder(**context).last_hidden_state
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[config.dataset]
+    except KeyError:
+      in_beat_resolution = 4  # Default resolution if dataset is not found
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset)
+
+    generated_sample = model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context)
+    if encoding_scheme == 'nb':
+      generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+    # Open the jsonl file and count the number of lines to determine the current index
+    jsonl_path = save_dir / "name2prompt.jsonl"
+    if jsonl_path.exists():
+      with open(jsonl_path, 'r') as f:
+        current_idx = sum(1 for _ in f)
+    else:
+      current_idx = 0
+
+    name = f"prompt_{current_idx}"
+    name2prompt_dict = defaultdict(list)
+    name2prompt_dict[name].append(prompt)
+    with open(jsonl_path, 'a') as f:
+      f.write(json.dumps(name2prompt_dict) + '\n')
+    decoder(generated_sample, output_path=str(save_dir / f"{name}_{uid}.mid"))
+
+def unconditioned_worker(process_idx, gpu_id, args, num_samples):
+    """Worker process for unconditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"uncond_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+        
+    # Generate assigned number of samples
+    batch_dir = base_path 
+    generate_samples_unconditioned(
+        config,
+        vocab,
+        model,
+        batch_dir,
+        num_samples,
+        config.data_params.first_pred_feature,
+        args.sampling_method,
+        args.threshold,
+        args.temperature,
+        generation_length=args.generate_length,
+        uid=f"{process_idx}"
+    )
+def text_conditioned_worker(process_idx, gpu_id, args, num_samples, data_slice):
+    """Worker process for unconditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"text_condi_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+    
+    # Generate assigned number of samples
+    batch_dir = base_path 
+    for idx, tune_name in enumerate(data_slice):
+        print(f"Process {process_idx} generating samples for tune: {tune_name}")
+        generate_samples_with_text_prompt(
+            config,
+            vocab,
+            model,
+            device,
+            batch_dir,
+            prompt=tune_name,
+            first_pred_feature=config.data_params.first_pred_feature,
+            sampling_method=args.sampling_method,
+            threshold=args.threshold,
+            temperature=args.temperature,
+            generation_length=args.generate_length,
+            uid=f"{process_idx}_{idx}"  
+        )
+def main():
+  # use spawn method for multiprocessing
+    set_start_method('spawn', force=True)
+    args = get_argument_parser().parse_args()
+    gpu_ids = list(map(int, args.gpu_ids.split(',')))
+    
+    # Validate GPU availability
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if len(gpu_ids) == 0:
+        raise ValueError("At least one GPU must be specified")
+    
+    # Validate process count
+    if args.num_processes < 1:
+        raise ValueError("Number of processes must be at least 1")
+    if len(gpu_ids) < args.num_processes:
+        print(f"Warning: More processes ({args.num_processes}) than GPUs ({len(gpu_ids)}), some GPUs will be shared")
+    
+    # Prepare data slices for processes
+    processes = []
+    try:
+        if args.generation_type == 'conditioned':
+            # Prepare selected tunes
+            wandb_dir = Path('wandb') / args.wandb_exp_dir
+            if not wandb_dir.exists():
+                raise FileNotFoundError(f"Experiment {args.wandb_exp_dir} not found")
+            
+            # Load test set to get selected tunes (dummy load to get dataset info)
+            dummy_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            _, test_set, _ = prepare_model_and_dataset_from_config(
+                wandb_dir / "files" / "config.yaml",
+                wandb_dir / "files" / "metadata.json",
+                wandb_dir / "files" / "vocab.json"
+            )
+            
+            if args.choose_selected_tunes and test_set.dataset == 'SOD':
+                selected_tunes = ['Requiem_orch', 'magnificat_bwv-243_8_orch', 
+                                 "Clarinet Concert in A Major: 2nd Movement, Adagio_orch"]
+            else:
+                selected_tunes = [name for _, name in test_set.data_list][:args.num_samples]
+            
+            # Split selected data across processes
+            selected_data = [d for d in test_set.data_list if d[1] in selected_tunes]
+            chunk_size = (len(selected_data) + args.num_processes - 1) // args.num_processes
+            
+            for i in range(args.num_processes):
+                start_idx = i * chunk_size
+                end_idx = min((i+1)*chunk_size, len(selected_data))
+                data_slice = selected_data[start_idx:end_idx]
+                
+                if not data_slice:
+                    continue
+                
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                p = Process(
+                    target=conditioned_worker,
+                    args=(i, gpu_id, args, data_slice)
+                )
+                processes.append(p)
+                p.start()
+        
+        elif args.generation_type == 'unconditioned':
+            samples_per_proc = args.num_samples // args.num_processes
+            remainder = args.num_samples % args.num_processes
+            
+            for i in range(args.num_processes):
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                samples = samples_per_proc + (1 if i < remainder else 0)
+                
+                if samples <= 0:
+                    continue
+                
+                p = Process(
+                    target=unconditioned_worker,
+                    args=(i, gpu_id, args, samples)
+                )
+                processes.append(p)
+                p.start()
+        elif args.generation_type == 'text-conditioned':
+            samples_per_proc = args.num_samples // args.num_processes
+            remainder = args.num_samples % args.num_processes
+            # Load prompts from file
+            prompt_name_list = []
+            with open(args.prompt_file, 'r') as f:
+                for line in f:
+                    if not line.strip():
+                        continue
+                    prompt_data = json.loads(line.strip())
+                    prompt_text = prompt_data['caption']
+                    if prompt_data['test_set'] is True:
+                        prompt_name_list.append(prompt_text)
+                    print("length of prompt_name_list:", len(prompt_name_list))
+                    if len(prompt_name_list) >= args.num_samples:
+                        print(f"Reached the limit of {args.num_samples} prompts.")
+                        break   
+            for i in range(args.num_processes):
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                samples = samples_per_proc + (1 if i < remainder else 0)
+                
+                if samples <= 0:
+                    continue
+                
+                # Split prompt names across processes
+                start_idx = i * (len(prompt_name_list) // args.num_processes)
+                end_idx = (i + 1) * (len(prompt_name_list) // args.num_processes)
+                data_slice = prompt_name_list[start_idx:end_idx]
+                
+                p = Process(
+                    target=text_conditioned_worker,
+                    args=(i, gpu_id, args, samples, data_slice)
+                )
+                processes.append(p)
+                p.start()
+        # Wait for all processes to complete
+        for p in processes:
+            p.join()
+    
+    except Exception as e:
+        print(f"Error in main process: {str(e)}")
+        for p in processes:
+            p.terminate()
+        raise
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/SongEval/matrics.py b/SongEval/matrics.py
new file mode 100644
index 0000000..80de842
--- /dev/null
+++ b/SongEval/matrics.py
@@ -0,0 +1,68 @@
+import argparse
+import os
+import shutil
+import tempfile
+import numpy as np
+import torch
+from audioldm_eval import EvaluationHelper, EvaluationHelperParallel
+import torch.multiprocessing as mp
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--generation_path", type=str, required=True, help="Path to generated audio files")
+    parser.add_argument("--target_path", type=str, required=True, help="Path to reference audio files")
+    parser.add_argument("--force_paired", action="store_true", help="Force pairing by randomly selecting reference files")
+    parser.add_argument("--gpu_mode", choices=["single", "multi"], default="single", help="Evaluation mode")
+    parser.add_argument("--num_gpus", type=int, default=2, help="Number of GPUs for multi-GPU mode")
+    args = parser.parse_args()
+
+    # Handle forced pairing
+    target_eval_path = args.target_path
+    temp_dir = None
+    if args.force_paired:
+        print(f"Using forced pairing with reference files from {args.target_path}")
+        temp_dir = tempfile.mkdtemp()
+        target_eval_path = temp_dir
+        
+        # Collect generated filenames
+        gen_files = []
+        for root, _, files in os.walk(args.generation_path):
+            for file in files:
+                if file.endswith(".wav"):
+                    gen_files.append(file)
+        print(f"Found {len(gen_files)} generated files in {args.generation_path}")
+        # Collect all reference files
+        ref_files = []
+        for root, _, files in os.walk(args.target_path):
+            for file in files:
+                if file.endswith(".wav"):
+                    ref_files.append(os.path.join(root, file))
+        
+        # Select random references matching the count
+        selected_refs = np.random.choice(ref_files, len(gen_files), replace=False)
+        print(f"Selected {len(selected_refs)} reference files for evaluation.")
+        # Copy selected references to temp dir with generated filenames
+        for gen_file, ref_path in zip(gen_files, selected_refs):
+            shutil.copy(ref_path, os.path.join(temp_dir, gen_file))
+        
+    
+    device = torch.device(f"cuda:{0}") if args.gpu_mode == "single" else None
+    
+    try:
+        if args.gpu_mode == "single":
+            print("Running single GPU evaluation...")
+            evaluator = EvaluationHelper(16000, device)
+            metrics = evaluator.main(args.generation_path, target_eval_path)
+        else:
+            print(f"Running multi-GPU evaluation on {args.num_gpus} GPUs...")
+            evaluator = EvaluationHelperParallel(16000, args.num_gpus)
+            metrics = evaluator.main(args.generation_path, target_eval_path)
+        print("Evaluation completed.")
+        
+    finally:
+        # Clean up temporary directory
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/SongEval/model.py b/SongEval/model.py
new file mode 100644
index 0000000..7dc8713
--- /dev/null
+++ b/SongEval/model.py
@@ -0,0 +1,66 @@
+from einops import rearrange
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Generator(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 ffd_hidden_size,
+                 num_classes,
+                 attn_layer_num,
+                 
+                 ):
+        super(Generator, self).__init__()
+        
+        self.attn = nn.ModuleList(
+            [
+                nn.MultiheadAttention(
+                    embed_dim=in_features,
+                    num_heads=8,
+                    dropout=0.2,
+                    batch_first=True,
+                )
+                for _ in range(attn_layer_num)
+            ]
+        )
+        
+        self.ffd = nn.Sequential(
+            nn.Linear(in_features, ffd_hidden_size),
+            nn.ReLU(),
+            nn.Linear(ffd_hidden_size, in_features)
+        )
+        
+        self.dropout = nn.Dropout(0.2)
+        
+        self.fc =  nn.Linear(in_features * 2, num_classes)
+        
+        self.proj = nn.Tanh()
+        
+
+    def forward(self, ssl_feature, judge_id=None):
+        '''
+        ssl_feature: [B, T, D]   
+        output: [B, num_classes]
+        '''
+        
+        B, T, D = ssl_feature.shape
+        
+        ssl_feature = self.ffd(ssl_feature)
+        
+        tmp_ssl_feature = ssl_feature
+        
+        for attn in self.attn:
+            tmp_ssl_feature, _ = attn(tmp_ssl_feature, tmp_ssl_feature, tmp_ssl_feature)
+    
+        ssl_feature = self.dropout(torch.concat([torch.mean(tmp_ssl_feature, dim=1), torch.max(ssl_feature, dim=1)[0]], dim=1))  # B, 2D
+        
+        x = self.fc(ssl_feature)  # B, num_classes
+        
+        x = self.proj(x) * 2.0 + 3
+        
+        return x
+    
+    
diff --git a/SongEval/requirements.txt b/SongEval/requirements.txt
new file mode 100644
index 0000000..b7e5e7d
--- /dev/null
+++ b/SongEval/requirements.txt
@@ -0,0 +1,4 @@
+librosa==0.11.0
+torch==2.7.0
+muq==0.1.0
+hydra-core==1.3.2
\ No newline at end of file
diff --git a/SongEval/result.json b/SongEval/result.json
new file mode 100644
index 0000000..6b6b1a9
--- /dev/null
+++ b/SongEval/result.json
@@ -0,0 +1,3397 @@
+{
+    "3_95": {
+        "Coherence": 2.0777,
+        "Musicality": 2.0003,
+        "Memorability": 1.9263,
+        "Clarity": 2.0167,
+        "Naturalness": 1.9908
+    },
+    "1_67": {
+        "Coherence": 2.2713,
+        "Musicality": 2.1316,
+        "Memorability": 2.0851,
+        "Clarity": 2.011,
+        "Naturalness": 2.0983
+    },
+    "1_71": {
+        "Coherence": 1.9365,
+        "Musicality": 1.7492,
+        "Memorability": 1.8149,
+        "Clarity": 1.7948,
+        "Naturalness": 1.8447
+    },
+    "3_11": {
+        "Coherence": 2.2496,
+        "Musicality": 1.8815,
+        "Memorability": 2.0675,
+        "Clarity": 2.0203,
+        "Naturalness": 1.9953
+    },
+    "0_25": {
+        "Coherence": 2.4701,
+        "Musicality": 2.4306,
+        "Memorability": 2.3398,
+        "Clarity": 2.3215,
+        "Naturalness": 2.2693
+    },
+    "3_28": {
+        "Coherence": 2.1966,
+        "Musicality": 2.0181,
+        "Memorability": 2.0707,
+        "Clarity": 2.0365,
+        "Naturalness": 2.0673
+    },
+    "0_13": {
+        "Coherence": 1.8047,
+        "Musicality": 1.6372,
+        "Memorability": 1.6814,
+        "Clarity": 1.6422,
+        "Naturalness": 1.6898
+    },
+    "4_56": {
+        "Coherence": 2.0725,
+        "Musicality": 2.0063,
+        "Memorability": 2.0271,
+        "Clarity": 2.0444,
+        "Naturalness": 2.0402
+    },
+    "1_77": {
+        "Coherence": 2.0153,
+        "Musicality": 2.0462,
+        "Memorability": 1.9804,
+        "Clarity": 1.9786,
+        "Naturalness": 2.024
+    },
+    "2_75": {
+        "Coherence": 2.6489,
+        "Musicality": 2.4531,
+        "Memorability": 2.4532,
+        "Clarity": 2.4901,
+        "Naturalness": 2.3846
+    },
+    "3_16": {
+        "Coherence": 2.1249,
+        "Musicality": 1.9554,
+        "Memorability": 1.9534,
+        "Clarity": 1.935,
+        "Naturalness": 1.9855
+    },
+    "1_41": {
+        "Coherence": 2.4529,
+        "Musicality": 2.5158,
+        "Memorability": 2.3429,
+        "Clarity": 2.4205,
+        "Naturalness": 2.377
+    },
+    "3_76": {
+        "Coherence": 2.6071,
+        "Musicality": 2.5502,
+        "Memorability": 2.5093,
+        "Clarity": 2.3261,
+        "Naturalness": 2.3095
+    },
+    "3_25": {
+        "Coherence": 2.5939,
+        "Musicality": 2.7286,
+        "Memorability": 2.6561,
+        "Clarity": 2.4598,
+        "Naturalness": 2.4987
+    },
+    "0_63": {
+        "Coherence": 2.9322,
+        "Musicality": 2.7272,
+        "Memorability": 2.7328,
+        "Clarity": 2.7141,
+        "Naturalness": 2.8544
+    },
+    "3_97": {
+        "Coherence": 2.2356,
+        "Musicality": 2.1371,
+        "Memorability": 2.0707,
+        "Clarity": 2.1362,
+        "Naturalness": 2.2475
+    },
+    "3_67": {
+        "Coherence": 2.0389,
+        "Musicality": 1.9455,
+        "Memorability": 2.0118,
+        "Clarity": 1.82,
+        "Naturalness": 1.9762
+    },
+    "4_38": {
+        "Coherence": 2.4616,
+        "Musicality": 2.3333,
+        "Memorability": 2.173,
+        "Clarity": 2.4206,
+        "Naturalness": 2.2885
+    },
+    "1_5": {
+        "Coherence": 1.757,
+        "Musicality": 1.7005,
+        "Memorability": 1.8469,
+        "Clarity": 1.7476,
+        "Naturalness": 1.8144
+    },
+    "0_98": {
+        "Coherence": 2.539,
+        "Musicality": 2.4862,
+        "Memorability": 2.5355,
+        "Clarity": 2.4826,
+        "Naturalness": 2.5772
+    },
+    "4_6": {
+        "Coherence": 2.5418,
+        "Musicality": 2.3204,
+        "Memorability": 2.4124,
+        "Clarity": 2.2771,
+        "Naturalness": 2.3653
+    },
+    "1_75": {
+        "Coherence": 2.0407,
+        "Musicality": 2.0109,
+        "Memorability": 1.9676,
+        "Clarity": 1.917,
+        "Naturalness": 1.8748
+    },
+    "3_89": {
+        "Coherence": 2.3063,
+        "Musicality": 2.1369,
+        "Memorability": 2.1295,
+        "Clarity": 2.226,
+        "Naturalness": 2.2245
+    },
+    "2_54": {
+        "Coherence": 1.98,
+        "Musicality": 1.8938,
+        "Memorability": 1.8962,
+        "Clarity": 1.9052,
+        "Naturalness": 1.8847
+    },
+    "0_18": {
+        "Coherence": 2.068,
+        "Musicality": 1.9906,
+        "Memorability": 1.9896,
+        "Clarity": 1.9079,
+        "Naturalness": 1.9479
+    },
+    "2_88": {
+        "Coherence": 2.4045,
+        "Musicality": 2.2651,
+        "Memorability": 2.2801,
+        "Clarity": 2.2443,
+        "Naturalness": 2.1988
+    },
+    "1_23": {
+        "Coherence": 2.2784,
+        "Musicality": 2.1312,
+        "Memorability": 2.1006,
+        "Clarity": 2.2199,
+        "Naturalness": 2.2498
+    },
+    "4_92": {
+        "Coherence": 2.3807,
+        "Musicality": 2.1529,
+        "Memorability": 2.1955,
+        "Clarity": 2.0735,
+        "Naturalness": 2.1025
+    },
+    "0_49": {
+        "Coherence": 1.8054,
+        "Musicality": 1.6688,
+        "Memorability": 1.7004,
+        "Clarity": 1.6797,
+        "Naturalness": 1.7418
+    },
+    "2_90": {
+        "Coherence": 1.8965,
+        "Musicality": 1.7718,
+        "Memorability": 1.7087,
+        "Clarity": 1.743,
+        "Naturalness": 1.7686
+    },
+    "3_1": {
+        "Coherence": 2.5091,
+        "Musicality": 2.3787,
+        "Memorability": 2.309,
+        "Clarity": 2.3365,
+        "Naturalness": 2.5147
+    },
+    "2_50": {
+        "Coherence": 2.1869,
+        "Musicality": 2.3706,
+        "Memorability": 2.1068,
+        "Clarity": 2.0057,
+        "Naturalness": 2.1438
+    },
+    "3_70": {
+        "Coherence": 2.3004,
+        "Musicality": 2.3632,
+        "Memorability": 2.1409,
+        "Clarity": 2.125,
+        "Naturalness": 2.1482
+    },
+    "0_37": {
+        "Coherence": 2.5766,
+        "Musicality": 2.5612,
+        "Memorability": 2.3669,
+        "Clarity": 2.3994,
+        "Naturalness": 2.3825
+    },
+    "1_12": {
+        "Coherence": 1.1409,
+        "Musicality": 1.1304,
+        "Memorability": 1.1616,
+        "Clarity": 1.122,
+        "Naturalness": 1.1375
+    },
+    "0_86": {
+        "Coherence": 2.5979,
+        "Musicality": 2.5692,
+        "Memorability": 2.3593,
+        "Clarity": 2.4961,
+        "Naturalness": 2.5166
+    },
+    "4_99": {
+        "Coherence": 2.4895,
+        "Musicality": 2.391,
+        "Memorability": 2.3179,
+        "Clarity": 2.4036,
+        "Naturalness": 2.3048
+    },
+    "2_77": {
+        "Coherence": 2.1994,
+        "Musicality": 2.0788,
+        "Memorability": 2.0804,
+        "Clarity": 1.8985,
+        "Naturalness": 1.9426
+    },
+    "2_3": {
+        "Coherence": 1.8327,
+        "Musicality": 1.878,
+        "Memorability": 1.727,
+        "Clarity": 1.8213,
+        "Naturalness": 1.7748
+    },
+    "0_62": {
+        "Coherence": 1.7433,
+        "Musicality": 1.6047,
+        "Memorability": 1.7248,
+        "Clarity": 1.7229,
+        "Naturalness": 1.7643
+    },
+    "2_42": {
+        "Coherence": 2.5783,
+        "Musicality": 2.4933,
+        "Memorability": 2.352,
+        "Clarity": 2.3632,
+        "Naturalness": 2.5246
+    },
+    "0_60": {
+        "Coherence": 2.4633,
+        "Musicality": 2.2678,
+        "Memorability": 2.3175,
+        "Clarity": 2.3106,
+        "Naturalness": 2.3029
+    },
+    "3_99": {
+        "Coherence": 2.3416,
+        "Musicality": 2.2106,
+        "Memorability": 1.9726,
+        "Clarity": 2.0669,
+        "Naturalness": 2.3347
+    },
+    "3_80": {
+        "Coherence": 2.2788,
+        "Musicality": 2.3309,
+        "Memorability": 2.1834,
+        "Clarity": 2.2162,
+        "Naturalness": 2.2746
+    },
+    "2_58": {
+        "Coherence": 2.2654,
+        "Musicality": 2.1978,
+        "Memorability": 1.9416,
+        "Clarity": 2.1375,
+        "Naturalness": 2.0765
+    },
+    "0_46": {
+        "Coherence": 2.014,
+        "Musicality": 2.0622,
+        "Memorability": 2.0347,
+        "Clarity": 1.947,
+        "Naturalness": 2.0524
+    },
+    "1_38": {
+        "Coherence": 2.3565,
+        "Musicality": 2.2671,
+        "Memorability": 2.2796,
+        "Clarity": 2.1579,
+        "Naturalness": 2.1791
+    },
+    "1_98": {
+        "Coherence": 2.2469,
+        "Musicality": 2.1249,
+        "Memorability": 1.9923,
+        "Clarity": 2.0832,
+        "Naturalness": 2.1617
+    },
+    "0_5": {
+        "Coherence": 1.9986,
+        "Musicality": 1.8451,
+        "Memorability": 1.8903,
+        "Clarity": 1.8648,
+        "Naturalness": 2.0043
+    },
+    "0_61": {
+        "Coherence": 2.0186,
+        "Musicality": 1.9662,
+        "Memorability": 1.7885,
+        "Clarity": 1.8509,
+        "Naturalness": 1.8734
+    },
+    "4_68": {
+        "Coherence": 1.3889,
+        "Musicality": 1.4006,
+        "Memorability": 1.4449,
+        "Clarity": 1.3977,
+        "Naturalness": 1.3989
+    },
+    "3_52": {
+        "Coherence": 2.2957,
+        "Musicality": 2.302,
+        "Memorability": 2.3302,
+        "Clarity": 2.1986,
+        "Naturalness": 2.3071
+    },
+    "0_30": {
+        "Coherence": 2.4437,
+        "Musicality": 2.3724,
+        "Memorability": 2.2421,
+        "Clarity": 2.2268,
+        "Naturalness": 2.3019
+    },
+    "4_65": {
+        "Coherence": 2.0702,
+        "Musicality": 1.8917,
+        "Memorability": 1.9231,
+        "Clarity": 1.8066,
+        "Naturalness": 1.8056
+    },
+    "4_78": {
+        "Coherence": 2.5228,
+        "Musicality": 2.5301,
+        "Memorability": 2.2555,
+        "Clarity": 2.3036,
+        "Naturalness": 2.3016
+    },
+    "3_41": {
+        "Coherence": 2.3243,
+        "Musicality": 2.0625,
+        "Memorability": 2.2123,
+        "Clarity": 2.013,
+        "Naturalness": 2.1384
+    },
+    "3_4": {
+        "Coherence": 2.1711,
+        "Musicality": 2.1947,
+        "Memorability": 2.0476,
+        "Clarity": 2.0618,
+        "Naturalness": 2.1072
+    },
+    "0_34": {
+        "Coherence": 2.4183,
+        "Musicality": 2.2883,
+        "Memorability": 2.0825,
+        "Clarity": 2.1587,
+        "Naturalness": 2.3078
+    },
+    "2_67": {
+        "Coherence": 2.3483,
+        "Musicality": 2.1847,
+        "Memorability": 2.09,
+        "Clarity": 2.1366,
+        "Naturalness": 2.1259
+    },
+    "3_96": {
+        "Coherence": 2.1818,
+        "Musicality": 2.2395,
+        "Memorability": 2.2308,
+        "Clarity": 2.0452,
+        "Naturalness": 2.1293
+    },
+    "0_66": {
+        "Coherence": 2.0316,
+        "Musicality": 1.8951,
+        "Memorability": 2.0299,
+        "Clarity": 1.8193,
+        "Naturalness": 1.9089
+    },
+    "4_53": {
+        "Coherence": 2.3561,
+        "Musicality": 2.4071,
+        "Memorability": 2.2317,
+        "Clarity": 2.336,
+        "Naturalness": 2.3277
+    },
+    "2_23": {
+        "Coherence": 2.3174,
+        "Musicality": 2.3013,
+        "Memorability": 2.3749,
+        "Clarity": 2.3659,
+        "Naturalness": 2.2551
+    },
+    "2_28": {
+        "Coherence": 2.1358,
+        "Musicality": 2.0423,
+        "Memorability": 1.9649,
+        "Clarity": 1.9885,
+        "Naturalness": 2.0853
+    },
+    "2_29": {
+        "Coherence": 2.0729,
+        "Musicality": 2.0636,
+        "Memorability": 2.1252,
+        "Clarity": 1.9903,
+        "Naturalness": 1.9401
+    },
+    "4_21": {
+        "Coherence": 2.3339,
+        "Musicality": 2.0501,
+        "Memorability": 1.9084,
+        "Clarity": 1.9564,
+        "Naturalness": 1.9006
+    },
+    "3_46": {
+        "Coherence": 2.0809,
+        "Musicality": 2.0113,
+        "Memorability": 1.9666,
+        "Clarity": 1.8907,
+        "Naturalness": 2.0141
+    },
+    "0_77": {
+        "Coherence": 2.0897,
+        "Musicality": 2.098,
+        "Memorability": 2.2421,
+        "Clarity": 2.1986,
+        "Naturalness": 2.0956
+    },
+    "4_86": {
+        "Coherence": 2.491,
+        "Musicality": 2.5162,
+        "Memorability": 2.32,
+        "Clarity": 2.3499,
+        "Naturalness": 2.4308
+    },
+    "2_4": {
+        "Coherence": 1.8087,
+        "Musicality": 1.7177,
+        "Memorability": 1.5451,
+        "Clarity": 1.7168,
+        "Naturalness": 1.6058
+    },
+    "0_88": {
+        "Coherence": 1.779,
+        "Musicality": 1.8651,
+        "Memorability": 1.7381,
+        "Clarity": 1.7405,
+        "Naturalness": 1.7942
+    },
+    "3_79": {
+        "Coherence": 2.4717,
+        "Musicality": 2.2998,
+        "Memorability": 2.382,
+        "Clarity": 2.2195,
+        "Naturalness": 2.1466
+    },
+    "3_77": {
+        "Coherence": 2.1888,
+        "Musicality": 2.0327,
+        "Memorability": 2.0769,
+        "Clarity": 1.9536,
+        "Naturalness": 2.2245
+    },
+    "1_74": {
+        "Coherence": 2.3644,
+        "Musicality": 2.3894,
+        "Memorability": 2.2264,
+        "Clarity": 2.2723,
+        "Naturalness": 2.3437
+    },
+    "0_75": {
+        "Coherence": 2.0146,
+        "Musicality": 1.9708,
+        "Memorability": 1.9617,
+        "Clarity": 1.8868,
+        "Naturalness": 1.9818
+    },
+    "1_28": {
+        "Coherence": 2.281,
+        "Musicality": 2.2385,
+        "Memorability": 2.1841,
+        "Clarity": 2.1268,
+        "Naturalness": 2.1962
+    },
+    "2_76": {
+        "Coherence": 1.9583,
+        "Musicality": 1.6395,
+        "Memorability": 1.7161,
+        "Clarity": 1.7739,
+        "Naturalness": 1.6589
+    },
+    "2_7": {
+        "Coherence": 1.7494,
+        "Musicality": 1.7288,
+        "Memorability": 1.6475,
+        "Clarity": 1.696,
+        "Naturalness": 1.6172
+    },
+    "2_63": {
+        "Coherence": 2.142,
+        "Musicality": 2.1304,
+        "Memorability": 1.8713,
+        "Clarity": 1.9539,
+        "Naturalness": 1.9687
+    },
+    "1_55": {
+        "Coherence": 2.454,
+        "Musicality": 2.5627,
+        "Memorability": 2.3505,
+        "Clarity": 2.2649,
+        "Naturalness": 2.3556
+    },
+    "1_57": {
+        "Coherence": 2.1894,
+        "Musicality": 1.944,
+        "Memorability": 2.0374,
+        "Clarity": 1.8814,
+        "Naturalness": 1.9284
+    },
+    "4_94": {
+        "Coherence": 1.839,
+        "Musicality": 1.7626,
+        "Memorability": 1.6898,
+        "Clarity": 1.6964,
+        "Naturalness": 1.734
+    },
+    "3_24": {
+        "Coherence": 2.2606,
+        "Musicality": 2.1091,
+        "Memorability": 2.0483,
+        "Clarity": 2.1631,
+        "Naturalness": 2.1043
+    },
+    "2_15": {
+        "Coherence": 2.2367,
+        "Musicality": 2.2431,
+        "Memorability": 2.1904,
+        "Clarity": 2.1406,
+        "Naturalness": 2.1539
+    },
+    "0_8": {
+        "Coherence": 2.4336,
+        "Musicality": 2.4236,
+        "Memorability": 2.5433,
+        "Clarity": 2.5246,
+        "Naturalness": 2.4657
+    },
+    "3_14": {
+        "Coherence": 1.9237,
+        "Musicality": 1.856,
+        "Memorability": 1.8864,
+        "Clarity": 1.7281,
+        "Naturalness": 1.8647
+    },
+    "1_88": {
+        "Coherence": 2.1056,
+        "Musicality": 2.0166,
+        "Memorability": 2.0529,
+        "Clarity": 1.9988,
+        "Naturalness": 2.0217
+    },
+    "1_8": {
+        "Coherence": 2.2008,
+        "Musicality": 1.8821,
+        "Memorability": 2.0604,
+        "Clarity": 2.0161,
+        "Naturalness": 2.0145
+    },
+    "1_0": {
+        "Coherence": 2.3727,
+        "Musicality": 2.4407,
+        "Memorability": 2.1786,
+        "Clarity": 2.2065,
+        "Naturalness": 2.172
+    },
+    "3_92": {
+        "Coherence": 2.3062,
+        "Musicality": 2.3895,
+        "Memorability": 2.282,
+        "Clarity": 2.1578,
+        "Naturalness": 2.1791
+    },
+    "0_73": {
+        "Coherence": 2.1344,
+        "Musicality": 2.0625,
+        "Memorability": 1.8351,
+        "Clarity": 1.9275,
+        "Naturalness": 1.8729
+    },
+    "4_81": {
+        "Coherence": 2.0691,
+        "Musicality": 1.9451,
+        "Memorability": 2.1633,
+        "Clarity": 2.1333,
+        "Naturalness": 2.0668
+    },
+    "2_92": {
+        "Coherence": 2.4402,
+        "Musicality": 2.29,
+        "Memorability": 2.3331,
+        "Clarity": 2.1726,
+        "Naturalness": 2.1892
+    },
+    "1_66": {
+        "Coherence": 2.7032,
+        "Musicality": 2.6471,
+        "Memorability": 2.4789,
+        "Clarity": 2.4478,
+        "Naturalness": 2.5586
+    },
+    "3_44": {
+        "Coherence": 2.1894,
+        "Musicality": 2.0205,
+        "Memorability": 2.0975,
+        "Clarity": 1.9897,
+        "Naturalness": 2.0595
+    },
+    "0_4": {
+        "Coherence": 2.6016,
+        "Musicality": 2.4601,
+        "Memorability": 2.3464,
+        "Clarity": 2.3192,
+        "Naturalness": 2.2662
+    },
+    "2_93": {
+        "Coherence": 1.9954,
+        "Musicality": 1.9995,
+        "Memorability": 1.9576,
+        "Clarity": 1.9181,
+        "Naturalness": 2.053
+    },
+    "0_64": {
+        "Coherence": 2.4691,
+        "Musicality": 2.6417,
+        "Memorability": 2.2994,
+        "Clarity": 2.2818,
+        "Naturalness": 2.4011
+    },
+    "3_40": {
+        "Coherence": 2.6494,
+        "Musicality": 2.5188,
+        "Memorability": 2.5081,
+        "Clarity": 2.5678,
+        "Naturalness": 2.4542
+    },
+    "4_29": {
+        "Coherence": 1.7034,
+        "Musicality": 1.7253,
+        "Memorability": 1.5533,
+        "Clarity": 1.588,
+        "Naturalness": 1.6467
+    },
+    "3_74": {
+        "Coherence": 2.3365,
+        "Musicality": 2.3321,
+        "Memorability": 2.1928,
+        "Clarity": 2.2858,
+        "Naturalness": 2.1803
+    },
+    "2_71": {
+        "Coherence": 1.7911,
+        "Musicality": 1.7773,
+        "Memorability": 1.785,
+        "Clarity": 1.6606,
+        "Naturalness": 1.7868
+    },
+    "2_45": {
+        "Coherence": 2.0612,
+        "Musicality": 2.2141,
+        "Memorability": 1.912,
+        "Clarity": 2.0512,
+        "Naturalness": 1.991
+    },
+    "1_58": {
+        "Coherence": 2.5347,
+        "Musicality": 2.4564,
+        "Memorability": 2.3709,
+        "Clarity": 2.3363,
+        "Naturalness": 2.4027
+    },
+    "2_24": {
+        "Coherence": 2.1706,
+        "Musicality": 2.073,
+        "Memorability": 2.0641,
+        "Clarity": 1.9986,
+        "Naturalness": 2.1331
+    },
+    "2_44": {
+        "Coherence": 2.4707,
+        "Musicality": 2.2782,
+        "Memorability": 2.2546,
+        "Clarity": 2.2037,
+        "Naturalness": 2.2995
+    },
+    "3_73": {
+        "Coherence": 2.2706,
+        "Musicality": 2.2547,
+        "Memorability": 2.0179,
+        "Clarity": 2.2202,
+        "Naturalness": 2.1244
+    },
+    "1_72": {
+        "Coherence": 1.5911,
+        "Musicality": 1.5279,
+        "Memorability": 1.6521,
+        "Clarity": 1.5473,
+        "Naturalness": 1.5907
+    },
+    "0_94": {
+        "Coherence": 1.8602,
+        "Musicality": 1.8602,
+        "Memorability": 1.7409,
+        "Clarity": 1.8094,
+        "Naturalness": 1.854
+    },
+    "3_0": {
+        "Coherence": 1.6971,
+        "Musicality": 1.5703,
+        "Memorability": 1.5548,
+        "Clarity": 1.5854,
+        "Naturalness": 1.6097
+    },
+    "0_95": {
+        "Coherence": 1.8323,
+        "Musicality": 1.8864,
+        "Memorability": 1.8162,
+        "Clarity": 1.8239,
+        "Naturalness": 1.8678
+    },
+    "1_20": {
+        "Coherence": 1.8756,
+        "Musicality": 1.9785,
+        "Memorability": 1.947,
+        "Clarity": 1.8313,
+        "Naturalness": 1.9069
+    },
+    "1_80": {
+        "Coherence": 1.9406,
+        "Musicality": 1.9605,
+        "Memorability": 1.928,
+        "Clarity": 1.9513,
+        "Naturalness": 2.054
+    },
+    "4_62": {
+        "Coherence": 1.5723,
+        "Musicality": 1.4671,
+        "Memorability": 1.4701,
+        "Clarity": 1.4424,
+        "Naturalness": 1.5156
+    },
+    "2_78": {
+        "Coherence": 2.4861,
+        "Musicality": 2.5491,
+        "Memorability": 2.3015,
+        "Clarity": 2.3086,
+        "Naturalness": 2.32
+    },
+    "0_65": {
+        "Coherence": 2.232,
+        "Musicality": 2.1086,
+        "Memorability": 2.2506,
+        "Clarity": 2.0765,
+        "Naturalness": 2.0796
+    },
+    "4_66": {
+        "Coherence": 2.3562,
+        "Musicality": 2.3646,
+        "Memorability": 2.3862,
+        "Clarity": 2.2813,
+        "Naturalness": 2.2441
+    },
+    "4_40": {
+        "Coherence": 2.1639,
+        "Musicality": 2.0781,
+        "Memorability": 2.1157,
+        "Clarity": 2.1316,
+        "Naturalness": 2.1229
+    },
+    "4_93": {
+        "Coherence": 2.2189,
+        "Musicality": 2.2275,
+        "Memorability": 2.0691,
+        "Clarity": 2.1112,
+        "Naturalness": 2.0928
+    },
+    "3_88": {
+        "Coherence": 2.3956,
+        "Musicality": 2.3006,
+        "Memorability": 2.0355,
+        "Clarity": 2.1448,
+        "Naturalness": 2.1649
+    },
+    "0_15": {
+        "Coherence": 2.2185,
+        "Musicality": 2.2944,
+        "Memorability": 2.0519,
+        "Clarity": 2.1389,
+        "Naturalness": 2.0885
+    },
+    "1_96": {
+        "Coherence": 2.3422,
+        "Musicality": 2.3445,
+        "Memorability": 2.2867,
+        "Clarity": 2.1522,
+        "Naturalness": 2.1611
+    },
+    "0_74": {
+        "Coherence": 2.0854,
+        "Musicality": 2.0287,
+        "Memorability": 1.9169,
+        "Clarity": 1.9461,
+        "Naturalness": 2.1022
+    },
+    "0_54": {
+        "Coherence": 1.568,
+        "Musicality": 1.5066,
+        "Memorability": 1.5506,
+        "Clarity": 1.4892,
+        "Naturalness": 1.5473
+    },
+    "2_97": {
+        "Coherence": 2.0816,
+        "Musicality": 2.1353,
+        "Memorability": 1.9688,
+        "Clarity": 1.9818,
+        "Naturalness": 1.9441
+    },
+    "2_74": {
+        "Coherence": 1.9713,
+        "Musicality": 2.1465,
+        "Memorability": 1.9864,
+        "Clarity": 2.0631,
+        "Naturalness": 2.0106
+    },
+    "0_27": {
+        "Coherence": 2.8013,
+        "Musicality": 2.7117,
+        "Memorability": 2.4925,
+        "Clarity": 2.6482,
+        "Naturalness": 2.6713
+    },
+    "0_33": {
+        "Coherence": 2.3914,
+        "Musicality": 2.432,
+        "Memorability": 2.2417,
+        "Clarity": 2.4649,
+        "Naturalness": 2.4498
+    },
+    "1_2": {
+        "Coherence": 2.2006,
+        "Musicality": 2.1381,
+        "Memorability": 2.0668,
+        "Clarity": 2.0994,
+        "Naturalness": 2.1751
+    },
+    "4_5": {
+        "Coherence": 2.2028,
+        "Musicality": 2.2548,
+        "Memorability": 2.094,
+        "Clarity": 2.104,
+        "Naturalness": 2.0769
+    },
+    "1_82": {
+        "Coherence": 2.1139,
+        "Musicality": 2.0486,
+        "Memorability": 1.9668,
+        "Clarity": 2.0287,
+        "Naturalness": 1.983
+    },
+    "0_38": {
+        "Coherence": 2.2971,
+        "Musicality": 2.4286,
+        "Memorability": 2.334,
+        "Clarity": 2.2721,
+        "Naturalness": 2.3503
+    },
+    "4_59": {
+        "Coherence": 2.5923,
+        "Musicality": 2.5951,
+        "Memorability": 2.6329,
+        "Clarity": 2.5983,
+        "Naturalness": 2.5336
+    },
+    "3_71": {
+        "Coherence": 2.3849,
+        "Musicality": 2.0994,
+        "Memorability": 2.1678,
+        "Clarity": 2.1002,
+        "Naturalness": 2.1303
+    },
+    "2_35": {
+        "Coherence": 2.2882,
+        "Musicality": 2.1923,
+        "Memorability": 2.1158,
+        "Clarity": 2.0651,
+        "Naturalness": 2.2521
+    },
+    "3_86": {
+        "Coherence": 2.5946,
+        "Musicality": 2.5866,
+        "Memorability": 2.3881,
+        "Clarity": 2.4527,
+        "Naturalness": 2.3642
+    },
+    "2_14": {
+        "Coherence": 2.5263,
+        "Musicality": 2.3986,
+        "Memorability": 2.2717,
+        "Clarity": 2.2873,
+        "Naturalness": 2.3597
+    },
+    "3_42": {
+        "Coherence": 1.9662,
+        "Musicality": 1.9749,
+        "Memorability": 1.9184,
+        "Clarity": 1.8202,
+        "Naturalness": 1.9593
+    },
+    "3_48": {
+        "Coherence": 1.8617,
+        "Musicality": 1.8093,
+        "Memorability": 1.8166,
+        "Clarity": 1.8295,
+        "Naturalness": 1.8602
+    },
+    "3_31": {
+        "Coherence": 2.4623,
+        "Musicality": 2.4417,
+        "Memorability": 2.1803,
+        "Clarity": 2.2882,
+        "Naturalness": 2.2492
+    },
+    "1_32": {
+        "Coherence": 2.6004,
+        "Musicality": 2.5097,
+        "Memorability": 2.3734,
+        "Clarity": 2.3893,
+        "Naturalness": 2.4
+    },
+    "4_7": {
+        "Coherence": 2.297,
+        "Musicality": 2.3344,
+        "Memorability": 2.3356,
+        "Clarity": 2.1336,
+        "Naturalness": 2.2263
+    },
+    "3_93": {
+        "Coherence": 2.2957,
+        "Musicality": 2.1905,
+        "Memorability": 2.1554,
+        "Clarity": 2.0281,
+        "Naturalness": 2.207
+    },
+    "4_42": {
+        "Coherence": 2.2719,
+        "Musicality": 2.252,
+        "Memorability": 2.0813,
+        "Clarity": 2.1453,
+        "Naturalness": 2.1588
+    },
+    "0_91": {
+        "Coherence": 2.141,
+        "Musicality": 2.0821,
+        "Memorability": 2.0982,
+        "Clarity": 1.983,
+        "Naturalness": 1.9613
+    },
+    "3_27": {
+        "Coherence": 1.8279,
+        "Musicality": 1.7477,
+        "Memorability": 1.7982,
+        "Clarity": 1.6296,
+        "Naturalness": 1.7693
+    },
+    "3_54": {
+        "Coherence": 2.3305,
+        "Musicality": 2.3382,
+        "Memorability": 2.4071,
+        "Clarity": 2.2438,
+        "Naturalness": 2.2249
+    },
+    "1_24": {
+        "Coherence": 2.185,
+        "Musicality": 2.36,
+        "Memorability": 2.2286,
+        "Clarity": 2.1736,
+        "Naturalness": 2.1503
+    },
+    "0_14": {
+        "Coherence": 2.1015,
+        "Musicality": 2.1473,
+        "Memorability": 2.031,
+        "Clarity": 2.0198,
+        "Naturalness": 2.0559
+    },
+    "3_2": {
+        "Coherence": 2.0178,
+        "Musicality": 1.9007,
+        "Memorability": 2.0232,
+        "Clarity": 1.9892,
+        "Naturalness": 2.0074
+    },
+    "3_10": {
+        "Coherence": 2.3047,
+        "Musicality": 2.365,
+        "Memorability": 2.3817,
+        "Clarity": 2.208,
+        "Naturalness": 2.2809
+    },
+    "3_91": {
+        "Coherence": 2.2553,
+        "Musicality": 2.2049,
+        "Memorability": 2.2052,
+        "Clarity": 2.0175,
+        "Naturalness": 2.1619
+    },
+    "0_2": {
+        "Coherence": 2.1873,
+        "Musicality": 2.1483,
+        "Memorability": 1.9162,
+        "Clarity": 2.0543,
+        "Naturalness": 2.1214
+    },
+    "4_3": {
+        "Coherence": 2.4972,
+        "Musicality": 2.3255,
+        "Memorability": 2.3075,
+        "Clarity": 2.2674,
+        "Naturalness": 2.4217
+    },
+    "0_28": {
+        "Coherence": 2.4347,
+        "Musicality": 2.4768,
+        "Memorability": 2.2624,
+        "Clarity": 2.2847,
+        "Naturalness": 2.3786
+    },
+    "3_5": {
+        "Coherence": 1.989,
+        "Musicality": 1.9532,
+        "Memorability": 1.8144,
+        "Clarity": 1.8353,
+        "Naturalness": 1.9482
+    },
+    "3_84": {
+        "Coherence": 2.2007,
+        "Musicality": 2.0882,
+        "Memorability": 1.9826,
+        "Clarity": 1.8819,
+        "Naturalness": 1.9807
+    },
+    "4_18": {
+        "Coherence": 2.2287,
+        "Musicality": 2.1163,
+        "Memorability": 2.0292,
+        "Clarity": 2.0164,
+        "Naturalness": 2.0719
+    },
+    "3_49": {
+        "Coherence": 2.2148,
+        "Musicality": 1.9555,
+        "Memorability": 1.8934,
+        "Clarity": 1.9539,
+        "Naturalness": 2.0419
+    },
+    "4_70": {
+        "Coherence": 1.9702,
+        "Musicality": 2.0574,
+        "Memorability": 1.9396,
+        "Clarity": 1.8764,
+        "Naturalness": 1.8685
+    },
+    "0_68": {
+        "Coherence": 1.9775,
+        "Musicality": 1.9256,
+        "Memorability": 1.9359,
+        "Clarity": 1.9169,
+        "Naturalness": 1.9153
+    },
+    "2_11": {
+        "Coherence": 2.1669,
+        "Musicality": 2.02,
+        "Memorability": 2.0331,
+        "Clarity": 2.0185,
+        "Naturalness": 2.1166
+    },
+    "4_49": {
+        "Coherence": 2.595,
+        "Musicality": 2.3318,
+        "Memorability": 2.038,
+        "Clarity": 2.3474,
+        "Naturalness": 2.2379
+    },
+    "1_93": {
+        "Coherence": 2.309,
+        "Musicality": 2.175,
+        "Memorability": 2.2298,
+        "Clarity": 2.0778,
+        "Naturalness": 2.1573
+    },
+    "0_89": {
+        "Coherence": 2.6534,
+        "Musicality": 2.697,
+        "Memorability": 2.5988,
+        "Clarity": 2.448,
+        "Naturalness": 2.3883
+    },
+    "3_66": {
+        "Coherence": 2.1432,
+        "Musicality": 2.1136,
+        "Memorability": 2.125,
+        "Clarity": 2.0783,
+        "Naturalness": 2.3627
+    },
+    "0_21": {
+        "Coherence": 2.5321,
+        "Musicality": 2.4141,
+        "Memorability": 2.3183,
+        "Clarity": 2.261,
+        "Naturalness": 2.2971
+    },
+    "0_1": {
+        "Coherence": 1.5446,
+        "Musicality": 1.5321,
+        "Memorability": 1.5399,
+        "Clarity": 1.471,
+        "Naturalness": 1.4825
+    },
+    "2_79": {
+        "Coherence": 2.5557,
+        "Musicality": 2.5329,
+        "Memorability": 2.52,
+        "Clarity": 2.489,
+        "Naturalness": 2.6073
+    },
+    "4_22": {
+        "Coherence": 1.8591,
+        "Musicality": 1.9253,
+        "Memorability": 1.8709,
+        "Clarity": 1.8276,
+        "Naturalness": 1.8137
+    },
+    "1_56": {
+        "Coherence": 2.3548,
+        "Musicality": 2.3963,
+        "Memorability": 2.2703,
+        "Clarity": 2.327,
+        "Naturalness": 2.2915
+    },
+    "1_39": {
+        "Coherence": 1.9871,
+        "Musicality": 1.689,
+        "Memorability": 1.92,
+        "Clarity": 1.7833,
+        "Naturalness": 1.8184
+    },
+    "1_86": {
+        "Coherence": 2.2606,
+        "Musicality": 2.2865,
+        "Memorability": 2.2581,
+        "Clarity": 2.3054,
+        "Naturalness": 2.2113
+    },
+    "0_85": {
+        "Coherence": 1.9186,
+        "Musicality": 1.9081,
+        "Memorability": 1.9889,
+        "Clarity": 1.7794,
+        "Naturalness": 1.8494
+    },
+    "3_61": {
+        "Coherence": 1.9655,
+        "Musicality": 1.9839,
+        "Memorability": 1.8955,
+        "Clarity": 1.9394,
+        "Naturalness": 1.9518
+    },
+    "0_92": {
+        "Coherence": 2.2782,
+        "Musicality": 2.4049,
+        "Memorability": 2.2771,
+        "Clarity": 2.1731,
+        "Naturalness": 2.2203
+    },
+    "0_16": {
+        "Coherence": 1.9612,
+        "Musicality": 2.0146,
+        "Memorability": 1.7677,
+        "Clarity": 1.8591,
+        "Naturalness": 2.0399
+    },
+    "3_22": {
+        "Coherence": 2.1536,
+        "Musicality": 2.0755,
+        "Memorability": 1.9017,
+        "Clarity": 1.9414,
+        "Naturalness": 2.0436
+    },
+    "0_79": {
+        "Coherence": 2.1513,
+        "Musicality": 2.1437,
+        "Memorability": 1.9734,
+        "Clarity": 1.9043,
+        "Naturalness": 2.0233
+    },
+    "0_71": {
+        "Coherence": 2.4485,
+        "Musicality": 2.3952,
+        "Memorability": 2.3545,
+        "Clarity": 2.3959,
+        "Naturalness": 2.3662
+    },
+    "3_26": {
+        "Coherence": 2.0978,
+        "Musicality": 2.1113,
+        "Memorability": 2.0101,
+        "Clarity": 2.0855,
+        "Naturalness": 2.0965
+    },
+    "0_47": {
+        "Coherence": 2.2464,
+        "Musicality": 2.3119,
+        "Memorability": 2.4551,
+        "Clarity": 2.1364,
+        "Naturalness": 2.1958
+    },
+    "0_70": {
+        "Coherence": 2.1887,
+        "Musicality": 2.0891,
+        "Memorability": 2.1789,
+        "Clarity": 1.8599,
+        "Naturalness": 1.9961
+    },
+    "2_82": {
+        "Coherence": 2.0149,
+        "Musicality": 1.9833,
+        "Memorability": 1.984,
+        "Clarity": 1.8742,
+        "Naturalness": 1.9466
+    },
+    "1_37": {
+        "Coherence": 2.2924,
+        "Musicality": 2.2435,
+        "Memorability": 2.2556,
+        "Clarity": 2.1273,
+        "Naturalness": 2.2784
+    },
+    "3_32": {
+        "Coherence": 2.3229,
+        "Musicality": 2.2298,
+        "Memorability": 2.1514,
+        "Clarity": 2.1442,
+        "Naturalness": 2.1044
+    },
+    "2_40": {
+        "Coherence": 2.2995,
+        "Musicality": 2.3528,
+        "Memorability": 2.179,
+        "Clarity": 2.1627,
+        "Naturalness": 2.294
+    },
+    "3_15": {
+        "Coherence": 1.7409,
+        "Musicality": 1.6818,
+        "Memorability": 1.4963,
+        "Clarity": 1.6333,
+        "Naturalness": 1.6013
+    },
+    "1_42": {
+        "Coherence": 2.6129,
+        "Musicality": 2.5839,
+        "Memorability": 2.4966,
+        "Clarity": 2.429,
+        "Naturalness": 2.3687
+    },
+    "0_51": {
+        "Coherence": 2.143,
+        "Musicality": 2.2888,
+        "Memorability": 2.1488,
+        "Clarity": 2.1636,
+        "Naturalness": 2.1598
+    },
+    "1_16": {
+        "Coherence": 2.2393,
+        "Musicality": 2.247,
+        "Memorability": 2.2046,
+        "Clarity": 2.248,
+        "Naturalness": 2.2468
+    },
+    "2_49": {
+        "Coherence": 2.0881,
+        "Musicality": 2.2131,
+        "Memorability": 1.8822,
+        "Clarity": 2.0395,
+        "Naturalness": 1.9794
+    },
+    "1_84": {
+        "Coherence": 2.359,
+        "Musicality": 2.2389,
+        "Memorability": 2.2024,
+        "Clarity": 2.2048,
+        "Naturalness": 2.2222
+    },
+    "0_32": {
+        "Coherence": 2.3477,
+        "Musicality": 2.2993,
+        "Memorability": 2.1484,
+        "Clarity": 2.2419,
+        "Naturalness": 2.2541
+    },
+    "2_68": {
+        "Coherence": 2.3435,
+        "Musicality": 2.4809,
+        "Memorability": 2.3231,
+        "Clarity": 2.1379,
+        "Naturalness": 2.2375
+    },
+    "2_31": {
+        "Coherence": 2.2846,
+        "Musicality": 2.1809,
+        "Memorability": 2.2819,
+        "Clarity": 2.137,
+        "Naturalness": 2.2179
+    },
+    "2_6": {
+        "Coherence": 2.4279,
+        "Musicality": 2.2806,
+        "Memorability": 2.0665,
+        "Clarity": 2.2548,
+        "Naturalness": 2.1858
+    },
+    "4_34": {
+        "Coherence": 2.433,
+        "Musicality": 2.3738,
+        "Memorability": 2.3242,
+        "Clarity": 2.3203,
+        "Naturalness": 2.2842
+    },
+    "3_20": {
+        "Coherence": 2.4709,
+        "Musicality": 2.2068,
+        "Memorability": 2.3481,
+        "Clarity": 2.1515,
+        "Naturalness": 2.1738
+    },
+    "2_26": {
+        "Coherence": 2.761,
+        "Musicality": 2.9227,
+        "Memorability": 2.764,
+        "Clarity": 2.6298,
+        "Naturalness": 2.6233
+    },
+    "1_68": {
+        "Coherence": 2.5299,
+        "Musicality": 2.5349,
+        "Memorability": 2.3781,
+        "Clarity": 2.246,
+        "Naturalness": 2.4304
+    },
+    "2_25": {
+        "Coherence": 2.1776,
+        "Musicality": 2.0596,
+        "Memorability": 2.1202,
+        "Clarity": 1.9972,
+        "Naturalness": 2.0727
+    },
+    "1_69": {
+        "Coherence": 1.5309,
+        "Musicality": 1.461,
+        "Memorability": 1.6637,
+        "Clarity": 1.4651,
+        "Naturalness": 1.6969
+    },
+    "3_69": {
+        "Coherence": 2.1159,
+        "Musicality": 1.9865,
+        "Memorability": 1.8409,
+        "Clarity": 1.944,
+        "Naturalness": 1.8649
+    },
+    "4_19": {
+        "Coherence": 2.3554,
+        "Musicality": 2.29,
+        "Memorability": 2.2956,
+        "Clarity": 2.237,
+        "Naturalness": 2.3469
+    },
+    "4_4": {
+        "Coherence": 1.9869,
+        "Musicality": 1.7675,
+        "Memorability": 1.8982,
+        "Clarity": 1.7451,
+        "Naturalness": 1.7462
+    },
+    "4_57": {
+        "Coherence": 2.4678,
+        "Musicality": 2.5522,
+        "Memorability": 2.335,
+        "Clarity": 2.4827,
+        "Naturalness": 2.4377
+    },
+    "4_75": {
+        "Coherence": 2.1265,
+        "Musicality": 2.2157,
+        "Memorability": 2.1694,
+        "Clarity": 2.1359,
+        "Naturalness": 2.2256
+    },
+    "0_72": {
+        "Coherence": 2.3192,
+        "Musicality": 2.1476,
+        "Memorability": 2.0943,
+        "Clarity": 2.1586,
+        "Naturalness": 2.1182
+    },
+    "2_55": {
+        "Coherence": 1.8826,
+        "Musicality": 1.7765,
+        "Memorability": 1.8529,
+        "Clarity": 1.8804,
+        "Naturalness": 1.8853
+    },
+    "2_18": {
+        "Coherence": 2.302,
+        "Musicality": 2.2897,
+        "Memorability": 2.3549,
+        "Clarity": 2.112,
+        "Naturalness": 2.2913
+    },
+    "3_65": {
+        "Coherence": 2.3258,
+        "Musicality": 2.1564,
+        "Memorability": 2.199,
+        "Clarity": 2.1968,
+        "Naturalness": 2.177
+    },
+    "3_63": {
+        "Coherence": 2.8712,
+        "Musicality": 2.8692,
+        "Memorability": 2.8522,
+        "Clarity": 2.7607,
+        "Naturalness": 2.6322
+    },
+    "4_25": {
+        "Coherence": 2.312,
+        "Musicality": 2.2323,
+        "Memorability": 2.0557,
+        "Clarity": 2.1357,
+        "Naturalness": 2.2097
+    },
+    "4_52": {
+        "Coherence": 1.9228,
+        "Musicality": 1.8919,
+        "Memorability": 1.8349,
+        "Clarity": 1.8561,
+        "Naturalness": 1.9062
+    },
+    "4_1": {
+        "Coherence": 2.1892,
+        "Musicality": 2.1107,
+        "Memorability": 2.15,
+        "Clarity": 2.1533,
+        "Naturalness": 2.1818
+    },
+    "2_94": {
+        "Coherence": 2.7352,
+        "Musicality": 2.63,
+        "Memorability": 2.3537,
+        "Clarity": 2.4471,
+        "Naturalness": 2.5075
+    },
+    "2_32": {
+        "Coherence": 2.0399,
+        "Musicality": 2.0722,
+        "Memorability": 2.1334,
+        "Clarity": 1.9989,
+        "Naturalness": 2.1709
+    },
+    "2_20": {
+        "Coherence": 2.4868,
+        "Musicality": 2.4854,
+        "Memorability": 2.3528,
+        "Clarity": 2.4241,
+        "Naturalness": 2.4396
+    },
+    "4_16": {
+        "Coherence": 2.2808,
+        "Musicality": 2.1759,
+        "Memorability": 2.1629,
+        "Clarity": 2.0831,
+        "Naturalness": 2.2275
+    },
+    "4_61": {
+        "Coherence": 1.8089,
+        "Musicality": 1.8357,
+        "Memorability": 1.8118,
+        "Clarity": 1.783,
+        "Naturalness": 1.7906
+    },
+    "2_33": {
+        "Coherence": 2.2052,
+        "Musicality": 2.0665,
+        "Memorability": 2.0094,
+        "Clarity": 2.102,
+        "Naturalness": 2.0444
+    },
+    "2_10": {
+        "Coherence": 2.2482,
+        "Musicality": 2.1705,
+        "Memorability": 1.9102,
+        "Clarity": 2.0577,
+        "Naturalness": 2.0466
+    },
+    "2_0": {
+        "Coherence": 1.8268,
+        "Musicality": 1.7328,
+        "Memorability": 1.7469,
+        "Clarity": 1.6844,
+        "Naturalness": 1.8355
+    },
+    "2_83": {
+        "Coherence": 2.4584,
+        "Musicality": 2.3891,
+        "Memorability": 2.2064,
+        "Clarity": 2.244,
+        "Naturalness": 2.3245
+    },
+    "2_16": {
+        "Coherence": 2.1736,
+        "Musicality": 1.9615,
+        "Memorability": 1.9975,
+        "Clarity": 1.9478,
+        "Naturalness": 1.9718
+    },
+    "0_31": {
+        "Coherence": 2.7925,
+        "Musicality": 2.5672,
+        "Memorability": 2.793,
+        "Clarity": 2.6695,
+        "Naturalness": 2.7267
+    },
+    "4_32": {
+        "Coherence": 2.2045,
+        "Musicality": 2.0841,
+        "Memorability": 2.108,
+        "Clarity": 2.0678,
+        "Naturalness": 2.0925
+    },
+    "1_76": {
+        "Coherence": 1.927,
+        "Musicality": 1.9184,
+        "Memorability": 2.0669,
+        "Clarity": 1.9465,
+        "Naturalness": 1.9366
+    },
+    "1_43": {
+        "Coherence": 2.3344,
+        "Musicality": 2.4682,
+        "Memorability": 2.3157,
+        "Clarity": 2.3336,
+        "Naturalness": 2.2014
+    },
+    "1_54": {
+        "Coherence": 1.9669,
+        "Musicality": 1.9557,
+        "Memorability": 1.9318,
+        "Clarity": 1.8269,
+        "Naturalness": 1.8148
+    },
+    "4_43": {
+        "Coherence": 2.5014,
+        "Musicality": 2.5679,
+        "Memorability": 2.2708,
+        "Clarity": 2.4154,
+        "Naturalness": 2.4212
+    },
+    "4_77": {
+        "Coherence": 2.0359,
+        "Musicality": 2.0298,
+        "Memorability": 1.7116,
+        "Clarity": 1.8409,
+        "Naturalness": 1.8259
+    },
+    "0_58": {
+        "Coherence": 1.9356,
+        "Musicality": 1.947,
+        "Memorability": 1.9151,
+        "Clarity": 1.8816,
+        "Naturalness": 1.954
+    },
+    "2_36": {
+        "Coherence": 2.3709,
+        "Musicality": 2.3627,
+        "Memorability": 2.3401,
+        "Clarity": 2.2308,
+        "Naturalness": 2.2637
+    },
+    "1_65": {
+        "Coherence": 2.4607,
+        "Musicality": 2.4541,
+        "Memorability": 2.1764,
+        "Clarity": 2.235,
+        "Naturalness": 2.2667
+    },
+    "0_26": {
+        "Coherence": 2.2933,
+        "Musicality": 2.1993,
+        "Memorability": 2.1394,
+        "Clarity": 2.1676,
+        "Naturalness": 2.1097
+    },
+    "2_65": {
+        "Coherence": 2.0945,
+        "Musicality": 2.0679,
+        "Memorability": 1.8917,
+        "Clarity": 2.0363,
+        "Naturalness": 1.9987
+    },
+    "2_52": {
+        "Coherence": 2.2121,
+        "Musicality": 2.3237,
+        "Memorability": 2.2339,
+        "Clarity": 2.2382,
+        "Naturalness": 2.2416
+    },
+    "0_93": {
+        "Coherence": 2.1196,
+        "Musicality": 1.9865,
+        "Memorability": 1.9946,
+        "Clarity": 1.9056,
+        "Naturalness": 1.9408
+    },
+    "4_63": {
+        "Coherence": 2.5936,
+        "Musicality": 2.4613,
+        "Memorability": 2.4061,
+        "Clarity": 2.37,
+        "Naturalness": 2.403
+    },
+    "0_78": {
+        "Coherence": 2.1615,
+        "Musicality": 1.9972,
+        "Memorability": 1.9241,
+        "Clarity": 2.0307,
+        "Naturalness": 2.0021
+    },
+    "4_89": {
+        "Coherence": 2.0578,
+        "Musicality": 2.1219,
+        "Memorability": 1.988,
+        "Clarity": 1.9906,
+        "Naturalness": 1.9254
+    },
+    "4_76": {
+        "Coherence": 2.2024,
+        "Musicality": 2.2634,
+        "Memorability": 2.2442,
+        "Clarity": 2.2597,
+        "Naturalness": 2.2427
+    },
+    "1_62": {
+        "Coherence": 2.4221,
+        "Musicality": 2.2359,
+        "Memorability": 2.1999,
+        "Clarity": 2.2348,
+        "Naturalness": 2.1607
+    },
+    "3_30": {
+        "Coherence": 2.1491,
+        "Musicality": 2.1072,
+        "Memorability": 2.0558,
+        "Clarity": 2.0153,
+        "Naturalness": 1.9973
+    },
+    "4_0": {
+        "Coherence": 1.716,
+        "Musicality": 1.7595,
+        "Memorability": 1.5804,
+        "Clarity": 1.6338,
+        "Naturalness": 1.6699
+    },
+    "2_57": {
+        "Coherence": 1.6038,
+        "Musicality": 1.6037,
+        "Memorability": 1.5694,
+        "Clarity": 1.5594,
+        "Naturalness": 1.5852
+    },
+    "0_44": {
+        "Coherence": 2.1459,
+        "Musicality": 2.0758,
+        "Memorability": 1.8967,
+        "Clarity": 1.932,
+        "Naturalness": 1.9503
+    },
+    "3_72": {
+        "Coherence": 2.4176,
+        "Musicality": 2.4026,
+        "Memorability": 2.3219,
+        "Clarity": 2.2086,
+        "Naturalness": 2.2489
+    },
+    "1_59": {
+        "Coherence": 2.2601,
+        "Musicality": 2.1113,
+        "Memorability": 1.9935,
+        "Clarity": 1.9936,
+        "Naturalness": 2.0203
+    },
+    "1_92": {
+        "Coherence": 1.9006,
+        "Musicality": 1.8884,
+        "Memorability": 1.9651,
+        "Clarity": 1.8136,
+        "Naturalness": 1.7773
+    },
+    "1_6": {
+        "Coherence": 1.5059,
+        "Musicality": 1.4867,
+        "Memorability": 1.5298,
+        "Clarity": 1.4985,
+        "Naturalness": 1.5145
+    },
+    "3_43": {
+        "Coherence": 2.8295,
+        "Musicality": 2.7372,
+        "Memorability": 2.6963,
+        "Clarity": 2.6524,
+        "Naturalness": 2.5917
+    },
+    "1_85": {
+        "Coherence": 2.1031,
+        "Musicality": 1.9526,
+        "Memorability": 2.0542,
+        "Clarity": 1.9236,
+        "Naturalness": 2.0189
+    },
+    "4_79": {
+        "Coherence": 1.7147,
+        "Musicality": 1.6407,
+        "Memorability": 1.6568,
+        "Clarity": 1.6287,
+        "Naturalness": 1.6046
+    },
+    "4_69": {
+        "Coherence": 2.4221,
+        "Musicality": 2.2589,
+        "Memorability": 2.4544,
+        "Clarity": 2.2849,
+        "Naturalness": 2.3705
+    },
+    "1_97": {
+        "Coherence": 1.9869,
+        "Musicality": 1.9427,
+        "Memorability": 1.7226,
+        "Clarity": 1.872,
+        "Naturalness": 1.8986
+    },
+    "1_11": {
+        "Coherence": 2.2454,
+        "Musicality": 2.2705,
+        "Memorability": 1.9677,
+        "Clarity": 2.1494,
+        "Naturalness": 2.1263
+    },
+    "0_12": {
+        "Coherence": 2.1644,
+        "Musicality": 2.1354,
+        "Memorability": 2.0599,
+        "Clarity": 2.0154,
+        "Naturalness": 2.0486
+    },
+    "0_43": {
+        "Coherence": 2.3703,
+        "Musicality": 2.3369,
+        "Memorability": 2.0079,
+        "Clarity": 2.1595,
+        "Naturalness": 2.2462
+    },
+    "1_10": {
+        "Coherence": 2.8041,
+        "Musicality": 2.828,
+        "Memorability": 2.6825,
+        "Clarity": 2.6604,
+        "Naturalness": 2.5805
+    },
+    "3_45": {
+        "Coherence": 1.494,
+        "Musicality": 1.4763,
+        "Memorability": 1.5155,
+        "Clarity": 1.4437,
+        "Naturalness": 1.4177
+    },
+    "0_50": {
+        "Coherence": 1.952,
+        "Musicality": 2.0015,
+        "Memorability": 1.9009,
+        "Clarity": 1.954,
+        "Naturalness": 2.0498
+    },
+    "1_45": {
+        "Coherence": 2.0283,
+        "Musicality": 2.0581,
+        "Memorability": 1.9296,
+        "Clarity": 1.8813,
+        "Naturalness": 1.9802
+    },
+    "1_78": {
+        "Coherence": 1.4743,
+        "Musicality": 1.479,
+        "Memorability": 1.4332,
+        "Clarity": 1.4293,
+        "Naturalness": 1.4072
+    },
+    "2_62": {
+        "Coherence": 2.1068,
+        "Musicality": 1.9854,
+        "Memorability": 2.1011,
+        "Clarity": 2.0366,
+        "Naturalness": 2.1211
+    },
+    "0_67": {
+        "Coherence": 2.5199,
+        "Musicality": 2.6554,
+        "Memorability": 2.3816,
+        "Clarity": 2.4124,
+        "Naturalness": 2.482
+    },
+    "4_23": {
+        "Coherence": 2.1429,
+        "Musicality": 2.1081,
+        "Memorability": 2.0469,
+        "Clarity": 1.9779,
+        "Naturalness": 2.1301
+    },
+    "1_21": {
+        "Coherence": 1.6967,
+        "Musicality": 1.546,
+        "Memorability": 1.6842,
+        "Clarity": 1.5955,
+        "Naturalness": 1.6175
+    },
+    "2_91": {
+        "Coherence": 1.7161,
+        "Musicality": 1.647,
+        "Memorability": 1.6533,
+        "Clarity": 1.6559,
+        "Naturalness": 1.5782
+    },
+    "0_87": {
+        "Coherence": 2.4759,
+        "Musicality": 2.3796,
+        "Memorability": 2.2335,
+        "Clarity": 2.2174,
+        "Naturalness": 2.3454
+    },
+    "2_73": {
+        "Coherence": 1.9303,
+        "Musicality": 1.9323,
+        "Memorability": 1.8936,
+        "Clarity": 1.7804,
+        "Naturalness": 1.8337
+    },
+    "4_84": {
+        "Coherence": 2.2965,
+        "Musicality": 2.2682,
+        "Memorability": 2.0356,
+        "Clarity": 2.1209,
+        "Naturalness": 2.1255
+    },
+    "0_81": {
+        "Coherence": 2.505,
+        "Musicality": 2.2662,
+        "Memorability": 2.3091,
+        "Clarity": 2.3909,
+        "Naturalness": 2.386
+    },
+    "0_41": {
+        "Coherence": 2.4277,
+        "Musicality": 2.459,
+        "Memorability": 2.4047,
+        "Clarity": 2.3466,
+        "Naturalness": 2.2622
+    },
+    "4_55": {
+        "Coherence": 2.0315,
+        "Musicality": 2.0147,
+        "Memorability": 1.9808,
+        "Clarity": 1.9455,
+        "Naturalness": 2.0539
+    },
+    "2_48": {
+        "Coherence": 1.5202,
+        "Musicality": 1.4762,
+        "Memorability": 1.5298,
+        "Clarity": 1.554,
+        "Naturalness": 1.6199
+    },
+    "1_51": {
+        "Coherence": 1.4029,
+        "Musicality": 1.4056,
+        "Memorability": 1.391,
+        "Clarity": 1.4451,
+        "Naturalness": 1.4142
+    },
+    "0_11": {
+        "Coherence": 2.2915,
+        "Musicality": 2.386,
+        "Memorability": 2.2595,
+        "Clarity": 2.2164,
+        "Naturalness": 2.328
+    },
+    "1_91": {
+        "Coherence": 2.2548,
+        "Musicality": 2.3364,
+        "Memorability": 2.2506,
+        "Clarity": 2.141,
+        "Naturalness": 2.2084
+    },
+    "2_80": {
+        "Coherence": 2.2379,
+        "Musicality": 2.3012,
+        "Memorability": 2.3898,
+        "Clarity": 2.1524,
+        "Naturalness": 2.1655
+    },
+    "0_19": {
+        "Coherence": 2.104,
+        "Musicality": 2.045,
+        "Memorability": 2.0121,
+        "Clarity": 1.947,
+        "Naturalness": 2.0452
+    },
+    "1_64": {
+        "Coherence": 2.2254,
+        "Musicality": 2.242,
+        "Memorability": 1.9814,
+        "Clarity": 2.1224,
+        "Naturalness": 2.099
+    },
+    "3_55": {
+        "Coherence": 2.62,
+        "Musicality": 2.5255,
+        "Memorability": 2.39,
+        "Clarity": 2.398,
+        "Naturalness": 2.4008
+    },
+    "3_37": {
+        "Coherence": 1.7342,
+        "Musicality": 1.7382,
+        "Memorability": 1.7473,
+        "Clarity": 1.6751,
+        "Naturalness": 1.7243
+    },
+    "1_9": {
+        "Coherence": 1.9233,
+        "Musicality": 1.8755,
+        "Memorability": 1.8503,
+        "Clarity": 1.8165,
+        "Naturalness": 1.9132
+    },
+    "0_39": {
+        "Coherence": 2.3557,
+        "Musicality": 2.1258,
+        "Memorability": 2.1873,
+        "Clarity": 2.1679,
+        "Naturalness": 2.228
+    },
+    "2_39": {
+        "Coherence": 2.0775,
+        "Musicality": 2.1469,
+        "Memorability": 1.9039,
+        "Clarity": 2.1632,
+        "Naturalness": 2.0126
+    },
+    "2_69": {
+        "Coherence": 2.8159,
+        "Musicality": 2.7529,
+        "Memorability": 2.5734,
+        "Clarity": 2.5901,
+        "Naturalness": 2.5003
+    },
+    "2_81": {
+        "Coherence": 1.7624,
+        "Musicality": 1.7138,
+        "Memorability": 1.6759,
+        "Clarity": 1.714,
+        "Naturalness": 1.6316
+    },
+    "3_58": {
+        "Coherence": 1.9916,
+        "Musicality": 1.8992,
+        "Memorability": 1.992,
+        "Clarity": 1.9266,
+        "Naturalness": 1.9128
+    },
+    "1_99": {
+        "Coherence": 1.945,
+        "Musicality": 2.1036,
+        "Memorability": 1.8971,
+        "Clarity": 1.8253,
+        "Naturalness": 1.8982
+    },
+    "0_99": {
+        "Coherence": 2.4951,
+        "Musicality": 2.3151,
+        "Memorability": 2.4236,
+        "Clarity": 2.4955,
+        "Naturalness": 2.4378
+    },
+    "3_17": {
+        "Coherence": 2.5045,
+        "Musicality": 2.3734,
+        "Memorability": 2.5493,
+        "Clarity": 2.4323,
+        "Naturalness": 2.4147
+    },
+    "3_50": {
+        "Coherence": 2.6205,
+        "Musicality": 2.6439,
+        "Memorability": 2.4681,
+        "Clarity": 2.6855,
+        "Naturalness": 2.5049
+    },
+    "4_91": {
+        "Coherence": 2.3077,
+        "Musicality": 2.3176,
+        "Memorability": 2.1234,
+        "Clarity": 2.1738,
+        "Naturalness": 2.3045
+    },
+    "3_56": {
+        "Coherence": 2.1082,
+        "Musicality": 2.011,
+        "Memorability": 2.0182,
+        "Clarity": 2.016,
+        "Naturalness": 2.1074
+    },
+    "3_82": {
+        "Coherence": 1.7938,
+        "Musicality": 1.9296,
+        "Memorability": 1.9261,
+        "Clarity": 1.7735,
+        "Naturalness": 1.9263
+    },
+    "4_17": {
+        "Coherence": 2.0995,
+        "Musicality": 2.0639,
+        "Memorability": 1.8537,
+        "Clarity": 1.9828,
+        "Naturalness": 1.9461
+    },
+    "4_72": {
+        "Coherence": 2.6618,
+        "Musicality": 2.4627,
+        "Memorability": 2.4604,
+        "Clarity": 2.3689,
+        "Naturalness": 2.3129
+    },
+    "4_10": {
+        "Coherence": 1.8977,
+        "Musicality": 1.8834,
+        "Memorability": 1.9509,
+        "Clarity": 1.8282,
+        "Naturalness": 1.899
+    },
+    "4_74": {
+        "Coherence": 1.8133,
+        "Musicality": 1.6628,
+        "Memorability": 1.6591,
+        "Clarity": 1.621,
+        "Naturalness": 1.857
+    },
+    "0_7": {
+        "Coherence": 2.3334,
+        "Musicality": 2.1764,
+        "Memorability": 2.4455,
+        "Clarity": 2.2542,
+        "Naturalness": 2.2119
+    },
+    "0_53": {
+        "Coherence": 1.9901,
+        "Musicality": 1.9633,
+        "Memorability": 1.9243,
+        "Clarity": 1.8607,
+        "Naturalness": 1.8191
+    },
+    "3_83": {
+        "Coherence": 2.1562,
+        "Musicality": 2.2071,
+        "Memorability": 1.9081,
+        "Clarity": 2.1533,
+        "Naturalness": 1.9599
+    },
+    "3_35": {
+        "Coherence": 2.0654,
+        "Musicality": 2.0104,
+        "Memorability": 1.9849,
+        "Clarity": 1.9511,
+        "Naturalness": 2.0346
+    },
+    "4_88": {
+        "Coherence": 2.2395,
+        "Musicality": 2.1779,
+        "Memorability": 2.1249,
+        "Clarity": 2.1083,
+        "Naturalness": 2.2168
+    },
+    "1_52": {
+        "Coherence": 1.9105,
+        "Musicality": 1.905,
+        "Memorability": 1.7679,
+        "Clarity": 1.8393,
+        "Naturalness": 1.8354
+    },
+    "1_95": {
+        "Coherence": 2.1638,
+        "Musicality": 2.106,
+        "Memorability": 2.2122,
+        "Clarity": 2.1135,
+        "Naturalness": 2.0643
+    },
+    "1_1": {
+        "Coherence": 1.8786,
+        "Musicality": 1.8373,
+        "Memorability": 1.8086,
+        "Clarity": 1.746,
+        "Naturalness": 1.9081
+    },
+    "3_51": {
+        "Coherence": 2.0104,
+        "Musicality": 2.0662,
+        "Memorability": 1.987,
+        "Clarity": 1.8889,
+        "Naturalness": 1.9966
+    },
+    "4_73": {
+        "Coherence": 2.2588,
+        "Musicality": 2.0573,
+        "Memorability": 2.3054,
+        "Clarity": 2.1202,
+        "Naturalness": 2.1212
+    },
+    "4_39": {
+        "Coherence": 2.1785,
+        "Musicality": 2.0144,
+        "Memorability": 1.9765,
+        "Clarity": 2.0063,
+        "Naturalness": 2.0919
+    },
+    "2_85": {
+        "Coherence": 2.2436,
+        "Musicality": 2.2585,
+        "Memorability": 2.1735,
+        "Clarity": 2.1191,
+        "Naturalness": 2.1095
+    },
+    "0_57": {
+        "Coherence": 2.2371,
+        "Musicality": 2.0397,
+        "Memorability": 2.1769,
+        "Clarity": 2.0651,
+        "Naturalness": 2.0733
+    },
+    "2_60": {
+        "Coherence": 1.8551,
+        "Musicality": 1.9177,
+        "Memorability": 2.0625,
+        "Clarity": 1.9154,
+        "Naturalness": 1.943
+    },
+    "4_90": {
+        "Coherence": 2.0694,
+        "Musicality": 2.1923,
+        "Memorability": 2.1006,
+        "Clarity": 1.8264,
+        "Naturalness": 1.9136
+    },
+    "1_53": {
+        "Coherence": 2.0543,
+        "Musicality": 2.0252,
+        "Memorability": 1.8597,
+        "Clarity": 1.9347,
+        "Naturalness": 1.908
+    },
+    "4_20": {
+        "Coherence": 2.5224,
+        "Musicality": 2.4856,
+        "Memorability": 2.3748,
+        "Clarity": 2.3196,
+        "Naturalness": 2.3066
+    },
+    "3_90": {
+        "Coherence": 2.6885,
+        "Musicality": 2.7295,
+        "Memorability": 2.3706,
+        "Clarity": 2.5435,
+        "Naturalness": 2.53
+    },
+    "0_82": {
+        "Coherence": 2.0804,
+        "Musicality": 1.9735,
+        "Memorability": 2.0409,
+        "Clarity": 2.0917,
+        "Naturalness": 2.0347
+    },
+    "2_95": {
+        "Coherence": 2.3756,
+        "Musicality": 2.4778,
+        "Memorability": 2.3212,
+        "Clarity": 2.2432,
+        "Naturalness": 2.3635
+    },
+    "0_40": {
+        "Coherence": 2.0068,
+        "Musicality": 1.9907,
+        "Memorability": 2.1615,
+        "Clarity": 2.0915,
+        "Naturalness": 2.0913
+    },
+    "1_87": {
+        "Coherence": 2.3678,
+        "Musicality": 2.2955,
+        "Memorability": 2.3364,
+        "Clarity": 2.5113,
+        "Naturalness": 2.2153
+    },
+    "3_33": {
+        "Coherence": 2.6268,
+        "Musicality": 2.71,
+        "Memorability": 2.6308,
+        "Clarity": 2.494,
+        "Naturalness": 2.5557
+    },
+    "4_30": {
+        "Coherence": 2.3005,
+        "Musicality": 2.1241,
+        "Memorability": 2.1483,
+        "Clarity": 2.1645,
+        "Naturalness": 2.1296
+    },
+    "3_3": {
+        "Coherence": 2.3968,
+        "Musicality": 2.142,
+        "Memorability": 2.127,
+        "Clarity": 2.1015,
+        "Naturalness": 2.1594
+    },
+    "3_36": {
+        "Coherence": 1.8789,
+        "Musicality": 1.9,
+        "Memorability": 1.9269,
+        "Clarity": 1.9324,
+        "Naturalness": 1.9133
+    },
+    "4_80": {
+        "Coherence": 1.9391,
+        "Musicality": 1.7574,
+        "Memorability": 1.7275,
+        "Clarity": 1.7538,
+        "Naturalness": 1.7564
+    },
+    "2_51": {
+        "Coherence": 2.3966,
+        "Musicality": 2.1859,
+        "Memorability": 2.1455,
+        "Clarity": 2.0988,
+        "Naturalness": 2.1257
+    },
+    "4_58": {
+        "Coherence": 2.1579,
+        "Musicality": 2.1347,
+        "Memorability": 2.0173,
+        "Clarity": 2.0455,
+        "Naturalness": 2.1905
+    },
+    "1_34": {
+        "Coherence": 2.3832,
+        "Musicality": 2.1407,
+        "Memorability": 2.2762,
+        "Clarity": 2.0385,
+        "Naturalness": 2.055
+    },
+    "0_36": {
+        "Coherence": 1.6852,
+        "Musicality": 1.6678,
+        "Memorability": 1.4983,
+        "Clarity": 1.5262,
+        "Naturalness": 1.5535
+    },
+    "1_49": {
+        "Coherence": 2.0158,
+        "Musicality": 2.041,
+        "Memorability": 1.825,
+        "Clarity": 1.9561,
+        "Naturalness": 1.996
+    },
+    "3_29": {
+        "Coherence": 2.5665,
+        "Musicality": 2.4987,
+        "Memorability": 2.4091,
+        "Clarity": 2.3353,
+        "Naturalness": 2.318
+    },
+    "2_38": {
+        "Coherence": 2.1383,
+        "Musicality": 2.2452,
+        "Memorability": 2.0626,
+        "Clarity": 1.9986,
+        "Naturalness": 2.0202
+    },
+    "3_8": {
+        "Coherence": 1.8991,
+        "Musicality": 1.8809,
+        "Memorability": 1.7575,
+        "Clarity": 1.8112,
+        "Naturalness": 1.8553
+    },
+    "1_70": {
+        "Coherence": 2.684,
+        "Musicality": 2.4419,
+        "Memorability": 2.4613,
+        "Clarity": 2.4446,
+        "Naturalness": 2.5297
+    },
+    "0_24": {
+        "Coherence": 2.11,
+        "Musicality": 1.9138,
+        "Memorability": 1.8638,
+        "Clarity": 1.9175,
+        "Naturalness": 2.0126
+    },
+    "0_83": {
+        "Coherence": 2.1425,
+        "Musicality": 1.9797,
+        "Memorability": 2.0508,
+        "Clarity": 1.917,
+        "Naturalness": 1.9763
+    },
+    "0_90": {
+        "Coherence": 2.1196,
+        "Musicality": 2.2561,
+        "Memorability": 2.0771,
+        "Clarity": 2.202,
+        "Naturalness": 2.1228
+    },
+    "2_43": {
+        "Coherence": 2.287,
+        "Musicality": 2.2181,
+        "Memorability": 2.1297,
+        "Clarity": 2.1293,
+        "Naturalness": 2.1854
+    },
+    "4_11": {
+        "Coherence": 1.6293,
+        "Musicality": 1.6067,
+        "Memorability": 1.4963,
+        "Clarity": 1.5133,
+        "Naturalness": 1.5644
+    },
+    "2_5": {
+        "Coherence": 2.2671,
+        "Musicality": 2.1869,
+        "Memorability": 2.0988,
+        "Clarity": 2.0693,
+        "Naturalness": 2.0278
+    },
+    "2_22": {
+        "Coherence": 2.1326,
+        "Musicality": 1.9773,
+        "Memorability": 2.0836,
+        "Clarity": 1.9685,
+        "Naturalness": 1.9751
+    },
+    "1_15": {
+        "Coherence": 2.6055,
+        "Musicality": 2.61,
+        "Memorability": 2.2391,
+        "Clarity": 2.3558,
+        "Naturalness": 2.3996
+    },
+    "2_66": {
+        "Coherence": 2.0179,
+        "Musicality": 2.0401,
+        "Memorability": 1.9842,
+        "Clarity": 1.9368,
+        "Naturalness": 2.0828
+    },
+    "1_83": {
+        "Coherence": 2.0647,
+        "Musicality": 1.9912,
+        "Memorability": 2.007,
+        "Clarity": 2.0035,
+        "Naturalness": 1.8946
+    },
+    "4_31": {
+        "Coherence": 1.7548,
+        "Musicality": 1.7506,
+        "Memorability": 1.7613,
+        "Clarity": 1.7323,
+        "Naturalness": 1.813
+    },
+    "0_35": {
+        "Coherence": 2.406,
+        "Musicality": 2.4684,
+        "Memorability": 2.3046,
+        "Clarity": 2.2827,
+        "Naturalness": 2.2514
+    },
+    "4_46": {
+        "Coherence": 1.735,
+        "Musicality": 1.5574,
+        "Memorability": 1.6575,
+        "Clarity": 1.594,
+        "Naturalness": 1.6847
+    },
+    "4_82": {
+        "Coherence": 2.0916,
+        "Musicality": 1.9722,
+        "Memorability": 1.9405,
+        "Clarity": 1.9748,
+        "Naturalness": 1.9839
+    },
+    "4_60": {
+        "Coherence": 1.9311,
+        "Musicality": 1.8396,
+        "Memorability": 1.8892,
+        "Clarity": 1.8481,
+        "Naturalness": 1.8399
+    },
+    "3_57": {
+        "Coherence": 2.308,
+        "Musicality": 2.2293,
+        "Memorability": 2.1903,
+        "Clarity": 2.029,
+        "Naturalness": 2.1156
+    },
+    "1_46": {
+        "Coherence": 2.6872,
+        "Musicality": 2.7337,
+        "Memorability": 2.4985,
+        "Clarity": 2.6359,
+        "Naturalness": 2.7108
+    },
+    "0_52": {
+        "Coherence": 2.5576,
+        "Musicality": 2.5189,
+        "Memorability": 2.2952,
+        "Clarity": 2.3614,
+        "Naturalness": 2.3503
+    },
+    "2_70": {
+        "Coherence": 1.947,
+        "Musicality": 1.8895,
+        "Memorability": 1.8951,
+        "Clarity": 1.8664,
+        "Naturalness": 2.0238
+    },
+    "3_12": {
+        "Coherence": 1.6705,
+        "Musicality": 1.6687,
+        "Memorability": 1.6058,
+        "Clarity": 1.6089,
+        "Naturalness": 1.7315
+    },
+    "1_13": {
+        "Coherence": 1.9944,
+        "Musicality": 1.8802,
+        "Memorability": 1.8915,
+        "Clarity": 1.8086,
+        "Naturalness": 1.9355
+    },
+    "0_69": {
+        "Coherence": 2.1448,
+        "Musicality": 2.2876,
+        "Memorability": 2.3752,
+        "Clarity": 2.1484,
+        "Naturalness": 2.1355
+    },
+    "2_96": {
+        "Coherence": 2.1742,
+        "Musicality": 2.0379,
+        "Memorability": 2.1654,
+        "Clarity": 1.967,
+        "Naturalness": 2.1188
+    },
+    "1_29": {
+        "Coherence": 2.0899,
+        "Musicality": 2.0926,
+        "Memorability": 1.955,
+        "Clarity": 1.8637,
+        "Naturalness": 2.1318
+    },
+    "0_23": {
+        "Coherence": 2.3541,
+        "Musicality": 2.3899,
+        "Memorability": 2.3275,
+        "Clarity": 2.2506,
+        "Naturalness": 2.3021
+    },
+    "3_75": {
+        "Coherence": 1.7439,
+        "Musicality": 1.7416,
+        "Memorability": 1.745,
+        "Clarity": 1.7009,
+        "Naturalness": 1.8192
+    },
+    "0_22": {
+        "Coherence": 2.2646,
+        "Musicality": 2.322,
+        "Memorability": 2.1101,
+        "Clarity": 2.2849,
+        "Naturalness": 2.2111
+    },
+    "4_14": {
+        "Coherence": 2.007,
+        "Musicality": 1.6769,
+        "Memorability": 1.7567,
+        "Clarity": 1.773,
+        "Naturalness": 1.8146
+    },
+    "4_83": {
+        "Coherence": 1.8591,
+        "Musicality": 1.8238,
+        "Memorability": 1.797,
+        "Clarity": 1.7371,
+        "Naturalness": 1.8122
+    },
+    "4_87": {
+        "Coherence": 1.5046,
+        "Musicality": 1.468,
+        "Memorability": 1.5036,
+        "Clarity": 1.4899,
+        "Naturalness": 1.54
+    },
+    "1_22": {
+        "Coherence": 2.0131,
+        "Musicality": 1.9433,
+        "Memorability": 2.026,
+        "Clarity": 1.9211,
+        "Naturalness": 1.9316
+    },
+    "0_9": {
+        "Coherence": 1.8929,
+        "Musicality": 2.0008,
+        "Memorability": 1.9504,
+        "Clarity": 1.9416,
+        "Naturalness": 1.881
+    },
+    "4_8": {
+        "Coherence": 2.1362,
+        "Musicality": 1.8801,
+        "Memorability": 1.8276,
+        "Clarity": 1.824,
+        "Naturalness": 1.8764
+    },
+    "4_35": {
+        "Coherence": 2.5386,
+        "Musicality": 2.6107,
+        "Memorability": 2.4252,
+        "Clarity": 2.4234,
+        "Naturalness": 2.5142
+    },
+    "0_56": {
+        "Coherence": 2.1514,
+        "Musicality": 2.223,
+        "Memorability": 2.1618,
+        "Clarity": 2.0111,
+        "Naturalness": 2.0403
+    },
+    "2_46": {
+        "Coherence": 2.1267,
+        "Musicality": 2.2684,
+        "Memorability": 2.014,
+        "Clarity": 2.0737,
+        "Naturalness": 2.1822
+    },
+    "0_42": {
+        "Coherence": 2.2165,
+        "Musicality": 1.9245,
+        "Memorability": 2.0194,
+        "Clarity": 2.0125,
+        "Naturalness": 2.1712
+    },
+    "0_17": {
+        "Coherence": 1.9866,
+        "Musicality": 2.2045,
+        "Memorability": 1.8497,
+        "Clarity": 2.0364,
+        "Naturalness": 2.1013
+    },
+    "2_84": {
+        "Coherence": 2.0135,
+        "Musicality": 1.8381,
+        "Memorability": 1.9097,
+        "Clarity": 1.8395,
+        "Naturalness": 1.9822
+    },
+    "3_23": {
+        "Coherence": 1.627,
+        "Musicality": 1.8459,
+        "Memorability": 1.6816,
+        "Clarity": 1.6436,
+        "Naturalness": 1.7349
+    },
+    "2_27": {
+        "Coherence": 1.8048,
+        "Musicality": 1.8331,
+        "Memorability": 1.8633,
+        "Clarity": 1.8186,
+        "Naturalness": 1.9587
+    },
+    "4_13": {
+        "Coherence": 2.0373,
+        "Musicality": 2.0325,
+        "Memorability": 1.8568,
+        "Clarity": 1.956,
+        "Naturalness": 1.9767
+    },
+    "3_21": {
+        "Coherence": 2.0607,
+        "Musicality": 1.9899,
+        "Memorability": 2.0849,
+        "Clarity": 1.9912,
+        "Naturalness": 2.1108
+    },
+    "2_17": {
+        "Coherence": 2.2321,
+        "Musicality": 1.8576,
+        "Memorability": 1.8561,
+        "Clarity": 1.9274,
+        "Naturalness": 1.9511
+    },
+    "1_31": {
+        "Coherence": 2.4058,
+        "Musicality": 2.363,
+        "Memorability": 2.3252,
+        "Clarity": 2.2802,
+        "Naturalness": 2.3214
+    },
+    "2_99": {
+        "Coherence": 2.199,
+        "Musicality": 2.2032,
+        "Memorability": 2.181,
+        "Clarity": 2.2392,
+        "Naturalness": 2.1879
+    },
+    "1_19": {
+        "Coherence": 1.76,
+        "Musicality": 1.7322,
+        "Memorability": 1.7219,
+        "Clarity": 1.7162,
+        "Naturalness": 1.7226
+    },
+    "1_35": {
+        "Coherence": 2.8663,
+        "Musicality": 2.8794,
+        "Memorability": 2.8437,
+        "Clarity": 2.82,
+        "Naturalness": 2.7089
+    },
+    "4_15": {
+        "Coherence": 1.5884,
+        "Musicality": 1.6912,
+        "Memorability": 1.5345,
+        "Clarity": 1.5451,
+        "Naturalness": 1.5748
+    },
+    "4_98": {
+        "Coherence": 2.5377,
+        "Musicality": 2.4452,
+        "Memorability": 2.3973,
+        "Clarity": 2.4145,
+        "Naturalness": 2.3078
+    },
+    "4_97": {
+        "Coherence": 2.0159,
+        "Musicality": 1.9679,
+        "Memorability": 1.8909,
+        "Clarity": 1.8561,
+        "Naturalness": 1.9376
+    },
+    "3_62": {
+        "Coherence": 2.2726,
+        "Musicality": 2.1243,
+        "Memorability": 1.9979,
+        "Clarity": 2.0129,
+        "Naturalness": 2.169
+    },
+    "3_47": {
+        "Coherence": 1.9621,
+        "Musicality": 1.8414,
+        "Memorability": 1.7679,
+        "Clarity": 1.7602,
+        "Naturalness": 1.8394
+    },
+    "4_44": {
+        "Coherence": 1.9389,
+        "Musicality": 2.0183,
+        "Memorability": 1.9027,
+        "Clarity": 1.7985,
+        "Naturalness": 1.8496
+    },
+    "4_26": {
+        "Coherence": 2.3804,
+        "Musicality": 2.2318,
+        "Memorability": 2.2971,
+        "Clarity": 2.3846,
+        "Naturalness": 2.3424
+    },
+    "0_96": {
+        "Coherence": 2.1998,
+        "Musicality": 2.0293,
+        "Memorability": 2.1546,
+        "Clarity": 1.9797,
+        "Naturalness": 2.0439
+    },
+    "2_19": {
+        "Coherence": 2.6319,
+        "Musicality": 2.6034,
+        "Memorability": 2.674,
+        "Clarity": 2.4835,
+        "Naturalness": 2.5341
+    },
+    "3_34": {
+        "Coherence": 2.1468,
+        "Musicality": 2.0713,
+        "Memorability": 2.0913,
+        "Clarity": 2.0615,
+        "Naturalness": 2.1338
+    },
+    "3_53": {
+        "Coherence": 2.134,
+        "Musicality": 2.1984,
+        "Memorability": 2.235,
+        "Clarity": 2.1576,
+        "Naturalness": 2.1941
+    },
+    "1_4": {
+        "Coherence": 2.3963,
+        "Musicality": 2.1941,
+        "Memorability": 2.2128,
+        "Clarity": 2.2107,
+        "Naturalness": 2.1844
+    },
+    "4_96": {
+        "Coherence": 2.2618,
+        "Musicality": 2.232,
+        "Memorability": 2.2834,
+        "Clarity": 2.1766,
+        "Naturalness": 2.175
+    },
+    "4_67": {
+        "Coherence": 2.1472,
+        "Musicality": 2.2276,
+        "Memorability": 1.9249,
+        "Clarity": 2.1189,
+        "Naturalness": 2.0585
+    },
+    "3_18": {
+        "Coherence": 2.2926,
+        "Musicality": 2.0919,
+        "Memorability": 1.9636,
+        "Clarity": 2.1313,
+        "Naturalness": 2.0226
+    },
+    "2_34": {
+        "Coherence": 1.8336,
+        "Musicality": 1.7908,
+        "Memorability": 1.8167,
+        "Clarity": 1.8019,
+        "Naturalness": 1.7783
+    },
+    "1_33": {
+        "Coherence": 1.9092,
+        "Musicality": 1.8481,
+        "Memorability": 1.899,
+        "Clarity": 1.8472,
+        "Naturalness": 1.8423
+    },
+    "1_14": {
+        "Coherence": 2.0287,
+        "Musicality": 1.6931,
+        "Memorability": 1.8084,
+        "Clarity": 1.8911,
+        "Naturalness": 1.8841
+    },
+    "4_45": {
+        "Coherence": 2.2605,
+        "Musicality": 2.3191,
+        "Memorability": 1.9416,
+        "Clarity": 2.2267,
+        "Naturalness": 2.0908
+    },
+    "4_51": {
+        "Coherence": 2.3854,
+        "Musicality": 2.3819,
+        "Memorability": 2.3657,
+        "Clarity": 2.2707,
+        "Naturalness": 2.356
+    },
+    "0_20": {
+        "Coherence": 2.4207,
+        "Musicality": 2.1803,
+        "Memorability": 2.3646,
+        "Clarity": 2.3664,
+        "Naturalness": 2.2901
+    },
+    "1_90": {
+        "Coherence": 2.1604,
+        "Musicality": 1.9937,
+        "Memorability": 1.9726,
+        "Clarity": 2.0761,
+        "Naturalness": 2.0193
+    },
+    "3_6": {
+        "Coherence": 2.0842,
+        "Musicality": 2.0169,
+        "Memorability": 1.9929,
+        "Clarity": 1.9499,
+        "Naturalness": 1.9548
+    },
+    "3_68": {
+        "Coherence": 2.1774,
+        "Musicality": 2.1836,
+        "Memorability": 2.2703,
+        "Clarity": 2.0859,
+        "Naturalness": 2.1572
+    },
+    "4_64": {
+        "Coherence": 2.2637,
+        "Musicality": 2.1571,
+        "Memorability": 2.1312,
+        "Clarity": 2.1039,
+        "Naturalness": 2.1308
+    },
+    "1_7": {
+        "Coherence": 2.127,
+        "Musicality": 2.1189,
+        "Memorability": 2.101,
+        "Clarity": 2.0108,
+        "Naturalness": 2.0466
+    },
+    "2_56": {
+        "Coherence": 2.255,
+        "Musicality": 2.2398,
+        "Memorability": 2.0856,
+        "Clarity": 2.2421,
+        "Naturalness": 2.0982
+    },
+    "2_64": {
+        "Coherence": 2.3961,
+        "Musicality": 2.3965,
+        "Memorability": 2.3291,
+        "Clarity": 2.2876,
+        "Naturalness": 2.3993
+    },
+    "4_2": {
+        "Coherence": 2.0556,
+        "Musicality": 2.0101,
+        "Memorability": 1.9861,
+        "Clarity": 1.9554,
+        "Naturalness": 1.8409
+    },
+    "0_48": {
+        "Coherence": 1.6236,
+        "Musicality": 1.5744,
+        "Memorability": 1.613,
+        "Clarity": 1.654,
+        "Naturalness": 1.5972
+    },
+    "4_33": {
+        "Coherence": 2.3151,
+        "Musicality": 2.311,
+        "Memorability": 2.0714,
+        "Clarity": 2.1852,
+        "Naturalness": 2.2574
+    },
+    "3_59": {
+        "Coherence": 2.2179,
+        "Musicality": 2.1151,
+        "Memorability": 2.1836,
+        "Clarity": 2.1124,
+        "Naturalness": 2.1609
+    },
+    "1_30": {
+        "Coherence": 2.3185,
+        "Musicality": 2.2906,
+        "Memorability": 2.2103,
+        "Clarity": 2.1885,
+        "Naturalness": 2.3091
+    },
+    "2_87": {
+        "Coherence": 2.2864,
+        "Musicality": 2.0823,
+        "Memorability": 2.132,
+        "Clarity": 2.1362,
+        "Naturalness": 2.1958
+    },
+    "1_27": {
+        "Coherence": 2.2529,
+        "Musicality": 2.2283,
+        "Memorability": 2.1236,
+        "Clarity": 2.2069,
+        "Naturalness": 2.1388
+    },
+    "1_50": {
+        "Coherence": 2.3244,
+        "Musicality": 2.3636,
+        "Memorability": 2.2035,
+        "Clarity": 2.2259,
+        "Naturalness": 2.2104
+    },
+    "4_54": {
+        "Coherence": 1.5802,
+        "Musicality": 1.6693,
+        "Memorability": 1.7025,
+        "Clarity": 1.6213,
+        "Naturalness": 1.7422
+    },
+    "3_38": {
+        "Coherence": 2.149,
+        "Musicality": 1.9559,
+        "Memorability": 1.8837,
+        "Clarity": 2.0031,
+        "Naturalness": 2.1194
+    },
+    "3_60": {
+        "Coherence": 2.1493,
+        "Musicality": 2.1146,
+        "Memorability": 1.8566,
+        "Clarity": 2.0343,
+        "Naturalness": 2.0217
+    },
+    "0_80": {
+        "Coherence": 2.2452,
+        "Musicality": 2.0877,
+        "Memorability": 1.9421,
+        "Clarity": 2.0476,
+        "Naturalness": 2.0674
+    },
+    "0_10": {
+        "Coherence": 2.3701,
+        "Musicality": 2.1845,
+        "Memorability": 2.1567,
+        "Clarity": 2.0152,
+        "Naturalness": 2.2248
+    },
+    "4_37": {
+        "Coherence": 1.8832,
+        "Musicality": 1.7516,
+        "Memorability": 1.8802,
+        "Clarity": 1.9111,
+        "Naturalness": 1.9214
+    },
+    "2_47": {
+        "Coherence": 2.2542,
+        "Musicality": 2.4492,
+        "Memorability": 2.208,
+        "Clarity": 2.2574,
+        "Naturalness": 2.2519
+    },
+    "0_84": {
+        "Coherence": 2.0714,
+        "Musicality": 1.9126,
+        "Memorability": 2.0429,
+        "Clarity": 1.961,
+        "Naturalness": 2.0466
+    },
+    "2_98": {
+        "Coherence": 1.8055,
+        "Musicality": 1.8217,
+        "Memorability": 1.7747,
+        "Clarity": 1.6343,
+        "Naturalness": 1.6715
+    },
+    "1_26": {
+        "Coherence": 1.9664,
+        "Musicality": 1.95,
+        "Memorability": 1.9236,
+        "Clarity": 1.8303,
+        "Naturalness": 1.8775
+    },
+    "4_50": {
+        "Coherence": 1.8427,
+        "Musicality": 1.73,
+        "Memorability": 1.6585,
+        "Clarity": 1.7139,
+        "Naturalness": 1.6576
+    },
+    "0_0": {
+        "Coherence": 2.0397,
+        "Musicality": 1.9672,
+        "Memorability": 1.946,
+        "Clarity": 1.8068,
+        "Naturalness": 1.9073
+    },
+    "4_47": {
+        "Coherence": 1.8557,
+        "Musicality": 1.8835,
+        "Memorability": 1.837,
+        "Clarity": 1.7625,
+        "Naturalness": 1.9051
+    },
+    "2_13": {
+        "Coherence": 1.6357,
+        "Musicality": 1.5882,
+        "Memorability": 1.6177,
+        "Clarity": 1.4985,
+        "Naturalness": 1.5918
+    },
+    "4_36": {
+        "Coherence": 1.9727,
+        "Musicality": 1.8362,
+        "Memorability": 1.9173,
+        "Clarity": 1.8182,
+        "Naturalness": 1.9177
+    },
+    "4_27": {
+        "Coherence": 1.5064,
+        "Musicality": 1.5201,
+        "Memorability": 1.5053,
+        "Clarity": 1.5035,
+        "Naturalness": 1.4923
+    },
+    "4_9": {
+        "Coherence": 2.1439,
+        "Musicality": 2.1297,
+        "Memorability": 2.0104,
+        "Clarity": 2.0009,
+        "Naturalness": 2.04
+    },
+    "1_89": {
+        "Coherence": 2.0242,
+        "Musicality": 2.0556,
+        "Memorability": 1.9212,
+        "Clarity": 1.8167,
+        "Naturalness": 2.0016
+    },
+    "1_73": {
+        "Coherence": 2.1793,
+        "Musicality": 2.2498,
+        "Memorability": 2.1285,
+        "Clarity": 2.1383,
+        "Naturalness": 2.1396
+    },
+    "2_86": {
+        "Coherence": 2.3578,
+        "Musicality": 2.2023,
+        "Memorability": 2.1193,
+        "Clarity": 2.2041,
+        "Naturalness": 2.1863
+    },
+    "3_39": {
+        "Coherence": 2.4611,
+        "Musicality": 2.3943,
+        "Memorability": 2.3378,
+        "Clarity": 2.3163,
+        "Naturalness": 2.3382
+    },
+    "0_29": {
+        "Coherence": 2.346,
+        "Musicality": 2.1596,
+        "Memorability": 2.1985,
+        "Clarity": 2.295,
+        "Naturalness": 2.1203
+    },
+    "1_18": {
+        "Coherence": 1.8233,
+        "Musicality": 1.7384,
+        "Memorability": 1.6232,
+        "Clarity": 1.6176,
+        "Naturalness": 1.7529
+    },
+    "3_7": {
+        "Coherence": 2.4063,
+        "Musicality": 2.4915,
+        "Memorability": 2.2263,
+        "Clarity": 2.4099,
+        "Naturalness": 2.3619
+    },
+    "1_17": {
+        "Coherence": 2.4014,
+        "Musicality": 2.2059,
+        "Memorability": 2.187,
+        "Clarity": 2.1376,
+        "Naturalness": 2.1224
+    },
+    "4_41": {
+        "Coherence": 1.914,
+        "Musicality": 1.9452,
+        "Memorability": 1.8625,
+        "Clarity": 1.9696,
+        "Naturalness": 1.9795
+    },
+    "1_61": {
+        "Coherence": 2.006,
+        "Musicality": 1.9873,
+        "Memorability": 1.9519,
+        "Clarity": 1.8659,
+        "Naturalness": 1.9498
+    },
+    "3_13": {
+        "Coherence": 2.3618,
+        "Musicality": 2.3475,
+        "Memorability": 2.2027,
+        "Clarity": 2.2215,
+        "Naturalness": 2.3608
+    },
+    "3_9": {
+        "Coherence": 2.385,
+        "Musicality": 2.3741,
+        "Memorability": 2.3,
+        "Clarity": 2.3678,
+        "Naturalness": 2.4543
+    },
+    "2_30": {
+        "Coherence": 2.0684,
+        "Musicality": 1.8582,
+        "Memorability": 2.06,
+        "Clarity": 1.9585,
+        "Naturalness": 2.0117
+    },
+    "2_53": {
+        "Coherence": 1.9078,
+        "Musicality": 1.8967,
+        "Memorability": 1.7585,
+        "Clarity": 2.0203,
+        "Naturalness": 1.8391
+    },
+    "2_41": {
+        "Coherence": 1.8686,
+        "Musicality": 1.9613,
+        "Memorability": 1.804,
+        "Clarity": 1.8032,
+        "Naturalness": 1.861
+    },
+    "1_81": {
+        "Coherence": 2.2679,
+        "Musicality": 2.1949,
+        "Memorability": 2.1999,
+        "Clarity": 2.0734,
+        "Naturalness": 2.2605
+    },
+    "3_98": {
+        "Coherence": 2.3907,
+        "Musicality": 2.4633,
+        "Memorability": 2.1563,
+        "Clarity": 2.161,
+        "Naturalness": 2.4192
+    },
+    "2_89": {
+        "Coherence": 2.0126,
+        "Musicality": 2.0098,
+        "Memorability": 1.9861,
+        "Clarity": 1.9218,
+        "Naturalness": 1.9113
+    },
+    "4_28": {
+        "Coherence": 2.0495,
+        "Musicality": 1.8543,
+        "Memorability": 1.8595,
+        "Clarity": 1.8491,
+        "Naturalness": 1.9354
+    },
+    "2_12": {
+        "Coherence": 2.2923,
+        "Musicality": 2.3921,
+        "Memorability": 2.2181,
+        "Clarity": 2.1807,
+        "Naturalness": 2.1864
+    },
+    "1_3": {
+        "Coherence": 1.6278,
+        "Musicality": 1.7068,
+        "Memorability": 1.6259,
+        "Clarity": 1.7255,
+        "Naturalness": 1.7471
+    },
+    "3_78": {
+        "Coherence": 2.9628,
+        "Musicality": 2.8431,
+        "Memorability": 2.654,
+        "Clarity": 2.6517,
+        "Naturalness": 2.7332
+    },
+    "3_87": {
+        "Coherence": 1.8677,
+        "Musicality": 1.847,
+        "Memorability": 1.966,
+        "Clarity": 1.7626,
+        "Naturalness": 1.8631
+    },
+    "0_55": {
+        "Coherence": 1.9595,
+        "Musicality": 1.8839,
+        "Memorability": 1.8605,
+        "Clarity": 1.8872,
+        "Naturalness": 1.8648
+    },
+    "1_79": {
+        "Coherence": 2.3852,
+        "Musicality": 2.3242,
+        "Memorability": 2.2843,
+        "Clarity": 2.1605,
+        "Naturalness": 2.2595
+    },
+    "3_19": {
+        "Coherence": 2.2715,
+        "Musicality": 2.2328,
+        "Memorability": 2.2698,
+        "Clarity": 2.2219,
+        "Naturalness": 2.3195
+    },
+    "2_8": {
+        "Coherence": 2.4697,
+        "Musicality": 2.2947,
+        "Memorability": 2.56,
+        "Clarity": 2.3589,
+        "Naturalness": 2.2362
+    },
+    "1_44": {
+        "Coherence": 2.3735,
+        "Musicality": 2.4384,
+        "Memorability": 2.2973,
+        "Clarity": 2.2148,
+        "Naturalness": 2.2393
+    },
+    "2_1": {
+        "Coherence": 2.4573,
+        "Musicality": 2.4089,
+        "Memorability": 2.4131,
+        "Clarity": 2.2625,
+        "Naturalness": 2.4246
+    },
+    "0_6": {
+        "Coherence": 2.3034,
+        "Musicality": 2.073,
+        "Memorability": 2.0064,
+        "Clarity": 1.9674,
+        "Naturalness": 2.126
+    },
+    "4_24": {
+        "Coherence": 2.065,
+        "Musicality": 2.1859,
+        "Memorability": 2.0134,
+        "Clarity": 1.9378,
+        "Naturalness": 2.0556
+    },
+    "2_61": {
+        "Coherence": 2.232,
+        "Musicality": 1.9937,
+        "Memorability": 2.0872,
+        "Clarity": 1.9703,
+        "Naturalness": 1.9535
+    },
+    "1_25": {
+        "Coherence": 2.2357,
+        "Musicality": 2.2056,
+        "Memorability": 2.2286,
+        "Clarity": 1.9696,
+        "Naturalness": 2.0042
+    },
+    "1_60": {
+        "Coherence": 1.9664,
+        "Musicality": 2.0033,
+        "Memorability": 1.9511,
+        "Clarity": 1.875,
+        "Naturalness": 1.9356
+    },
+    "1_36": {
+        "Coherence": 2.2049,
+        "Musicality": 2.385,
+        "Memorability": 2.263,
+        "Clarity": 2.164,
+        "Naturalness": 2.0728
+    },
+    "1_63": {
+        "Coherence": 1.8495,
+        "Musicality": 1.8794,
+        "Memorability": 1.9,
+        "Clarity": 1.8604,
+        "Naturalness": 1.8717
+    },
+    "2_21": {
+        "Coherence": 2.3975,
+        "Musicality": 2.256,
+        "Memorability": 2.204,
+        "Clarity": 2.1861,
+        "Naturalness": 2.32
+    },
+    "4_85": {
+        "Coherence": 2.5726,
+        "Musicality": 2.2728,
+        "Memorability": 2.3922,
+        "Clarity": 2.3106,
+        "Naturalness": 2.3212
+    },
+    "2_59": {
+        "Coherence": 2.2687,
+        "Musicality": 2.3734,
+        "Memorability": 2.1968,
+        "Clarity": 2.0332,
+        "Naturalness": 2.2547
+    },
+    "1_94": {
+        "Coherence": 2.1387,
+        "Musicality": 2.1505,
+        "Memorability": 2.3135,
+        "Clarity": 2.0918,
+        "Naturalness": 2.2138
+    },
+    "0_45": {
+        "Coherence": 2.349,
+        "Musicality": 2.1916,
+        "Memorability": 2.22,
+        "Clarity": 2.1934,
+        "Naturalness": 2.143
+    },
+    "3_85": {
+        "Coherence": 2.3207,
+        "Musicality": 2.3741,
+        "Memorability": 2.2619,
+        "Clarity": 2.2145,
+        "Naturalness": 2.1553
+    },
+    "1_40": {
+        "Coherence": 2.2562,
+        "Musicality": 2.3165,
+        "Memorability": 2.4133,
+        "Clarity": 2.2157,
+        "Naturalness": 2.2307
+    },
+    "1_47": {
+        "Coherence": 1.7885,
+        "Musicality": 1.7783,
+        "Memorability": 1.8075,
+        "Clarity": 1.6714,
+        "Naturalness": 1.8082
+    },
+    "average": {
+        "Coherence": 2.1767,
+        "Musicality": 2.1267,
+        "Memorability": 2.0738,
+        "Clarity": 2.0536,
+        "Naturalness": 2.0858
+    }
+}
\ No newline at end of file
diff --git a/assets/amadeus-framwork.drawio.png b/assets/amadeus-framwork.drawio.png
new file mode 100644
index 0000000..df6213c
Binary files /dev/null and b/assets/amadeus-framwork.drawio.png differ
diff --git a/assets/exp_amadeus.mp3 b/assets/exp_amadeus.mp3
new file mode 100644
index 0000000..a0fd3d9
Binary files /dev/null and b/assets/exp_amadeus.mp3 differ
diff --git a/assets/inference.drawio.png b/assets/inference.drawio.png
new file mode 100644
index 0000000..9ee7d16
Binary files /dev/null and b/assets/inference.drawio.png differ
diff --git a/assets/merged_compare_clean.png b/assets/merged_compare_clean.png
new file mode 100644
index 0000000..5501196
Binary files /dev/null and b/assets/merged_compare_clean.png differ
diff --git a/data_representation/README.md b/data_representation/README.md
new file mode 100644
index 0000000..ccfae12
--- /dev/null
+++ b/data_representation/README.md
@@ -0,0 +1,81 @@
+# Dataset Download
+
+Our model supports four different datasets:
+
+- **Symbolic Orchestral Database (SOD)**: [Link](https://qsdfo.github.io/LOP/database.html)  
+- **Lakh MIDI Dataset (Clean version)**: [Link](https://colinraffel.com/projects/lmd/)  
+- **Pop1k7**: [Link](https://github.com/YatingMusic/compound-word-transformer)  
+- **Pop909**: [Link](https://github.com/music-x-lab/POP909-Dataset)  
+
+### Download Instructions
+
+You can download the datasets via the command line:
+
+```sh
+# SOD
+wget https://qsdfo.github.io/LOP/database/SOD.zip
+
+# LakhClean
+wget http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz
+```
+
+For Pop1k7, the official repository link is currently unavailable. However, you can download it from this Google Drive link:
+[Download Pop1k7](https://drive.google.com/file/d/1GnbELjE-kQ4WOkBmZ3XapFKIaltySRyV/view?usp=drive_link)
+
+For Pop909, the dataset is uploaded in the official Github repository: [Repository link](https://github.com/music-x-lab/POP909-Dataset)
+
+### Using Your Own Dataset
+If you plan to use your own dataset, you can modify the dataset class in the data_utils.py script under symbolic_encoding folder inside the nested_music_transformer folder. Alternatively, for a simpler approach, rename your dataset to match one of the following options:
+
+- SOD: Use this for score-based MIDI datasets that require finer-grained quantization (supports up to 16th note triplet level quantization; 24 samples per quarter note).
+- LakhClean: Suitable for score-based MIDI datasets requiring coarse-grained quantization (supports up to 16th note level quantization; 4 samples per quarter note).
+- Pop1k7, Pop909: Ideal for expressive-based MIDI datasets requiring coarse-grained quantization (supports up to 16th note level quantization; 4 samples per quarter note).
+
+# Data Representation
+
+<p align="center">
+<img src="figure/Data_Representation_Pipeline.png" width="1000">
+</p>
+
+
+This document outlines our standard data processing pipeline. By following the instructions and running the corresponding Python scripts, you can generate a data representation suited to your specific needs.
+
+We focus on symbolic music and limit the use of musical features to a select few. Each feature set size corresponds to specific musical attributes. Through various experiments, we decided to use **7 features** for the *Pop1k7* and *Pop909* datasets, which consist of pop piano music requiring velocity for expression, and **5 features** for the *Symbolic Orchestral Database (SOD)*, *Lakh MIDI*, and *SymphonyMIDI* datasets.
+
+- **4 features**: `["type", "beat", "pitch", "duration"]`
+- **5 features**: `["type", "beat", "instrument", "pitch", "duration"]`
+- **7 features**: `["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"]`
+- **8 features**: `["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"]`
+
+## Parse Argument
+- `-d`, `--dataset`: This required argument specifies the dataset to be used. It takes one of the following values: `"BachChorale"`, `"Pop1k7"`, `"Pop909"`, `"SOD"`, `"LakhClean"`, or `"SymphonyMIDI"`.
+  
+- `-e`, `--encoding`: This required argument specifies the encoding scheme to use. It accepts one of the following: `"remi"`, `"cp"`, `"nb"`, or `"remi_pos"`.
+
+- `-f`, `--num_features`: This required argument specifies the number of features. It can take one of the following values: `4`, `5`, `7`, or `8`.
+
+- `-i`, `--in_dir`: This optional argument specifies the input data directory. It defaults to `../dataset/represented_data/corpus/` if not provided.
+
+- `-o`, `--out_dir`: This optional argument specifies the output data directory. It defaults to `../dataset/represented_data/events/`.
+
+- `--debug`: This flag enables debug mode when included. No additional value is needed.
+
+## 1. MIDI to Corpus
+In this step, we convert MIDI files into a set of events containing various musical information. The MIDI files should be aligned with the beat and contain accurate time signature information. Place the MIDI files in `<nmt/dataset/MIDI_dataset>` and refer to the example files provided. Navigate to the `<nmt/data_representation>` folder and run the script. The converted data will be stored in `<nmt/dataset/represented_data/corpus>`.
+
+- Example usage: `python3 step1_midi2corpus.py --dataset SOD --num_features 5`
+
+## 2. Corpus to Event
+We provide three types of representations: **REMI**, **Compound Word (CP)**, and **Note-based Encoding (NB)**. The converted data will be stored in `<nmt/dataset/represented_data/events>`.
+
+- Example usage: `python3 step2_corpus2event.py --dataset SOD --num_features 5 --encoding nb`
+
+## 3. Creating Vocabulary
+This script creates a vocabulary in the `<nmt/vocab>` folder. The vocabulary includes event-to-index pair information.
+
+- Example usage: `python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb`
+
+## 4. Event to Index
+In this step, we convert events into indices for efficient model training. The converted data will be stored in `<nmt/dataset/represented_data/tuneidx>`.
+
+- Example usage: `python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb`
diff --git a/data_representation/__init__.py b/data_representation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_representation/__pycache__/__init__.cpython-310.pyc b/data_representation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..417fb88
Binary files /dev/null and b/data_representation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/data_representation/__pycache__/constants.cpython-310.pyc b/data_representation/__pycache__/constants.cpython-310.pyc
new file mode 100644
index 0000000..a939e7a
Binary files /dev/null and b/data_representation/__pycache__/constants.cpython-310.pyc differ
diff --git a/data_representation/__pycache__/vocab_utils.cpython-310.pyc b/data_representation/__pycache__/vocab_utils.cpython-310.pyc
new file mode 100644
index 0000000..fdcdf55
Binary files /dev/null and b/data_representation/__pycache__/vocab_utils.cpython-310.pyc differ
diff --git a/data_representation/constants.py b/data_representation/constants.py
new file mode 100644
index 0000000..dea8b21
--- /dev/null
+++ b/data_representation/constants.py
@@ -0,0 +1,422 @@
+import numpy as np
+
+# for chord analysis
+NUM2PITCH = {
+    0: 'C',
+    1: 'C#',
+    2: 'D',
+    3: 'D#',
+    4: 'E',
+    5: 'F',
+    6: 'F#',
+    7: 'G',
+    8: 'G#',
+    9: 'A',
+    10: 'A#',
+    11: 'B',
+}
+
+# referred to mmt "https://github.com/salu133445/mmt"
+PROGRAM_INSTRUMENT_MAP = {
+    # Pianos
+    0: "piano",
+    1: "piano",
+    2: "piano",
+    3: "piano",
+    4: "electric-piano",
+    5: "electric-piano",
+    6: "harpsichord",
+    7: "clavinet",
+    # Chromatic Percussion
+    8: "celesta",
+    9: "glockenspiel",
+    10: "music-box",
+    11: "vibraphone",
+    12: "marimba",
+    13: "xylophone",
+    14: "tubular-bells",
+    15: "dulcimer",
+    # Organs
+    16: "organ",
+    17: "organ",
+    18: "organ",
+    19: "church-organ",
+    20: "organ",
+    21: "accordion",
+    22: "harmonica",
+    23: "bandoneon",
+    # Guitars
+    24: "nylon-string-guitar",
+    25: "steel-string-guitar",
+    26: "electric-guitar",
+    27: "electric-guitar",
+    28: "electric-guitar",
+    29: "electric-guitar",
+    30: "electric-guitar",
+    31: "electric-guitar",
+    # Basses
+    32: "bass",
+    33: "electric-bass",
+    34: "electric-bass",
+    35: "electric-bass",
+    36: "slap-bass",
+    37: "slap-bass",
+    38: "synth-bass",
+    39: "synth-bass",
+    # Strings
+    40: "violin",
+    41: "viola",
+    42: "cello",
+    43: "contrabass",
+    44: "strings",
+    45: "strings",
+    46: "harp",
+    47: "timpani",
+    # Ensemble
+    48: "strings",
+    49: "strings",
+    50: "synth-strings",
+    51: "synth-strings",
+    52: "voices",
+    53: "voices",
+    54: "voices",
+    55: "orchestra-hit",
+    # Brass
+    56: "trumpet",
+    57: "trombone",
+    58: "tuba",
+    59: "trumpet",
+    60: "horn",
+    61: "brasses",
+    62: "synth-brasses",
+    63: "synth-brasses",
+    # Reed
+    64: "soprano-saxophone",
+    65: "alto-saxophone",
+    66: "tenor-saxophone",
+    67: "baritone-saxophone",
+    68: "oboe",
+    69: "english-horn",
+    70: "bassoon",
+    71: "clarinet",
+    # Pipe
+    72: "piccolo",
+    73: "flute",
+    74: "recorder",
+    75: "pan-flute",
+    76: None,
+    77: None,
+    78: None,
+    79: "ocarina",
+    # Synth Lead
+    80: "lead",
+    81: "lead",
+    82: "lead",
+    83: "lead",
+    84: "lead",
+    85: "lead",
+    86: "lead",
+    87: "lead",
+    # Synth Pad
+    88: "pad",
+    89: "pad",
+    90: "pad",
+    91: "pad",
+    92: "pad",
+    93: "pad",
+    94: "pad",
+    95: "pad",
+    # Synth Effects
+    96: None,
+    97: None,
+    98: None,
+    99: None,
+    100: None,
+    101: None,
+    102: None,
+    103: None,
+    # Ethnic
+    104: "sitar",
+    105: "banjo",
+    106: "shamisen",
+    107: "koto",
+    108: "kalimba",
+    109: "bag-pipe",
+    110: "violin",
+    111: "shehnai",
+    # Percussive
+    112: None,
+    113: None,
+    114: "steel-drums",
+    115: None,
+    116: None,
+    117: "melodic-tom",
+    118: "synth-drums",
+    119: "synth-drums",
+    # Sound effects
+    120: None,
+    121: None,
+    122: None,
+    123: None,
+    124: None,
+    125: None,
+    126: None,
+    127: None,
+}
+
+# referred to mmt "https://github.com/salu133445/mmt"
+INSTRUMENT_PROGRAM_MAP = {
+    # Pianos
+    "piano": 0,
+    "electric-piano": 4,
+    "harpsichord": 6,
+    "clavinet": 7,
+    # Chromatic Percussion
+    "celesta": 8,
+    "glockenspiel": 9,
+    "music-box": 10,
+    "vibraphone": 11,
+    "marimba": 12,
+    "xylophone": 13,
+    "tubular-bells": 14,
+    "dulcimer": 15,
+    # Organs
+    "organ": 16,
+    "church-organ": 19,
+    "accordion": 21,
+    "harmonica": 22,
+    "bandoneon": 23,
+    # Guitars
+    "nylon-string-guitar": 24,
+    "steel-string-guitar": 25,
+    "electric-guitar": 26,
+    # Basses
+    "bass": 32,
+    "electric-bass": 33,
+    "slap-bass": 36,
+    "synth-bass": 38,
+    # Strings
+    "violin": 40,
+    "viola": 41,
+    "cello": 42,
+    "contrabass": 43,
+    "harp": 46,
+    "timpani": 47,
+    # Ensemble
+    "strings": 49,
+    "synth-strings": 50,
+    "voices": 52,
+    "orchestra-hit": 55,
+    # Brass
+    "trumpet": 56,
+    "trombone": 57,
+    "tuba": 58,
+    "horn": 60,
+    "brasses": 61,
+    "synth-brasses": 62,
+    # Reed
+    "soprano-saxophone": 64,
+    "alto-saxophone": 65,
+    "tenor-saxophone": 66,
+    "baritone-saxophone": 67,
+    "oboe": 68,
+    "english-horn": 69,
+    "bassoon": 70,
+    "clarinet": 71,
+    # Pipe
+    "piccolo": 72,
+    "flute": 73,
+    "recorder": 74,
+    "pan-flute": 75,
+    "ocarina": 79,
+    # Synth Lead
+    "lead": 80,
+    # Synth Pad
+    "pad": 88,
+    # Ethnic
+    "sitar": 104,
+    "banjo": 105,
+    "shamisen": 106,
+    "koto": 107,
+    "kalimba": 108,
+    "bag-pipe": 109,
+    "shehnai": 111,
+    # Percussive
+    "steel-drums": 114,
+    "melodic-tom": 117,
+    "synth-drums": 118,
+}
+
+FINED_PROGRAM_INSTRUMENT_MAP ={
+    # Pianos
+    0: "Acoustic-Grand-Piano",
+    1: "Bright-Acoustic-Piano",
+    2: "Electric-Grand-Piano",
+    3: "Honky-Tonk-Piano",
+    4: "Electric-Piano-1",
+    5: "Electric-Piano-2",
+    6: "Harpsichord",
+    7: "Clavinet",
+
+    # Chromatic Percussion
+    8: "Celesta",
+    9: "Glockenspiel",
+    10: "Music-Box",
+    11: "Vibraphone",
+    12: "Marimba",
+    13: "Xylophone",
+    14: "Tubular-Bells",
+    15: "Dulcimer",
+
+    # Organs
+    16: "Drawbar-Organ",
+    17: "Percussive-Organ",
+    18: "Rock-Organ",
+    19: "Church-Organ",
+    20: "Reed-Organ",
+    21: "Accordion",
+    22: "Harmonica",
+    23: "Tango-Accordion",
+
+    # Guitars
+    24: "Acoustic-Guitar-nylon",
+    25: "Acoustic-Guitar-steel",
+    26: "Electric-Guitar-jazz",
+    27: "Electric-Guitar-clean",
+    28: "Electric-Guitar-muted",
+    29: "Overdriven-Guitar",
+    30: "Distortion-Guitar",
+    31: "Guitar-harmonics",
+
+    # Basses
+    32: "Acoustic-Bass",
+    33: "Electric-Bass-finger",
+    34: "Electric-Bass-pick",
+    35: "Fretless-Bass",
+    36: "Slap-Bass-1",
+    37: "Slap-Bass-2",
+    38: "Synth-Bass-1",
+    39: "Synth-Bass-2",
+
+    # Strings & Orchestral
+    40: "Violin",
+    41: "Viola",
+    42: "Cello",
+    43: "Contrabass",
+    44: "Tremolo-Strings",
+    45: "Pizzicato-Strings",
+    46: "Orchestral-Harp",
+    47: "Timpani",
+
+    # Ensemble
+    48: "String-Ensemble-1",
+    49: "String-Ensemble-2",
+    50: "Synth-Strings-1",
+    51: "Synth-Strings-2",
+    52: "Choir-Aahs",
+    53: "Voice-Oohs",
+    54: "Synth-Voice",
+    55: "Orchestra-Hit",
+
+    # Brass
+    56: "Trumpet",
+    57: "Trombone",
+    58: "Tuba",
+    59: "Muted-Trumpet",
+    60: "French-Horn",
+    61: "Brass-Section",
+    62: "Synth-Brass-1",
+    63: "Synth-Brass-2",
+
+    # Reeds
+    64: "Soprano-Sax",
+    65: "Alto-Sax",
+    66: "Tenor-Sax",
+    67: "Baritone-Sax",
+    68: "Oboe",
+    69: "English-Horn",
+    70: "Bassoon",
+    71: "Clarinet",
+
+    # Pipes
+    72: "Piccolo",
+    73: "Flute",
+    74: "Recorder",
+    75: "Pan-Flute",
+    76: "Blown-Bottle",
+    77: "Shakuhachi",
+    78: "Whistle",
+    79: "Ocarina",
+
+    # Synth Lead
+    80: "Lead-1-square",
+    81: "Lead-2-sawtooth",
+    82: "Lead-3-calliope",
+    83: "Lead-4-chiff",
+    84: "Lead-5-charang",
+    85: "Lead-6-voice",
+    86: "Lead-7-fifths",
+    87: "Lead-8-bass+lead",
+
+    # Synth Pad
+    88: "Pad-1-new-age",
+    89: "Pad-2-warm",
+    90: "Pad-3-polysynth",
+    91: "Pad-4-choir",
+    92: "Pad-5-bowed",
+    93: "Pad-6-metallic",
+    94: "Pad-7-halo",
+    95: "Pad-8-sweep",
+
+    # Effects
+    96: "FX-1-rain",
+    97: "FX-2-soundtrack",
+    98: "FX-3-crystal",
+    99: "FX-4-atmosphere",
+    100: "FX-5-brightness",
+    101: "FX-6-goblins",
+    102: "FX-7-echoes",
+    103: "FX-8-sci-fi",
+
+    # Ethnic & Percussion
+    104: "Sitar",
+    105: "Banjo",
+    106: "Shamisen",
+    107: "Koto",
+    108: "Kalimba",
+    109: "Bag-pipe",
+    110: "Fiddle",
+    111: "Shanai",
+
+    # Percussive
+    112: "Tinkle-Bell",
+    113: "Agogo",
+    114: "Steel-Drums",
+    115: "Woodblock",
+    116: "Taiko-Drum",
+    117: "Melodic-Tom",
+    118: "Synth-Drum",
+    119: "Reverse-Cymbal",
+
+    # Sound Effects
+    120: "Guitar-Fret-Noise",
+    121: "Breath-Noise",
+    122: "Seashore",
+    123: "Bird-Tweet",
+    124: "Telephone-Ring",
+    125: "Helicopter",
+    126: "Applause",
+    127: "Gunshot"
+}
+
+
+REGULAR_NUM_DENOM = [(1, 1), (1, 2), (2, 2), (3, 2), (4, 2),
+                     (1, 4), (2, 4), (3, 4), (4, 4), (5, 4), (6, 4), (7, 4), (8, 4),
+                     (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (11, 8), (12, 8)]
+CORE_NUM_DENOM = [(1, 1), (1, 2), (2, 2), (4, 2),
+                  (1, 4), (2, 4), (3, 4), (4, 4), (5, 4),
+                  (1, 8), (2, 8), (3, 8), (6, 8), (9, 8), (12, 8)]
+VALID_TIME_SIGNATURES = ['time_signature_' + str(x[0]) + '/' + str(x[1]) for x in REGULAR_NUM_DENOM]
+
+# cover possible time signatures
+REGULAR_TICKS_PER_BEAT = [48, 96, 192, 384, 120, 240, 480, 960, 256, 512, 1024]
diff --git a/data_representation/encoding_utils.py b/data_representation/encoding_utils.py
new file mode 100644
index 0000000..a4695cf
--- /dev/null
+++ b/data_representation/encoding_utils.py
@@ -0,0 +1,879 @@
+from typing import Any
+from fractions import Fraction
+from collections import defaultdict
+
+from miditoolkit import TimeSignature
+
+from constants import *
+
+'''
+This script contains specific encoding functions for different encoding schemes.
+'''
+
+def frange(start, stop, step):
+  while start < stop:
+    yield start
+    start += step
+
+################################# for REMI style encoding #################################
+
+class Corpus2event_remi():
+    def __init__(self, num_features:int):
+      self.num_features = num_features
+    
+    def _create_event(self, name, value):
+      event = dict()
+      event['name'] = name
+      event['value'] = value
+      return event
+    def _break_down_numerator(self, numerator, possible_time_signatures):
+        """Break down a numerator into smaller time signatures.
+        
+        Args:
+            numerator: Target numerator to decompose (must be > 0).
+            possible_time_signatures: List of (numerator, denominator) tuples,
+                                    sorted in descending order (e.g., [(4,4), (3,4)]).
+
+        Returns:
+            List of decomposed time signatures (e.g., [(4,4), (3,4)]).
+
+        Raises:
+            ValueError: If decomposition is impossible.
+        """
+        if numerator <= 0:
+            raise ValueError("Numerator must be positive.")
+        if not possible_time_signatures:
+            raise ValueError("No possible time signatures provided.")
+
+        result = []
+        original_numerator = numerator  # For error message
+
+        # Sort signatures in descending order to prioritize larger chunks
+        possible_time_signatures = sorted(possible_time_signatures, key=lambda x: -x[0])
+
+        while numerator > 0:
+            subtracted = False  # Track if any subtraction occurred in this iteration
+
+            for sig in possible_time_signatures:
+                sig_numerator, _ = sig
+                if sig_numerator <= 0:
+                    continue  # Skip invalid signatures
+
+                while numerator >= sig_numerator:
+                    result.append(sig)
+                    numerator -= sig_numerator
+                    subtracted = True
+
+            # If no progress was made, decomposition failed
+            if not subtracted:
+                raise ValueError(
+                    f"Cannot decompose numerator {original_numerator} "
+                    f"with given time signatures {possible_time_signatures}. "
+                    f"Remaining: {numerator}"
+                )
+
+        return result
+    def _normalize_time_signature(self, time_signature, ticks_per_beat, next_change_point):
+        """
+        Normalize irregular time signatures to standard ones by breaking them down 
+        into common time signatures, and adjusting their durations to fit the given 
+        musical structure.
+
+        Parameters:
+        - time_signature: TimeSignature object with numerator, denominator, and start time.
+        - ticks_per_beat: Number of ticks per beat, representing the resolution of the timing.
+        - next_change_point: Tick position where the next time signature change occurs.
+
+        Returns:
+        - A list of TimeSignature objects, normalized to fit within regular time signatures.
+
+        Procedure:
+        1. If the time signature is already a standard one (in REGULAR_NUM_DENOM), return it.
+        2. For non-standard signatures, break them down into simpler, well-known signatures.
+          - For unusual denominations (e.g., 16th, 32nd, or 64th notes), normalize to 4/4.
+          - For 6/4 signatures, break it into two 3/4 measures.
+        3. If the time signature has a non-standard numerator and denominator, break it down 
+          into the largest possible numerators that still fit within the denominator. 
+          This ensures that the final measure fits within the regular time signature format.
+        4. Calculate the resolution (duration in ticks) for each bar and ensure the bars 
+          fit within the time until the next change point.
+          - Adjust the number of bars if they exceed the available space.
+          - If the total length is too short, repeat the first (largest) bar to fill the gap.
+        5. Convert the breakdown into TimeSignature objects and return the normalized result.
+        """
+        
+        # Check if the time signature is a regular one, return it if so
+        if (time_signature.numerator, time_signature.denominator) in REGULAR_NUM_DENOM:
+            return [time_signature]
+        
+        # Extract time signature components
+        numerator, denominator, bar_start_tick = time_signature.numerator, time_signature.denominator, time_signature.time
+
+        # Normalize time signatures with 16th, 32nd, or 64th note denominators to 4/4
+        if denominator in [16, 32, 64]:
+            return [TimeSignature(4, 4, time_signature.time)]
+        
+        # Special case for 6/4, break it into two 3/4 bars
+        elif denominator == 6 and numerator == 4:
+            return [TimeSignature(3, 4, time_signature.time), TimeSignature(3, 4, time_signature.time)]
+        
+        # Determine possible regular signatures for the given denominator
+        possible_time_signatures = [sig for sig in CORE_NUM_DENOM if sig[1] == denominator]
+        
+        # Sort by numerator in descending order to prioritize larger numerators
+        possible_time_signatures.sort(key=lambda x: x[0], reverse=True)
+        
+        result = []
+        
+        # Break down the numerator into smaller regular numerators
+        max_iterations = 100  # Prevent infinite loops
+        original_numerator = numerator  # Store original for error message
+    
+    # Break down the numerator into smaller regular numerators
+        iteration_count = 0
+        while numerator > 0:
+            iteration_count += 1
+            if iteration_count > max_iterations:
+                raise ValueError(
+                    f"Failed to normalize time signature {original_numerator}/{denominator}. "
+                    f"Could not break down numerator {original_numerator} with available signatures: "
+                    f"{possible_time_signatures}"
+                )
+                
+            for sig in possible_time_signatures:
+                # Subtract numerators and add to the result
+                while numerator >= sig[0]:
+                    result.append(sig)
+                    numerator -= sig[0]
+    
+        
+        
+        # Calculate the resolution (length in ticks) of each bar
+        bar_resol_list = [int(ticks_per_beat * numerator * (4 / denominator)) for numerator, denominator in result]
+        
+        # Adjust bars to fit within the remaining ticks before the next change point
+        total_length = 0
+        for idx, bar_resol in enumerate(bar_resol_list):
+            total_length += bar_resol
+            if total_length > next_change_point - bar_start_tick:
+                result = result[:idx+1]
+                break
+        
+        # If the total length is too short, repeat the first (largest) bar until the gap is filled
+        while total_length < next_change_point - bar_start_tick:
+            result.append(result[0])
+            total_length += int(ticks_per_beat * result[0][0] * (4 / result[0][1]))
+        
+        # Recalculate bar resolutions for the final result
+        bar_resol_list = [int(ticks_per_beat * numerator * (4 / denominator)) for numerator, denominator in result]
+        
+        # Insert a starting resolution of 0 and calculate absolute tick positions for each TimeSignature
+        bar_resol_list.insert(0, 0)
+        total_length = bar_start_tick
+        normalized_result = []
+        for sig, length in zip(result, bar_resol_list):
+            total_length += length
+            normalized_result.append(TimeSignature(sig[0], sig[1], total_length))
+        
+        return normalized_result
+
+    def _process_time_signature(self, time_signature_changes, ticks_per_beat, first_note_tick, global_end):
+        """
+        Process and normalize time signature changes for a given musical piece.
+
+        Parameters:
+        - time_signature_changes: A list of TimeSignature objects representing time signature changes in the music.
+        - ticks_per_beat: The resolution of timing in ticks per beat.
+        - first_note_tick: The tick position of the first note in the piece.
+        - global_end: The tick position where the piece ends.
+
+        Returns:
+        - A list of processed and normalized time signature changes. If no valid time signature 
+          changes are found, returns None.
+
+        Procedure:
+        1. Check the validity of the time signature changes:
+          - Ensure there is at least one time signature change.
+          - Ensure the first time signature change occurs at the beginning (before the first note).
+        2. Remove duplicate consecutive time signatures:
+          - Only add time signatures that differ from the previous one (de-duplication).
+        3. Normalize the time signatures:
+          - For each time signature, determine its duration by calculating the time until the 
+            next change point or the end of the piece.
+          - Use the _normalize_time_signature method to break down non-standard signatures into 
+            simpler, well-known signatures that fit within the musical structure.
+        4. Return the processed and normalized time signature changes.
+
+        """
+        
+        # Check if there are any time signature changes
+        if len(time_signature_changes) == 0:
+            print("No time signature change in this tune, default to 4/4 time signature")
+            # default to 4/4 time signature if none are found
+            return [TimeSignature(4, 4, 0)]
+        
+        # Ensure the first time signature change is at the start of the piece (before the first note)
+        if time_signature_changes[0].time != 0 and time_signature_changes[0].time > first_note_tick:
+            print("The first time signature change is not at the beginning of the tune")
+            return None
+        
+        # Remove consecutive duplicate time signatures (de-duplication)
+        processed_time_signature_changes = []
+        for idx, time_sig in enumerate(time_signature_changes):
+            if idx == 0:
+                processed_time_signature_changes.append(time_sig)
+            else:
+                prev_time_sig = time_signature_changes[idx-1]
+                # Only add time signature if it's different from the previous one
+                if not (prev_time_sig.numerator == time_sig.numerator and prev_time_sig.denominator == time_sig.denominator):
+                    processed_time_signature_changes.append(time_sig)
+        
+        # Normalize the time signatures to standard formats
+        normalized_time_signature_changes = []
+        for idx, time_signature in enumerate(processed_time_signature_changes):
+            if idx == len(time_signature_changes) - 1:
+                # If it's the last time signature change, set the next change point as the end of the piece
+                next_change_point = global_end
+            else:
+                # Otherwise, set the next change point as the next time signature's start time
+                next_change_point = time_signature_changes[idx+1].time
+            
+            # Normalize the current time signature and extend the result
+            normalized_time_signature_changes.extend(self._normalize_time_signature(time_signature, ticks_per_beat, next_change_point))
+        
+        # Return the list of processed and normalized time signatures
+        time_signature_changes = normalized_time_signature_changes
+        return time_signature_changes
+
+    def _half_step_interval_gap_check_across_instruments(self, instrument_note_dict):
+        '''
+        This function checks for half-step interval gaps between notes across different instruments.
+        It will avoid half-step intervals by keeping one note from any pair of notes that are a half-step apart, 
+        regardless of which instrument they belong to.
+        '''
+        # order instrument_note_dict by pitch in descending order
+        instrument_note_dict = dict(sorted(instrument_note_dict.items()))
+
+        # Create a dictionary to store all pitches across instruments
+        all_pitches = {}
+
+        # Collect all pitches from each instrument and sort them in descending order
+        for instrument, notes in instrument_note_dict.items():
+            for pitch, durations in notes.items():
+                all_pitches[pitch] = all_pitches.get(pitch, []) + [(instrument, durations)]
+
+        # Sort the pitches in descending order
+        sorted_pitches = sorted(all_pitches.keys(), reverse=True)
+
+        # Create a new list to store the final pitches after comparison
+        final_pitch_list = []
+
+        # Use an index pointer to control the sliding window
+        idx = 0
+        while idx < len(sorted_pitches) - 1:
+            current_pitch = sorted_pitches[idx]
+            next_pitch = sorted_pitches[idx + 1]
+
+            if current_pitch - next_pitch == 1:  # Check for a half-step interval gap
+                current_max_duration = max(duration for _, durations in all_pitches[current_pitch] for duration, _ in durations)
+                next_max_duration = max(duration for _, durations in all_pitches[next_pitch] for duration, _ in durations)
+
+                if current_max_duration < next_max_duration:
+                    # Keep the higher pitch (next_pitch) and skip the current_pitch
+                    final_pitch_list.append(next_pitch)
+                else:
+                    # Keep the lower pitch (current_pitch) and skip the next_pitch
+                    final_pitch_list.append(current_pitch)
+
+                # Skip the next pitch because we already handled it
+                idx += 2
+            else:
+                # No half-step gap, keep the current pitch and move to the next one
+                final_pitch_list.append(current_pitch)
+                idx += 1
+
+        # Ensure the last pitch is added if it's not part of a half-step interval
+        if idx == len(sorted_pitches) - 1:
+            final_pitch_list.append(sorted_pitches[-1])
+
+        # Filter out notes not in the final pitch list and update the instrument_note_dict
+        for instrument in instrument_note_dict.keys():
+            instrument_note_dict[instrument] = {
+                pitch: instrument_note_dict[instrument][pitch]
+                for pitch in sorted(instrument_note_dict[instrument].keys(), reverse=True) if pitch in final_pitch_list
+            }
+
+        return instrument_note_dict
+
+    def __call__(self, song_data, in_beat_resolution):
+        '''
+        Process a song's data to generate a sequence of musical events, including bars, chords, tempo, 
+        and notes, similar to the approach used in the CP paper (corpus2event_remi_v2).
+
+        Parameters:
+        - song_data: A dictionary containing metadata, notes, chords, and tempos of the song.
+        - in_beat_resolution: The resolution of timing in beats (how many divisions per beat).
+
+        Returns:
+        - A sequence of musical events including start (SOS), bars, chords, tempo, instruments, notes,
+          and an end (EOS) event. If the time signature is invalid, returns None.
+
+        Procedure:
+        1. **Global Setup**:
+          - Extract global metadata like first and last note ticks, time signature changes, and ticks 
+            per beat.
+          - Compute `in_beat_tick_resol`, the ratio of ticks per beat to the input beat resolution, 
+            to assist in dividing bars later.
+          - Get a sorted list of instruments in the song.
+
+        2. **Time Signature Processing**:
+          - Call `_process_time_signature` to clean up and normalize the time signatures in the song.
+          - If the time signatures are invalid (e.g., no time signature changes or missing at the 
+            start), the function exits early with None.
+
+        3. **Sequence Generation**:
+          - Initialize the sequence with a start token (SOS) and prepare variables for tracking 
+            previous chord, tempo, and instrument states.
+          - Loop through each time signature change, dividing the song into measures based on the 
+            current time signature's numerator and denominator.
+          - For each measure, append "Bar" tokens to mark measure boundaries, while ensuring that no 
+            more than four consecutive empty bars are added.
+          - For each step within a measure, process the following:
+            - **Chords**: If there is a chord change, add a corresponding chord event.
+            - **Tempo**: If the tempo changes, add a tempo event.
+            - **Notes**: Iterate over each instrument, adding notes and checking for half-step 
+              intervals, deduplicating notes, and choosing the longest duration for each pitch.
+          - Append a "Beat" event for each step with musical events.
+
+        4. **End Sequence**:
+          - Conclude the sequence by appending a final "Bar" token followed by an end token (EOS).
+        '''
+
+        # --- global tag --- #
+        first_note_tick = song_data['metadata']['first_note']  # Starting tick of the first note
+        global_end = song_data['metadata']['last_note']  # Ending tick of the last note
+        time_signature_changes = song_data['metadata']['time_signature']  # Time signature changes
+        ticks_per_beat = song_data['metadata']['ticks_per_beat']  # Ticks per beat resolution
+        # Resolution for dividing beats within measures, expressed as a fraction
+        in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution)  # Example: 1024/12 -> (256, 3)
+        instrument_list = sorted(list(song_data['notes'].keys()))  # Get a sorted list of instruments in the song
+
+        # --- process time signature --- #
+        # Normalize and process the time signatures in the song
+        time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end)
+        if time_signature_changes == None:
+            return None  # Exit if time signature is invalid
+
+        # --- create sequence --- #
+        prev_instr_idx = None  # Track the previously processed instrument
+        final_sequence = []
+        final_sequence.append(self._create_event('SOS', None))  # Add Start of Sequence (SOS) token
+        prev_chord = None  # Track the previous chord
+        prev_tempo = None  # Track the previous tempo
+        chord_value = None
+        tempo_value = None
+
+        # Process each time signature change
+        for idx in range(len(time_signature_changes)):
+            time_sig_change_flag = True  # Flag to indicate a time signature change
+            # Calculate bar resolution based on the current time signature
+            numerator = time_signature_changes[idx].numerator
+            denominator = time_signature_changes[idx].denominator
+            time_sig_name = f'time_signature_{numerator}/{denominator}'  # Format time signature name
+            bar_resol = int(ticks_per_beat * numerator * (4 / denominator))  # Calculate bar resolution in ticks
+            bar_start_tick = time_signature_changes[idx].time  # Start tick of the current bar
+            # Determine the next time signature change point or the end of the song
+            if idx == len(time_signature_changes) - 1:
+                next_change_point = global_end
+            else:
+                next_change_point = time_signature_changes[idx+1].time
+            
+            # Process each measure within the current time signature
+            for measure_step in frange(bar_start_tick, next_change_point, bar_resol):
+                empty_bar_token = self._create_event('Bar', None)  # Token for empty bars
+
+                # Ensure no more than 4 consecutive empty bars are added
+                if len(final_sequence) >= 4:
+                    if not (final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and 
+                            final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token):
+                        if time_sig_change_flag:
+                            final_sequence.append(self._create_event('Bar', time_sig_name))  # Mark new bar with time signature
+                        else:
+                            final_sequence.append(self._create_event('Bar', None))
+                    else:
+                        if time_sig_change_flag:
+                            final_sequence.append(self._create_event('Bar', time_sig_name))
+                else:
+                    if time_sig_change_flag:
+                        final_sequence.append(self._create_event('Bar', time_sig_name))
+                    else:
+                        final_sequence.append(self._create_event('Bar', None))
+                        
+                time_sig_change_flag = False  # Reset time signature change flag
+                
+                # Process events within each beat
+                for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)):
+                    events_list = []
+                    # Retrieve chords and tempos at the current beat step
+                    t_chords = song_data['chords'].get(beat_step)
+                    t_tempos = song_data['tempos'].get(beat_step)
+
+                    # Process chord and tempo if the number of features allows for it
+                    if self.num_features in {8, 7}:
+                        if t_chords is not None:
+                            root, quality, _ = t_chords[-1].text.split('_')  # Extract chord info
+                            chord_value = root + '_' + quality
+                        if t_tempos is not None:
+                            tempo_value = t_tempos[-1].tempo  # Extract tempo value
+
+                    # Dictionary to track notes for each instrument to avoid duplicates
+                    instrument_note_dict = defaultdict(dict)
+
+                    # Process notes for each instrument at the current beat step
+                    for instrument_idx in instrument_list:
+                        t_notes = song_data['notes'][instrument_idx].get(beat_step)
+
+                        # If there are notes at this beat step, process them.
+                        if t_notes is not None:
+                            # Track notes to avoid duplicates and check for half-step intervals
+                            for note in t_notes:
+                                if note.pitch not in instrument_note_dict[instrument_idx]:
+                                    instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)]
+                                else:
+                                    instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity))
+                            
+                    if len(instrument_note_dict) == 0:
+                        continue
+
+                    # Check for half-step interval gaps and handle them across instruments
+                    pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict)
+
+                    # add chord and tempo
+                    if self.num_features in {7, 8}:
+                        if prev_chord != chord_value:
+                            events_list.append(self._create_event('Chord', chord_value))
+                            prev_chord = chord_value
+                        if prev_tempo != tempo_value:
+                            events_list.append(self._create_event('Tempo', tempo_value))
+                            prev_tempo = tempo_value
+
+                    # add instrument and note
+                    for instrument in pruned_instrument_note_dict:
+                        if self.num_features in {5, 8}:
+                            events_list.append(self._create_event('Instrument', instrument))
+                        
+                        for pitch in pruned_instrument_note_dict[instrument]:
+                            max_duration = max(pruned_instrument_note_dict[instrument][pitch], key=lambda x: x[0])
+                            note_event = [
+                                self._create_event('Note_Pitch', pitch),
+                                self._create_event('Note_Duration', max_duration[0])
+                            ]
+                            if self.num_features in {7, 8}:
+                                note_event.append(self._create_event('Note_Velocity', max_duration[1]))
+                            events_list.extend(note_event)
+
+                    # If there are events in this step, add a "Beat" event and the collected events
+                    if len(events_list):
+                        final_sequence.append(self._create_event('Beat', in_beat_off_idx))
+                        final_sequence.extend(events_list)
+
+        # --- end with BAR & EOS --- #
+        final_sequence.append(self._create_event('Bar', None))  # Add final bar token
+        final_sequence.append(self._create_event('EOS', None))  # Add End of Sequence (EOS) token
+        return final_sequence 
+
+################################# for CP style encoding #################################
+  
+class Corpus2event_cp(Corpus2event_remi):
+    def __init__(self, num_features):
+        super().__init__(num_features)
+        self.num_features = num_features
+        self._init_event_template()
+      
+    def _init_event_template(self):
+        '''
+        The order of musical features is Type, Beat, Chord, Tempo, Instrument, Pitch, Duration, Velocity
+        '''
+        self.event_template = {}
+        if self.num_features == 8:
+            feature_names = ['type', 'beat', 'chord', 'tempo', 'instrument', 'pitch', 'duration', 'velocity']
+        elif self.num_features == 7:
+            feature_names = ['type', 'beat', 'chord', 'tempo', 'pitch', 'duration', 'velocity']
+        elif self.num_features == 5:
+            feature_names = ['type', 'beat', 'instrument', 'pitch', 'duration']
+        elif self.num_features == 4:
+            feature_names = ['type', 'beat', 'pitch', 'duration']
+        for feature_name in feature_names:
+            self.event_template[feature_name] = 0
+
+    def create_cp_sos_event(self):
+        total_event = self.event_template.copy()
+        total_event['type'] = 'SOS'
+        return total_event
+      
+    def create_cp_eos_event(self):
+        total_event = self.event_template.copy()
+        total_event['type'] = 'EOS'
+        return total_event
+
+    def create_cp_metrical_event(self, pos, chord, tempo):
+        '''
+        when the compound token is related to metrical information
+        '''
+        meter_event = self.event_template.copy()
+        meter_event['type'] = 'Metrical'
+        meter_event['beat'] = pos
+        if self.num_features == 7 or self.num_features == 8:
+            meter_event['chord'] = chord
+            meter_event['tempo'] = tempo
+        return meter_event
+
+    def create_cp_note_event(self, instrument_name, pitch, duration, velocity):
+        '''
+        when the compound token is related to note information
+        '''
+        note_event = self.event_template.copy()
+        note_event['type'] = 'Note'
+        note_event['pitch'] = pitch
+        note_event['duration'] = duration
+        if self.num_features == 5 or self.num_features == 8:
+            note_event['instrument'] = instrument_name
+        if self.num_features == 7 or self.num_features == 8:
+            note_event['velocity'] = velocity
+        return note_event
+
+    def create_cp_bar_event(self, time_sig_change_flag=False, time_sig_name=None):
+        meter_event = self.event_template.copy()
+        if time_sig_change_flag:
+            meter_event['type'] = 'Metrical'
+            meter_event['beat'] = f'Bar_{time_sig_name}'
+        else:
+            meter_event['type'] = 'Metrical'
+            meter_event['beat'] = 'Bar'
+        return meter_event
+
+    def __call__(self, song_data, in_beat_resolution):
+        # --- global tag --- #
+        first_note_tick = song_data['metadata']['first_note']  # First note timestamp in ticks
+        global_end = song_data['metadata']['last_note']  # Last note timestamp in ticks
+        time_signature_changes = song_data['metadata']['time_signature']  # Time signature changes throughout the song
+        ticks_per_beat = song_data['metadata']['ticks_per_beat']  # Ticks per beat (resolution of the timing grid)
+        in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution)  # Tick resolution for beats
+        instrument_list = sorted(list(song_data['notes'].keys()))  # List of instruments in the song
+
+        # --- process time signature --- #
+        # Process time signature changes and adjust them for the given song structure
+        time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end)
+        if time_signature_changes == None:
+            return None  # Exit if no valid time signature changes found
+
+        # --- create sequence --- #
+        final_sequence = []  # Initialize the final sequence to store the events
+        final_sequence.append(self.create_cp_sos_event())  # Add the Start-of-Sequence (SOS) event
+        chord_text = None  # Placeholder for the current chord
+        tempo_text = None  # Placeholder for the current tempo
+
+        # Loop through each time signature change and process the corresponding measures
+        for idx in range(len(time_signature_changes)):
+            time_sig_change_flag = True  # Flag to track when time signature changes
+            # Calculate bar resolution (number of ticks per bar based on the time signature)
+            numerator = time_signature_changes[idx].numerator
+            denominator = time_signature_changes[idx].denominator
+            time_sig_name = f'time_signature_{numerator}/{denominator}'  # Format the time signature as a string
+            bar_resol = int(ticks_per_beat * numerator * (4 / denominator))  # Calculate number of ticks per bar
+            bar_start_tick = time_signature_changes[idx].time  # Starting tick for this time signature
+
+            # Determine the point for the next time signature change or the end of the song
+            if idx == len(time_signature_changes) - 1:
+                next_change_point = global_end
+            else:
+                next_change_point = time_signature_changes[idx + 1].time
+
+            # Iterate over each measure (bar) between the current and next time signature change
+            for measure_step in frange(bar_start_tick, next_change_point, bar_resol):
+                empty_bar_token = self.create_cp_bar_event()  # Create an empty bar event
+
+                # Check if the last four events in the sequence are consecutive empty bars
+                if len(final_sequence) >= 4:
+                    if not (final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token):
+                        final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name))
+                    else:
+                        if time_sig_change_flag:
+                            final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name))
+                else:
+                    final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name))
+                
+                # Reset the time signature change flag after handling the bar event
+                time_sig_change_flag = False
+
+                # Loop through beats in each measure based on the in-beat resolution
+                for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)):
+                    chord_tempo_flag = False  # Flag to track if chord and tempo events are added
+                    events_list = []  # List to hold events for the current beat
+                    pos_text = 'Beat_' + str(in_beat_off_idx)  # Create a beat event label
+
+                    # --- chord & tempo processing --- #
+                    # Unpack chords and tempos for the current beat step
+                    t_chords = song_data['chords'].get(beat_step)
+                    t_tempos = song_data['tempos'].get(beat_step)
+
+                    # If a chord is present, extract its root, quality, and bass
+                    if self.num_features in {7, 8}:
+                        if t_chords is not None:
+                            root, quality, _ = t_chords[-1].text.split('_')
+                            chord_text = 'Chord_' + root + '_' + quality
+
+                        # If a tempo is present, format it as a string
+                        if t_tempos is not None:
+                            tempo_text = 'Tempo_' + str(t_tempos[-1].tempo)
+                    
+                    # Dictionary to track notes for each instrument to avoid duplicates
+                    instrument_note_dict = defaultdict(dict)
+
+                    # --- instrument & note processing --- #
+                    # Loop through each instrument and process its notes at the current beat step
+                    for instrument_idx in instrument_list:
+                        t_notes = song_data['notes'][instrument_idx].get(beat_step)
+
+                        # If notes are present, process them
+                        if t_notes != None:
+                            # Track notes and their properties (duration and velocity) for the current instrument
+                            for note in t_notes:
+                                if note.pitch not in instrument_note_dict[instrument_idx]:
+                                    instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)]
+                                else:
+                                    instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity))
+
+                    if len(instrument_note_dict) == 0:
+                        continue
+                    
+                    # Check for half-step interval gaps and handle them across instruments
+                    pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict)
+
+                    # add chord and tempo
+                    if self.num_features in {7, 8}:
+                        if not chord_tempo_flag:
+                            if chord_text == None:
+                                chord_text = 'Chord_N_N'
+                            if tempo_text == None:
+                                tempo_text = 'Tempo_N_N'
+                            chord_tempo_flag = True
+                            
+                    events_list.append(self.create_cp_metrical_event(pos_text, chord_text, tempo_text))
+
+                    # add instrument and note
+                    for instrument_idx in pruned_instrument_note_dict:
+                        instrument_name = 'Instrument_' + str(instrument_idx)
+                        for pitch in pruned_instrument_note_dict[instrument_idx]:
+                            max_duration = max(pruned_instrument_note_dict[instrument_idx][pitch], key=lambda x: x[0])
+                            note_pitch_text = 'Note_Pitch_' + str(pitch)
+                            note_duration_text = 'Note_Duration_' + str(max_duration[0])
+                            note_velocity_text = 'Note_Velocity_' + str(max_duration[1])
+                            events_list.append(self.create_cp_note_event(instrument_name, note_pitch_text, note_duration_text, note_velocity_text))                   
+
+                    # If there are any events for this beat, add them to the final sequence
+                    if len(events_list) > 0:
+                      final_sequence.extend(events_list)
+
+        # --- end with BAR & EOS --- #
+        final_sequence.append(self.create_cp_bar_event())  # Add the final bar event
+        final_sequence.append(self.create_cp_eos_event())  # Add the End-of-Sequence (EOS) event
+        return final_sequence  # Return the final sequence of events
+  
+################################# for NB style encoding #################################
+
+class Corpus2event_nb(Corpus2event_cp):
+    def __init__(self, num_features):
+        '''
+        For convenience in logging, we use "type" word for "metric" sub-token in the code to compare easily with other encoding schemes
+        '''
+        super().__init__(num_features)
+        self.num_features = num_features
+        self._init_event_template()
+      
+    def _init_event_template(self):
+        self.event_template = {}
+        if self.num_features == 8:
+            feature_names = ['type', 'beat', 'chord', 'tempo', 'instrument', 'pitch', 'duration', 'velocity']
+        elif self.num_features == 7:
+            feature_names = ['type', 'beat', 'chord', 'tempo', 'pitch', 'duration', 'velocity']
+        elif self.num_features == 5:
+            feature_names = ['type', 'beat', 'instrument', 'pitch', 'duration']
+        elif self.num_features == 4:
+            feature_names = ['type', 'beat', 'pitch', 'duration']
+        for feature_name in feature_names:
+            self.event_template[feature_name] = 0
+
+    def create_nb_sos_event(self):
+        total_event = self.event_template.copy()
+        total_event['type'] = 'SOS'
+        return total_event
+
+    def create_nb_eos_event(self):
+        total_event = self.event_template.copy()
+        total_event['type'] = 'EOS'
+        return total_event
+
+    def create_nb_event(self, bar_beat_type, pos, chord, tempo, instrument_name, pitch, duration, velocity):
+        total_event = self.event_template.copy()
+        total_event['type'] = bar_beat_type
+        total_event['beat'] = pos
+        total_event['pitch'] = pitch
+        total_event['duration'] = duration
+        if self.num_features in {5, 8}:
+            total_event['instrument'] = instrument_name
+        if self.num_features in {7, 8}:
+            total_event['chord'] = chord
+            total_event['tempo'] = tempo
+            total_event['velocity'] = velocity
+        return total_event
+
+    def create_nb_empty_bar_event(self):
+        total_event = self.event_template.copy()
+        total_event['type'] = 'Empty_Bar'
+        return total_event
+
+    def get_bar_beat_idx(self, bar_flag, beat_flag, time_sig_name, time_sig_change_flag):
+        '''
+        This function is to get the metric information for the current bar and beat
+        There are four types of metric information: NNN, SNN, SSN, SSS
+        Each letter represents the change of time signature, bar, and beat (new or same)
+        '''
+        if time_sig_change_flag: # new time signature
+            return "NNN_" + time_sig_name
+        else:
+            if bar_flag and beat_flag: # same time sig & new bar & new beat
+                return "SNN"
+            elif not bar_flag and beat_flag: # same time sig & same bar & new beat
+                return "SSN"
+            elif not bar_flag and not beat_flag: # same time sig & same bar & same beat
+                return "SSS"
+
+    def __call__(self, song_data, in_beat_resolution:int):
+        # --- global tag --- #
+        first_note_tick = song_data['metadata']['first_note']  # First note timestamp in ticks
+        global_end = song_data['metadata']['last_note']  # Last note timestamp in ticks
+        time_signature_changes = song_data['metadata']['time_signature']  # Time signature changes throughout the song
+        ticks_per_beat = song_data['metadata']['ticks_per_beat']  # Ticks per beat (resolution of the timing grid)
+        in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution)  # Tick resolution for beats
+        instrument_list = sorted(list(song_data['notes'].keys()))  # List of instruments in the song
+
+        # --- process time signature --- #
+        # Process time signature changes and adjust them for the given song structure
+        time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end)
+        if time_signature_changes == None:
+            return None  # Exit if no valid time signature changes found
+          
+        # --- create sequence --- #
+        final_sequence = []  # Initialize the final sequence to store the events
+        final_sequence.append(self.create_nb_sos_event())  # Add the Start-of-Sequence (SOS) event
+        chord_text = None  # Placeholder for the current chord
+        tempo_text = None  # Placeholder for the current tempo
+
+        # Loop through each time signature change and process the corresponding measures
+        for idx in range(len(time_signature_changes)):
+            time_sig_change_flag = True  # Flag to track when time signature changes
+            # Calculate bar resolution (number of ticks per bar based on the time signature)
+            numerator = time_signature_changes[idx].numerator
+            denominator = time_signature_changes[idx].denominator
+            time_sig_name = f'time_signature_{numerator}/{denominator}'  # Format the time signature as a string
+            bar_resol = int(ticks_per_beat * numerator * (4 / denominator))  # Calculate number of ticks per bar
+            bar_start_tick = time_signature_changes[idx].time  # Starting tick for this time signature
+
+            # Determine the point for the next time signature change or the end of the song
+            if idx == len(time_signature_changes) - 1:
+                next_change_point = global_end
+            else:
+                next_change_point = time_signature_changes[idx + 1].time
+
+            # Iterate over each measure (bar) between the current and next time signature change
+            for measure_step in frange(bar_start_tick, next_change_point, bar_resol):
+                bar_flag = True
+                note_flag = False
+
+                # Loop through beats in each measure based on the in-beat resolution
+                for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)):
+                    beat_flag = True
+                    events_list = []
+                    pos_text = 'Beat_' + str(in_beat_off_idx)
+
+                    # --- chord & tempo processing --- #
+                    # Unpack chords and tempos for the current beat step
+                    t_chords = song_data['chords'].get(beat_step)
+                    t_tempos = song_data['tempos'].get(beat_step)
+
+                    # If a chord is present, extract its root, quality, and bass
+                    if self.num_features == 8 or self.num_features == 7:
+                        if t_chords is not None:
+                            root, quality, _ = t_chords[-1].text.split('_')
+                            chord_text = 'Chord_' + root + '_' + quality
+
+                        # If a tempo is present, format it as a string
+                        if t_tempos is not None:
+                            tempo_text = 'Tempo_' + str(t_tempos[-1].tempo)
+
+                    # Dictionary to track notes for each instrument to avoid duplicates
+                    instrument_note_dict = defaultdict(dict)
+
+                    # --- instrument & note processing --- #
+                    # Loop through each instrument and process its notes at the current beat step
+                    for instrument_idx in instrument_list:
+                        t_notes = song_data['notes'][instrument_idx].get(beat_step)
+
+                        # If notes are present, process them
+                        if t_notes != None:
+                          note_flag = True
+
+                          # Track notes and their properties (duration and velocity) for the current instrument
+                          for note in t_notes:
+                              if note.pitch not in instrument_note_dict[instrument_idx]:
+                                  instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)]
+                              else:
+                                  instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity))
+
+                          # # Check for half-step interval gaps and handle them accordingly
+                          # self._half_step_interval_gap_check(instrument_note_dict, instrument_idx)
+
+                    if len(instrument_note_dict) == 0:
+                        continue
+                    
+                    # Check for half-step interval gaps and handle them across instruments
+                    pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict)
+
+                    # add chord and tempo
+                    if self.num_features in {7, 8}:
+                        if chord_text == None:
+                            chord_text = 'Chord_N_N'
+                        if tempo_text == None:
+                            tempo_text = 'Tempo_N_N'
+
+                    # add instrument and note
+                    for instrument_idx in pruned_instrument_note_dict:
+                        instrument_name = 'Instrument_' + str(instrument_idx)
+                        for pitch in pruned_instrument_note_dict[instrument_idx]:
+                            max_duration = max(pruned_instrument_note_dict[instrument_idx][pitch], key=lambda x: x[0])
+                            note_pitch_text = 'Note_Pitch_' + str(pitch)
+                            note_duration_text = 'Note_Duration_' + str(max_duration[0])
+                            note_velocity_text = 'Note_Velocity_' + str(max_duration[1])
+                            bar_beat_type = self.get_bar_beat_idx(bar_flag, beat_flag, time_sig_name, time_sig_change_flag)
+                            events_list.append(self.create_nb_event(bar_beat_type, pos_text, chord_text, tempo_text, instrument_name, note_pitch_text, note_duration_text, note_velocity_text))
+                            bar_flag = False
+                            beat_flag = False
+                            time_sig_change_flag = False
+
+                    # If there are any events for this beat, add them to the final sequence
+                    if events_list != None and len(events_list):
+                        final_sequence.extend(events_list)
+
+                # when there is no note in this bar
+                if not note_flag:
+                  # avoid consecutive empty bars (more than 4 is not allowed)
+                  empty_bar_token = self.create_nb_empty_bar_event()
+                  if len(final_sequence) >= 4:
+                    if final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token:
+                      continue
+                  final_sequence.append(empty_bar_token)
+
+        # --- end with BAR & EOS --- #
+        final_sequence.append(self.create_nb_eos_event())
+        return final_sequence
\ No newline at end of file
diff --git a/data_representation/step1_midi2corpus.py b/data_representation/step1_midi2corpus.py
new file mode 100644
index 0000000..815fc8f
--- /dev/null
+++ b/data_representation/step1_midi2corpus.py
@@ -0,0 +1,650 @@
+import argparse
+import time
+import itertools
+import copy
+from copy import deepcopy
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+from fractions import Fraction
+from typing import List
+import os
+from muspy import sort
+import numpy as np
+import pickle
+from tqdm import tqdm
+
+import miditoolkit
+from miditoolkit.midi.containers import Marker, Instrument
+from chorder import Dechorder
+
+from constants import NUM2PITCH, PROGRAM_INSTRUMENT_MAP, INSTRUMENT_PROGRAM_MAP
+
+'''
+This script is designed to preprocess MIDI files and convert them into a structured corpus suitable for symbolic music analysis or model training. 
+It handles various tasks, including setting beat resolution, calculating duration, velocity, and tempo bins, and processing MIDI data into quantized musical events. 
+'''
+
+def get_tempo_bin(max_tempo:int, ratio:float=1.1):
+  bpm = 30
+  regular_tempo_bins = [bpm]
+  while bpm < max_tempo:
+    bpm *= ratio
+    bpm = round(bpm)
+    if bpm > max_tempo:
+      break
+    regular_tempo_bins.append(bpm)
+  return np.array(regular_tempo_bins)
+
+def split_markers(markers:List[miditoolkit.midi.containers.Marker]):
+  '''
+  split markers into chord, tempo, label
+  '''
+  chords = []
+  for marker in markers:
+    splitted_text = marker.text.split('_')
+    if splitted_text[0] != 'global' and 'Boundary' not in splitted_text[0]:
+      chords.append(marker)
+  return chords
+
+class CorpusMaker():
+  def __init__(
+      self, 
+      dataset_name:str, 
+      num_features:int, 
+      in_dir:Path, 
+      out_dir:Path, 
+      debug:bool
+  ):
+    '''
+    Initialize the CorpusMaker with dataset information and directory paths.
+    It sets up MIDI paths, output directories, and debug mode, then
+    retrieves the beat resolution, duration bins, velocity/tempo bins, and prepares the MIDI file list.
+    '''
+    self.dataset_name = dataset_name
+    self.num_features = num_features
+    self.midi_path = in_dir / f"{dataset_name}"
+    self.out_dir = out_dir
+    self.debug = debug
+    self._get_in_beat_resolution()
+    self._get_duration_bins()
+    self._get_velocity_tempo_bins()
+    self._get_min_max_last_time()
+    self._prepare_midi_list()
+  
+  def _get_in_beat_resolution(self):
+    # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note)
+    in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8}
+    try:
+      self.in_beat_resolution = in_beat_resolution_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      self.in_beat_resolution = in_beat_resolution_dict['LakhClean']
+
+  def _get_duration_bins(self):
+    # Set up regular duration bins for quantizing note lengths, based on the beat resolution.
+    base_duration = {4:[1,2,3,4,5,6,8,10,12,16,20,24,28,32],
+                     8:[1,2,3,4,6,8,10,12,14,16,20,24,28,32,36,40,48,56,64],
+                     12:[1,2,3,4,6,9,12,15,18,24,30,36,42,48,54,60,72,84,96]}
+    base_duration_list = base_duration[self.in_beat_resolution]
+    self.regular_duration_bins = np.array(base_duration_list)
+
+  def _get_velocity_tempo_bins(self):
+    # Define velocity and tempo bins based on whether the dataset is a performance or score type.
+    midi_type_dict = {'BachChorale': 'score', 'Pop1k7': 'perform', 'Pop909': 'score', 'SOD': 'score', 'LakhClean': 'score', 'Symphony': 'score'}
+    try:
+      midi_type = midi_type_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      midi_type = midi_type_dict['LakhClean']
+    # For performance-type datasets, set finer granularity of velocity and tempo bins.
+    if midi_type == 'perform':
+      self.regular_velocity_bins = np.array(list(range(40, 128, 8)) + [127])
+      self.regular_tempo_bins = get_tempo_bin(max_tempo=240, ratio=1.04)
+    # For score-type datasets, use coarser velocity and tempo bins.
+    elif midi_type == 'score':
+      self.regular_velocity_bins = np.array([40, 60, 80, 100, 120])
+      self.regular_tempo_bins = get_tempo_bin(max_tempo=390, ratio=1.04)
+
+  def _get_min_max_last_time(self):
+    '''
+    Set the minimum and maximum allowed length of a MIDI track, depending on the dataset.
+    0 to 2000 means no limitation
+    '''
+    # last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (60, 600), 'Symphony': (60, 1500)}
+    last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (0, 2000), 'Symphony': (60, 1500)}
+    try:
+      self.min_last_time, self.max_last_time = last_time_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      self.min_last_time, self.max_last_time = last_time_dict['LakhClean']
+      
+  def _prepare_midi_list(self):
+    midi_path = Path(self.midi_path)
+    # detect subdirectories and get all midi files
+    if not midi_path.exists():
+      raise ValueError(f"midi_path {midi_path} does not exist")
+    # go though all subdirectories and get all midi files
+    midi_files = []
+    for root, _, files in os.walk(midi_path):
+      for file in files:
+        if file.endswith('.mid'):
+          # print(Path(root) / file)
+          midi_files.append(Path(root) / file)
+    self.midi_list = midi_files
+    print(f"Found {len(self.midi_list)} MIDI files in {midi_path}")
+
+  def make_corpus(self) -> None:
+    '''
+    Main method to process the MIDI files and create the corpus data.
+    It supports both single-processing (debug mode) and multi-processing for large datasets.
+    '''
+    print("preprocessing midi data to corpus data")
+    # check the corpus folder is already exist and make it if not
+    Path(self.out_dir).mkdir(parents=True, exist_ok=True)
+    Path(self.out_dir / f"corpus_{self.dataset_name}").mkdir(parents=True, exist_ok=True)
+    Path(self.out_dir / f"midi_{self.dataset_name}").mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    if self.debug:
+      # single processing for debugging
+      broken_counter = 0
+      success_counter = 0
+      for file_path in tqdm(self.midi_list, total=len(self.midi_list)):
+        message = self._mp_midi2corpus(file_path)
+        if message == "error":
+          broken_counter += 1
+        elif message == "success":
+          success_counter += 1
+    else:
+    # Multi-threaded processing for faster corpus generation.
+      broken_counter = 0
+      success_counter = 0
+      # filter out processed files
+      print(self.out_dir)
+      processed_files = list(Path(self.out_dir).glob(f"midi_{self.dataset_name}/*.mid"))
+      processed_files = [x.name for x in processed_files]
+      print(f"processed files: {len(processed_files)}")
+      print("length of midi list: ", len(self.midi_list))
+      # Use set for faster lookup (O(1) per check)
+      processed_files_set = set(processed_files)
+      self.midi_list = [x for x in self.midi_list if x.name not in processed_files_set]
+      # reverse the list to process the latest files first
+      self.midi_list.reverse()
+      print(f"length of midi list after filtering: ", len(self.midi_list))
+      with Pool(16) as p:
+        for message in tqdm(p.imap(self._mp_midi2corpus, self.midi_list, 1000), total=len(self.midi_list)):
+          if message == "error":
+            broken_counter += 1
+          elif message == "success":
+            success_counter += 1
+        #  for file_path in tqdm(self.midi_list, total=len(self.midi_list)):
+        # message = self._mp_midi2corpus(file_path)
+        # if message == "error":
+        #   broken_counter += 1
+        # elif message == "success":
+        #   success_counter += 1
+    print(f"Making corpus takes: {time.time() - start_time}s, success: {success_counter}, broken: {broken_counter}")
+
+  def _mp_midi2corpus(self, file_path: Path):
+      """Convert MIDI to corpus format and save both corpus (.pkl) and MIDI (.mid)."""
+      try:
+          midi_obj = self._analyze(file_path)
+          corpus, midi_obj = self._midi2corpus(midi_obj)
+          # --- 1. Save corpus (.pkl) ---
+          relative_path = file_path.relative_to(self.midi_path)  # Get relative path from input dir
+          safe_name = str(relative_path).replace("/", "_").replace("\\", "_").replace(".mid", ".pkl")
+          save_path = Path(self.out_dir) / f"corpus_{self.dataset_name}" / safe_name
+          save_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure dir exists
+          with save_path.open("wb") as f:
+              pickle.dump(corpus, f)
+
+          # --- 2. Save MIDI (.mid) ---
+          midi_save_dir = Path("../dataset/represented_data/corpus") / f"midi_{self.dataset_name}"
+          midi_save_dir.mkdir(parents=True, exist_ok=True)
+          midi_save_path = midi_save_dir / file_path.name  # Keep original MIDI filename
+          midi_obj.dump(midi_save_path)
+
+          del midi_obj, corpus
+          return "success"
+
+      except (OSError, EOFError, ValueError, KeyError, AssertionError) as e:
+          print(f"Error processing {file_path.name}: {e}")
+          return "error"
+      except Exception as e:
+          print(f"Unexpected error in {file_path.name}: {e}")
+          return "error"
+  def _check_length(self, last_time:float):
+    if last_time < self.min_last_time:
+      raise ValueError(f"last time {last_time} is out of range")
+
+  def _analyze(self, midi_path:Path):
+    # Loads and analyzes a MIDI file, performing various checks and extracting chords.
+    midi_obj = miditoolkit.midi.parser.MidiFile(midi_path)
+    
+    # check length
+    mapping = midi_obj.get_tick_to_time_mapping()
+    last_time = mapping[midi_obj.max_tick]
+    self._check_length(last_time)
+    
+    for ins in midi_obj.instruments:
+      # delete instrument with no notes
+      if len(ins.notes) == 0:
+        midi_obj.instruments.remove(ins)
+        continue
+      notes = ins.notes
+      notes = sorted(notes, key=lambda x: (x.start, x.pitch))
+
+    # three steps to merge instruments
+    self._merge_percussion(midi_obj)
+    self._pruning_instrument(midi_obj)
+    self._limit_max_track(midi_obj)
+
+    if self.num_features == 7 or self.num_features == 8:
+      # in case of 7 or 8 features, we need to extract chords
+      new_midi_obj = self._pruning_notes_for_chord_extraction(midi_obj)
+      chords = Dechorder.dechord(new_midi_obj)
+      markers = []
+      for cidx, chord in enumerate(chords):
+        if chord.is_complete():
+          chord_text = NUM2PITCH[chord.root_pc] + '_' + chord.quality + '_' + NUM2PITCH[chord.bass_pc]
+        else:
+          chord_text = 'N_N_N'
+        markers.append(Marker(time=int(cidx*new_midi_obj.ticks_per_beat), text=chord_text))
+      
+      # de-duplication
+      prev_chord = None
+      dedup_chords = []
+      for m in markers:
+        if m.text != prev_chord:
+          prev_chord = m.text
+          dedup_chords.append(m)
+
+      # return midi
+      midi_obj.markers = dedup_chords
+    return midi_obj
+
+  def _pruning_grouped_notes_from_quantization(self, instr_grid:dict):
+    '''
+    In case where notes are grouped in the same quant_time but with different start time, unintentional chords are created
+    rule1: if notes have half step interval, delete the shorter one
+    rule2: if notes do not share 50% of duration of the shorter note, delete the shorter one
+    '''
+    for instr in instr_grid.keys():
+      time_list = sorted(list(instr_grid[instr].keys()))
+      for time in time_list:
+        notes = instr_grid[instr][time]
+        if len(notes) == 1:
+          continue
+        else:
+          new_notes = []
+        # sort in pitch with ascending order
+        notes.sort(key=lambda x: x.pitch)
+        for i in range(len(notes)-1):
+          # if start time is same add to new_notes
+          if notes[i].start == notes[i+1].start:
+            new_notes.append(notes[i])
+            new_notes.append(notes[i+1])
+            continue
+          if notes[i].pitch == notes[i+1].pitch or notes[i].pitch + 1 == notes[i+1].pitch:
+            # select longer note
+            if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start:
+              new_notes.append(notes[i])
+            else:
+              new_notes.append(notes[i+1])
+          else:
+            # check how much duration they share
+            shared_duration = min(notes[i].end, notes[i+1].end) - max(notes[i].start, notes[i+1].start)
+            shorter_duration = min(notes[i].end - notes[i].start, notes[i+1].end - notes[i+1].start)
+            # unless they share more than 80% of duration, select longer note (pruning shorter note)
+            if shared_duration / shorter_duration < 0.8:
+              if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start:
+                new_notes.append(notes[i])
+              else:
+                new_notes.append(notes[i+1])
+            else:
+              if len(new_notes) == 0:
+                new_notes.append(notes[i])
+                new_notes.append(notes[i+1])
+              else:
+                new_notes.append(notes[i+1])
+        instr_grid[instr][time] = new_notes
+
+  def _midi2corpus(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    # Checks if the ticks per beat in the MIDI file is lower than the expected resolution.
+    # If it is, raise an error.
+    if midi_obj.ticks_per_beat < self.in_beat_resolution:
+      raise ValueError(f'[x] Irregular ticks_per_beat. {midi_obj.ticks_per_beat}')
+
+    # Ensure there is at least one time signature change in the MIDI file.
+    # if len(midi_obj.time_signature_changes) == 0:
+    #   raise ValueError('[x] No time_signature_changes')
+    
+    # Ensure there are no duplicated time signature changes.
+    # time_list = [ts.time for ts in midi_obj.time_signature_changes]
+    # if len(time_list) != len(set(time_list)):
+    #   raise ValueError('[x] Duplicated time_signature_changes')
+    
+    # If the dataset is 'LakhClean' or 'SymphonyMIDI', verify there are at least 4 tracks.
+    # if self.dataset_name == 'LakhClean' or self.dataset_name == 'SymphonyMIDI':
+    #   if len(midi_obj.instruments) < 4:
+    #     raise ValueError('[x] We will use more than 4 tracks in Lakh Clean dataset.')
+    
+    # Calculate the resolution of ticks per beat as a fraction.
+    in_beat_tick_resol = Fraction(midi_obj.ticks_per_beat, self.in_beat_resolution)
+    
+    # Extract the initial time signature (numerator and denominator) and calculate the number of ticks for the first bar.
+    if len(midi_obj.time_signature_changes) != 0:
+      initial_numerator = midi_obj.time_signature_changes[0].numerator
+      initial_denominator = midi_obj.time_signature_changes[0].denominator
+    else:
+      # If no time signature changes, set default values
+      initial_numerator = 4
+      initial_denominator = 4
+    first_bar_resol = int(midi_obj.ticks_per_beat * initial_numerator * (4 / initial_denominator))
+
+    # --- load notes --- #
+    instr_notes = self._make_instr_notes(midi_obj)
+    # --- load information --- #
+    # load chords, labels
+    chords = split_markers(midi_obj.markers)
+    chords.sort(key=lambda x: x.time)
+    
+
+    # load tempos
+    tempos = midi_obj.tempo_changes if len(midi_obj.tempo_changes) > 0 else []
+    if len(tempos) == 0:
+      # if no tempo changes, set the default tempo to 120 BPM
+      tempos = [miditoolkit.midi.containers.TempoChange(time=0, tempo=120)]
+    tempos.sort(key=lambda x: x.time)
+    
+    # --- process items to grid --- #
+    # compute empty bar offset at head
+    first_note_time = min([instr_notes[k][0].start for k in instr_notes.keys()])
+    last_note_time = max([instr_notes[k][-1].start for k in instr_notes.keys()])
+
+    quant_time_first = int(round(first_note_time  / in_beat_tick_resol)) * in_beat_tick_resol
+    offset = quant_time_first // first_bar_resol # empty bar
+    offset_by_resol = offset * first_bar_resol
+    # --- process notes --- #
+    instr_grid = dict()
+    for key in instr_notes.keys():
+      notes = instr_notes[key]
+      note_grid = defaultdict(list)
+      for note in notes:
+        # skip notes out of range, below C-1 and above C8
+        if note.pitch < 12 or note.pitch >= 120:
+          continue
+
+        # in case when the first note starts at slightly before the first bar
+        note.start = note.start - offset_by_resol if note.start - offset_by_resol > 0 else 0
+        note.end = note.end - offset_by_resol if note.end - offset_by_resol > 0 else 0
+
+        # relative duration
+        # skip note with 0 duration
+        note_duration = note.end - note.start
+        relative_duration = round(note_duration / in_beat_tick_resol)
+        if relative_duration == 0:
+          continue
+        if relative_duration > self.in_beat_resolution * 8: # 8 beats
+          relative_duration = self.in_beat_resolution * 8
+          
+        # use regular duration bins
+        note.quantized_duration = self.regular_duration_bins[np.argmin(abs(self.regular_duration_bins-relative_duration))]
+
+        # quantize start time 
+        quant_time = int(round(note.start / in_beat_tick_resol)) * in_beat_tick_resol
+
+        # velocity
+        note.velocity = self.regular_velocity_bins[
+          np.argmin(abs(self.regular_velocity_bins-note.velocity))]
+
+        # append
+        note_grid[quant_time].append(note)
+
+      # set to track
+      instr_grid[key] = note_grid
+    
+    # --- pruning grouped notes --- #
+    self._pruning_grouped_notes_from_quantization(instr_grid)
+  
+    # --- process chords --- #
+    chord_grid = defaultdict(list)
+    for chord in chords:
+      # quantize
+      chord.time = chord.time - offset_by_resol
+      chord.time  = 0 if chord.time < 0 else chord.time
+      quant_time = int(round(chord.time / in_beat_tick_resol)) * in_beat_tick_resol
+      chord_grid[quant_time].append(chord)
+    
+    # --- process tempos --- #
+    
+    first_notes_list = []
+    for instr in instr_grid.keys():
+        time_list = sorted(list(instr_grid[instr].keys()))
+        if len(time_list) == 0:  # 跳过空轨道
+            continue
+        first_quant_time = time_list[0]
+        first_notes_list.append(first_quant_time)
+
+    # 处理全空情况
+    if not first_notes_list:
+        raise ValueError("[x] No valid notes found in any instrument track.")
+    quant_first_note_time = min(first_notes_list)
+    tempo_grid = defaultdict(list)
+    for tempo in tempos:
+      # quantize
+      tempo.time = tempo.time - offset_by_resol if tempo.time - offset_by_resol > 0 else 0
+      quant_time = int(round(tempo.time / in_beat_tick_resol)) * in_beat_tick_resol
+      tempo.tempo = self.regular_tempo_bins[
+        np.argmin(abs(self.regular_tempo_bins-tempo.tempo))]
+      if quant_time < quant_first_note_time:
+        tempo_grid[quant_first_note_time].append(tempo)
+      else:
+        tempo_grid[quant_time].append(tempo)
+    if len(tempo_grid[quant_first_note_time]) > 1:
+      tempo_grid[quant_first_note_time] = [tempo_grid[quant_first_note_time][-1]]
+    # --- process time signature --- #
+    quant_time_signature = deepcopy(midi_obj.time_signature_changes)
+    quant_time_signature.sort(key=lambda x: x.time)
+    for ts in quant_time_signature:
+      ts.time = ts.time - offset_by_resol if ts.time - offset_by_resol > 0 else 0
+      ts.time = int(round(ts.time / in_beat_tick_resol)) * in_beat_tick_resol
+    
+    # --- make new midi object to check processed values --- #
+    new_midi_obj = miditoolkit.midi.parser.MidiFile()
+    new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat
+    new_midi_obj.max_tick = midi_obj.max_tick
+    for instr_idx in instr_grid.keys():
+      new_instrument = Instrument(program=instr_idx)
+      new_instrument.notes = [y for x in instr_grid[instr_idx].values() for y in x]
+      new_midi_obj.instruments.append(new_instrument)
+    new_midi_obj.markers = [y for x in chord_grid.values() for y in x]
+    new_midi_obj.tempo_changes = [y for x in tempo_grid.values() for y in x]
+    new_midi_obj.time_signature_changes = midi_obj.time_signature_changes
+    
+    # make corpus
+    song_data = {
+      'notes': instr_grid,
+      'chords': chord_grid,
+      'tempos': tempo_grid,
+      'metadata': {
+        'first_note': first_note_time,
+        'last_note': last_note_time,
+        'time_signature': quant_time_signature,
+        'ticks_per_beat': midi_obj.ticks_per_beat,
+        }
+      }
+    return song_data, new_midi_obj
+
+  def _make_instr_notes(self, midi_obj):
+    '''
+    This part is important, we can use three different ways to merge instruments
+    1st option: compare the number of notes and choose tracks with more notes
+    2nd option: merge all instruments with the same tracks
+    3rd option: leave all instruments as they are. differentiate tracks with different track number
+    
+    In this version we choose to use the 2nd option as it helps to reduce the number of tracks and sequence length
+    '''
+    instr_notes = defaultdict(list)
+    for instr in midi_obj.instruments:
+      instr_idx = instr.program
+      # change instrument idx
+      instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx)
+      if instr_name is None:
+        continue
+      new_instr_idx = INSTRUMENT_PROGRAM_MAP[instr_name]
+      instr_notes[new_instr_idx].extend(instr.notes)
+      instr_notes[new_instr_idx].sort(key=lambda x: (x.start, -x.pitch))
+    return instr_notes
+
+  # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet"
+  def _merge_percussion(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    merge drum track to one track
+    '''
+    drum_0_lst = []
+    new_instruments = []
+    for instrument in midi_obj.instruments:
+      if len(instrument.notes) == 0:
+        continue
+      if instrument.is_drum:
+        drum_0_lst.extend(instrument.notes)
+      else:
+        new_instruments.append(instrument)
+    if len(drum_0_lst) > 0:
+      drum_0_lst.sort(key=lambda x: x.start)
+      # remove duplicate
+      drum_0_lst = list(k for k, _ in itertools.groupby(drum_0_lst))
+      drum_0_instrument = Instrument(program=114, is_drum=True, name="percussion")
+      drum_0_instrument.notes = drum_0_lst
+      new_instruments.append(drum_0_instrument)
+    midi_obj.instruments = new_instruments
+
+  # referred to mmt "https://github.com/salu133445/mmt"
+  def _pruning_instrument(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    merge instrument number with similar intrument category
+    ex. 0: Acoustic Grand Piano, 1: Bright Acoustic Piano, 2: Electric Grand Piano into 0: Acoustic Grand Piano
+    '''
+    new_instruments = []
+    for instr in midi_obj.instruments:
+      instr_idx = instr.program
+      # change instrument idx
+      instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx)
+      if instr_name != None:
+        new_instruments.append(instr)
+    midi_obj.instruments = new_instruments
+
+  # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet"
+  def _limit_max_track(self, midi_obj:miditoolkit.midi.parser.MidiFile, MAX_TRACK:int=16):
+      '''
+      merge track with least notes to other track with same program
+      and limit the maximum amount of track to 16
+      '''
+      if len(midi_obj.instruments) == 1:
+        if midi_obj.instruments[0].is_drum:
+          midi_obj.instruments[0].program = 114
+          midi_obj.instruments[0].is_drum = False
+        return midi_obj
+      good_instruments = midi_obj.instruments
+      good_instruments.sort(
+          key=lambda x: (not x.is_drum, -len(x.notes)))  # place drum track or the most note track at first
+      assert good_instruments[0].is_drum == True or len(good_instruments[0].notes) >= len(
+          good_instruments[1].notes), tuple(len(x.notes) for x in good_instruments[:3])
+      # assert good_instruments[0].is_drum == False, (, len(good_instruments[2]))
+      track_idx_lst = list(range(len(good_instruments)))
+      if len(good_instruments) > MAX_TRACK:
+          new_good_instruments = copy.deepcopy(good_instruments[:MAX_TRACK])
+          # print(midi_file_path)
+          for id in track_idx_lst[MAX_TRACK:]:
+              cur_ins = good_instruments[id]
+              merged = False
+              new_good_instruments.sort(key=lambda x: len(x.notes))
+              for nid, ins in enumerate(new_good_instruments):
+                  if cur_ins.program == ins.program and cur_ins.is_drum == ins.is_drum:
+                      new_good_instruments[nid].notes.extend(cur_ins.notes)
+                      merged = True
+                      break
+              if not merged:
+                  pass
+          good_instruments = new_good_instruments
+
+      assert len(good_instruments) <= MAX_TRACK, len(good_instruments)
+      for idx, good_instrument in enumerate(good_instruments):
+          if good_instrument.is_drum:
+              good_instruments[idx].program = 114
+              good_instruments[idx].is_drum = False
+      midi_obj.instruments = good_instruments
+
+  def _pruning_notes_for_chord_extraction(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    extract notes for chord extraction
+    '''
+    new_midi_obj = miditoolkit.midi.parser.MidiFile()
+    new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat
+    new_midi_obj.max_tick = midi_obj.max_tick
+    new_instrument = Instrument(program=0, is_drum=False, name="for_chord")
+    new_instruments = []
+    new_notes = []
+    for instrument in midi_obj.instruments:
+      if instrument.program == 114 or instrument.is_drum: # pass drum track
+        continue
+      valid_notes = [note for note in instrument.notes if note.pitch >= 21 and note.pitch <= 108]
+      new_notes.extend(valid_notes)
+    new_notes.sort(key=lambda x: x.start)
+    new_instrument.notes = new_notes
+    new_instruments.append(new_instrument)
+    new_midi_obj.instruments = new_instruments
+    return new_midi_obj
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../dataset/represented_data/corpus/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  return parser
+
+def main():
+  parser = get_argument_parser()
+  args = parser.parse_args()
+  corpus_maker = CorpusMaker(args.dataset, args.num_features, args.in_dir, args.out_dir, args.debug)
+  corpus_maker.make_corpus()
+
+if __name__ == "__main__":
+  main()
+#   python3 step1_midi2corpus.py --dataset SOD --num_features 5
+# python3 step2_corpus2event.py --dataset LakhClean --num_features 5 --encoding nb
+# python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb
+#  python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb
\ No newline at end of file
diff --git a/data_representation/step1_midi2corpus_fined.py b/data_representation/step1_midi2corpus_fined.py
new file mode 100644
index 0000000..e42cbfa
--- /dev/null
+++ b/data_representation/step1_midi2corpus_fined.py
@@ -0,0 +1,654 @@
+import argparse
+import time
+import itertools
+import copy
+from copy import deepcopy
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+from fractions import Fraction
+from typing import List
+import os
+from muspy import sort
+import numpy as np
+import pickle
+from tqdm import tqdm
+
+import miditoolkit
+from miditoolkit.midi.containers import Marker, Instrument
+from chorder import Dechorder
+
+from constants import NUM2PITCH,FINED_PROGRAM_INSTRUMENT_MAP, INSTRUMENT_PROGRAM_MAP
+
+'''
+This script is designed to preprocess MIDI files and convert them into a structured corpus suitable for symbolic music analysis or model training. 
+It handles various tasks, including setting beat resolution, calculating duration, velocity, and tempo bins, and processing MIDI data into quantized musical events. 
+We dont do instrument merging here.
+'''
+
+def get_tempo_bin(max_tempo:int, ratio:float=1.1):
+  bpm = 30
+  regular_tempo_bins = [bpm]
+  while bpm < max_tempo:
+    bpm *= ratio
+    bpm = round(bpm)
+    if bpm > max_tempo:
+      break
+    regular_tempo_bins.append(bpm)
+  return np.array(regular_tempo_bins)
+
+def split_markers(markers:List[miditoolkit.midi.containers.Marker]):
+  '''
+  split markers into chord, tempo, label
+  '''
+  chords = []
+  for marker in markers:
+    splitted_text = marker.text.split('_')
+    if splitted_text[0] != 'global' and 'Boundary' not in splitted_text[0]:
+      chords.append(marker)
+  return chords
+
+class CorpusMaker():
+  def __init__(
+      self, 
+      dataset_name:str, 
+      num_features:int, 
+      in_dir:Path, 
+      out_dir:Path, 
+      debug:bool
+  ):
+    '''
+    Initialize the CorpusMaker with dataset information and directory paths.
+    It sets up MIDI paths, output directories, and debug mode, then
+    retrieves the beat resolution, duration bins, velocity/tempo bins, and prepares the MIDI file list.
+    '''
+    self.dataset_name = dataset_name
+    self.num_features = num_features
+    self.midi_path = in_dir / f"{dataset_name}"
+    self.out_dir = out_dir
+    self.debug = debug
+    self._get_in_beat_resolution()
+    self._get_duration_bins()
+    self._get_velocity_tempo_bins()
+    self._get_min_max_last_time()
+    self._prepare_midi_list()
+  
+  def _get_in_beat_resolution(self):
+    # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note)
+    in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8}
+    try:
+      self.in_beat_resolution = in_beat_resolution_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      self.in_beat_resolution = in_beat_resolution_dict['LakhClean']
+
+  def _get_duration_bins(self):
+    # Set up regular duration bins for quantizing note lengths, based on the beat resolution.
+    base_duration = {4:[1,2,3,4,5,6,8,10,12,16,20,24,28,32],
+                     8:[1,2,3,4,6,8,10,12,14,16,20,24,28,32,36,40,48,56,64],
+                     12:[1,2,3,4,6,9,12,15,18,24,30,36,42,48,54,60,72,84,96]}
+    base_duration_list = base_duration[self.in_beat_resolution]
+    self.regular_duration_bins = np.array(base_duration_list)
+
+  def _get_velocity_tempo_bins(self):
+    # Define velocity and tempo bins based on whether the dataset is a performance or score type.
+    midi_type_dict = {'BachChorale': 'score', 'Pop1k7': 'perform', 'Pop909': 'score', 'SOD': 'score', 'LakhClean': 'score', 'Symphony': 'score'}
+    try:
+      midi_type = midi_type_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      midi_type = midi_type_dict['LakhClean']
+    # For performance-type datasets, set finer granularity of velocity and tempo bins.
+    if midi_type == 'perform':
+      self.regular_velocity_bins = np.array(list(range(40, 128, 8)) + [127])
+      self.regular_tempo_bins = get_tempo_bin(max_tempo=240, ratio=1.04)
+    # For score-type datasets, use coarser velocity and tempo bins.
+    elif midi_type == 'score':
+      self.regular_velocity_bins = np.array([40, 60, 80, 100, 120])
+      self.regular_tempo_bins = get_tempo_bin(max_tempo=390, ratio=1.04)
+
+  def _get_min_max_last_time(self):
+    '''
+    Set the minimum and maximum allowed length of a MIDI track, depending on the dataset.
+    0 to 2000 means no limitation
+    '''
+    # last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (60, 600), 'Symphony': (60, 1500)}
+    last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (0, 2000), 'Symphony': (60, 1500)}
+    try:
+      self.min_last_time, self.max_last_time = last_time_dict[self.dataset_name]
+    except KeyError:
+      print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean")
+      self.min_last_time, self.max_last_time = last_time_dict['LakhClean']
+      
+  def _prepare_midi_list(self):
+    midi_path = Path(self.midi_path)
+    # detect subdirectories and get all midi files
+    if not midi_path.exists():
+      raise ValueError(f"midi_path {midi_path} does not exist")
+    # go though all subdirectories and get all midi files
+    midi_files = []
+    for root, _, files in os.walk(midi_path):
+      for file in files:
+        if file.endswith('.mid') or file.endswith('.midi') or file.endswith('.MID'):
+          # print(Path(root) / file)
+          midi_files.append(Path(root) / file)
+    self.midi_list = midi_files
+    print(f"Found {len(self.midi_list)} MIDI files in {midi_path}")
+
+  def make_corpus(self) -> None:
+    '''
+    Main method to process the MIDI files and create the corpus data.
+    It supports both single-processing (debug mode) and multi-processing for large datasets.
+    '''
+    print("preprocessing midi data to corpus data")
+    # check the corpus folder is already exist and make it if not
+    Path(self.out_dir).mkdir(parents=True, exist_ok=True)
+    Path(self.out_dir / f"corpus_{self.dataset_name}").mkdir(parents=True, exist_ok=True)
+    Path(self.out_dir / f"midi_{self.dataset_name}").mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    if self.debug:
+      # single processing for debugging
+      broken_counter = 0
+      success_counter = 0
+      for file_path in tqdm(self.midi_list, total=len(self.midi_list)):
+        message = self._mp_midi2corpus(file_path)
+        if message == "error":
+          broken_counter += 1
+        elif message == "success":
+          success_counter += 1
+    else:
+    # Multi-threaded processing for faster corpus generation.
+      broken_counter = 0
+      success_counter = 0
+      # filter out processed files
+      print(self.out_dir)
+      processed_files = list(Path(self.out_dir).glob(f"midi_{self.dataset_name}/*.mid"))
+      processed_files = [x.name for x in processed_files]
+      print(f"processed files: {len(processed_files)}")
+      print("length of midi list: ", len(self.midi_list))
+      # Use set for faster lookup (O(1) per check)
+      processed_files_set = set(processed_files)
+      # self.midi_list = [x for x in self.midi_list if x.name not in processed_files_set]
+      # reverse the list to process the latest files first
+      self.midi_list.reverse()
+      print(f"length of midi list after filtering: ", len(self.midi_list))
+      with Pool(16) as p:
+        for message in tqdm(p.imap(self._mp_midi2corpus, self.midi_list, 500), total=len(self.midi_list)):
+          if message == "error":
+            broken_counter += 1
+          elif message == "success":
+            success_counter += 1
+        #  for file_path in tqdm(self.midi_list, total=len(self.midi_list)):
+        # message = self._mp_midi2corpus(file_path)
+        # if message == "error":
+        #   broken_counter += 1
+        # elif message == "success":
+        #   success_counter += 1
+    print(f"Making corpus takes: {time.time() - start_time}s, success: {success_counter}, broken: {broken_counter}")
+
+  def _mp_midi2corpus(self, file_path: Path):
+      """Convert MIDI to corpus format and save both corpus (.pkl) and MIDI (.mid)."""
+      try:
+          midi_obj = self._analyze(file_path)
+          corpus, midi_obj = self._midi2corpus(midi_obj)
+          # --- 1. Save corpus (.pkl) ---
+          relative_path = file_path.relative_to(self.midi_path)  # Get relative path from input dir
+          safe_name = str(relative_path).replace("/", "_").replace("\\", "_").replace(".mid", ".pkl")
+          save_path = Path(self.out_dir) / f"corpus_{self.dataset_name}" / safe_name
+          save_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure dir exists
+          with save_path.open("wb") as f:
+              pickle.dump(corpus, f)
+
+          # --- 2. Save MIDI (.mid) ---
+          midi_save_dir = Path("../dataset/represented_data/corpus") / f"midi_{self.dataset_name}"
+          midi_save_dir.mkdir(parents=True, exist_ok=True)
+          midi_save_path = midi_save_dir / file_path.name  # Keep original MIDI filename
+          midi_obj.dump(midi_save_path)
+
+          del midi_obj, corpus
+          return "success"
+
+      except (OSError, EOFError, ValueError, KeyError, AssertionError) as e:
+          print(f"Error processing {file_path.name}: {e}")
+          return "error"
+      except Exception as e:
+          print(f"Unexpected error in {file_path.name}: {e}")
+          return "error"
+  def _check_length(self, last_time:float):
+    if last_time < self.min_last_time:
+      raise ValueError(f"last time {last_time} is out of range")
+
+  def _analyze(self, midi_path:Path):
+    # Loads and analyzes a MIDI file, performing various checks and extracting chords.
+    midi_obj = miditoolkit.midi.parser.MidiFile(midi_path)
+    
+    # check length
+    mapping = midi_obj.get_tick_to_time_mapping()
+    last_time = mapping[midi_obj.max_tick]
+    self._check_length(last_time)
+    
+    for ins in midi_obj.instruments:
+      # delete instrument with no notes
+      if len(ins.notes) == 0:
+        midi_obj.instruments.remove(ins)
+        continue
+      notes = ins.notes
+      notes = sorted(notes, key=lambda x: (x.start, x.pitch))
+
+    # three steps to merge instruments
+    self._merge_percussion(midi_obj)
+    # self._pruning_instrument(midi_obj)
+    self._limit_max_track(midi_obj)
+
+    if self.num_features == 7 or self.num_features == 8:
+      # in case of 7 or 8 features, we need to extract chords
+      new_midi_obj = self._pruning_notes_for_chord_extraction(midi_obj)
+      chords = Dechorder.dechord(new_midi_obj)
+      markers = []
+      for cidx, chord in enumerate(chords):
+        if chord.is_complete():
+          chord_text = NUM2PITCH[chord.root_pc] + '_' + chord.quality + '_' + NUM2PITCH[chord.bass_pc]
+        else:
+          chord_text = 'N_N_N'
+        markers.append(Marker(time=int(cidx*new_midi_obj.ticks_per_beat), text=chord_text))
+      
+      # de-duplication
+      prev_chord = None
+      dedup_chords = []
+      for m in markers:
+        if m.text != prev_chord:
+          prev_chord = m.text
+          dedup_chords.append(m)
+
+      # return midi
+      midi_obj.markers = dedup_chords
+    return midi_obj
+
+  def _pruning_grouped_notes_from_quantization(self, instr_grid:dict):
+    '''
+    In case where notes are grouped in the same quant_time but with different start time, unintentional chords are created
+    rule1: if notes have half step interval, delete the shorter one
+    rule2: if notes do not share 50% of duration of the shorter note, delete the shorter one
+    '''
+    for instr in instr_grid.keys():
+      time_list = sorted(list(instr_grid[instr].keys()))
+      for time in time_list:
+        notes = instr_grid[instr][time]
+        if len(notes) == 1:
+          continue
+        else:
+          new_notes = []
+        # sort in pitch with ascending order
+        notes.sort(key=lambda x: x.pitch)
+        for i in range(len(notes)-1):
+          # if start time is same add to new_notes
+          if notes[i].start == notes[i+1].start:
+            new_notes.append(notes[i])
+            new_notes.append(notes[i+1])
+            continue
+          if notes[i].pitch == notes[i+1].pitch or notes[i].pitch + 1 == notes[i+1].pitch:
+            # select longer note
+            if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start:
+              new_notes.append(notes[i])
+            else:
+              new_notes.append(notes[i+1])
+          else:
+            # check how much duration they share
+            shared_duration = min(notes[i].end, notes[i+1].end) - max(notes[i].start, notes[i+1].start)
+            shorter_duration = min(notes[i].end - notes[i].start, notes[i+1].end - notes[i+1].start)
+            # unless they share more than 80% of duration, select longer note (pruning shorter note)
+            if shared_duration / shorter_duration < 0.8:
+              if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start:
+                new_notes.append(notes[i])
+              else:
+                new_notes.append(notes[i+1])
+            else:
+              if len(new_notes) == 0:
+                new_notes.append(notes[i])
+                new_notes.append(notes[i+1])
+              else:
+                new_notes.append(notes[i+1])
+        instr_grid[instr][time] = new_notes
+
+  def _midi2corpus(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    # Checks if the ticks per beat in the MIDI file is lower than the expected resolution.
+    # If it is, raise an error.
+    if midi_obj.ticks_per_beat < self.in_beat_resolution:
+      raise ValueError(f'[x] Irregular ticks_per_beat. {midi_obj.ticks_per_beat}')
+
+    # Ensure there is at least one time signature change in the MIDI file.
+    # if len(midi_obj.time_signature_changes) == 0:
+    #   raise ValueError('[x] No time_signature_changes')
+    
+    # Ensure there are no duplicated time signature changes.
+    # time_list = [ts.time for ts in midi_obj.time_signature_changes]
+    # if len(time_list) != len(set(time_list)):
+    #   raise ValueError('[x] Duplicated time_signature_changes')
+    
+    # If the dataset is 'LakhClean' or 'SymphonyMIDI', verify there are at least 4 tracks.
+    # if self.dataset_name == 'LakhClean' or self.dataset_name == 'SymphonyMIDI':
+    #   if len(midi_obj.instruments) < 4:
+    #     raise ValueError('[x] We will use more than 4 tracks in Lakh Clean dataset.')
+    
+    # Calculate the resolution of ticks per beat as a fraction.
+    in_beat_tick_resol = Fraction(midi_obj.ticks_per_beat, self.in_beat_resolution)
+    
+    # Extract the initial time signature (numerator and denominator) and calculate the number of ticks for the first bar.
+    if len(midi_obj.time_signature_changes) != 0:
+      initial_numerator = midi_obj.time_signature_changes[0].numerator
+      initial_denominator = midi_obj.time_signature_changes[0].denominator
+    else:
+      # If no time signature changes, set default values
+      initial_numerator = 4
+      initial_denominator = 4
+    first_bar_resol = int(midi_obj.ticks_per_beat * initial_numerator * (4 / initial_denominator))
+
+    # --- load notes --- #
+    instr_notes = self._make_instr_notes(midi_obj)
+    # --- load information --- #
+    # load chords, labels
+    chords = split_markers(midi_obj.markers)
+    chords.sort(key=lambda x: x.time)
+    
+
+    # load tempos
+    tempos = midi_obj.tempo_changes if len(midi_obj.tempo_changes) > 0 else []
+    if len(tempos) == 0:
+      # if no tempo changes, set the default tempo to 120 BPM
+      tempos = [miditoolkit.midi.containers.TempoChange(time=0, tempo=120)]
+    tempos.sort(key=lambda x: x.time)
+    
+    # --- process items to grid --- #
+    # compute empty bar offset at head
+    first_note_time = min([instr_notes[k][0].start for k in instr_notes.keys()])
+    last_note_time = max([instr_notes[k][-1].start for k in instr_notes.keys()])
+
+    quant_time_first = int(round(first_note_time  / in_beat_tick_resol)) * in_beat_tick_resol
+    offset = quant_time_first // first_bar_resol # empty bar
+    offset_by_resol = offset * first_bar_resol
+    # --- process notes --- #
+    instr_grid = dict()
+    for key in instr_notes.keys():
+      notes = instr_notes[key]
+      note_grid = defaultdict(list)
+      for note in notes:
+        # skip notes out of range, below C-1 and above C8
+        if note.pitch < 12 or note.pitch >= 120:
+          continue
+
+        # in case when the first note starts at slightly before the first bar
+        note.start = note.start - offset_by_resol if note.start - offset_by_resol > 0 else 0
+        note.end = note.end - offset_by_resol if note.end - offset_by_resol > 0 else 0
+
+        # relative duration
+        # skip note with 0 duration
+        note_duration = note.end - note.start
+        relative_duration = round(note_duration / in_beat_tick_resol)
+        if relative_duration == 0:
+          continue
+        if relative_duration > self.in_beat_resolution * 8: # 8 beats
+          relative_duration = self.in_beat_resolution * 8
+          
+        # use regular duration bins
+        note.quantized_duration = self.regular_duration_bins[np.argmin(abs(self.regular_duration_bins-relative_duration))]
+
+        # quantize start time 
+        quant_time = int(round(note.start / in_beat_tick_resol)) * in_beat_tick_resol
+
+        # velocity
+        note.velocity = self.regular_velocity_bins[
+          np.argmin(abs(self.regular_velocity_bins-note.velocity))]
+
+        # append
+        note_grid[quant_time].append(note)
+
+      # set to track
+      instr_grid[key] = note_grid
+    
+    # --- pruning grouped notes --- #
+    self._pruning_grouped_notes_from_quantization(instr_grid)
+  
+    # --- process chords --- #
+    chord_grid = defaultdict(list)
+    for chord in chords:
+      # quantize
+      chord.time = chord.time - offset_by_resol
+      chord.time  = 0 if chord.time < 0 else chord.time
+      quant_time = int(round(chord.time / in_beat_tick_resol)) * in_beat_tick_resol
+      chord_grid[quant_time].append(chord)
+    
+    # --- process tempos --- #
+    
+    first_notes_list = []
+    for instr in instr_grid.keys():
+        time_list = sorted(list(instr_grid[instr].keys()))
+        if len(time_list) == 0:  # 跳过空轨道
+            continue
+        first_quant_time = time_list[0]
+        first_notes_list.append(first_quant_time)
+
+    # 处理全空情况
+    if not first_notes_list:
+        raise ValueError("[x] No valid notes found in any instrument track.")
+    quant_first_note_time = min(first_notes_list)
+    tempo_grid = defaultdict(list)
+    for tempo in tempos:
+      # quantize
+      tempo.time = tempo.time - offset_by_resol if tempo.time - offset_by_resol > 0 else 0
+      quant_time = int(round(tempo.time / in_beat_tick_resol)) * in_beat_tick_resol
+      tempo.tempo = self.regular_tempo_bins[
+        np.argmin(abs(self.regular_tempo_bins-tempo.tempo))]
+      if quant_time < quant_first_note_time:
+        tempo_grid[quant_first_note_time].append(tempo)
+      else:
+        tempo_grid[quant_time].append(tempo)
+    if len(tempo_grid[quant_first_note_time]) > 1:
+      tempo_grid[quant_first_note_time] = [tempo_grid[quant_first_note_time][-1]]
+    # --- process time signature --- #
+    quant_time_signature = deepcopy(midi_obj.time_signature_changes)
+    quant_time_signature.sort(key=lambda x: x.time)
+    for ts in quant_time_signature:
+      ts.time = ts.time - offset_by_resol if ts.time - offset_by_resol > 0 else 0
+      ts.time = int(round(ts.time / in_beat_tick_resol)) * in_beat_tick_resol
+    
+    # --- make new midi object to check processed values --- #
+    new_midi_obj = miditoolkit.midi.parser.MidiFile()
+    new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat
+    new_midi_obj.max_tick = midi_obj.max_tick
+    for instr_idx in instr_grid.keys():
+      new_instrument = Instrument(program=instr_idx)
+      new_instrument.notes = [y for x in instr_grid[instr_idx].values() for y in x]
+      new_midi_obj.instruments.append(new_instrument)
+    new_midi_obj.markers = [y for x in chord_grid.values() for y in x]
+    new_midi_obj.tempo_changes = [y for x in tempo_grid.values() for y in x]
+    new_midi_obj.time_signature_changes = midi_obj.time_signature_changes
+    
+    # make corpus
+    song_data = {
+      'notes': instr_grid,
+      'chords': chord_grid,
+      'tempos': tempo_grid,
+      'metadata': {
+        'first_note': first_note_time,
+        'last_note': last_note_time,
+        'time_signature': quant_time_signature,
+        'ticks_per_beat': midi_obj.ticks_per_beat,
+        }
+      }
+    return song_data, new_midi_obj
+
+  def _make_instr_notes(self, midi_obj):
+    '''
+    This part is important, we can use three different ways to merge instruments
+    1st option: compare the number of notes and choose tracks with more notes
+    2nd option: merge all instruments with the same tracks
+    3rd option: leave all instruments as they are. differentiate tracks with different track number
+    
+    In this version we choose to use the 2nd option as it helps to reduce the number of tracks and sequence length
+    '''
+    instr_notes = defaultdict(list)
+    for instr in midi_obj.instruments:
+      instr_idx = instr.program
+      # change instrument idx
+      instr_name = FINED_PROGRAM_INSTRUMENT_MAP.get(instr_idx)
+      if instr_name is None:
+        continue
+      # new_instr_idx = INSTRUMENT_PROGRAM_MAP[instr_name]
+      new_instr_idx = instr_idx
+      if new_instr_idx not in instr_notes:
+        instr_notes[new_instr_idx] = []
+      instr_notes[new_instr_idx].extend(instr.notes)
+      instr_notes[new_instr_idx].sort(key=lambda x: (x.start, -x.pitch))
+    return instr_notes
+
+  # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet"
+  def _merge_percussion(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    merge drum track to one track
+    '''
+    drum_0_lst = []
+    new_instruments = []
+    for instrument in midi_obj.instruments:
+      if len(instrument.notes) == 0:
+        continue
+      if instrument.is_drum:
+        drum_0_lst.extend(instrument.notes)
+      else:
+        new_instruments.append(instrument)
+    if len(drum_0_lst) > 0:
+      drum_0_lst.sort(key=lambda x: x.start)
+      # remove duplicate
+      drum_0_lst = list(k for k, _ in itertools.groupby(drum_0_lst))
+      drum_0_instrument = Instrument(program=114, is_drum=True, name="percussion")
+      drum_0_instrument.notes = drum_0_lst
+      new_instruments.append(drum_0_instrument)
+    midi_obj.instruments = new_instruments
+
+  # referred to mmt "https://github.com/salu133445/mmt"
+  def _pruning_instrument(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    merge instrument number with similar intrument category
+    ex. 0: Acoustic Grand Piano, 1: Bright Acoustic Piano, 2: Electric Grand Piano into 0: Acoustic Grand Piano
+    '''
+    new_instruments = []
+    for instr in midi_obj.instruments:
+      instr_idx = instr.program
+      # change instrument idx
+      instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx)
+      if instr_name != None:
+        new_instruments.append(instr)
+    midi_obj.instruments = new_instruments
+
+  # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet"
+  def _limit_max_track(self, midi_obj:miditoolkit.midi.parser.MidiFile, MAX_TRACK:int=16):
+      '''
+      merge track with least notes to other track with same program
+      and limit the maximum amount of track to 16
+      '''
+      if len(midi_obj.instruments) == 1:
+        if midi_obj.instruments[0].is_drum:
+          midi_obj.instruments[0].program = 114
+          midi_obj.instruments[0].is_drum = False
+        return midi_obj
+      good_instruments = midi_obj.instruments
+      good_instruments.sort(
+          key=lambda x: (not x.is_drum, -len(x.notes)))  # place drum track or the most note track at first
+      assert good_instruments[0].is_drum == True or len(good_instruments[0].notes) >= len(
+          good_instruments[1].notes), tuple(len(x.notes) for x in good_instruments[:3])
+      # assert good_instruments[0].is_drum == False, (, len(good_instruments[2]))
+      track_idx_lst = list(range(len(good_instruments)))
+      if len(good_instruments) > MAX_TRACK:
+          new_good_instruments = copy.deepcopy(good_instruments[:MAX_TRACK])
+          # print(midi_file_path)
+          for id in track_idx_lst[MAX_TRACK:]:
+              cur_ins = good_instruments[id]
+              merged = False
+              new_good_instruments.sort(key=lambda x: len(x.notes))
+              for nid, ins in enumerate(new_good_instruments):
+                  if cur_ins.program == ins.program and cur_ins.is_drum == ins.is_drum:
+                      new_good_instruments[nid].notes.extend(cur_ins.notes)
+                      merged = True
+                      break
+              if not merged:
+                  pass
+          good_instruments = new_good_instruments
+
+      assert len(good_instruments) <= MAX_TRACK, len(good_instruments)
+      for idx, good_instrument in enumerate(good_instruments):
+          if good_instrument.is_drum:
+              good_instruments[idx].program = 114
+              good_instruments[idx].is_drum = False
+      midi_obj.instruments = good_instruments
+
+  def _pruning_notes_for_chord_extraction(self, midi_obj:miditoolkit.midi.parser.MidiFile):
+    '''
+    extract notes for chord extraction
+    '''
+    new_midi_obj = miditoolkit.midi.parser.MidiFile()
+    new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat
+    new_midi_obj.max_tick = midi_obj.max_tick
+    new_instrument = Instrument(program=0, is_drum=False, name="for_chord")
+    new_instruments = []
+    new_notes = []
+    for instrument in midi_obj.instruments:
+      if instrument.program == 114 or instrument.is_drum: # pass drum track
+        continue
+      valid_notes = [note for note in instrument.notes if note.pitch >= 21 and note.pitch <= 108]
+      new_notes.extend(valid_notes)
+    new_notes.sort(key=lambda x: x.start)
+    new_instrument.notes = new_notes
+    new_instruments.append(new_instrument)
+    new_midi_obj.instruments = new_instruments
+    return new_midi_obj
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../dataset/represented_data/corpus/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  return parser
+
+def main():
+  parser = get_argument_parser()
+  args = parser.parse_args()
+  corpus_maker = CorpusMaker(args.dataset, args.num_features, args.in_dir, args.out_dir, args.debug)
+  corpus_maker.make_corpus()
+
+if __name__ == "__main__":
+  main()
+#   python3 step1_midi2corpus.py --dataset SOD --num_features 5
+# python3 step2_corpus2event.py --dataset LakhClean --num_features 5 --encoding nb
+# python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb
+#  python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb
\ No newline at end of file
diff --git a/data_representation/step2_corpus2event.py b/data_representation/step2_corpus2event.py
new file mode 100644
index 0000000..74b2b55
--- /dev/null
+++ b/data_representation/step2_corpus2event.py
@@ -0,0 +1,147 @@
+import argparse
+import time
+from pathlib import Path
+
+import pickle
+from tqdm import tqdm
+from multiprocessing import Pool
+
+import encoding_utils
+
+'''
+This script is for converting corpus data to event data.
+'''
+
+class Corpus2Event():
+  def __init__(
+      self, 
+      dataset: str, 
+      encoding_scheme: str, 
+      num_features: int, 
+      in_dir: Path, 
+      out_dir: Path,
+      debug: bool,
+      cache: bool,
+  ):
+    self.dataset = dataset
+    self.encoding_name = encoding_scheme + str(num_features)
+    self.in_dir = in_dir / f"corpus_{self.dataset}"
+    self.out_dir = out_dir / f"events_{self.dataset}" / self.encoding_name
+    self.debug = debug
+    self.cache = cache
+    self.encoding_function = getattr(encoding_utils, f'Corpus2event_{encoding_scheme}')(num_features)
+    self._get_in_beat_resolution()
+
+  def _get_in_beat_resolution(self):
+    # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note)
+    in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8}
+    try:
+      self.in_beat_resolution = in_beat_resolution_dict[self.dataset]
+    except KeyError:
+      print(f"Dataset {self.dataset} is not supported. use the setting of LakhClean")
+      self.in_beat_resolution = in_beat_resolution_dict['LakhClean']
+
+  def make_events(self):
+    '''
+    Preprocess corpus data to events data.
+    The process in each encoding scheme is different.
+    Please refer to encoding_utils.py for more details.
+    '''
+    print("preprocessing corpus data to events data")
+    # check output directory exists
+    self.out_dir.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    # single-processing
+    broken_count = 0
+    success_count = 0
+    corpus_list = sorted(list(self.in_dir.rglob("*.pkl")))
+    if corpus_list == []:
+      print(f"No corpus files found in {self.in_dir}. Please check the directory.")
+      corpus_list = sorted(list(self.in_dir.glob("*.pkli")))
+    # remove the corpus files that are already in the out_dir
+    # Use set for faster existence checks
+    existing_files = set(f.name for f in self.out_dir.glob("*.pkl"))
+    # corpus_list = [corpus for corpus in corpus_list if corpus.name not in existing_files]
+    for filepath_name, event in tqdm(map(self._load_single_corpus_and_make_event, corpus_list), total=len(corpus_list)):
+      if event is None:
+        broken_count += 1
+        continue
+      # if using cache, check if the event file already exists
+      if self.cache and (self.out_dir / filepath_name).exists():
+        # print(f"event file {filepath_name} already exists, skipping")
+        continue
+      with open(self.out_dir / filepath_name, 'wb') as f:
+        pickle.dump(event, f)
+      success_count += 1
+      del event
+    print(f"taken time for making events is {time.time()-start_time}s, success: {success_count}, broken: {broken_count}")
+
+  def _load_single_corpus_and_make_event(self, file_path):
+    try:
+      with open(file_path, 'rb') as f:
+        corpus = pickle.load(f)
+      event = self.encoding_function(corpus, self.in_beat_resolution)
+    except Exception as e:
+      print(f"error in encoding {file_path}: {e}")
+      event = None
+    return file_path.name, event
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-e",
+      "--encoding",
+      required=True,
+      choices=("remi", "cp", "nb", "remi_pos"),
+      type=str,
+      help="encoding scheme",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/represented_data/corpus/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../dataset/represented_data/events/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  parser.add_argument(
+      "--cache",
+      action="store_true",
+      help="enable cache mode",
+  )
+  return parser
+
+def main():
+  args = get_argument_parser().parse_args()
+  corpus2event = Corpus2Event(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug, args.cache)
+  corpus2event.make_events()
+
+if __name__ == "__main__":
+  main()
diff --git a/data_representation/step3_creating_vocab.py b/data_representation/step3_creating_vocab.py
new file mode 100644
index 0000000..fa8cc33
--- /dev/null
+++ b/data_representation/step3_creating_vocab.py
@@ -0,0 +1,84 @@
+import argparse
+from pathlib import Path
+
+import vocab_utils
+
+'''
+This script is for creating vocab file.
+'''
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-e",
+      "--encoding",
+      required=True,
+      choices=("remi", "cp", "nb"),
+      type=str,
+      help="encoding scheme",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/represented_data/events/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../vocab/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  return parser
+
+def main():
+  args = get_argument_parser().parse_args()
+  encoding_scheme = args.encoding
+  num_features = args.num_features
+  dataset = args.dataset
+
+  out_vocab_path = args.out_dir / f"vocab_{dataset}"
+  out_vocab_path.mkdir(parents=True, exist_ok=True)
+  out_vocab_file_path = out_vocab_path / f"vocab_{dataset}_{encoding_scheme}{num_features}.json"
+  
+  events_path = Path(args.in_dir / f"events_{dataset}" / f"{encoding_scheme}{num_features}")
+  vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+  selected_vocab_name = vocab_name[encoding_scheme]
+  event_data = sorted(list(events_path.rglob("*.pkl")))
+  if event_data == []:
+    print(f"No event files found in {events_path}. Please check the directory.")
+    event_data = sorted(list(events_path.glob("*.pkli")))
+  vocab = getattr(vocab_utils, selected_vocab_name)(
+    in_vocab_file_path=None, 
+    event_data=event_data,
+    encoding_scheme=encoding_scheme, 
+    num_features=num_features
+    )
+  vocab.save_vocab(out_vocab_file_path)
+  print(f"Vocab file saved at {out_vocab_file_path}")
+
+if __name__ == "__main__":
+  main()
diff --git a/data_representation/step4_event2tuneidx.py b/data_representation/step4_event2tuneidx.py
new file mode 100644
index 0000000..e429265
--- /dev/null
+++ b/data_representation/step4_event2tuneidx.py
@@ -0,0 +1,127 @@
+import argparse
+import time
+from pathlib import Path
+
+import numpy as np
+import pickle
+from tqdm import tqdm
+
+import vocab_utils
+
+class Event2tuneidx():
+  def __init__(
+      self, 
+      dataset: str, 
+      encoding_scheme: str, 
+      num_features: int, 
+      in_dir: Path, 
+      out_dir: Path, 
+      debug: bool
+  ):
+    self.dataset = dataset
+    self.encoding_scheme = encoding_scheme
+    self.encoding_name = encoding_scheme + str(num_features)
+    self.in_dir = in_dir / f"events_{self.dataset}" / self.encoding_name
+    self.out_dir = out_dir / f"tuneidx_{self.dataset}" / self.encoding_name
+    self.debug = debug
+
+    vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+    in_vocab_file_path = Path(f"../vocab/vocab_{dataset}/vocab_{dataset}_{encoding_scheme}{num_features}.json")
+    self.vocab = getattr(vocab_utils, selected_vocab_name)(in_vocab_file_path=in_vocab_file_path, event_data=None,
+                                                        encoding_scheme=encoding_scheme, num_features=num_features)
+
+  def _convert_event_to_tune_in_idx(self, tune_in_event):
+    tune_in_idx = []
+    for event in tune_in_event:
+      event_in_idx = self.vocab(event)
+      if event_in_idx != None:
+        tune_in_idx.append(event_in_idx)
+    return tune_in_idx
+
+  def _load_single_event_and_make_tune_in_idx(self, file_path):
+    with open(file_path, 'rb') as f:
+      tune_in_event = pickle.load(f)
+    tune_in_idx = self._convert_event_to_tune_in_idx(tune_in_event)
+    return file_path.name, tune_in_idx
+
+  def make_tune_in_idx(self):
+    print("preprocessing events data to tune_in_idx data")
+    # check output directory exists
+    self.out_dir.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    event_list = sorted(list(self.in_dir.rglob("*.pkl")))
+    if event_list == []:
+      event_list = sorted(list(self.in_dir.glob("*.pkli")))
+    for filepath_name, tune_in_idx in tqdm(map(self._load_single_event_and_make_tune_in_idx, event_list), total=len(event_list)):
+      # save tune_in_idx as npz file with uint16 dtype for remi because it has more than 256 tokens
+      if self.encoding_scheme == 'remi':
+        tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
+      else:
+        tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
+        if np.max(tune_in_idx) < 256:
+          tune_in_idx = np.array(tune_in_idx, dtype=np.uint8)
+      if filepath_name.endswith('.pkli'):
+        file_name = filepath_name.replace('.pkli', '.npz')
+      else:
+        file_name = filepath_name.replace('.pkl', '.npz')
+      np.savez_compressed(self.out_dir / file_name, tune_in_idx)
+      del tune_in_idx
+    print(f"taken time for making tune_in_idx is {time.time()-start_time}")
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-e",
+      "--encoding",
+      required=True,
+      choices=("remi", "cp", "nb"),
+      type=str,
+      help="encoding scheme",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/represented_data/events/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../dataset/represented_data/tuneidx/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  return parser
+
+def main():
+  parser = get_argument_parser()
+  args = parser.parse_args()
+
+  event2tuneidx = Event2tuneidx(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug)
+  event2tuneidx.make_tune_in_idx()
+
+if __name__ == "__main__":
+  main()
diff --git a/data_representation/step4_event2tuneidx_addprompt.py b/data_representation/step4_event2tuneidx_addprompt.py
new file mode 100644
index 0000000..4418f0f
--- /dev/null
+++ b/data_representation/step4_event2tuneidx_addprompt.py
@@ -0,0 +1,122 @@
+import argparse
+import time
+from pathlib import Path
+
+import numpy as np
+import pickle
+from tqdm import tqdm
+
+import vocab_utils
+
+class Event2tuneidx():
+  def __init__(
+      self, 
+      dataset: str, 
+      encoding_scheme: str, 
+      num_features: int, 
+      in_dir: Path, 
+      out_dir: Path, 
+      debug: bool
+  ):
+    self.dataset = dataset
+    self.encoding_scheme = encoding_scheme
+    self.encoding_name = encoding_scheme + str(num_features)
+    self.in_dir = in_dir / f"events_{self.dataset}" / self.encoding_name
+    self.out_dir = out_dir / f"tuneidx_{self.dataset}" / self.encoding_name
+    self.debug = debug
+
+    vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+    in_vocab_file_path = Path(f"../vocab/vocab_{dataset}/vocab_{dataset}_{encoding_scheme}{num_features}.json")
+    self.vocab = getattr(vocab_utils, selected_vocab_name)(in_vocab_file_path=in_vocab_file_path, event_data=None,
+                                                        encoding_scheme=encoding_scheme, num_features=num_features)
+
+  def _convert_event_to_tune_in_idx(self, tune_in_event):
+    tune_in_idx = []
+    for event in tune_in_event:
+      event_in_idx = self.vocab(event)
+      if event_in_idx != None:
+        tune_in_idx.append(event_in_idx)
+    return tune_in_idx
+
+  def _load_single_event_and_make_tune_in_idx(self, file_path):
+    with open(file_path, 'rb') as f:
+      tune_in_event = pickle.load(f)
+    tune_in_idx = self._convert_event_to_tune_in_idx(tune_in_event)
+    return file_path.name, tune_in_idx
+
+  def make_tune_in_idx(self):
+    print("preprocessing events data to tune_in_idx data")
+    # check output directory exists
+    self.out_dir.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    event_list = sorted(list(self.in_dir.rglob("*.pkl")))
+    for filepath_name, tune_in_idx in tqdm(map(self._load_single_event_and_make_tune_in_idx, event_list), total=len(event_list)):
+      # save tune_in_idx as npz file with uint16 dtype for remi because it has more than 256 tokens
+      if self.encoding_scheme == 'remi':
+        tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
+      else:
+        tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
+        if np.max(tune_in_idx) < 256:
+          tune_in_idx = np.array(tune_in_idx, dtype=np.uint8)
+      file_name = filepath_name.replace('.pkl', '.npz')
+      np.savez_compressed(self.out_dir / file_name, tune_in_idx)
+      del tune_in_idx
+    print(f"taken time for making tune_in_idx is {time.time()-start_time}")
+
+def get_argument_parser():
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-d",
+      "--dataset",
+      required=True,
+      # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
+      type=str,
+      help="dataset names",
+  )
+  parser.add_argument(
+      "-e",
+      "--encoding",
+      required=True,
+      choices=("remi", "cp", "nb"),
+      type=str,
+      help="encoding scheme",
+  )
+  parser.add_argument(
+      "-f",
+      "--num_features",
+      required=True,
+      choices=(4, 5, 7, 8),
+      type=int,
+      help="number of features",
+  )
+  parser.add_argument(
+      "-i",
+      "--in_dir",
+      default="../dataset/represented_data/events/",
+      type=Path,
+      help="input data directory",
+  )
+  parser.add_argument(
+      "-o",
+      "--out_dir",
+      default="../dataset/represented_data/tuneidx_withcaption/",
+      type=Path,
+      help="output data directory",
+  )
+  parser.add_argument(
+      "--debug",
+      action="store_true",
+      help="enable debug mode",
+  )
+  return parser
+
+def main():
+  parser = get_argument_parser()
+  args = parser.parse_args()
+
+  event2tuneidx = Event2tuneidx(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug)
+  event2tuneidx.make_tune_in_idx()
+
+if __name__ == "__main__":
+  main()
diff --git a/data_representation/vocab_utils.py b/data_representation/vocab_utils.py
new file mode 100644
index 0000000..999385b
--- /dev/null
+++ b/data_representation/vocab_utils.py
@@ -0,0 +1,395 @@
+import pickle
+from pathlib import Path
+from typing import Union
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+from fractions import Fraction
+
+import torch
+
+import json
+from tqdm import tqdm
+
+def sort_key(s):
+  fraction_part = s.split('_')[-1]
+  numerator, denominator = map(int, fraction_part.split('/'))
+  # Return a tuple with denominator first, then numerator, both in negative for descending order
+  return (-denominator, -numerator)
+
+class LangTokenVocab:
+  def __init__(
+    self, 
+    in_vocab_file_path:Union[Path, None],
+    event_data: list, 
+    encoding_scheme: str, 
+    num_features: int
+  ):
+    '''
+    Initializes the LangTokenVocab class.
+    
+    Args:
+      in_vocab_file_path (Union[Path, None]): Path to the pre-made vocabulary file (optional).
+      event_data (list): List of event data to create a vocabulary if no pre-made vocab is provided.
+      encoding_scheme (str): Encoding scheme to be used (e.g., 'remi', 'cp', 'nb').
+      num_features (int): Number of features to be used (e.g., 4, 5, 7, 8).
+    
+    Summary:
+    This class is responsible for handling vocabularies used in language models, especially for REMI encoding. 
+    It supports multiple encoding schemes, creates vocabularies based on event data, handles special tokens (e.g., 
+    start/end of sequence), and manages feature-specific masks. It provides methods for saving, loading, and decoding
+    vocabularies. It also supports vocabulary augmentation for pitch, instrument, beat, and chord features, ensuring 
+    that these are arranged and ordered appropriately.
+
+    For all encoding schemes, the metric or special tokens are named as 'type', 
+    so that we can easily handle and compare among different encoding schemes.
+    '''
+
+    self.encoding_scheme = encoding_scheme
+    self.num_features = num_features
+    self._prepare_in_vocab(in_vocab_file_path, event_data)  # Prepares initial vocab based on the input file or event data
+    self._get_features()  # Extracts relevant features based on the num_features
+    self.idx2event, self.event2idx = self._get_vocab(event_data, unique_vocabs=self.idx2event)  # Creates vocab or loads premade vocab
+    if self.encoding_scheme == 'remi':
+      self._make_mask()  # Generates masks for 'remi' encoding scheme
+    self._get_sos_eos_token()  # Retrieves special tokens (Start of Sequence, End of Sequence)
+
+  # Prepares vocabulary if a pre-made vocab file exists or handles cases with no input file.
+  def _prepare_in_vocab(self, in_vocab_file_path, event_data):
+    if in_vocab_file_path is not None:
+      with open(in_vocab_file_path, 'r') as f:
+        idx2event_temp = json.load(f)
+      if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb':
+        for key in idx2event_temp.keys():
+          idx2event_temp[key] = {int(idx):tok for idx, tok in idx2event_temp[key].items()}
+      elif self.encoding_scheme == 'remi':
+        idx2event_temp = {int(idx):tok for idx, tok in idx2event_temp.items()}
+      self.idx2event = idx2event_temp
+    elif in_vocab_file_path is None and event_data is None:
+      raise NotImplementedError('either premade vocab or event_data should be given')
+    else:
+      self.idx2event = None
+
+  # Extracts features depending on the number of features chosen (4, 5, 7, 8).
+  def _get_features(self):
+    feature_args = {
+      4: ["type", "beat", "pitch", "duration"],
+      5: ["type", "beat", "instrument", "pitch", "duration"],
+      7: ["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"],
+      8: ["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"]}
+    self.feature_list = feature_args[self.num_features]
+
+  # Saves the current vocabulary to a specified JSON path.
+  def save_vocab(self, json_path):
+    with open(json_path, 'w') as f:
+      json.dump(self.idx2event, f, indent=2, ensure_ascii=False)
+
+  # Returns the size of the current vocabulary.
+  def get_vocab_size(self):
+    return len(self.idx2event)
+
+  # Handles Start of Sequence (SOS) and End of Sequence (EOS) tokens based on the encoding scheme.
+  def _get_sos_eos_token(self):
+    if self.encoding_scheme == 'remi':
+      self.sos_token = [self.event2idx['SOS_None']]
+      self.eos_token = [[self.event2idx['EOS_None']]]
+    else:
+      self.sos_token = [[self.event2idx['type']['SOS']] + [0] * (self.num_features - 1)]
+      self.eos_token = [[self.event2idx['type']['EOS']] + [0] * (self.num_features - 1)]
+
+  # Generates vocabularies by either loading from a file or creating them based on the event data.
+  def _get_vocab(self, event_data, unique_vocabs=None):
+    # make new vocab from given event_data
+    if event_data is not None:
+      unique_char_list = list(set([f'{event["name"]}_{event["value"]}' for tune_path in event_data for event in pickle.load(open(tune_path, 'rb'))]))
+      unique_vocabs = sorted(unique_char_list)
+      unique_vocabs.remove('SOS_None')
+      unique_vocabs.remove('EOS_None')
+      unique_vocabs.remove('Bar_None')
+      new_unique_vocab = self._augment_pitch_vocab(unique_vocabs)
+      if self.num_features == 5 or self.num_features == 8:
+        new_unique_vocab = self._arange_instrument_vocab(new_unique_vocab)
+      if self.num_features == 7 or self.num_features == 8:
+        new_unique_vocab = self._arange_chord_vocab(new_unique_vocab)
+      new_unique_vocab = self._arange_beat_vocab(new_unique_vocab)
+      new_unique_vocab.insert(0, 'SOS_None')
+      new_unique_vocab.insert(1, 'EOS_None')
+      new_unique_vocab.insert(2, 'Bar_None')
+      idx2event = {int(idx) : tok for idx, tok in enumerate(new_unique_vocab)}
+      event2idx = {tok : int(idx) for idx, tok in idx2event.items()}
+    # load premade vocab
+    else:
+      idx2event = unique_vocabs
+      event2idx = {tok : int(idx) for idx, tok in unique_vocabs.items()}
+    return idx2event, event2idx
+
+  # Augments the pitch vocabulary by expanding the range of pitch values.
+  def _augment_pitch_vocab(self, unique_vocabs):
+    pitch_vocab = [x for x in unique_vocabs if 'Note_Pitch_' in x]
+    pitch_int = [int(x.replace('Note_Pitch_', '')) for x in pitch_vocab if x.replace('Note_Pitch_', '').isdigit()]
+    min_pitch = min(pitch_int)
+    max_pitch = max(pitch_int)
+    min_pitch_margin = max(min_pitch-6, 0)
+    max_pitch_margin = min(max_pitch+7, 127)
+    new_pitch_vocab = sorted([f'Note_Pitch_{x}' for x in range(min_pitch_margin, max_pitch_margin+1)], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+    new_unique_vocab = [x for x in unique_vocabs if x not in new_pitch_vocab] + new_pitch_vocab
+    return new_unique_vocab
+
+  # Orders and arranges the instrument vocabulary.
+  def _arange_instrument_vocab(self, unique_vocabs):
+    instrument_vocab = [x for x in unique_vocabs if 'Instrument_' in x]
+    new_instrument_vocab = sorted(instrument_vocab, key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+    new_unique_vocab = [x for x in unique_vocabs if x not in new_instrument_vocab] + new_instrument_vocab
+    return new_unique_vocab
+
+  # Orders and arranges the chord vocabulary, ensuring 'Chord_N_N' is the last token.
+  def _arange_chord_vocab(self, unique_vocabs):
+    '''
+    for chord augmentation
+    Chord_N_N should be the last token in the list for an easy implementation of chord augmentation
+    '''
+    chord_vocab = [x for x in unique_vocabs if 'Chord_' in x]
+    chord_vocab.remove('Chord_N_N')
+    new_chord_vocab = sorted(chord_vocab, key=lambda x: (not isinstance(x, int), x.split('_')[-1] if isinstance(x, str) else x, x.split('_')[1] if isinstance(x, str) else x))
+    new_chord_vocab.append('Chord_N_N')
+    new_unique_vocab = [x for x in unique_vocabs if x not in new_chord_vocab] + new_chord_vocab
+    return new_unique_vocab
+
+  # Orders and arranges the beat vocabulary.
+  def _arange_beat_vocab(self, unique_vocabs):
+    beat_vocab = [x for x in unique_vocabs if 'Beat_' in x]
+    new_beat_vocab = sorted(beat_vocab, key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+    count = 0
+    for idx, token in enumerate(unique_vocabs):
+      if 'Beat_' in token:
+        unique_vocabs[idx] = new_beat_vocab[count]
+        count += 1
+    return unique_vocabs     
+
+  # Generates masks for the 'remi' encoding scheme.
+  def _make_mask(self):
+    '''
+    This function is used to extract the target musical features for validation.
+    '''
+    idx2feature = {}
+    for idx, feature in self.idx2event.items():
+      if feature.startswith('SOS') or feature.startswith('EOS') or feature.startswith('Bar'):
+        idx2feature[idx] = 'type'
+      elif feature.startswith('Beat'):
+        idx2feature[idx] = 'beat'
+      elif feature.startswith('Chord'):
+        idx2feature[idx] = 'chord'
+      elif feature.startswith('Tempo'):
+        idx2feature[idx] = 'tempo'
+      elif feature.startswith('Note_Pitch'):
+        idx2feature[idx] = 'pitch'
+      elif feature.startswith('Note_Duration'):
+        idx2feature[idx] = 'duration'
+      elif feature.startswith('Note_Velocity'):
+        idx2feature[idx] = 'velocity'
+      elif feature.startswith('Instrument'):
+        idx2feature[idx] = 'instrument'
+
+    self.total_mask = {}
+    self.remi_vocab_boundaries_by_key = {}
+    for target in self.feature_list:
+      mask = [0] * len(idx2feature)  # Initialize all-zero list of length equal to dictionary
+      for key, value in idx2feature.items():
+        if value == target:
+          mask[int(key)] = 1  # If value equals target, set corresponding position in mask to 1
+      mask = torch.LongTensor(mask)
+      self.total_mask[target] = mask
+      start_idx, end_idx = torch.argwhere(mask == 1).flatten().tolist()[0], torch.argwhere(mask == 1).flatten().tolist()[-1]
+      self.remi_vocab_boundaries_by_key[target] = (start_idx, end_idx+1)
+
+  def decode(self, events:torch.Tensor):
+    '''
+    Used for checking events in the evaluation
+    events: 1d tensor
+    '''
+    decoded_list = []
+    for event in events:
+      decoded_list.append(self.idx2event[event.item()])
+    return decoded_list
+
+  def __call__(self, word):
+    '''
+    for remi style encoding
+    '''
+    return self.event2idx[f"{word['name']}_{word['value']}"]
+
+class MusicTokenVocabCP(LangTokenVocab):
+  def __init__(
+    self, 
+    in_vocab_file_path:Union[Path, None],
+    event_data: list, 
+    encoding_scheme: str, 
+    num_features: int
+  ):
+    # Initialize the vocabulary class with vocab file path, event data, encoding scheme, and feature count
+    super().__init__(in_vocab_file_path, event_data, encoding_scheme, num_features)
+
+  def _augment_pitch_vocab(self, unique_vocabs):
+    # Extract pitch-related vocabularies and adjust pitch range
+    pitch_total_vocab = unique_vocabs['pitch']
+    pitch_vocab = [x for x in pitch_total_vocab if 'Note_Pitch_' in str(x)]
+    pitch_int = [int(x.replace('Note_Pitch_', '')) for x in pitch_vocab if x.replace('Note_Pitch_', '').isdigit()]
+    # Determine the min and max pitch values and extend the pitch range slightly
+    min_pitch = min(pitch_int)
+    max_pitch = max(pitch_int)
+    min_pitch_margin = max(min_pitch - 6, 0)
+    max_pitch_margin = min(max_pitch + 7, 127)
+    # Create new pitch vocab and ensure new entries do not overlap with existing ones
+    new_pitch_vocab = [f'Note_Pitch_{x}' for x in range(min_pitch_margin, max_pitch_margin + 1)]
+    new_pitch_vocab = [x for x in pitch_total_vocab if str(x) not in new_pitch_vocab] + new_pitch_vocab
+    unique_vocabs['pitch'] = new_pitch_vocab
+    return unique_vocabs
+
+  def _mp_get_unique_vocab(self, tune, features):
+    # Read event data from a file and collect unique vocabularies for specified features
+    with open(tune, 'rb') as f:
+        events_list = pickle.load(f)
+    unique_vocabs = defaultdict(set)
+    for event in events_list:
+        for key in features:
+            unique_vocabs[key].add(event[key])
+    return unique_vocabs
+
+  def _get_chord_vocab(self):
+    '''
+    Manually define the chord vocabulary by combining roots and qualities
+    from a predefined list. This is used for chord augmentation.
+    '''
+    root_list = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']
+    quality_list = ['+', '/o7', '7', 'M', 'M7', 'm', 'm7', 'o', 'o7', 'sus2', 'sus4']
+    chord_vocab = [f'Chord_{root}_{quality}' for root in root_list for quality in quality_list]
+    # Sort the chord vocabulary based on the root and quality
+    chord_vocab = sorted(chord_vocab, key=lambda x: (not isinstance(x, int), x.split('_')[-1] if isinstance(x, str) else x, x.split('_')[0] if isinstance(x, str) else x))
+    return chord_vocab
+
+  def _cp_sort_type(self, unique_vocabs):
+    # Similar to _nb_sort_type but used for the 'cp' encoding scheme, sorting vocabularies in a different order
+    unique_vocabs.remove('SOS')
+    unique_vocabs.remove('EOS')
+    unique_vocabs.remove('Metrical')
+    unique_vocabs.remove('Note')
+    vocab_list = list(unique_vocabs)
+    unique_vocabs = sorted(vocab_list, key=sort_key)
+    unique_vocabs.insert(0, 'SOS')
+    unique_vocabs.insert(1, 'EOS')
+    unique_vocabs.insert(2, 'Metrical')
+    unique_vocabs.insert(3, 'Note')
+    return unique_vocabs
+
+  # Define custom sorting function
+  def sort_type_cp(self, item):
+      if item == 0:
+          return (0, 0)  # Move 0 to the beginning
+      elif isinstance(item, str):
+          if item.startswith("Bar"):
+              return (1, item)  # "Bar" items come next, sorted lexicographically
+          elif item.startswith("Beat"):
+              # Extract numeric part of "Beat_x" to sort numerically
+              beat_number = int(item.split('_')[1])
+              return (2, beat_number)  # "Beat" items come last, sorted by number
+      return (3, item)  # Catch-all for anything unexpected (shouldn't be necessary here)
+
+  def _get_vocab(self, event_data, unique_vocabs=None):
+    if event_data is not None:
+        # Create vocab mappings (event2idx, idx2event) from the provided event data
+        print('start to get unique vocab')
+        event2idx = {}
+        idx2event = {}
+        unique_vocabs = defaultdict(set)
+        # Use multiprocessing to extract unique vocabularies for each event
+        with Pool(16) as p:
+            results = p.starmap(self._mp_get_unique_vocab, tqdm([(tune, self.feature_list) for tune in event_data]))
+        # Combine results from different processes
+        for result in results:
+            for key in self.feature_list:
+                if key == 'chord':  # Chords are handled separately
+                    continue
+                unique_vocabs[key].update(result[key])
+        # Augment pitch vocab and add manually defined chord vocab
+        unique_vocabs = self._augment_pitch_vocab(unique_vocabs)
+        unique_vocabs['chord'] = self._get_chord_vocab()
+        # Process each feature type, handling special cases like 'tempo' and 'chord'
+        for key in self.feature_list:
+            if key == 'tempo':
+                remove_nn_flag = False
+                if 'Tempo_N_N' in unique_vocabs[key]:
+                    unique_vocabs[key].remove('Tempo_N_N')
+                    remove_nn_flag = True
+                unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+                if remove_nn_flag:
+                    unique_vocabs[key].insert(1, 'Tempo_N_N')
+            elif key == 'chord':
+                unique_vocabs[key].insert(0, 0)
+                unique_vocabs[key].insert(1, 'Chord_N_N')
+            elif key == 'type':  # Sort 'type' vocab depending on the encoding scheme
+                if self.encoding_scheme == 'cp':
+                    unique_vocabs[key] = self._cp_sort_type(unique_vocabs[key])
+                else:  # NB encoding scheme
+                    unique_vocabs[key] = self._nb_sort_type(unique_vocabs[key])
+            elif key == 'beat' and self.encoding_scheme == 'cp':  # Handle 'beat' vocab with 'cp' scheme
+                # unique_vocabs[key].remove('Bar')
+                # unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), Fraction(x.split('_')[-1] if isinstance(x, str) else x)))
+                # unique_vocabs[key].insert(1, 'Bar')
+                unique_vocabs[key] = sorted(unique_vocabs[key], key = self.sort_type_cp)
+            elif key == 'beat' and self.encoding_scheme == 'nb':  # Handle 'beat' vocab with 'nb' scheme
+                unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+            elif key == 'instrument':  # Sort 'instrument' vocab by integer values
+                unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+            else:  # Default case: sort by integer values for other keys
+                unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x)))
+            # Create event2idx and idx2event mappings for each feature
+            event2idx[key] = {tok: int(idx) for idx, tok in enumerate(unique_vocabs[key])}
+            idx2event[key] = {int(idx): tok for idx, tok in enumerate(unique_vocabs[key])}
+        return idx2event, event2idx
+    else:
+        # If no event data, simply map unique vocab to indexes
+        event2idx = {}
+        for key in self.feature_list:
+            event2idx[key] = {tok: int(idx) for idx, tok in unique_vocabs[key].items()}
+        return unique_vocabs, event2idx
+
+  def get_vocab_size(self):
+    # Return the size of the vocabulary for each feature
+    return {key: len(self.idx2event[key]) for key in self.feature_list}
+
+  def __call__(self, event):
+    # Convert an event to its corresponding indices
+    return [self.event2idx[key][event[key]] for key in self.feature_list]
+        
+  def decode(self, events:torch.Tensor):
+    decoded_list = []
+    for event in events:
+      decoded_list.append([self.idx2event[key][event[idx].item()] for idx, key in enumerate(self.feature_list)])
+    return decoded_list
+
+class MusicTokenVocabNB(MusicTokenVocabCP):
+  def __init__(
+    self, 
+    in_vocab_file_path:Union[Path, None],
+    event_data: list, 
+    encoding_scheme: str, 
+    num_features: int
+  ):
+    super().__init__(in_vocab_file_path, event_data, encoding_scheme, num_features)
+
+  def _nb_sort_type(self, unique_vocabs):
+    # Remove special tokens and sort the remaining vocab list, then re-insert the special tokens in order
+    unique_vocabs.remove('SOS')
+    unique_vocabs.remove('EOS')
+    unique_vocabs.remove('Empty_Bar')
+    unique_vocabs.remove('SSS')
+    unique_vocabs.remove('SSN')
+    unique_vocabs.remove('SNN')
+    vocab_list = list(unique_vocabs)
+    unique_vocabs = sorted(vocab_list, key=sort_key)
+    unique_vocabs.insert(0, 'SOS')
+    unique_vocabs.insert(1, 'EOS')
+    unique_vocabs.insert(2, 'Empty_Bar')
+    unique_vocabs.insert(3, 'SSS')
+    unique_vocabs.insert(4, 'SSN')
+    unique_vocabs.insert(5, 'SNN')
+    return unique_vocabs
\ No newline at end of file
diff --git a/demo/Amadeus_app_CN.py b/demo/Amadeus_app_CN.py
new file mode 100644
index 0000000..099d8aa
--- /dev/null
+++ b/demo/Amadeus_app_CN.py
@@ -0,0 +1,223 @@
+from email.mime import audio
+import torch
+from pathlib import Path
+import json
+from collections import defaultdict
+from omegaconf import OmegaConf, DictConfig
+from transformers import T5Tokenizer, T5EncoderModel
+import gradio as gr
+import os, sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from Amadeus.train_utils import adjust_prediction_order
+from Amadeus.evaluation_utils import (
+    wandb_style_config_to_omega_config,
+)
+from Amadeus.symbolic_encoding import decoding_utils
+from data_representation import vocab_utils
+from Amadeus import model_zoo
+from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+
+
+# === 保持原来的工具函数 ===
+def get_best_ckpt_path_and_config(dir):
+    if dir is None:
+        raise ValueError('No such code in wandb_dir')
+    ckpt_dir = dir / 'files' / 'checkpoints'
+
+    config_path = dir / 'files' / 'config.yaml'
+    vocab_path = next(ckpt_dir.glob('vocab*'))
+
+    if len(list(ckpt_dir.glob('*last.pt'))) > 0:
+        last_ckpt_fn = next(ckpt_dir.glob('*last.pt'))
+    else:
+        pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', '')))
+        last_ckpt_fn = pt_fns[-1]
+
+    return last_ckpt_fn, config_path, vocab_path
+
+
+def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path: str):
+    nn_params = config.nn_params
+    vocab_path = Path(vocab_path)
+
+    encoding_scheme = config.nn_params.encoding_scheme
+    num_features = config.nn_params.num_features
+    vocab_name = {'remi': 'LangTokenVocab', 'cp': 'MusicTokenVocabCP', 'nb': 'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+
+    vocab = getattr(vocab_utils, selected_vocab_name)(
+        in_vocab_file_path=vocab_path,
+        event_data=None,
+        encoding_scheme=encoding_scheme,
+        num_features=num_features)
+
+    prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params)
+
+    AmadeusModel = getattr(model_zoo, nn_params.model_name)(
+        vocab=vocab,
+        input_length=config.train_params.input_length,
+        prediction_order=prediction_order,
+        input_embedder_name=nn_params.input_embedder_name,
+        main_decoder_name=nn_params.main_decoder_name,
+        sub_decoder_name=nn_params.sub_decoder_name,
+        sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,
+        sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+            if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,
+        dim=nn_params.main_decoder.dim_model,
+        heads=nn_params.main_decoder.num_head,
+        depth=nn_params.main_decoder.num_layer,
+        dropout=nn_params.model_dropout,
+    )
+    return AmadeusModel, vocab
+
+
+def load_resources(wandb_exp_dir, device):
+    wandb_exp_dir = Path(wandb_exp_dir)
+    ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config(
+        wandb_exp_dir
+    )
+    config = OmegaConf.load(config_path)
+    config = wandb_style_config_to_omega_config(config)
+
+    ckpt = torch.load(ckpt_path, map_location=device)
+    model, vocab = prepare_model_and_dataset_from_config(config, vocab_path)
+    model.load_state_dict(ckpt['model'], strict=False)
+    model.to(device)
+    model.eval()
+    torch.compile(model)
+    print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
+
+    return config, model, vocab
+
+
+import time
+
+def generate_with_text_prompt(config, vocab, model, device, prompt, text_encoder_model,
+                              sampling_method='top_p', threshold=0.99,
+                              temperature=1.15, generation_length=1024):
+    encoding_scheme = config.nn_params.encoding_scheme
+    tokenizer = T5Tokenizer.from_pretrained(text_encoder_model)
+    encoder = T5EncoderModel.from_pretrained(text_encoder_model).to(device)
+    context = tokenizer(prompt, return_tensors='pt',
+                        padding='max_length', truncation=True, max_length=128).to(device)
+    context = encoder(**context).last_hidden_state
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4)
+
+    midi_decoder_dict = {'remi': 'MidiDecoder4REMI',
+                         'cp': 'MidiDecoder4CP',
+                         'nb': 'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(
+        vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset
+    )
+
+    generated_sample = model.generate(
+        0, generation_length, condition=None, num_target_measures=None,
+        sampling_method=sampling_method, threshold=threshold,
+        temperature=temperature, context=context
+    )
+    if encoding_scheme == 'nb':
+        generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, config.data_params.first_pred_feature)
+
+    # === 生成带时间戳的文件名 ===
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    Path("outputs").mkdir(exist_ok=True)
+    output_file = Path("outputs") / f"generated_{timestamp}.mid"
+
+    decoder(generated_sample, output_path=str(output_file))
+    return str(output_file)
+
+# === Gradio Demo ===
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model_id = "models/Amadeus-S"  # 模型路径，可以是 Amadeus-S, Amadeus-M, Amadeus-L
+# check if model exists
+if not Path(model_id).exists():
+    # download from huggingface
+    import os
+    from huggingface_hub import snapshot_download
+
+    os.makedirs("models", exist_ok=True)
+
+    local_dir = snapshot_download(
+        repo_id="longyu1315/Amadeus-S",
+        repo_type="model",
+        local_dir="models"
+    )
+
+    print("模型已下载到:", local_dir)    
+
+config, model, vocab = load_resources(model_id, device)
+
+# 示例 prompts
+examples = {
+    "prompt1": "A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.",
+    "prompt2": "A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.",
+    "prompt3": " A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.",
+    "prompt4": "A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music.",
+}
+
+def gradio_generate(prompt, threshold, temperature, length):
+    if "Amadeus-M" in model_id or "Amadeus-L" in model_id:
+        encoder_choice ="large"
+    else:
+        encoder_choice = "base"
+    text_encoder_model = 'google/flan-t5-base' if encoder_choice == 'base' else 'google/flan-t5-large'
+    midi_path = generate_with_text_prompt(
+        config,
+        vocab,
+        model,
+        device,
+        prompt,
+        text_encoder_model,
+        threshold=threshold,
+        temperature=temperature,
+        generation_length=length,
+    )
+    # === 根据 MIDI 文件名生成对应的 WAV 文件名 ===
+    audio_path = midi_path.replace('.mid', '.wav').replace('generated', 'music/generated')
+    return midi_path, audio_path
+
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎵 Amadeus MIDI Generation Demo")
+    gr.Markdown(
+            "### 🎵 Prompt 输入指南\n"
+            "请尽量包含以下要素：\n"
+            "- 曲风（如 pop, electronic, ambient...）\n"
+            "- 乐器（如 piano, guitar, drums, strings...）\n"
+            "- 调式（如 C major, F# minor...）\n"
+            "- 拍号（如 4/4, 3/4...）\n"
+            "- 速度（如 120 BPM, Andante, Allegro...）\n"
+            "- 和弦走向（如 C, G, Am, F...）\n"
+            "- 情绪（如 happy, relaxing, motivational...）"
+            "推荐从示例中选择初始 Prompt 进行修改。"
+        )
+    with gr.Row():
+        prompt = gr.Textbox(label="输入文本描述 (Prompt)", placeholder="A lively rock and electronic fusion, this song radiates happiness and energy. Distorted guitars, a rock organ, and driving drums propel the melody forward in a fast-paced 4/4 time signature. Set in the key of A major, it features a chord progression of E, D, A/G, E, and D, creating a dynamic and engaging sound that would be right at home in a video game soundtrack.")
+    with gr.Row():
+        threshold = gr.Slider(0.5, 1.0, 0.99, step=0.01, label="阈值")
+        temperature = gr.Slider(0.5, 3.0, 1.25, step=0.05, label="温度")
+        length = gr.Slider(256, 3072, 1024, step=128, label="生成长度")
+    generate_btn = gr.Button("生成 MIDI 🎼")
+    midi_file = gr.File(label="下载生成的 MIDI 文件")
+    audio_output = gr.Audio(label="生成的音频预览", type="filepath")
+    generate_btn.click(fn=gradio_generate,
+                       inputs=[prompt, threshold, temperature, length],
+                       outputs=[midi_file, audio_output])
+    gr.Markdown("### 示例 Prompt\n"
+                "prompt1: A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.\n\n"
+                "prompt2: A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.\n\n"
+                "prompt3: A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.\n\n"
+                "prompt4: A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music."
+                )   
+                
+    with gr.Row():
+        for name, text in examples.items():
+            # show text on button click
+            btn = gr.Button(name)
+            btn.click(lambda t=text: t, None, prompt)
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
\ No newline at end of file
diff --git a/demo/Amadeus_app_EN.py b/demo/Amadeus_app_EN.py
new file mode 100644
index 0000000..e460289
--- /dev/null
+++ b/demo/Amadeus_app_EN.py
@@ -0,0 +1,222 @@
+from email.mime import audio
+import torch
+from pathlib import Path
+import json
+from collections import defaultdict
+from omegaconf import OmegaConf, DictConfig
+from transformers import T5Tokenizer, T5EncoderModel
+import gradio as gr
+import os, sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from Amadeus.train_utils import adjust_prediction_order
+from Amadeus.evaluation_utils import (
+    wandb_style_config_to_omega_config,
+)
+from Amadeus.symbolic_encoding import decoding_utils
+from data_representation import vocab_utils
+from Amadeus import model_zoo
+from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+
+
+# === Keep original utility functions ===
+def get_best_ckpt_path_and_config(dir):
+    if dir is None:
+        raise ValueError('No such code in wandb_dir')
+    ckpt_dir = dir / 'files' / 'checkpoints'
+
+    config_path = dir / 'files' / 'config.yaml'
+    vocab_path = next(ckpt_dir.glob('vocab*'))
+
+    if len(list(ckpt_dir.glob('*last.pt'))) > 0:
+        last_ckpt_fn = next(ckpt_dir.glob('*last.pt'))
+    else:
+        pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', '')))
+        last_ckpt_fn = pt_fns[-1]
+
+    return last_ckpt_fn, config_path, vocab_path
+
+
+def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path: str):
+    nn_params = config.nn_params
+    vocab_path = Path(vocab_path)
+
+    encoding_scheme = config.nn_params.encoding_scheme
+    num_features = config.nn_params.num_features
+    vocab_name = {'remi': 'LangTokenVocab', 'cp': 'MusicTokenVocabCP', 'nb': 'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+
+    vocab = getattr(vocab_utils, selected_vocab_name)(
+        in_vocab_file_path=vocab_path,
+        event_data=None,
+        encoding_scheme=encoding_scheme,
+        num_features=num_features)
+
+    prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params)
+
+    AmadeusModel = getattr(model_zoo, nn_params.model_name)(
+        vocab=vocab,
+        input_length=config.train_params.input_length,
+        prediction_order=prediction_order,
+        input_embedder_name=nn_params.input_embedder_name,
+        main_decoder_name=nn_params.main_decoder_name,
+        sub_decoder_name=nn_params.sub_decoder_name,
+        sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,
+        sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+            if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,
+        dim=nn_params.main_decoder.dim_model,
+        heads=nn_params.main_decoder.num_head,
+        depth=nn_params.main_decoder.num_layer,
+        dropout=nn_params.model_dropout,
+    )
+    return AmadeusModel, vocab
+
+
+def load_resources(wandb_exp_dir, device):
+    wandb_exp_dir = Path(wandb_exp_dir)
+    ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config(
+        wandb_exp_dir
+    )
+    config = OmegaConf.load(config_path)
+    config = wandb_style_config_to_omega_config(config)
+
+    ckpt = torch.load(ckpt_path, map_location=device)
+    model, vocab = prepare_model_and_dataset_from_config(config, vocab_path)
+    model.load_state_dict(ckpt['model'], strict=False)
+    model.to(device)
+    model.eval()
+    torch.compile(model)
+    print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
+
+    return config, model, vocab
+
+
+import time
+
+def generate_with_text_prompt(config, vocab, model, device, prompt, text_encoder_model,
+                              sampling_method='top_p', threshold=0.99,
+                              temperature=1.15, generation_length=1024):
+    encoding_scheme = config.nn_params.encoding_scheme
+    tokenizer = T5Tokenizer.from_pretrained(text_encoder_model)
+    encoder = T5EncoderModel.from_pretrained(text_encoder_model).to(device)
+    context = tokenizer(prompt, return_tensors='pt',
+                        padding='max_length', truncation=True, max_length=128).to(device)
+    context = encoder(**context).last_hidden_state
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4)
+
+    midi_decoder_dict = {'remi': 'MidiDecoder4REMI',
+                         'cp': 'MidiDecoder4CP',
+                         'nb': 'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(
+        vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset
+    )
+
+    generated_sample = model.generate(
+        0, generation_length, condition=None, num_target_measures=None,
+        sampling_method=sampling_method, threshold=threshold,
+        temperature=temperature, context=context
+    )
+    if encoding_scheme == 'nb':
+        generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, config.data_params.first_pred_feature)
+
+    # === Generate filename with timestamp ===
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    Path("outputs").mkdir(exist_ok=True)
+    output_file = Path("outputs") / f"generated_{timestamp}.mid"
+
+    decoder(generated_sample, output_path=str(output_file))
+    return str(output_file)
+
+# === Gradio Demo ===
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model_id = "models/Amadeus-S"  # 模型路径，可以是 Amadeus-S, Amadeus-M, Amadeus-L
+# check if model exists
+if not Path(model_id).exists():
+    # download from huggingface
+    import os
+    from huggingface_hub import snapshot_download
+
+    os.makedirs("models", exist_ok=True)
+
+    local_dir = snapshot_download(
+        repo_id="longyu1315/Amadeus-S",
+        repo_type="model",
+        local_dir="models"
+    )
+
+    print("模型已下载到:", local_dir)   
+config, model, vocab = load_resources(model_id, device)
+
+# Example prompts
+examples = {
+    "prompt1": "A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.",
+    "prompt2": "A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.",
+    "prompt3": " A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.",
+    "prompt4": "A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music.",
+}
+
+def gradio_generate(prompt, threshold, temperature, length):
+    if "Amadeus-M" in model_id or "Amadeus-L" in model_id:
+        encoder_choice ="large"
+    else:
+        encoder_choice = "base"
+    text_encoder_model = 'google/flan-t5-base' if encoder_choice == 'base' else 'google/flan-t5-large'
+    midi_path = generate_with_text_prompt(
+        config,
+        vocab,
+        model,
+        device,
+        prompt,
+        text_encoder_model,
+        threshold=threshold,
+        temperature=temperature,
+        generation_length=length,
+    )
+    # === Generate corresponding WAV filename ===
+    audio_path = midi_path.replace('.mid', '.wav').replace('generated', 'music/generated')
+    return midi_path, audio_path
+
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎵 Amadeus MIDI Generation Demo")
+    gr.Markdown(
+            "### 🎵 Prompt Input Guide\n"
+            "Please try to include the following elements:\n"
+            "- Genre (e.g. pop, electronic, ambient...)\n"
+            "- Instruments (e.g. piano, guitar, drums, strings...)\n"
+            "- Key (e.g. C major, F# minor...)\n"
+            "- Time signature (e.g. 4/4, 3/4...)\n"
+            "- Tempo (e.g. 120 BPM, Andante, Allegro...)\n"
+            "- Chord progression (e.g. C, G, Am, F...)\n"
+            "- Mood (e.g. happy, relaxing, motivational...)\n"
+            "We recommend starting from an example prompt and then modifying it."
+        )
+    with gr.Row():
+        prompt = gr.Textbox(label="Text Description (Prompt)", placeholder="A lively rock and electronic fusion, this song radiates happiness and energy. Distorted guitars, a rock organ, and driving drums propel the melody forward in a fast-paced 4/4 time signature. Set in the key of A major, it features a chord progression of E, D, A/G, E, and D, creating a dynamic and engaging sound that would be right at home in a video game soundtrack.")
+    with gr.Row():
+        threshold = gr.Slider(0.5, 1.0, 0.99, step=0.01, label="Threshold")
+        temperature = gr.Slider(0.5, 3.0, 1.25, step=0.05, label="Temperature")
+        length = gr.Slider(256, 3072, 1024, step=128, label="Generation Length")
+    generate_btn = gr.Button("Generate MIDI 🎼")
+    midi_file = gr.File(label="Download Generated MIDI File")
+    audio_output = gr.Audio(label="Generated Audio Preview", type="filepath")
+    generate_btn.click(fn=gradio_generate,
+                       inputs=[prompt, threshold, temperature, length],
+                       outputs=[midi_file, audio_output])
+    gr.Markdown("### Example Prompts\n"
+                "prompt1: A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.\n\n"
+                "prompt2: A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.\n\n"
+                "prompt3: A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.\n\n"
+                "prompt4: A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music."
+                )   
+                
+    with gr.Row():
+        for name, text in examples.items():
+            # show text on button click
+            btn = gr.Button(name)
+            btn.click(lambda t=text: t, None, prompt)
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)
\ No newline at end of file
diff --git a/demo/app.py b/demo/app.py
new file mode 100644
index 0000000..e69de29
diff --git a/demo/requirements.txt b/demo/requirements.txt
new file mode 100644
index 0000000..b2efddd
--- /dev/null
+++ b/demo/requirements.txt
@@ -0,0 +1,9 @@
+transformers
+torch 
+gradio 
+omegaconf 
+x_transformers 
+matplotlib 
+music21 
+muspy
+SentencePiece
\ No newline at end of file
diff --git a/demo/text2midi_app.py b/demo/text2midi_app.py
new file mode 100644
index 0000000..e69de29
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..6c2cd12
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,228 @@
+name: Amadeus
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - bzip2=1.0.8
+  - ca-certificates=2025.1.31
+  - ld_impl_linux-64=2.43
+  - libffi=3.4.6
+  - libgcc=14.2.0
+  - libgcc-ng=14.2.0
+  - libgomp=14.2.0
+  - liblzma=5.6.4
+  - libnsl=2.0.1
+  - libsqlite=3.49.1
+  - libuuid=2.38.1
+  - libxcrypt=4.4.36
+  - libzlib=1.3.1
+  - ncurses=6.5
+  - openssl=3.4.1
+  - pip=25.0.1
+  - python=3.10.16
+  - readline=8.2
+  - setuptools=75.8.0
+  - tk=8.6.13
+  - pip:
+      - accelerate==1.5.0
+      - aiohappyeyeballs==2.4.4
+      - aiohttp==3.11.10
+      - aiosignal==1.3.1
+      - annotated-types==0.7.0
+      - anthropic==0.59.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==4.9.0
+      - async-timeout==5.0.1
+      - attrs==24.2.0
+      - audioread==3.0.1
+      - beartype==0.19.0
+      - bidict==0.23.1
+      # - blis==1.0.1
+      - braceexpand==0.1.7
+      - catalogue==2.0.10
+      - certifi==2024.8.30
+      - cffi==1.17.1
+      - chardet==5.2.0
+      - charset-normalizer==3.4.0
+      - chorder==0.1.4
+      - click==8.1.7
+      - cloudpathlib==0.20.0
+      - coloredlogs==15.0.1
+      - colt5-attention==0.11.1
+      - confection==0.1.5
+      - contourpy==1.3.1
+      - cycler==0.12.1
+      - cymem==2.0.10
+      - datasets==3.1.0
+      - decorator==5.2.1
+      - deepspeed==0.16.7
+      - dill==0.3.8
+      - distro==1.9.0
+      - docker-pycreds==0.4.0
+      - einops==0.8.0
+      - einx==0.3.0
+      - encodec==0.1.1
+      - evaluate==0.4.3
+      - exceptiongroup==1.3.0
+      - filelock==3.16.1
+      - fire==0.7.0
+      - fonttools==4.56.0
+      - frozendict==2.4.6
+      - frozenlist==1.5.0
+      - fsspec==2024.9.0
+      - ftfy==6.3.1
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - h11==0.16.0
+      - h5py==3.13.0
+      - hf-xet==1.1.4
+      - hjson==3.1.0
+      - httpcore==1.0.9
+      - httpx==0.28.1
+      - huggingface-hub==0.33.0
+      - humanfriendly==10.0
+      - hydra-core==1.3.2
+      - hypy-utils==1.0.29
+      - idna==3.10
+      - iniconfig==2.1.0
+      - jinja2==3.1.4
+      - jiter==0.10.0
+      - joblib==1.4.2
+      - jsonlines==4.0.0
+      - jsonpickle==4.0.5
+      - kiwisolver==1.4.8
+      # - laion-clap==1.1.7
+      - langcodes==3.5.0
+      - langdetect==1.0.9
+      - language-data==1.3.0
+      - lazy-loader==0.4
+      - librosa==0.10.2.post1
+      - llvmlite==0.41.1
+      - local-attention==1.9.15
+      - loguru==0.7.3
+      - marisa-trie==1.2.1
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.1
+      - mdurl==0.1.2
+      - megabyte-pytorch==0.3.6
+      - midi2audio==0.1.1
+      - miditok==3.0.3
+      - miditoolkit==1.0.1
+      - mido==1.3.3
+      - more-itertools==10.7.0
+      - mpmath==1.3.0
+      # - msclap==1.3.3
+      - msgpack==1.1.0
+      - multidict==6.1.0
+      - multiprocess==0.70.16
+      - murmurhash==1.0.11
+      - music21==9.5.0
+      - muspy==0.5.0
+      - networkx==3.4.2
+      - ninja==1.11.1.3
+      - nnaudio==0.3.3
+      # - numba==0.58.1
+      # - numpy==1.26.4
+      - nvidia-cublas-cu12==12.6.4.1
+      - nvidia-cuda-cupti-cu12==12.6.80
+      - nvidia-cuda-nvrtc-cu12==12.6.77
+      - nvidia-cuda-runtime-cu12==12.6.77
+      - nvidia-cudnn-cu12==9.5.1.17
+      - nvidia-cufft-cu12==11.3.0.4
+      - nvidia-cufile-cu12==1.11.1.6
+      - nvidia-curand-cu12==10.3.7.77
+      - nvidia-cusolver-cu12==11.7.1.2
+      - nvidia-cusparse-cu12==12.5.4.2
+      - nvidia-cusparselt-cu12==0.6.3
+      - nvidia-ml-py==12.570.86
+      - nvidia-nccl-cu12==2.26.2
+      - nvidia-nvjitlink-cu12==12.6.85
+      - nvidia-nvtx-cu12==12.6.77
+      - omegaconf==2.3.0
+      - packaging==24.2
+      - pandas==2.2.3
+      - peft==0.14.0
+      - pillow==11.1.0
+      - platformdirs==4.3.6
+      - pluggy==1.6.0
+      - pooch==1.8.2
+      - preshed==3.0.9
+      - pretty-midi==0.2.10
+      - progressbar==2.5
+      - propcache==0.2.1
+      - protobuf==5.29.1
+      - psutil==6.1.0
+      - py-cpuinfo==9.0.0
+      - pyarrow==18.1.0
+      - pycparser==2.22
+      - pydantic==2.10.3
+      - pydantic-core==2.27.1
+      - pydub==0.25.1
+      - pygments==2.18.0
+      - pyparsing==3.2.1
+      - pypianoroll==1.0.4
+      - pysmartdl==1.3.4
+      - pytest==8.4.0
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.2
+      - pyyaml==6.0.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - resampy==0.4.3
+      - rich==13.9.4
+      - safetensors==0.4.5
+      - scikit-learn==1.6.1
+      - scipy==1.15.2
+      - seaborn==0.13.2
+      - sentencepiece==0.2.0
+      - sentry-sdk==2.19.2
+      - setproctitle==1.3.4
+      - sf2utils==1.0.0
+      - shellingham==1.5.4
+      - six==1.17.0
+      - smart-open==7.0.5
+      - smmap==5.0.1
+      - sniffio==1.3.1
+      - soundfile==0.12.1
+      - soxr==0.5.0.post1
+      - spacy==3.8.2
+      - spacy-legacy==3.0.12
+      - spacy-loggers==1.0.5
+      - srsly==2.4.8
+      - st-moe-pytorch==0.1.8
+      - sudachidict-core==20250129
+      - sudachipy==0.6.10
+      - sympy==1.14.0
+      - symusic==0.5.5
+      - termcolor==2.5.0
+      # - thinc==8.3.2
+      - threadpoolctl==3.5.0
+      - tokenizers==0.21.0
+      - tomli==2.2.1
+      - torch==2.7.0
+      - torchaudio==2.7.0
+      - torchlibrosa==0.1.0
+      # - torchvision==0.16.2
+      - tqdm==4.67.1
+      - transformers==4.52.4
+      - triton==3.3.0
+      - typer==0.15.1
+      - typing-extensions==4.12.2
+      - tzdata==2024.2
+      - urllib3==2.2.3
+      - wandb==0.19.0
+      - wasabi==1.1.3
+      - wcwidth==0.2.13
+      - weasel==0.4.1
+      - webcolors==24.11.1
+      - webdataset==0.2.111
+      - wget==3.2
+      - wheel==0.41.3
+      - wrapt==1.17.0
+      - x-transformers==2.3.1
+      - xxhash==3.5.0
+      - yarl==1.18.3
+prefix: ~/.conda/envs/Amadeus
diff --git a/generate-batch.py b/generate-batch.py
new file mode 100644
index 0000000..f43f1c0
--- /dev/null
+++ b/generate-batch.py
@@ -0,0 +1,336 @@
+import sys
+import os
+from pathlib import Path
+from multiprocessing import Process,set_start_method
+import torch
+import argparse
+from omegaconf import OmegaConf
+import json
+
+from Amadeus.evaluation_utils import (
+    wandb_style_config_to_omega_config,
+    prepare_model_and_dataset_from_config,
+    get_best_ckpt_path_and_config,
+    Evaluator
+)
+
+def get_argument_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-wandb_exp_dir",
+        required=True,
+        type=str,
+        help="wandb experiment directory",
+    )
+    parser.add_argument(
+        "-generation_type",
+        type=str,
+        choices=('conditioned', 'unconditioned', 'text-conditioned'),
+        default='unconditioned',
+        help="generation type",
+    )
+    parser.add_argument(
+        "-sampling_method",
+        type=str,
+        choices=('top_p', 'top_k'),
+        default='top_p',
+        help="sampling method",
+    )
+    parser.add_argument(
+        "-threshold",
+        type=float,
+        default=0.99,
+        help="threshold",
+    )
+    parser.add_argument(
+        "-temperature",
+        type=float,
+        default=1.15,
+        help="temperature",
+    )
+    parser.add_argument(
+        "-num_samples",
+        type=int,
+        default=30,
+        help="number of samples to generate",
+    )
+    parser.add_argument(
+        "-num_target_measure",
+        type=int,
+        default=4,
+        help="number of target measures for conditioned generation",
+    )
+    parser.add_argument(
+        "-choose_selected_tunes",
+        action='store_true',
+        help="generate samples from selected tunes, only for SOD dataset",
+    )
+    parser.add_argument(
+        "-generate_length",
+        type=int,
+        default=1024,
+        help="length of the generated sequence",
+    )
+    parser.add_argument(
+        "-num_processes",
+        type=int,
+        default=4,
+        help="number of processes to use",
+    )
+    parser.add_argument(
+        "-gpu_ids",
+        type=str,
+        default="1,2,3,5",
+        help="comma-separated list of GPU IDs to use (e.g., '0,1,2,3')",
+    )
+    parser.add_argument(
+    "-prompt",
+    type=str,
+    default="With a rhythm of 100 BPM, this classical piece in 1/4 time signature in the key of Eb major creates a classical mood using String Ensemble, Pizzicato Strings, Tremolo Strings, Trumpet, Timpani.",
+    help="prompt for generation, only used for conditioned generation",
+  )
+    parser.add_argument(
+        "-prompt_file",
+        type=str,
+        default="dataset/midicaps/train.json",
+        help="file containing prompts for text-conditioned generation",
+    )
+    return parser
+
+def load_resources(wandb_exp_dir, device):
+    """Load model and dataset resources for a process"""
+    wandb_dir = Path('wandb')
+    ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, wandb_exp_dir)
+    config = OmegaConf.load(config_path)
+    config = wandb_style_config_to_omega_config(config)
+    
+    # Load checkpoint to specified device
+    ckpt = torch.load(ckpt_path, map_location=device)
+    model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path, vocab_path)
+    model.load_state_dict(ckpt['model'], strict=False)
+    model.to(device)
+    model.eval()
+    torch.compile(model)
+    print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    
+    # Prepare dataset for prompts
+    condition_list = [x[1] for x in test_set.data_list]
+    dataset_for_prompt = []
+    for i in range(len(condition_list)):
+        condition = test_set.get_segments_with_tune_idx(condition_list[i], 0)[0]
+        dataset_for_prompt.append((condition, condition_list[i]))
+    
+    return config, model, dataset_for_prompt, vocab
+
+def conditioned_worker(process_idx, gpu_id, args, data_slice):
+    """Worker process for conditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"cond_{args.num_target_measure}m_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+    
+    evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device)
+    
+    # Process assigned data slice
+    for idx, (tune_in_idx, tune_name) in enumerate(data_slice):
+        batch_dir = base_path / f"process_{process_idx}_batch_{idx}"
+        batch_dir.mkdir(parents=True, exist_ok=True)
+        evaluator.generate_samples_with_prompt(
+            batch_dir,
+            args.num_target_measure,
+            tune_in_idx,
+            tune_name,
+            config.data_params.first_pred_feature,
+            args.sampling_method,
+            args.threshold,
+            args.temperature,
+            generation_length=args.generate_length
+        )
+
+def unconditioned_worker(process_idx, gpu_id, args, num_samples):
+    """Worker process for unconditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"uncond_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+    
+    evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device)
+    
+    # Generate assigned number of samples
+    batch_dir = base_path 
+    evaluator.generate_samples_unconditioned(
+        batch_dir,
+        num_samples,
+        config.data_params.first_pred_feature,
+        args.sampling_method,
+        args.threshold,
+        args.temperature,
+        generation_length=args.generate_length,
+        uid=f"{process_idx}"
+    )
+def text_conditioned_worker(process_idx, gpu_id, args, num_samples, data_slice):
+    """Worker process for unconditioned generation"""
+    torch.cuda.set_device(gpu_id)
+    device = torch.device(f'cuda:{gpu_id}')
+    
+    # Load resources with proper device
+    config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device)
+    
+    # Create output directory with process index
+    base_path = Path('wandb') / args.wandb_exp_dir / \
+        f"text_condi_{args.sampling_method}_t{args.threshold}_temp{args.temperature}"
+    base_path.mkdir(parents=True, exist_ok=True)
+    
+    evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device)
+    
+    # Generate assigned number of samples
+    batch_dir = base_path 
+    for idx, tune_name in enumerate(data_slice):
+        print(f"Process {process_idx} generating samples for tune: {tune_name}")
+        evaluator.generate_samples_with_text_prompt(
+            batch_dir,
+            tune_name,
+            config.data_params.first_pred_feature,
+            args.sampling_method,
+            args.threshold,
+            args.temperature,
+            generation_length=args.generate_length,
+            uid=f"{process_idx}"
+        )
+def main():
+  # use spawn method for multiprocessing
+    set_start_method('spawn', force=True)
+    args = get_argument_parser().parse_args()
+    gpu_ids = list(map(int, args.gpu_ids.split(',')))
+    
+    # Validate GPU availability
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if len(gpu_ids) == 0:
+        raise ValueError("At least one GPU must be specified")
+    
+    # Validate process count
+    if args.num_processes < 1:
+        raise ValueError("Number of processes must be at least 1")
+    if len(gpu_ids) < args.num_processes:
+        print(f"Warning: More processes ({args.num_processes}) than GPUs ({len(gpu_ids)}), some GPUs will be shared")
+    
+    # Prepare data slices for processes
+    processes = []
+    try:
+        if args.generation_type == 'conditioned':
+            # Prepare selected tunes
+            wandb_dir = Path('wandb') / args.wandb_exp_dir
+            if not wandb_dir.exists():
+                raise FileNotFoundError(f"Experiment {args.wandb_exp_dir} not found")
+            
+            # Load test set to get selected tunes (dummy load to get dataset info)
+            dummy_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            _, test_set, _ = prepare_model_and_dataset_from_config(
+                wandb_dir / "files" / "config.yaml",
+                wandb_dir / "files" / "metadata.json",
+                wandb_dir / "files" / "vocab.json"
+            )
+            
+            if args.choose_selected_tunes and test_set.dataset == 'SOD':
+                selected_tunes = ['Requiem_orch', 'magnificat_bwv-243_8_orch', 
+                                 "Clarinet Concert in A Major: 2nd Movement, Adagio_orch"]
+            else:
+                selected_tunes = [name for _, name in test_set.data_list][:args.num_samples]
+            
+            # Split selected data across processes
+            selected_data = [d for d in test_set.data_list if d[1] in selected_tunes]
+            chunk_size = (len(selected_data) + args.num_processes - 1) // args.num_processes
+            
+            for i in range(args.num_processes):
+                start_idx = i * chunk_size
+                end_idx = min((i+1)*chunk_size, len(selected_data))
+                data_slice = selected_data[start_idx:end_idx]
+                
+                if not data_slice:
+                    continue
+                
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                p = Process(
+                    target=conditioned_worker,
+                    args=(i, gpu_id, args, data_slice)
+                )
+                processes.append(p)
+                p.start()
+        
+        elif args.generation_type == 'unconditioned':
+            samples_per_proc = args.num_samples // args.num_processes
+            remainder = args.num_samples % args.num_processes
+            
+            for i in range(args.num_processes):
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                samples = samples_per_proc + (1 if i < remainder else 0)
+                
+                if samples <= 0:
+                    continue
+                
+                p = Process(
+                    target=unconditioned_worker,
+                    args=(i, gpu_id, args, samples)
+                )
+                processes.append(p)
+                p.start()
+        elif args.generation_type == 'text-conditioned':
+            samples_per_proc = args.num_samples // args.num_processes
+            remainder = args.num_samples % args.num_processes
+            # Load prompts from file
+            prompt_name_list = []
+            with open(args.prompt_file, 'r') as f:
+                for line in f:
+                    if not line.strip():
+                        continue
+                    prompt_data = json.loads(line.strip())
+                    prompt_text = prompt_data['caption']
+                    if prompt_data['test_set'] is True:
+                        prompt_name_list.append(prompt_text)
+                    print("length of prompt_name_list:", len(prompt_name_list))
+                    if len(prompt_name_list) >= args.num_samples:
+                        print(f"Reached the limit of {args.num_samples} prompts.")
+                        break   
+            for i in range(args.num_processes):
+                gpu_id = gpu_ids[i % len(gpu_ids)]
+                samples = samples_per_proc + (1 if i < remainder else 0)
+                
+                if samples <= 0:
+                    continue
+                
+                # Split prompt names across processes
+                start_idx = i * (len(prompt_name_list) // args.num_processes)
+                end_idx = (i + 1) * (len(prompt_name_list) // args.num_processes)
+                data_slice = prompt_name_list[start_idx:end_idx]
+                
+                p = Process(
+                    target=text_conditioned_worker,
+                    args=(i, gpu_id, args, samples, data_slice)
+                )
+                processes.append(p)
+                p.start()
+        # Wait for all processes to complete
+        for p in processes:
+            p.join()
+    
+    except Exception as e:
+        print(f"Error in main process: {str(e)}")
+        for p in processes:
+            p.terminate()
+        raise
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/generate.py b/generate.py
new file mode 100644
index 0000000..4922099
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,210 @@
+import torch
+from pathlib import Path
+import argparse
+import json
+from collections import defaultdict
+from omegaconf import OmegaConf, DictConfig
+from transformers import T5Tokenizer, T5EncoderModel
+from Amadeus.train_utils import adjust_prediction_order
+
+from Amadeus.evaluation_utils import (
+    get_dir_from_wandb_by_code,
+    wandb_style_config_to_omega_config,
+)
+from Amadeus.symbolic_encoding import decoding_utils, data_utils
+from data_representation import vocab_utils
+from Amadeus import  model_zoo
+from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor
+
+
+def get_argument_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-wandb_exp_dir",
+        required=True,
+        type=str,
+        help="wandb experiment directory",
+    )
+    parser.add_argument(
+        "-prompt",
+        required=True,
+        type=str,
+        help="text prompt for genvidia-smiration",
+    )
+    parser.add_argument(
+        "-output_dir",
+        type=str,
+        default="outputs",
+        help="directory to save results",
+    )
+    parser.add_argument(
+        "-sampling_method",
+        type=str,
+        choices=('top_p', 'top_k'),
+        default='top_p',
+        help="sampling method",
+    )
+    parser.add_argument(
+        "-threshold",
+        type=float,
+        default=0.99,
+        help="threshold",
+    )
+    parser.add_argument(
+        "-temperature",
+        type=float,
+        default=1.15,
+        help="temperature",
+    )
+    parser.add_argument(
+        "-generate_length",
+        type=int,
+        default=2048,
+        help="length of the generated sequence",
+    )
+    parser.add_argument(
+        "-text_encoder_model",
+        type=str,
+        default='google/flan-t5-large',
+        help="pretrained text encoder model",
+    )
+    return parser
+
+def get_best_ckpt_path_and_config(dir):
+  if dir is None:
+    raise ValueError('No such code in wandb_dir')
+  ckpt_dir = dir / 'files' / 'checkpoints'
+
+  config_path = dir / 'files'  / 'config.yaml'
+  # print all files in ckpt_dir
+  vocab_path = next(ckpt_dir.glob('vocab*'))
+
+  # if there is pt file ending with 'last', return it 
+  if len(list(ckpt_dir.glob('*last.pt'))) > 0:
+    last_ckpt_fn = next(ckpt_dir.glob('*last.pt'))
+  else:
+    pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', '')))
+    last_ckpt_fn = pt_fns[-1]
+
+  return last_ckpt_fn, config_path, vocab_path
+
+def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path:str):
+    nn_params = config.nn_params
+    vocab_path = Path(vocab_path)
+
+    # print(config)
+    encoding_scheme = config.nn_params.encoding_scheme
+    num_features = config.nn_params.num_features
+    
+    # get vocab
+    vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+
+    vocab = getattr(vocab_utils, selected_vocab_name)(
+      in_vocab_file_path=vocab_path,
+      event_data=None,
+      encoding_scheme=encoding_scheme, 
+      num_features=num_features)
+        # get proper prediction order according to the encoding scheme and target feature in the config
+    prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params)
+
+    # Create the Transformer model based on configuration parameters
+    AmadeusModel = getattr(model_zoo, nn_params.model_name)(
+                          vocab=vocab,
+                          input_length=config.train_params.input_length,
+                          prediction_order=prediction_order,
+                          input_embedder_name=nn_params.input_embedder_name,
+                          main_decoder_name=nn_params.main_decoder_name,
+                          sub_decoder_name=nn_params.sub_decoder_name,
+                          sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,
+                          sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+                            if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,
+                          dim=nn_params.main_decoder.dim_model,
+                          heads=nn_params.main_decoder.num_head,
+                          depth=nn_params.main_decoder.num_layer,
+                          dropout=nn_params.model_dropout,
+                          )
+    
+    return AmadeusModel, [], vocab
+
+def load_resources(dir, device):
+    """Load model and dataset resources"""
+    dir = Path(dir)
+    ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config(
+        dir
+    )
+    config = OmegaConf.load(config_path)
+    config = wandb_style_config_to_omega_config(config)
+
+    ckpt = torch.load(ckpt_path, map_location=device)
+    model, _, vocab = prepare_model_and_dataset_from_config(config, vocab_path)
+    model.load_state_dict(ckpt['model'], strict=False)
+    model.to(device)
+    model.eval()
+    torch.compile(model)
+    print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
+
+    return config, model, vocab
+
+def generate_with_text_prompt(config, vocab, model, device, prompt, save_dir,
+                              first_pred_feature, sampling_method, threshold,
+                              temperature, generation_length=1024):
+    encoding_scheme = config.nn_params.encoding_scheme
+    tokenizer = T5Tokenizer.from_pretrained(config.text_encoder_model)
+    encoder = T5EncoderModel.from_pretrained(config.text_encoder_model).to(device)
+    print(f"Using T5EncoderModel for text prompt:\n{prompt}")
+    context = tokenizer(prompt, return_tensors='pt',
+                        padding='max_length', truncation=True, max_length=128).to(device)
+    context = encoder(**context).last_hidden_state
+
+    in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4}
+    in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4)
+
+    midi_decoder_dict = {'remi': 'MidiDecoder4REMI',
+                         'cp': 'MidiDecoder4CP',
+                         'nb': 'MidiDecoder4NB'}
+    decoder_name = midi_decoder_dict[encoding_scheme]
+    decoder = getattr(decoding_utils, decoder_name)(
+        vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset
+    )
+
+    generated_sample = model.generate(
+        0, generation_length, condition=None, num_target_measures=None,
+        sampling_method=sampling_method, threshold=threshold,
+        temperature=temperature, context=context
+    )
+    if encoding_scheme == 'nb':
+        generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature)
+
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    output_file = save_dir / f"generated.mid"
+    decoder(generated_sample, output_path=str(output_file))
+    print(f"Generated file saved at: {output_file}")
+
+
+def main():
+    args = get_argument_parser().parse_args()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    config, model, vocab = load_resources(args.wandb_exp_dir, device)
+
+    save_dir = Path(args.output_dir)
+    config.text_encoder_model = args.text_encoder_model
+    generate_with_text_prompt(
+        config,
+        vocab,
+        model,
+        device,
+        args.prompt,
+        save_dir,
+        config.data_params.first_pred_feature,
+        args.sampling_method,
+        args.threshold,
+        args.temperature,
+        generation_length=args.generate_length,
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..356930b
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,163 @@
+# 🎵 Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music
+
+<p align="center">
+  <a href="https://huggingface.co/longyu1315/Amadeus-S">
+    <img src="https://img.shields.io/badge/🤗-Amadeus--S-yellow" alt="HuggingFace">
+  </a>
+  <a href="https://arxiv.org/abs/2508.20665">
+    <img src="https://img.shields.io/badge/arXiv-2508.20665-blue" alt="arXiv">
+  </a>
+</p>
+
+**Amadeus** is a novel **symbolic music (MIDI) generation framework**. We use **autoregressive modeling** for note sequences, **discrete diffusion models** for intra-note attributes, and **representation optimization** to enhance model performance. Compared to current mainstream autoregressive or hierarchical autoregressive models, Amadeus achieves significant improvements in **generation quality, speed, and controllability**. While significantly improving generation quality, we have achieved a speedup of at least **4x** compared to pure autoregressive models. We also support a training-free **fine-grained attribute control** mechanism, which endows Amadeus with maximum flexibility. We will continuously update the **code, models, and datasets**.
+
+
+
+***
+
+## 🏗️ Model Architecture
+<p align="center">
+  <img src="assets/amadeus-framwork.drawio.png" alt="Amadeus architecture" width="600">
+</p>
+
+
+
+***
+
+## 📅 Changelog
+
+
+
+*   2025-08-28: Released inference code and the **Amadeus-S** model
+
+
+
+***
+
+## ⚙️ Installation and Usage
+
+Set up the environment (inference only):
+
+
+
+```bash
+conda create -n amadeus_slim python=3.10
+
+conda activate amadeus_slim
+
+pip install -r demo/requirements.txt
+```
+
+First run:
+
+
+
+```bash
+# Chinese interface
+
+python demo/Amadeus_app_CN.py
+
+# English interface
+
+python demo/Amadeus_app_EN.py
+```
+
+> Note: 
+>
+> `Amadeus_app_CN.py`
+>
+>  is for the Chinese interface, and 
+>
+> `Amadeus_app_EN.py`
+>
+>  is for the English interface.
+
+👉 The model will be automatically downloaded to the `models/` folder, which includes a usable **soundfont**. Please modify the path of `DEFAULT_SOUND_FONT` in `Amadeus/symbolic_encoding/``midi2audio.py`.
+
+Example of command-line generation:
+
+
+
+```
+python generate.py -wandb\_exp\_dir models/Amadeus-S -text\_encoder\_model google/flan-t5-base -temperature 2 -prompt "A lively and melodic pop rock song featuring piano, overdriven guitar, electric drum and electric bass, set in a fast 4/4 tempo and the key of C# minor, with a frequently recurring chord progression of D, A, C#m, and F# that evokes a mix of emotion and love."
+```
+
+
+
+***
+
+## 📂 Repository Structure
+
+
+
+```
+Amadeus/
+
+├── demo/                   # Example scripts and interfaces (CN/EN)
+
+├── Amadeus/                # Core model and symbolic encoding
+
+├── assets/                 # Architecture diagrams and sample audio files
+
+├── data\_representation     # Data processing
+
+├── models/                 # Downloaded or cached pre-trained models
+
+└── generate.py             # Command-line generation entry point
+```
+
+
+
+***
+
+## 📊 Evaluation Results
+
+We evaluated **generation speed, text alignment, and note attribute control accuracy** on the **MidiCaps** dataset. The results are as follows:
+
+
+
+| Model          | Speed (notes/s) | CLAP ↑   | TBT ↑     | CK ↑      | CTS ↑     | CI ↑      | CMtop3 ↑  |
+| -------------- | --------------- | -------- | --------- | --------- | --------- | --------- | --------- |
+| Text2Midi      | 4.02            | 0.19     | 31.76     | 22.22     | 84.15     | 19.92     | 60.57     |
+| MuseCoco       | 1.67            | 0.19     | 34.21     | 14.66     | 94.24     | 22.42     | 38.18     |
+| T2M-inferalign | 4.02            | 0.20     | 39.32     | 29.80     | 84.32     | 20.13     | 47.74     |
+| **Amadeus**    | **16.23**       | 0.20     | 73.93     | 39.31     | 96.98     | 26.01     | 65.52     |
+| **Amadeus-M**  | 10.51           | **0.21** | **76.31** | **43.07** | **97.02** | **27.11** | **66.39** |
+
+
+
+***
+
+## 🤝 Acknowledgements and Contributions
+
+The development of Amadeus is inspired by the music and AI communities, with the goal of **serving music creators, not replacing them**.
+
+We welcome developers and researchers to contribute code or provide suggestions — please reach out to us via **Issues** or **Pull Requests**.
+
+Part of the design of this project references [JudeJiwoo/nmt](https://github.com/JudeJiwoo/nmt), and we would like to express our gratitude here 🙏.
+
+*** 
+
+## ⚠️ Notes
+
+The current model is relatively small and may not always generate MIDI that fully matches the description.  
+You can try **slightly adjusting parameters such as temperature or top-p** to improve the results.  
+
+We will continue to improve the model to provide more stable and higher-quality generation.
+
+***
+
+## 📚 Citation
+
+If you find Amadeus helpful for your research or create，please cite our paper:
+
+
+
+```bibtex
+@article{su2025amadeus,
+  title   = {Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music},
+  author  = {Su, Hongju and Li, Ke and Yang, Lan and Zhang, Honggang and Song, Yi-Zhe},
+  journal = {arXiv preprint arXiv:2508.20665},
+  year    = {2025}
+}
+
diff --git a/readme_CN.md b/readme_CN.md
new file mode 100644
index 0000000..b47fd54
--- /dev/null
+++ b/readme_CN.md
@@ -0,0 +1,108 @@
+# 🎵 Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music
+<p align="center">
+  <a href="https://huggingface.co/longyu1315/Amadeus-S">
+    <img src="https://img.shields.io/badge/🤗-Amadeus--S-yellow" alt="HuggingFace">
+  </a>
+  <a href="https://arxiv.org/abs/2508.20665">
+    <img src="https://img.shields.io/badge/arXiv-2508.20665-blue" alt="arXiv">
+  </a>
+</p>
+
+**Amadeus** 是一种新型的 **符号音乐 (MIDI) 生成框架**，我们使用 **自回归** 建模音符序列，**离散扩散模型** 建模音符内部属性，并通过 **表征优化** 提升模型性能。相较于当前主流的自回归或分层自回归模型，Amadeus 在 **生成质量、速度与可控性** 上均取得了显著进步。在生成质量显著提升的同时，我们实现了至少 **4x** 于纯自自回归模型的速度提升。我们同时还支持一种免训练的 **细粒度属性控制** ，这赋予了Amadeus最大程度的灵活性。我们会持续更新 **代码，模型和数据集** 。
+
+
+---
+## 🏗️ 模型架构
+<p align="center">
+  <img src="assets/amadeus-framwork.drawio.png" alt="Amadeus architecture" width="600">
+</p>
+
+---
+
+## 📅 更新日志
+- 2025-08-28：公布推理代码和 **Amadeus-S** 模型
+
+---
+
+## ⚙️ 安装与使用
+搭建环境（仅推理）：  
+```bash
+conda create -n amadeus_slim python=3.10
+conda activate amadeus_slim
+pip install -r demo/requirements.txt
+```
+
+首次运行：  
+```bash
+# 中文界面
+python demo/Amadeus_app_CN.py
+
+# 英文界面
+python demo/Amadeus_app_EN.py
+```
+> 说明：`Amadeus_app_CN.py` 用于中文界面，`Amadeus_app_EN.py` 用于英文界面。
+👉 模型会自动下载到 `models/` 文件夹，包含一个可用的 **soundfont**。请修改 `Amadeus/symbolic_encoding/midi2audio.py` 中的 `DEFAULT_SOUND_FONT` 路径。
+
+命令行生成示例：  
+```bash
+python generate.py -wandb_exp_dir models/Amadeus-S -text_encoder_model google/flan-t5-base -temperature 2 -prompt "A lively and melodic pop rock song featuring piano, overdriven guitar, electric drum and electric bass, set in a fast 4/4 tempo and the key of C# minor, with a frequently recurring chord progression of D, A, C#m, and F# that evokes a mix of emotion and love."
+```
+
+---
+
+## 📂 仓库结构
+```
+Amadeus/
+├── demo/                   # 示例脚本与界面 (CN/EN)
+├── Amadeus/                # 核心模型与符号编码
+├── assets/                 # 架构图与示例音频文件
+├── data_representation     # 数据处理
+├── models/                 # 下载或缓存的预训练模型
+└── generate.py             # 命令行生成入口
+```
+
+---
+
+## 📊 评测结果
+我们在 **MidiCaps** 数据集上评测了 **生成速度、文本对齐度以及音符属性控制精度**。结果如下：
+
+| Model        | Speed (notes/s) | CLAP ↑ | TBT ↑ | CK ↑ | CTS ↑ | CI ↑ | CM<sub>top3</sub> ↑ |
+|--------------|-----------------|--------|-------|------|-------|------|---------------------|
+| Text2Midi    | 4.02            | 0.19   | 31.76 | 22.22 | 84.15 | 19.92 | 60.57 |
+| MuseCoco     | 1.67            | 0.19   | 34.21 | 14.66 | 94.24 | 22.42 | 38.18 |
+| T2M-inferalign | 4.02          | 0.20   | 39.32 | 29.80 | 84.32 | 20.13 | 47.74 |
+| **Amadeus**  | **16.23**       | 0.20   | 73.93 | 39.31 | 96.98 | 26.01 | 65.52 |
+| **Amadeus-M**| 10.51           | **0.21** | **76.31** | **43.07** | **97.02** | **27.11** | **66.39** |
+
+
+
+
+
+---
+## 🤝 致谢与贡献
+Amadeus 的研发受到音乐与 AI 社区的启发，旨在 **服务音乐创作者，而非替代他们**。  
+我们欢迎开发者和研究人员贡献代码或提出建议 —— 请通过 **Issues** 或 **Pull Requests** 与我们交流。  
+
+本项目部分设计参考了 [JudeJiwoo/nmt](https://github.com/JudeJiwoo/nmt)，在此表示感谢 🙏。
+
+
+## ⚠️ 注意事项
+
+当前模型规模较小，并不总是能生成完全符合描述的 MIDI。  
+您可以尝试 **适当调整温度（temperature）、阈值（top-p 等参数）** 来改善结果。  
+
+我们会持续改进模型，以提供更稳定和高质量的生成体验。---
+
+---
+
+## 📚 引用
+如果您觉得 Amadeus 对您的研究或创作有帮助，请引用我们的论文：
+
+```bibtex
+@article{su2025amadeus,
+  title   = {Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music},
+  author  = {Su, Hongju and Li, Ke and Yang, Lan and Zhang, Honggang and Song, Yi-Zhe},
+  journal = {arXiv preprint arXiv:2508.20665},
+  year    = {2025}
+}
+
diff --git a/run_evaluation.py b/run_evaluation.py
new file mode 100644
index 0000000..b7cac2d
--- /dev/null
+++ b/run_evaluation.py
@@ -0,0 +1,46 @@
+import sys
+import torch
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from Amadeus.evaluation_utils import Evaluator, wandb_style_config_to_omega_config, prepare_model_and_dataset_from_config, get_best_ckpt_path_and_config
+
+def main(exp_code):
+  wandb_dir = Path('wandb')
+  ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, exp_code)
+  config = OmegaConf.load(config_path)
+  config = wandb_style_config_to_omega_config(config)
+  print(ckpt_path)
+
+  ckpt = torch.load(ckpt_path, map_location='cpu')
+  model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path=metadata_path, vocab_path=vocab_path)
+  model.load_state_dict(ckpt['model'])
+  model = model.eval()
+
+  evaluator = Evaluator(config, model, test_set, vocab, device='cuda', batch_size=21)
+
+  evaluator.get_perplexity()
+  evaluator.save_results(wandb_dir / exp_code / f'micro_evaluated_perplexity_conti_fixed.pt')
+  mean_by_class = {}
+  
+  for key in evaluator.vocab.feature_list:
+    # skip type for calculating mean as type or metric token have different meanings across encoding schemes
+    if key == 'type':
+      continue
+    mean_nll = sum(evaluator.loss_by_class[key]) / evaluator.count_by_class[key]
+    mean_by_class[key] = mean_nll
+    
+  # calculate micro average
+  total_mean_nll = 0
+  for key in mean_by_class.keys():
+    total_mean_nll += mean_by_class[key] * evaluator.count_by_class[key]
+  denominator = 0
+  for key in mean_by_class.keys():
+    denominator += evaluator.count_by_class[key]
+  total_mean_nll /= denominator
+  return total_mean_nll
+
+if __name__ == '__main__':
+  exp_code = sys.argv[1]
+  print(main(exp_code)) 
\ No newline at end of file
diff --git a/train_accelerate.py b/train_accelerate.py
new file mode 100644
index 0000000..0531d61
--- /dev/null
+++ b/train_accelerate.py
@@ -0,0 +1,376 @@
+from calendar import c
+import os
+import copy
+from pathlib import Path
+from datetime import datetime
+
+import torch
+import torch.multiprocessing as mp
+from torch.distributed import init_process_group, destroy_process_group
+
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+
+import wandb
+import hydra
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import DictConfig, OmegaConf
+
+# import accelerate
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+
+from Amadeus.symbolic_encoding import data_utils, decoding_utils
+from Amadeus.symbolic_encoding.data_utils import get_emb_total_size
+from Amadeus import model_zoo, trainer_accelerate as trainer
+from Amadeus.train_utils import NLLLoss4REMI, NLLLoss4CompoundToken, CosineAnnealingWarmUpRestarts, EncodecFlattenLoss, EncodecMultiClassLoss, CosineLRScheduler, adjust_prediction_order, DiffusionLoss4CompoundToken
+from Amadeus.encodec.data_utils import EncodecDataset
+from data_representation import vocab_utils
+from run_evaluation import main as run_evaluation
+
+def ddp_setup(rank, world_size, backend='nccl'):
+  os.environ['MASTER_ADDR'] = 'localhost'
+  os.environ['MASTER_PORT'] = '12355'
+  init_process_group(backend, rank=rank, world_size=world_size)
+  torch.cuda.set_device(rank)
+
+def generate_experiment_name(config):
+  # add base hyperparameters to the experiment name
+  dataset_name = config.dataset
+  encoding_name = config.nn_params.encoding_scheme
+  num_features = config.nn_params.num_features
+  input_embedder_name = config.nn_params.input_embedder_name
+  sub_decoder_name = config.nn_params.sub_decoder_name
+  batch_size = config.train_params.batch_size
+  num_layers = config.nn_params.main_decoder.num_layer
+  input_length = config.train_params.input_length
+  first_pred_feature = config.data_params.first_pred_feature
+
+  # Add target hyperparameters to the experiment name
+  # dropout
+  main_dropout = config.nn_params.model_dropout 
+  # learning rate
+  lr_decay_rate = config.train_params.decay_step_rate
+
+  time  = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+  # Combine the information into a single string for the experiment name
+  # experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{input_embedder_name}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}\
+  # _dropout{main_dropout}_lrdecay{lr_decay_rate}"
+  experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}"
+  return experiment_name
+
+def setup_log(config):
+    if config.general.make_log and config.use_ddp == False:
+        experiment_name = generate_experiment_name(config)
+        wandb.init(
+            project="Amadeus",
+            name=experiment_name,
+            config=OmegaConf.to_container(config)
+        )
+        # 保存配置到 WANDB 根目录
+        config_path = Path(wandb.run.dir) / "config.yaml"
+        OmegaConf.save(config, config_path)  # 关键代码
+        
+        save_dir = Path(wandb.run.dir) / "checkpoints"
+        save_dir.mkdir(exist_ok=True, parents=True)
+    else:
+        now = datetime.now()
+        save_dir = Path('wandb/debug/checkpoints') / now.strftime('%y-%m-%d_%H-%M-%S')
+        save_dir.mkdir(exist_ok=True, parents=True)
+        # 保存配置到调试目录
+        config_path = save_dir / "config.yaml"
+        OmegaConf.save(config, config_path)  # 关键代码
+    
+    return str(save_dir)
+
+# Prepare symbolic dataset and model for training
+def preapre_sybmolic(config: DictConfig, save_dir: str, rank: int) -> trainer.LanguageModelTrainer:
+    # Extract neural network parameters, dataset name, encoding scheme, and number of features from the configuration
+    nn_params = config.nn_params
+    dataset_name = config.dataset
+    encoding_scheme = nn_params.encoding_scheme
+    num_features = nn_params.num_features
+
+    # get proper prediction order according to the encoding scheme and target feature in the config
+    prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params)
+
+    # Prepare paths for input and output vocabulary files
+    vocab_dir = Path(f'vocab/vocab_{dataset_name}')
+    in_vocab_file_path = vocab_dir / f'vocab_{dataset_name}_{encoding_scheme}{num_features}.json'
+    out_vocab_path = Path(save_dir) / f'vocab_{dataset_name}_{encoding_scheme}{num_features}.json'
+
+    # get vocab
+    vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
+    selected_vocab_name = vocab_name[encoding_scheme]
+
+    vocab = getattr(vocab_utils, selected_vocab_name)(
+      in_vocab_file_path=in_vocab_file_path,
+      event_data=None,
+      encoding_scheme=encoding_scheme, 
+      num_features=num_features)
+    
+    if out_vocab_path is not None:
+      vocab.save_vocab(out_vocab_path)
+
+    # Initialize symbolic dataset based on dataset name and configuration parameters
+    symbolic_dataset = getattr(data_utils, dataset_name)(
+                                vocab=vocab,
+                                encoding_scheme=encoding_scheme,
+                                num_features=num_features,
+                                debug=config.general.debug,
+                                aug_type=config.data_params.aug_type,
+                                input_length=config.train_params.input_length,
+                                first_pred_feature=config.data_params.first_pred_feature,
+                                caption_path=config.captions_path,
+                                )
+
+    # Split dataset into training, validation, and test sets
+    split_ratio = config.data_params.split_ratio
+    trainset, validset, testset = symbolic_dataset.split_train_valid_test_set(
+        dataset_name=config.dataset, ratio=split_ratio, seed=42, save_dir=save_dir)
+
+    # Create the Transformer model based on configuration parameters
+    nested_music_transformer = getattr(model_zoo, nn_params.model_name)(
+                          vocab=symbolic_dataset.vocab,
+                          input_length=config.train_params.input_length,
+                          prediction_order=prediction_order,
+                          input_embedder_name=nn_params.input_embedder_name,
+                          main_decoder_name=nn_params.main_decoder_name,
+                          sub_decoder_name=nn_params.sub_decoder_name,
+                          sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,
+                          sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+                            if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,
+                          dim=nn_params.main_decoder.dim_model,
+                          heads=nn_params.main_decoder.num_head,
+                          depth=nn_params.main_decoder.num_layer,
+                          dropout=nn_params.model_dropout,
+                          )
+    
+    # Log the total number of parameters requires grad in the model
+    total_params = sum(p.numel() for p in nested_music_transformer.parameters() if p.requires_grad)
+    print(f"Total number of parameters is: {total_params}")
+    
+    # # Optionally log the total parameters in Wandb
+    # if config.general.make_log:
+    #     wandb.log({'model_total_params': total_params}, step=0)
+    
+    # Select loss function based on encoding scheme
+    # You can use focal loss by setting focal_alpha and focal_gamma in the config file
+    focal_alpha = config.train_params.focal_alpha
+    focal_gamma = config.train_params.focal_gamma
+    if encoding_scheme == 'remi':
+        loss_fn = NLLLoss4REMI(focal_alpha=focal_alpha, focal_gamma=focal_gamma)
+    elif encoding_scheme in ['cp', 'nb']:
+      if config.use_diff is False:
+        loss_fn = NLLLoss4CompoundToken(feature_list=symbolic_dataset.vocab.feature_list, focal_alpha=focal_alpha, focal_gamma=focal_gamma)
+      else:
+        loss_fn = DiffusionLoss4CompoundToken(feature_list=symbolic_dataset.vocab.feature_list, focal_alpha=focal_alpha, focal_gamma=focal_gamma)
+    
+    # Set optimizer and learning rate scheduler based on the configuration
+    optimizer = torch.optim.AdamW(nested_music_transformer.parameters(), lr=config.train_params.initial_lr, betas=(0.9, 0.95), eps=1e-08, weight_decay=0.01)
+    scheduler_dict = {'not-using': None, 'cosineannealingwarmuprestarts': CosineAnnealingWarmUpRestarts, 'cosinelr': CosineLRScheduler}
+    if scheduler_dict[config.train_params.scheduler] == CosineAnnealingWarmUpRestarts:
+        scheduler = scheduler_dict[config.train_params.scheduler](optimizer, T_0=config.train_params.num_steps_per_cycle, T_mult=2, eta_min=0, eta_max=config.train_params.max_lr,  T_up=config.train_params.warmup_steps, gamma=config.train_params.gamma)
+    elif scheduler_dict[config.train_params.scheduler] == CosineLRScheduler:
+        scheduler = scheduler_dict[config.train_params.scheduler](optimizer, total_steps=config.train_params.num_iter * config.train_params.decay_step_rate, warmup_steps=config.train_params.warmup_steps, lr_min_ratio=0.1, cycle_length=1.0)
+    else:
+        scheduler = None
+
+    # Define beat resolution and MIDI decoder based on the dataset and encoding scheme
+    in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8}
+    try:
+      in_beat_resolution = in_beat_resolution_dict[dataset_name]
+    except KeyError:
+      in_beat_resolution = 4
+    midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'}
+    midi_decoder = getattr(decoding_utils, midi_decoder_dict[encoding_scheme])(vocab=symbolic_dataset.vocab, in_beat_resolution=in_beat_resolution, dataset_name=dataset_name)
+
+    # Select trainer class based on encoding scheme
+    trainer_option_dict = {'remi': 'LanguageModelTrainer4REMI', 'cp': 'LanguageModelTrainer4CompoundToken', 'nb':'LanguageModelTrainer4CompoundToken'}
+    trainer_option = trainer_option_dict[encoding_scheme]
+    sampling_method = None
+    sampling_threshold = 0.99
+    sampling_temperature = 1.0
+
+    # Initialize and return the training module
+    training_module = getattr(trainer, trainer_option)(
+                              model=nested_music_transformer,
+                              optimizer=optimizer,
+                              scheduler=scheduler,
+                              loss_fn=loss_fn,
+                              midi_decoder=midi_decoder,
+                              train_set=trainset,
+                              valid_set=validset,
+                              save_dir=save_dir,
+                              vocab=symbolic_dataset.vocab,
+                              use_ddp=config.use_ddp,
+                              use_fp16=config.use_fp16,
+                              world_size=config.train_params.world_size,
+                              batch_size=config.train_params.batch_size,
+                              infer_target_len=symbolic_dataset.mean_len_tunes,
+                              gpu_id=rank,
+                              sampling_method=sampling_method,
+                              sampling_threshold=sampling_threshold,
+                              sampling_temperature=sampling_temperature,
+                              config=config
+                              )
+    
+    return training_module
+
+# Prepare Encodec dataset and model for training
+def prepare_encodec(config, save_dir, rank):
+    # Setup logging and determine where logs will be saved
+    save_dir = setup_log(config)
+
+    # Extract neural network (NN) parameters and encoding scheme from config
+    nn_params = config.nn_params
+    encoding_scheme = config.data_params.encoding_scheme
+
+    # no change in prediction order for encodec
+    prediction_order = ['k1', 'k2', 'k3', 'k4']
+
+    # Create directory for storing vocabulary files, if it doesn't already exist
+    vocab_dir = Path(f'vocab/vocab_MaestroEncodec')
+    Path(vocab_dir).mkdir(exist_ok=True, parents=True)
+    
+    # Define paths for input and output vocabulary files
+    in_vocab_file_path = vocab_dir / f'maestro-v3.0.0-in_vocab.json'
+    out_vocab_path = Path(save_dir) / f'maestro-v3.0.0-in_vocab.json'
+
+    # Define path for tokenized dataset using the Encodec scheme
+    token_path = Path(f"dataset/encodec_dataset/maestro-v3.0.0-encodec_{config.data_type}")
+    
+    # Initialize the EncodecDataset object with necessary file paths and parameters
+    encodec_dataset = EncodecDataset(
+                                    in_vocab_file_path=in_vocab_file_path,
+                                    out_vocab_path=out_vocab_path,
+                                    encoding_scheme=encoding_scheme,
+                                    input_length=config.train_params.input_length,
+                                    token_path=token_path
+                                    ) 
+    
+    # Split the dataset into training, validation, and test sets
+    trainset, validset, testset = encodec_dataset.split_train_valid_test_set()
+
+    # Load the model from the model zoo based on the configuration and neural network parameters
+    nested_music_transformer = getattr(model_zoo, nn_params.model_name)(
+                            vocab=encodec_dataset.vocab,  # Vocab used by the dataset
+                            input_length=config.train_params.input_length,  # Length of input sequences
+                            prediction_order=prediction_order,  # Order in which predictions are made
+                            input_embedder_name=nn_params.input_embedder_name,  # Name of the embedding layer
+                            main_decoder_name=nn_params.main_decoder_name,  # Main decoder name
+                            sub_decoder_name=nn_params.sub_decoder_name,  # Sub-decoder name if applicable
+                            sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0,  # Sub-decoder depth if defined
+                            sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \
+                              if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False,  # Use feature enricher in sub-decoder if defined
+                            dim=nn_params.main_decoder.dim_model,  # Model dimension
+                            heads=nn_params.main_decoder.num_head,  # Number of attention heads
+                            depth=nn_params.main_decoder.num_layer,  # Number of layers in the main decoder
+                            dropout=nn_params.main_decoder.dropout,  # Dropout rate
+                            )
+    
+    # Calculate and print the total number of model parameters
+    total_params = sum(p.numel() for p in nested_music_transformer.parameters())
+    print(f"Total number of parameters is: {total_params}")
+
+    # If logging is enabled, log the total parameter count to Weights and Biases
+    if config.general.make_log:
+      wandb.log({'model_total_params': total_params})
+
+    # Select the appropriate loss function based on the encoding scheme
+    # In discrete audio token, remi encoding means flatten encoding and nb encoding means compound encoding
+    # nb_delay encoding is the tokenization manipulation technique proposed in MusicGen "https://arxiv.org/abs/2306.05284"
+    if encoding_scheme == 'remi':
+      loss_fn = EncodecFlattenLoss(feature_list=encodec_dataset.vocab.feature_list)  # Loss for REMI encoding scheme
+    elif encoding_scheme == 'nb' or encoding_scheme == 'nb_delay':
+      loss_fn = EncodecMultiClassLoss(feature_list=encodec_dataset.vocab.feature_list)  # Loss for NB or NB-Delay encoding scheme
+
+    # Initialize the AdamW optimizer with the model's parameters and specified hyperparameters
+    optimizer = torch.optim.AdamW(nested_music_transformer.parameters(), lr=config.train_params.initial_lr, betas=(0.9, 0.95), eps=1e-08, weight_decay=0.01)
+    
+    # Define scheduler options and initialize based on config
+    scheduler_dict = {'not-using': None, 'cosineannealingwarmuprestarts': CosineAnnealingWarmUpRestarts, 'cosinelr': CosineLRScheduler}
+    if scheduler_dict[config.train_params.scheduler] == CosineAnnealingWarmUpRestarts:
+      # Cosine Annealing with warm restarts
+      scheduler = scheduler_dict[config.train_params.scheduler](optimizer, T_0=config.train_params.num_steps_per_cycle, T_mult=2, eta_min=0, eta_max=config.train_params.max_lr, T_up=config.train_params.warmup_steps, gamma=config.train_params.gamma)
+    elif scheduler_dict[config.train_params.scheduler] == CosineLRScheduler:
+      # Cosine LR Scheduler
+      scheduler = scheduler_dict[config.train_params.scheduler](optimizer, total_steps=config.train_params.num_iter * config.train_params.decay_step_rate, warmup_steps=config.train_params.warmup_steps, lr_min_ratio=0.1, cycle_length=1.0)
+    else:
+      scheduler = None  # No scheduler if 'not-using' is selected
+
+    # Define trainer options based on the encoding scheme
+    trainer_option_dict = {'remi': 'EncodecFlattenTrainer', 'nb':'EncodecMultiClassTrainer', 'nb_delay':'EncodecMultiClassTrainer'}
+    trainer_option = trainer_option_dict[encoding_scheme]
+
+    # Define the target inference length for different encoding schemes
+    infer_target_len_dict = {'remi': 6000, 'nb': 1500, 'nb_delay': 1500}
+    infer_target_len = infer_target_len_dict[encoding_scheme]
+
+    # sampling method and parameters
+    sampling_method = None
+    sampling_threshold = 1.0
+    sampling_temperature = 1.0
+
+    # Initialize the appropriate trainer class with the model, optimizer, datasets, and other training parameters
+    training_module = getattr(trainer, trainer_option)(
+                              model=nested_music_transformer,
+                              optimizer=optimizer,
+                              scheduler=scheduler,
+                              loss_fn=loss_fn,
+                              midi_decoder=None,
+                              train_set=trainset,
+                              valid_set=validset,
+                              save_dir=save_dir,
+                              vocab=encodec_dataset.vocab,
+                              use_ddp=config.use_ddp,
+                              use_fp16=config.use_fp16,
+                              world_size=config.train_params.world_size,
+                              batch_size=config.train_params.batch_size,
+                              infer_target_len=infer_target_len,
+                              gpu_id=rank,
+                              sampling_method=sampling_method,
+                              sampling_threshold=sampling_threshold,
+                              sampling_temperature=sampling_temperature,
+                              config=config
+                              )
+
+    # Return the initialized training module to be used for training
+    return training_module
+
+def run_train_exp(rank, config, world_size:int=1):
+  # if config.use_ddp: ddp_setup(rank, world_size)
+  # config = copy.deepcopy(config)
+  # config.train_params.world_size = world_size
+  # if rank != 0:
+  #   config.general.make_log = False
+  #   config.general.infer_and_log = False
+
+  save_dir = setup_log(config)
+  print(f"save_dir: {save_dir}")
+  if 'encodec' in config.dataset.lower():
+    training_module = prepare_encodec(config, save_dir, rank)
+  else:
+    training_module = preapre_sybmolic(config, save_dir, rank)
+  training_module.accelerate_train_by_num_iter(int(config.train_params.num_iter))
+
+  if not 'encodec' in config.dataset.lower():
+    try:
+      exp_code = [x for x in save_dir.split('/') if 'run-' in x][0]
+      mean_nll = run_evaluation(exp_code)
+      wandb.log({'evaluated_mean_nll': mean_nll})
+    except Exception as e:
+      exp_code = "latest-run"
+
+
+@hydra.main(version_base=None, config_path="./Amadeus/symbolic_yamls/", config_name="config-accelerate")
+def main(config: DictConfig):
+  if config.use_ddp:
+    world_size = torch.cuda.device_count()
+    run_train_exp(0, config, world_size)
+  else:
+    run_train_exp(0, config) # single gpu
+
+if __name__ == "__main__":
+  main()
+# CUDA_VISIBLE_DEVICES=2,3,4,5 accelerate launch --num_processes 4 --num_machines 1  train_accelerate.py
\ No newline at end of file
diff --git a/vocab/.DS_Store b/vocab/.DS_Store
new file mode 100644
index 0000000..83dbc6a
Binary files /dev/null and b/vocab/.DS_Store differ
diff --git a/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json b/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json b/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json
new file mode 100644
index 0000000..6b4d0c3
--- /dev/null
+++ b/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json
@@ -0,0 +1,494 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json b/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json b/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json
new file mode 100644
index 0000000..9534353
--- /dev/null
+++ b/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json
@@ -0,0 +1,278 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json b/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json
new file mode 100644
index 0000000..68b07cc
--- /dev/null
+++ b/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json
@@ -0,0 +1,488 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_30",
+    "2": "Tempo_31",
+    "3": "Tempo_32",
+    "4": "Tempo_33",
+    "5": "Tempo_34",
+    "6": "Tempo_35",
+    "7": "Tempo_36",
+    "8": "Tempo_37",
+    "9": "Tempo_38",
+    "10": "Tempo_40",
+    "11": "Tempo_42",
+    "12": "Tempo_44",
+    "13": "Tempo_46",
+    "14": "Tempo_48",
+    "15": "Tempo_50",
+    "16": "Tempo_52",
+    "17": "Tempo_54",
+    "18": "Tempo_56",
+    "19": "Tempo_58",
+    "20": "Tempo_60",
+    "21": "Tempo_62",
+    "22": "Tempo_64",
+    "23": "Tempo_67",
+    "24": "Tempo_70",
+    "25": "Tempo_73",
+    "26": "Tempo_76",
+    "27": "Tempo_79",
+    "28": "Tempo_82",
+    "29": "Tempo_85",
+    "30": "Tempo_88",
+    "31": "Tempo_92",
+    "32": "Tempo_96",
+    "33": "Tempo_100",
+    "34": "Tempo_104",
+    "35": "Tempo_108",
+    "36": "Tempo_112",
+    "37": "Tempo_116",
+    "38": "Tempo_121",
+    "39": "Tempo_126",
+    "40": "Tempo_131",
+    "41": "Tempo_136",
+    "42": "Tempo_141",
+    "43": "Tempo_147",
+    "44": "Tempo_153",
+    "45": "Tempo_159",
+    "46": "Tempo_165",
+    "47": "Tempo_172",
+    "48": "Tempo_179",
+    "49": "Tempo_186",
+    "50": "Tempo_193",
+    "51": "Tempo_201",
+    "52": "Tempo_209",
+    "53": "Tempo_217",
+    "54": "Tempo_226",
+    "55": "Tempo_235",
+    "56": "Tempo_244",
+    "57": "Tempo_254",
+    "58": "Tempo_264",
+    "59": "Tempo_275",
+    "60": "Tempo_286",
+    "61": "Tempo_297",
+    "62": "Tempo_309",
+    "63": "Tempo_347",
+    "64": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json b/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json
new file mode 100644
index 0000000..2c51382
--- /dev/null
+++ b/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json
@@ -0,0 +1,462 @@
+{
+  "0": "SOS_None",
+  "1": "EOS_None",
+  "2": "Bar_None",
+  "3": "Bar_time_signature_1/2",
+  "4": "Bar_time_signature_1/4",
+  "5": "Bar_time_signature_1/8",
+  "6": "Bar_time_signature_11/8",
+  "7": "Bar_time_signature_12/8",
+  "8": "Bar_time_signature_2/2",
+  "9": "Bar_time_signature_2/4",
+  "10": "Bar_time_signature_2/8",
+  "11": "Bar_time_signature_3/2",
+  "12": "Bar_time_signature_3/4",
+  "13": "Bar_time_signature_3/8",
+  "14": "Bar_time_signature_4/2",
+  "15": "Bar_time_signature_4/4",
+  "16": "Bar_time_signature_4/8",
+  "17": "Bar_time_signature_5/4",
+  "18": "Bar_time_signature_5/8",
+  "19": "Bar_time_signature_6/4",
+  "20": "Bar_time_signature_6/8",
+  "21": "Bar_time_signature_7/4",
+  "22": "Bar_time_signature_7/8",
+  "23": "Bar_time_signature_8/4",
+  "24": "Bar_time_signature_8/8",
+  "25": "Bar_time_signature_9/8",
+  "26": "Beat_0",
+  "27": "Beat_1",
+  "28": "Beat_2",
+  "29": "Beat_3",
+  "30": "Beat_4",
+  "31": "Beat_5",
+  "32": "Beat_6",
+  "33": "Beat_7",
+  "34": "Beat_8",
+  "35": "Beat_9",
+  "36": "Beat_10",
+  "37": "Beat_11",
+  "38": "Beat_12",
+  "39": "Beat_13",
+  "40": "Beat_14",
+  "41": "Beat_15",
+  "42": "Beat_16",
+  "43": "Beat_17",
+  "44": "Beat_18",
+  "45": "Beat_19",
+  "46": "Beat_20",
+  "47": "Beat_21",
+  "48": "Beat_22",
+  "49": "Beat_23",
+  "50": "Beat_24",
+  "51": "Beat_25",
+  "52": "Beat_26",
+  "53": "Beat_27",
+  "54": "Beat_28",
+  "55": "Beat_29",
+  "56": "Beat_30",
+  "57": "Beat_31",
+  "58": "Note_Duration_1",
+  "59": "Note_Duration_10",
+  "60": "Note_Duration_12",
+  "61": "Note_Duration_16",
+  "62": "Note_Duration_2",
+  "63": "Note_Duration_20",
+  "64": "Note_Duration_24",
+  "65": "Note_Duration_28",
+  "66": "Note_Duration_3",
+  "67": "Note_Duration_32",
+  "68": "Note_Duration_4",
+  "69": "Note_Duration_5",
+  "70": "Note_Duration_6",
+  "71": "Note_Duration_8",
+  "72": "Note_Velocity_100",
+  "73": "Note_Velocity_120",
+  "74": "Note_Velocity_40",
+  "75": "Note_Velocity_60",
+  "76": "Note_Velocity_80",
+  "77": "Tempo_100",
+  "78": "Tempo_104",
+  "79": "Tempo_108",
+  "80": "Tempo_112",
+  "81": "Tempo_116",
+  "82": "Tempo_121",
+  "83": "Tempo_126",
+  "84": "Tempo_131",
+  "85": "Tempo_136",
+  "86": "Tempo_141",
+  "87": "Tempo_147",
+  "88": "Tempo_153",
+  "89": "Tempo_159",
+  "90": "Tempo_165",
+  "91": "Tempo_172",
+  "92": "Tempo_179",
+  "93": "Tempo_186",
+  "94": "Tempo_193",
+  "95": "Tempo_201",
+  "96": "Tempo_209",
+  "97": "Tempo_217",
+  "98": "Tempo_226",
+  "99": "Tempo_235",
+  "100": "Tempo_244",
+  "101": "Tempo_254",
+  "102": "Tempo_264",
+  "103": "Tempo_275",
+  "104": "Tempo_286",
+  "105": "Tempo_297",
+  "106": "Tempo_30",
+  "107": "Tempo_309",
+  "108": "Tempo_31",
+  "109": "Tempo_32",
+  "110": "Tempo_33",
+  "111": "Tempo_34",
+  "112": "Tempo_347",
+  "113": "Tempo_35",
+  "114": "Tempo_36",
+  "115": "Tempo_37",
+  "116": "Tempo_38",
+  "117": "Tempo_390",
+  "118": "Tempo_40",
+  "119": "Tempo_42",
+  "120": "Tempo_44",
+  "121": "Tempo_46",
+  "122": "Tempo_48",
+  "123": "Tempo_50",
+  "124": "Tempo_52",
+  "125": "Tempo_54",
+  "126": "Tempo_56",
+  "127": "Tempo_58",
+  "128": "Tempo_60",
+  "129": "Tempo_62",
+  "130": "Tempo_64",
+  "131": "Tempo_67",
+  "132": "Tempo_70",
+  "133": "Tempo_73",
+  "134": "Tempo_76",
+  "135": "Tempo_79",
+  "136": "Tempo_82",
+  "137": "Tempo_85",
+  "138": "Tempo_88",
+  "139": "Tempo_92",
+  "140": "Tempo_96",
+  "141": "Note_Pitch_6",
+  "142": "Note_Pitch_7",
+  "143": "Note_Pitch_8",
+  "144": "Note_Pitch_9",
+  "145": "Note_Pitch_10",
+  "146": "Note_Pitch_11",
+  "147": "Note_Pitch_12",
+  "148": "Note_Pitch_13",
+  "149": "Note_Pitch_14",
+  "150": "Note_Pitch_15",
+  "151": "Note_Pitch_16",
+  "152": "Note_Pitch_17",
+  "153": "Note_Pitch_18",
+  "154": "Note_Pitch_19",
+  "155": "Note_Pitch_20",
+  "156": "Note_Pitch_21",
+  "157": "Note_Pitch_22",
+  "158": "Note_Pitch_23",
+  "159": "Note_Pitch_24",
+  "160": "Note_Pitch_25",
+  "161": "Note_Pitch_26",
+  "162": "Note_Pitch_27",
+  "163": "Note_Pitch_28",
+  "164": "Note_Pitch_29",
+  "165": "Note_Pitch_30",
+  "166": "Note_Pitch_31",
+  "167": "Note_Pitch_32",
+  "168": "Note_Pitch_33",
+  "169": "Note_Pitch_34",
+  "170": "Note_Pitch_35",
+  "171": "Note_Pitch_36",
+  "172": "Note_Pitch_37",
+  "173": "Note_Pitch_38",
+  "174": "Note_Pitch_39",
+  "175": "Note_Pitch_40",
+  "176": "Note_Pitch_41",
+  "177": "Note_Pitch_42",
+  "178": "Note_Pitch_43",
+  "179": "Note_Pitch_44",
+  "180": "Note_Pitch_45",
+  "181": "Note_Pitch_46",
+  "182": "Note_Pitch_47",
+  "183": "Note_Pitch_48",
+  "184": "Note_Pitch_49",
+  "185": "Note_Pitch_50",
+  "186": "Note_Pitch_51",
+  "187": "Note_Pitch_52",
+  "188": "Note_Pitch_53",
+  "189": "Note_Pitch_54",
+  "190": "Note_Pitch_55",
+  "191": "Note_Pitch_56",
+  "192": "Note_Pitch_57",
+  "193": "Note_Pitch_58",
+  "194": "Note_Pitch_59",
+  "195": "Note_Pitch_60",
+  "196": "Note_Pitch_61",
+  "197": "Note_Pitch_62",
+  "198": "Note_Pitch_63",
+  "199": "Note_Pitch_64",
+  "200": "Note_Pitch_65",
+  "201": "Note_Pitch_66",
+  "202": "Note_Pitch_67",
+  "203": "Note_Pitch_68",
+  "204": "Note_Pitch_69",
+  "205": "Note_Pitch_70",
+  "206": "Note_Pitch_71",
+  "207": "Note_Pitch_72",
+  "208": "Note_Pitch_73",
+  "209": "Note_Pitch_74",
+  "210": "Note_Pitch_75",
+  "211": "Note_Pitch_76",
+  "212": "Note_Pitch_77",
+  "213": "Note_Pitch_78",
+  "214": "Note_Pitch_79",
+  "215": "Note_Pitch_80",
+  "216": "Note_Pitch_81",
+  "217": "Note_Pitch_82",
+  "218": "Note_Pitch_83",
+  "219": "Note_Pitch_84",
+  "220": "Note_Pitch_85",
+  "221": "Note_Pitch_86",
+  "222": "Note_Pitch_87",
+  "223": "Note_Pitch_88",
+  "224": "Note_Pitch_89",
+  "225": "Note_Pitch_90",
+  "226": "Note_Pitch_91",
+  "227": "Note_Pitch_92",
+  "228": "Note_Pitch_93",
+  "229": "Note_Pitch_94",
+  "230": "Note_Pitch_95",
+  "231": "Note_Pitch_96",
+  "232": "Note_Pitch_97",
+  "233": "Note_Pitch_98",
+  "234": "Note_Pitch_99",
+  "235": "Note_Pitch_100",
+  "236": "Note_Pitch_101",
+  "237": "Note_Pitch_102",
+  "238": "Note_Pitch_103",
+  "239": "Note_Pitch_104",
+  "240": "Note_Pitch_105",
+  "241": "Note_Pitch_106",
+  "242": "Note_Pitch_107",
+  "243": "Note_Pitch_108",
+  "244": "Note_Pitch_109",
+  "245": "Note_Pitch_110",
+  "246": "Note_Pitch_111",
+  "247": "Note_Pitch_112",
+  "248": "Note_Pitch_113",
+  "249": "Note_Pitch_114",
+  "250": "Note_Pitch_115",
+  "251": "Note_Pitch_116",
+  "252": "Note_Pitch_117",
+  "253": "Note_Pitch_118",
+  "254": "Note_Pitch_119",
+  "255": "Note_Pitch_120",
+  "256": "Note_Pitch_121",
+  "257": "Note_Pitch_122",
+  "258": "Note_Pitch_123",
+  "259": "Note_Pitch_124",
+  "260": "Note_Pitch_125",
+  "261": "Note_Pitch_126",
+  "262": "Instrument_0",
+  "263": "Instrument_4",
+  "264": "Instrument_6",
+  "265": "Instrument_7",
+  "266": "Instrument_8",
+  "267": "Instrument_9",
+  "268": "Instrument_10",
+  "269": "Instrument_11",
+  "270": "Instrument_12",
+  "271": "Instrument_13",
+  "272": "Instrument_14",
+  "273": "Instrument_15",
+  "274": "Instrument_16",
+  "275": "Instrument_19",
+  "276": "Instrument_21",
+  "277": "Instrument_22",
+  "278": "Instrument_23",
+  "279": "Instrument_24",
+  "280": "Instrument_25",
+  "281": "Instrument_26",
+  "282": "Instrument_32",
+  "283": "Instrument_33",
+  "284": "Instrument_36",
+  "285": "Instrument_38",
+  "286": "Instrument_40",
+  "287": "Instrument_41",
+  "288": "Instrument_42",
+  "289": "Instrument_43",
+  "290": "Instrument_46",
+  "291": "Instrument_47",
+  "292": "Instrument_49",
+  "293": "Instrument_50",
+  "294": "Instrument_52",
+  "295": "Instrument_55",
+  "296": "Instrument_56",
+  "297": "Instrument_57",
+  "298": "Instrument_58",
+  "299": "Instrument_60",
+  "300": "Instrument_61",
+  "301": "Instrument_62",
+  "302": "Instrument_64",
+  "303": "Instrument_65",
+  "304": "Instrument_66",
+  "305": "Instrument_67",
+  "306": "Instrument_68",
+  "307": "Instrument_69",
+  "308": "Instrument_70",
+  "309": "Instrument_71",
+  "310": "Instrument_72",
+  "311": "Instrument_73",
+  "312": "Instrument_74",
+  "313": "Instrument_75",
+  "314": "Instrument_79",
+  "315": "Instrument_80",
+  "316": "Instrument_88",
+  "317": "Instrument_104",
+  "318": "Instrument_105",
+  "319": "Instrument_106",
+  "320": "Instrument_107",
+  "321": "Instrument_108",
+  "322": "Instrument_109",
+  "323": "Instrument_111",
+  "324": "Instrument_114",
+  "325": "Instrument_117",
+  "326": "Instrument_118",
+  "327": "Chord_A_+",
+  "328": "Chord_A#_+",
+  "329": "Chord_B_+",
+  "330": "Chord_C_+",
+  "331": "Chord_C#_+",
+  "332": "Chord_D_+",
+  "333": "Chord_D#_+",
+  "334": "Chord_E_+",
+  "335": "Chord_F_+",
+  "336": "Chord_F#_+",
+  "337": "Chord_G_+",
+  "338": "Chord_G#_+",
+  "339": "Chord_A_/o7",
+  "340": "Chord_A#_/o7",
+  "341": "Chord_B_/o7",
+  "342": "Chord_C_/o7",
+  "343": "Chord_C#_/o7",
+  "344": "Chord_D_/o7",
+  "345": "Chord_D#_/o7",
+  "346": "Chord_E_/o7",
+  "347": "Chord_F_/o7",
+  "348": "Chord_F#_/o7",
+  "349": "Chord_G_/o7",
+  "350": "Chord_G#_/o7",
+  "351": "Chord_A_7",
+  "352": "Chord_A#_7",
+  "353": "Chord_B_7",
+  "354": "Chord_C_7",
+  "355": "Chord_C#_7",
+  "356": "Chord_D_7",
+  "357": "Chord_D#_7",
+  "358": "Chord_E_7",
+  "359": "Chord_F_7",
+  "360": "Chord_F#_7",
+  "361": "Chord_G_7",
+  "362": "Chord_G#_7",
+  "363": "Chord_A_M",
+  "364": "Chord_A#_M",
+  "365": "Chord_B_M",
+  "366": "Chord_C_M",
+  "367": "Chord_C#_M",
+  "368": "Chord_D_M",
+  "369": "Chord_D#_M",
+  "370": "Chord_E_M",
+  "371": "Chord_F_M",
+  "372": "Chord_F#_M",
+  "373": "Chord_G_M",
+  "374": "Chord_G#_M",
+  "375": "Chord_A_M7",
+  "376": "Chord_A#_M7",
+  "377": "Chord_B_M7",
+  "378": "Chord_C_M7",
+  "379": "Chord_C#_M7",
+  "380": "Chord_D_M7",
+  "381": "Chord_D#_M7",
+  "382": "Chord_E_M7",
+  "383": "Chord_F_M7",
+  "384": "Chord_F#_M7",
+  "385": "Chord_G_M7",
+  "386": "Chord_G#_M7",
+  "387": "Chord_A_m",
+  "388": "Chord_A#_m",
+  "389": "Chord_B_m",
+  "390": "Chord_C_m",
+  "391": "Chord_C#_m",
+  "392": "Chord_D_m",
+  "393": "Chord_D#_m",
+  "394": "Chord_E_m",
+  "395": "Chord_F_m",
+  "396": "Chord_F#_m",
+  "397": "Chord_G_m",
+  "398": "Chord_G#_m",
+  "399": "Chord_A_m7",
+  "400": "Chord_A#_m7",
+  "401": "Chord_B_m7",
+  "402": "Chord_C_m7",
+  "403": "Chord_C#_m7",
+  "404": "Chord_D_m7",
+  "405": "Chord_D#_m7",
+  "406": "Chord_E_m7",
+  "407": "Chord_F_m7",
+  "408": "Chord_F#_m7",
+  "409": "Chord_G_m7",
+  "410": "Chord_G#_m7",
+  "411": "Chord_A_o",
+  "412": "Chord_A#_o",
+  "413": "Chord_B_o",
+  "414": "Chord_C_o",
+  "415": "Chord_C#_o",
+  "416": "Chord_D_o",
+  "417": "Chord_D#_o",
+  "418": "Chord_E_o",
+  "419": "Chord_F_o",
+  "420": "Chord_F#_o",
+  "421": "Chord_G_o",
+  "422": "Chord_G#_o",
+  "423": "Chord_A_o7",
+  "424": "Chord_A#_o7",
+  "425": "Chord_B_o7",
+  "426": "Chord_C_o7",
+  "427": "Chord_C#_o7",
+  "428": "Chord_D_o7",
+  "429": "Chord_D#_o7",
+  "430": "Chord_E_o7",
+  "431": "Chord_F_o7",
+  "432": "Chord_F#_o7",
+  "433": "Chord_G_o7",
+  "434": "Chord_G#_o7",
+  "435": "Chord_A_sus2",
+  "436": "Chord_A#_sus2",
+  "437": "Chord_B_sus2",
+  "438": "Chord_C_sus2",
+  "439": "Chord_C#_sus2",
+  "440": "Chord_D_sus2",
+  "441": "Chord_D#_sus2",
+  "442": "Chord_E_sus2",
+  "443": "Chord_F_sus2",
+  "444": "Chord_F#_sus2",
+  "445": "Chord_G_sus2",
+  "446": "Chord_G#_sus2",
+  "447": "Chord_A_sus4",
+  "448": "Chord_A#_sus4",
+  "449": "Chord_B_sus4",
+  "450": "Chord_C_sus4",
+  "451": "Chord_C#_sus4",
+  "452": "Chord_D_sus4",
+  "453": "Chord_D#_sus4",
+  "454": "Chord_E_sus4",
+  "455": "Chord_F_sus4",
+  "456": "Chord_F#_sus4",
+  "457": "Chord_G_sus4",
+  "458": "Chord_G#_sus4",
+  "459": "Chord_N_N"
+}
\ No newline at end of file
diff --git a/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json b/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json
new file mode 100644
index 0000000..6b94992
--- /dev/null
+++ b/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json
@@ -0,0 +1,354 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_4/4"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_30",
+    "2": "Tempo_31",
+    "3": "Tempo_32",
+    "4": "Tempo_33",
+    "5": "Tempo_34",
+    "6": "Tempo_35",
+    "7": "Tempo_36",
+    "8": "Tempo_37",
+    "9": "Tempo_38",
+    "10": "Tempo_40",
+    "11": "Tempo_42",
+    "12": "Tempo_44",
+    "13": "Tempo_46",
+    "14": "Tempo_50",
+    "15": "Tempo_52",
+    "16": "Tempo_54",
+    "17": "Tempo_56",
+    "18": "Tempo_58",
+    "19": "Tempo_60",
+    "20": "Tempo_62",
+    "21": "Tempo_64",
+    "22": "Tempo_67",
+    "23": "Tempo_70",
+    "24": "Tempo_73",
+    "25": "Tempo_76",
+    "26": "Tempo_79",
+    "27": "Tempo_82",
+    "28": "Tempo_85",
+    "29": "Tempo_88",
+    "30": "Tempo_92",
+    "31": "Tempo_96",
+    "32": "Tempo_100",
+    "33": "Tempo_104",
+    "34": "Tempo_108",
+    "35": "Tempo_112",
+    "36": "Tempo_116",
+    "37": "Tempo_121",
+    "38": "Tempo_126",
+    "39": "Tempo_131",
+    "40": "Tempo_136",
+    "41": "Tempo_141",
+    "42": "Tempo_147",
+    "43": "Tempo_153",
+    "44": "Tempo_159",
+    "45": "Tempo_165",
+    "46": "Tempo_172",
+    "47": "Tempo_179",
+    "48": "Tempo_186",
+    "49": "Tempo_193",
+    "50": "Tempo_201",
+    "51": "Tempo_209",
+    "52": "Tempo_217",
+    "53": "Tempo_226",
+    "54": "Tempo_235"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_16",
+    "2": "Note_Pitch_17",
+    "3": "Note_Pitch_18",
+    "4": "Note_Pitch_19",
+    "5": "Note_Pitch_20",
+    "6": "Note_Pitch_21",
+    "7": "Note_Pitch_22",
+    "8": "Note_Pitch_23",
+    "9": "Note_Pitch_24",
+    "10": "Note_Pitch_25",
+    "11": "Note_Pitch_26",
+    "12": "Note_Pitch_27",
+    "13": "Note_Pitch_28",
+    "14": "Note_Pitch_29",
+    "15": "Note_Pitch_30",
+    "16": "Note_Pitch_31",
+    "17": "Note_Pitch_32",
+    "18": "Note_Pitch_33",
+    "19": "Note_Pitch_34",
+    "20": "Note_Pitch_35",
+    "21": "Note_Pitch_36",
+    "22": "Note_Pitch_37",
+    "23": "Note_Pitch_38",
+    "24": "Note_Pitch_39",
+    "25": "Note_Pitch_40",
+    "26": "Note_Pitch_41",
+    "27": "Note_Pitch_42",
+    "28": "Note_Pitch_43",
+    "29": "Note_Pitch_44",
+    "30": "Note_Pitch_45",
+    "31": "Note_Pitch_46",
+    "32": "Note_Pitch_47",
+    "33": "Note_Pitch_48",
+    "34": "Note_Pitch_49",
+    "35": "Note_Pitch_50",
+    "36": "Note_Pitch_51",
+    "37": "Note_Pitch_52",
+    "38": "Note_Pitch_53",
+    "39": "Note_Pitch_54",
+    "40": "Note_Pitch_55",
+    "41": "Note_Pitch_56",
+    "42": "Note_Pitch_57",
+    "43": "Note_Pitch_58",
+    "44": "Note_Pitch_59",
+    "45": "Note_Pitch_60",
+    "46": "Note_Pitch_61",
+    "47": "Note_Pitch_62",
+    "48": "Note_Pitch_63",
+    "49": "Note_Pitch_64",
+    "50": "Note_Pitch_65",
+    "51": "Note_Pitch_66",
+    "52": "Note_Pitch_67",
+    "53": "Note_Pitch_68",
+    "54": "Note_Pitch_69",
+    "55": "Note_Pitch_70",
+    "56": "Note_Pitch_71",
+    "57": "Note_Pitch_72",
+    "58": "Note_Pitch_73",
+    "59": "Note_Pitch_74",
+    "60": "Note_Pitch_75",
+    "61": "Note_Pitch_76",
+    "62": "Note_Pitch_77",
+    "63": "Note_Pitch_78",
+    "64": "Note_Pitch_79",
+    "65": "Note_Pitch_80",
+    "66": "Note_Pitch_81",
+    "67": "Note_Pitch_82",
+    "68": "Note_Pitch_83",
+    "69": "Note_Pitch_84",
+    "70": "Note_Pitch_85",
+    "71": "Note_Pitch_86",
+    "72": "Note_Pitch_87",
+    "73": "Note_Pitch_88",
+    "74": "Note_Pitch_89",
+    "75": "Note_Pitch_90",
+    "76": "Note_Pitch_91",
+    "77": "Note_Pitch_92",
+    "78": "Note_Pitch_93",
+    "79": "Note_Pitch_94",
+    "80": "Note_Pitch_95",
+    "81": "Note_Pitch_96",
+    "82": "Note_Pitch_97",
+    "83": "Note_Pitch_98",
+    "84": "Note_Pitch_99",
+    "85": "Note_Pitch_100",
+    "86": "Note_Pitch_101",
+    "87": "Note_Pitch_102",
+    "88": "Note_Pitch_103",
+    "89": "Note_Pitch_104",
+    "90": "Note_Pitch_105",
+    "91": "Note_Pitch_106",
+    "92": "Note_Pitch_107",
+    "93": "Note_Pitch_108",
+    "94": "Note_Pitch_109",
+    "95": "Note_Pitch_110",
+    "96": "Note_Pitch_111",
+    "97": "Note_Pitch_112"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_48",
+    "3": "Note_Velocity_56",
+    "4": "Note_Velocity_64",
+    "5": "Note_Velocity_72",
+    "6": "Note_Velocity_80",
+    "7": "Note_Velocity_88"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json b/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json
new file mode 100644
index 0000000..89af462
--- /dev/null
+++ b/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json
@@ -0,0 +1,328 @@
+{
+  "0": "SOS_None",
+  "1": "EOS_None",
+  "2": "Bar_None",
+  "3": "Bar_time_signature_4/4",
+  "4": "Beat_0",
+  "5": "Beat_1",
+  "6": "Beat_2",
+  "7": "Beat_3",
+  "8": "Beat_4",
+  "9": "Beat_5",
+  "10": "Beat_6",
+  "11": "Beat_7",
+  "12": "Beat_8",
+  "13": "Beat_9",
+  "14": "Beat_10",
+  "15": "Beat_11",
+  "16": "Beat_12",
+  "17": "Beat_13",
+  "18": "Beat_14",
+  "19": "Beat_15",
+  "20": "Note_Duration_1",
+  "21": "Note_Duration_10",
+  "22": "Note_Duration_12",
+  "23": "Note_Duration_16",
+  "24": "Note_Duration_2",
+  "25": "Note_Duration_20",
+  "26": "Note_Duration_24",
+  "27": "Note_Duration_28",
+  "28": "Note_Duration_3",
+  "29": "Note_Duration_32",
+  "30": "Note_Duration_4",
+  "31": "Note_Duration_5",
+  "32": "Note_Duration_6",
+  "33": "Note_Duration_8",
+  "34": "Note_Velocity_40",
+  "35": "Note_Velocity_48",
+  "36": "Note_Velocity_56",
+  "37": "Note_Velocity_64",
+  "38": "Note_Velocity_72",
+  "39": "Note_Velocity_80",
+  "40": "Note_Velocity_88",
+  "41": "Tempo_100",
+  "42": "Tempo_104",
+  "43": "Tempo_108",
+  "44": "Tempo_112",
+  "45": "Tempo_116",
+  "46": "Tempo_121",
+  "47": "Tempo_126",
+  "48": "Tempo_131",
+  "49": "Tempo_136",
+  "50": "Tempo_141",
+  "51": "Tempo_147",
+  "52": "Tempo_153",
+  "53": "Tempo_159",
+  "54": "Tempo_165",
+  "55": "Tempo_172",
+  "56": "Tempo_179",
+  "57": "Tempo_186",
+  "58": "Tempo_193",
+  "59": "Tempo_201",
+  "60": "Tempo_209",
+  "61": "Tempo_217",
+  "62": "Tempo_226",
+  "63": "Tempo_235",
+  "64": "Tempo_30",
+  "65": "Tempo_31",
+  "66": "Tempo_32",
+  "67": "Tempo_33",
+  "68": "Tempo_34",
+  "69": "Tempo_35",
+  "70": "Tempo_36",
+  "71": "Tempo_37",
+  "72": "Tempo_38",
+  "73": "Tempo_40",
+  "74": "Tempo_42",
+  "75": "Tempo_44",
+  "76": "Tempo_46",
+  "77": "Tempo_50",
+  "78": "Tempo_52",
+  "79": "Tempo_54",
+  "80": "Tempo_56",
+  "81": "Tempo_58",
+  "82": "Tempo_60",
+  "83": "Tempo_62",
+  "84": "Tempo_64",
+  "85": "Tempo_67",
+  "86": "Tempo_70",
+  "87": "Tempo_73",
+  "88": "Tempo_76",
+  "89": "Tempo_79",
+  "90": "Tempo_82",
+  "91": "Tempo_85",
+  "92": "Tempo_88",
+  "93": "Tempo_92",
+  "94": "Tempo_96",
+  "95": "Note_Pitch_16",
+  "96": "Note_Pitch_17",
+  "97": "Note_Pitch_18",
+  "98": "Note_Pitch_19",
+  "99": "Note_Pitch_20",
+  "100": "Note_Pitch_21",
+  "101": "Note_Pitch_22",
+  "102": "Note_Pitch_23",
+  "103": "Note_Pitch_24",
+  "104": "Note_Pitch_25",
+  "105": "Note_Pitch_26",
+  "106": "Note_Pitch_27",
+  "107": "Note_Pitch_28",
+  "108": "Note_Pitch_29",
+  "109": "Note_Pitch_30",
+  "110": "Note_Pitch_31",
+  "111": "Note_Pitch_32",
+  "112": "Note_Pitch_33",
+  "113": "Note_Pitch_34",
+  "114": "Note_Pitch_35",
+  "115": "Note_Pitch_36",
+  "116": "Note_Pitch_37",
+  "117": "Note_Pitch_38",
+  "118": "Note_Pitch_39",
+  "119": "Note_Pitch_40",
+  "120": "Note_Pitch_41",
+  "121": "Note_Pitch_42",
+  "122": "Note_Pitch_43",
+  "123": "Note_Pitch_44",
+  "124": "Note_Pitch_45",
+  "125": "Note_Pitch_46",
+  "126": "Note_Pitch_47",
+  "127": "Note_Pitch_48",
+  "128": "Note_Pitch_49",
+  "129": "Note_Pitch_50",
+  "130": "Note_Pitch_51",
+  "131": "Note_Pitch_52",
+  "132": "Note_Pitch_53",
+  "133": "Note_Pitch_54",
+  "134": "Note_Pitch_55",
+  "135": "Note_Pitch_56",
+  "136": "Note_Pitch_57",
+  "137": "Note_Pitch_58",
+  "138": "Note_Pitch_59",
+  "139": "Note_Pitch_60",
+  "140": "Note_Pitch_61",
+  "141": "Note_Pitch_62",
+  "142": "Note_Pitch_63",
+  "143": "Note_Pitch_64",
+  "144": "Note_Pitch_65",
+  "145": "Note_Pitch_66",
+  "146": "Note_Pitch_67",
+  "147": "Note_Pitch_68",
+  "148": "Note_Pitch_69",
+  "149": "Note_Pitch_70",
+  "150": "Note_Pitch_71",
+  "151": "Note_Pitch_72",
+  "152": "Note_Pitch_73",
+  "153": "Note_Pitch_74",
+  "154": "Note_Pitch_75",
+  "155": "Note_Pitch_76",
+  "156": "Note_Pitch_77",
+  "157": "Note_Pitch_78",
+  "158": "Note_Pitch_79",
+  "159": "Note_Pitch_80",
+  "160": "Note_Pitch_81",
+  "161": "Note_Pitch_82",
+  "162": "Note_Pitch_83",
+  "163": "Note_Pitch_84",
+  "164": "Note_Pitch_85",
+  "165": "Note_Pitch_86",
+  "166": "Note_Pitch_87",
+  "167": "Note_Pitch_88",
+  "168": "Note_Pitch_89",
+  "169": "Note_Pitch_90",
+  "170": "Note_Pitch_91",
+  "171": "Note_Pitch_92",
+  "172": "Note_Pitch_93",
+  "173": "Note_Pitch_94",
+  "174": "Note_Pitch_95",
+  "175": "Note_Pitch_96",
+  "176": "Note_Pitch_97",
+  "177": "Note_Pitch_98",
+  "178": "Note_Pitch_99",
+  "179": "Note_Pitch_100",
+  "180": "Note_Pitch_101",
+  "181": "Note_Pitch_102",
+  "182": "Note_Pitch_103",
+  "183": "Note_Pitch_104",
+  "184": "Note_Pitch_105",
+  "185": "Note_Pitch_106",
+  "186": "Note_Pitch_107",
+  "187": "Note_Pitch_108",
+  "188": "Note_Pitch_109",
+  "189": "Note_Pitch_110",
+  "190": "Note_Pitch_111",
+  "191": "Note_Pitch_112",
+  "192": "Instrument_0",
+  "193": "Chord_A_+",
+  "194": "Chord_A#_+",
+  "195": "Chord_B_+",
+  "196": "Chord_C_+",
+  "197": "Chord_C#_+",
+  "198": "Chord_D_+",
+  "199": "Chord_D#_+",
+  "200": "Chord_E_+",
+  "201": "Chord_F_+",
+  "202": "Chord_F#_+",
+  "203": "Chord_G_+",
+  "204": "Chord_G#_+",
+  "205": "Chord_A_/o7",
+  "206": "Chord_A#_/o7",
+  "207": "Chord_B_/o7",
+  "208": "Chord_C_/o7",
+  "209": "Chord_C#_/o7",
+  "210": "Chord_D_/o7",
+  "211": "Chord_D#_/o7",
+  "212": "Chord_E_/o7",
+  "213": "Chord_F_/o7",
+  "214": "Chord_F#_/o7",
+  "215": "Chord_G_/o7",
+  "216": "Chord_G#_/o7",
+  "217": "Chord_A_7",
+  "218": "Chord_A#_7",
+  "219": "Chord_B_7",
+  "220": "Chord_C_7",
+  "221": "Chord_C#_7",
+  "222": "Chord_D_7",
+  "223": "Chord_D#_7",
+  "224": "Chord_E_7",
+  "225": "Chord_F_7",
+  "226": "Chord_F#_7",
+  "227": "Chord_G_7",
+  "228": "Chord_G#_7",
+  "229": "Chord_A_M",
+  "230": "Chord_A#_M",
+  "231": "Chord_B_M",
+  "232": "Chord_C_M",
+  "233": "Chord_C#_M",
+  "234": "Chord_D_M",
+  "235": "Chord_D#_M",
+  "236": "Chord_E_M",
+  "237": "Chord_F_M",
+  "238": "Chord_F#_M",
+  "239": "Chord_G_M",
+  "240": "Chord_G#_M",
+  "241": "Chord_A_M7",
+  "242": "Chord_A#_M7",
+  "243": "Chord_B_M7",
+  "244": "Chord_C_M7",
+  "245": "Chord_C#_M7",
+  "246": "Chord_D_M7",
+  "247": "Chord_D#_M7",
+  "248": "Chord_E_M7",
+  "249": "Chord_F_M7",
+  "250": "Chord_F#_M7",
+  "251": "Chord_G_M7",
+  "252": "Chord_G#_M7",
+  "253": "Chord_A_m",
+  "254": "Chord_A#_m",
+  "255": "Chord_B_m",
+  "256": "Chord_C_m",
+  "257": "Chord_C#_m",
+  "258": "Chord_D_m",
+  "259": "Chord_D#_m",
+  "260": "Chord_E_m",
+  "261": "Chord_F_m",
+  "262": "Chord_F#_m",
+  "263": "Chord_G_m",
+  "264": "Chord_G#_m",
+  "265": "Chord_A_m7",
+  "266": "Chord_A#_m7",
+  "267": "Chord_B_m7",
+  "268": "Chord_C_m7",
+  "269": "Chord_C#_m7",
+  "270": "Chord_D_m7",
+  "271": "Chord_D#_m7",
+  "272": "Chord_E_m7",
+  "273": "Chord_F_m7",
+  "274": "Chord_F#_m7",
+  "275": "Chord_G_m7",
+  "276": "Chord_G#_m7",
+  "277": "Chord_A_o",
+  "278": "Chord_A#_o",
+  "279": "Chord_B_o",
+  "280": "Chord_C_o",
+  "281": "Chord_C#_o",
+  "282": "Chord_D_o",
+  "283": "Chord_D#_o",
+  "284": "Chord_E_o",
+  "285": "Chord_F_o",
+  "286": "Chord_F#_o",
+  "287": "Chord_G_o",
+  "288": "Chord_G#_o",
+  "289": "Chord_A_o7",
+  "290": "Chord_A#_o7",
+  "291": "Chord_B_o7",
+  "292": "Chord_C_o7",
+  "293": "Chord_C#_o7",
+  "294": "Chord_D_o7",
+  "295": "Chord_D#_o7",
+  "296": "Chord_E_o7",
+  "297": "Chord_F_o7",
+  "298": "Chord_F#_o7",
+  "299": "Chord_G_o7",
+  "300": "Chord_G#_o7",
+  "301": "Chord_A_sus2",
+  "302": "Chord_A#_sus2",
+  "303": "Chord_B_sus2",
+  "304": "Chord_C_sus2",
+  "305": "Chord_C#_sus2",
+  "306": "Chord_D_sus2",
+  "307": "Chord_D#_sus2",
+  "308": "Chord_E_sus2",
+  "309": "Chord_F_sus2",
+  "310": "Chord_F#_sus2",
+  "311": "Chord_G_sus2",
+  "312": "Chord_G#_sus2",
+  "313": "Chord_A_sus4",
+  "314": "Chord_A#_sus4",
+  "315": "Chord_B_sus4",
+  "316": "Chord_C_sus4",
+  "317": "Chord_C#_sus4",
+  "318": "Chord_D_sus4",
+  "319": "Chord_D#_sus4",
+  "320": "Chord_E_sus4",
+  "321": "Chord_F_sus4",
+  "322": "Chord_F#_sus4",
+  "323": "Chord_G_sus4",
+  "324": "Chord_G#_sus4",
+  "325": "Chord_N_N"
+}
\ No newline at end of file
diff --git a/vocab/vocab_Pop909/vocab_Pop909_nb8.json b/vocab/vocab_Pop909/vocab_Pop909_nb8.json
new file mode 100644
index 0000000..f080385
--- /dev/null
+++ b/vocab/vocab_Pop909/vocab_Pop909_nb8.json
@@ -0,0 +1,350 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_6/8",
+    "7": "NNN_time_signature_1/8",
+    "8": "NNN_time_signature_4/4",
+    "9": "NNN_time_signature_3/4",
+    "10": "NNN_time_signature_2/4",
+    "11": "NNN_time_signature_1/4",
+    "12": "NNN_time_signature_2/2"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_30",
+    "2": "Tempo_31",
+    "3": "Tempo_34",
+    "4": "Tempo_36",
+    "5": "Tempo_37",
+    "6": "Tempo_38",
+    "7": "Tempo_40",
+    "8": "Tempo_42",
+    "9": "Tempo_44",
+    "10": "Tempo_46",
+    "11": "Tempo_48",
+    "12": "Tempo_50",
+    "13": "Tempo_52",
+    "14": "Tempo_54",
+    "15": "Tempo_56",
+    "16": "Tempo_58",
+    "17": "Tempo_60",
+    "18": "Tempo_62",
+    "19": "Tempo_64",
+    "20": "Tempo_67",
+    "21": "Tempo_70",
+    "22": "Tempo_73",
+    "23": "Tempo_76",
+    "24": "Tempo_79",
+    "25": "Tempo_82",
+    "26": "Tempo_85",
+    "27": "Tempo_88",
+    "28": "Tempo_92",
+    "29": "Tempo_96",
+    "30": "Tempo_100",
+    "31": "Tempo_104",
+    "32": "Tempo_108",
+    "33": "Tempo_112",
+    "34": "Tempo_116",
+    "35": "Tempo_121",
+    "36": "Tempo_126",
+    "37": "Tempo_131",
+    "38": "Tempo_136",
+    "39": "Tempo_141",
+    "40": "Tempo_147",
+    "41": "Tempo_159",
+    "42": "Tempo_172",
+    "43": "Tempo_179",
+    "44": "Tempo_193"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_17",
+    "2": "Note_Pitch_18",
+    "3": "Note_Pitch_19",
+    "4": "Note_Pitch_20",
+    "5": "Note_Pitch_21",
+    "6": "Note_Pitch_22",
+    "7": "Note_Pitch_23",
+    "8": "Note_Pitch_24",
+    "9": "Note_Pitch_25",
+    "10": "Note_Pitch_26",
+    "11": "Note_Pitch_27",
+    "12": "Note_Pitch_28",
+    "13": "Note_Pitch_29",
+    "14": "Note_Pitch_30",
+    "15": "Note_Pitch_31",
+    "16": "Note_Pitch_32",
+    "17": "Note_Pitch_33",
+    "18": "Note_Pitch_34",
+    "19": "Note_Pitch_35",
+    "20": "Note_Pitch_36",
+    "21": "Note_Pitch_37",
+    "22": "Note_Pitch_38",
+    "23": "Note_Pitch_39",
+    "24": "Note_Pitch_40",
+    "25": "Note_Pitch_41",
+    "26": "Note_Pitch_42",
+    "27": "Note_Pitch_43",
+    "28": "Note_Pitch_44",
+    "29": "Note_Pitch_45",
+    "30": "Note_Pitch_46",
+    "31": "Note_Pitch_47",
+    "32": "Note_Pitch_48",
+    "33": "Note_Pitch_49",
+    "34": "Note_Pitch_50",
+    "35": "Note_Pitch_51",
+    "36": "Note_Pitch_52",
+    "37": "Note_Pitch_53",
+    "38": "Note_Pitch_54",
+    "39": "Note_Pitch_55",
+    "40": "Note_Pitch_56",
+    "41": "Note_Pitch_57",
+    "42": "Note_Pitch_58",
+    "43": "Note_Pitch_59",
+    "44": "Note_Pitch_60",
+    "45": "Note_Pitch_61",
+    "46": "Note_Pitch_62",
+    "47": "Note_Pitch_63",
+    "48": "Note_Pitch_64",
+    "49": "Note_Pitch_65",
+    "50": "Note_Pitch_66",
+    "51": "Note_Pitch_67",
+    "52": "Note_Pitch_68",
+    "53": "Note_Pitch_69",
+    "54": "Note_Pitch_70",
+    "55": "Note_Pitch_71",
+    "56": "Note_Pitch_72",
+    "57": "Note_Pitch_73",
+    "58": "Note_Pitch_74",
+    "59": "Note_Pitch_75",
+    "60": "Note_Pitch_76",
+    "61": "Note_Pitch_77",
+    "62": "Note_Pitch_78",
+    "63": "Note_Pitch_79",
+    "64": "Note_Pitch_80",
+    "65": "Note_Pitch_81",
+    "66": "Note_Pitch_82",
+    "67": "Note_Pitch_83",
+    "68": "Note_Pitch_84",
+    "69": "Note_Pitch_85",
+    "70": "Note_Pitch_86",
+    "71": "Note_Pitch_87",
+    "72": "Note_Pitch_88",
+    "73": "Note_Pitch_89",
+    "74": "Note_Pitch_90",
+    "75": "Note_Pitch_91",
+    "76": "Note_Pitch_92",
+    "77": "Note_Pitch_93",
+    "78": "Note_Pitch_94",
+    "79": "Note_Pitch_95",
+    "80": "Note_Pitch_96",
+    "81": "Note_Pitch_97",
+    "82": "Note_Pitch_98",
+    "83": "Note_Pitch_99",
+    "84": "Note_Pitch_100",
+    "85": "Note_Pitch_101",
+    "86": "Note_Pitch_102",
+    "87": "Note_Pitch_103",
+    "88": "Note_Pitch_104",
+    "89": "Note_Pitch_105",
+    "90": "Note_Pitch_106",
+    "91": "Note_Pitch_107",
+    "92": "Note_Pitch_108",
+    "93": "Note_Pitch_109",
+    "94": "Note_Pitch_110",
+    "95": "Note_Pitch_111",
+    "96": "Note_Pitch_112",
+    "97": "Note_Pitch_113",
+    "98": "Note_Pitch_114",
+    "99": "Note_Pitch_115"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_Pop909/vocab_Pop909_remi8.json b/vocab/vocab_Pop909/vocab_Pop909_remi8.json
new file mode 100644
index 0000000..812601e
--- /dev/null
+++ b/vocab/vocab_Pop909/vocab_Pop909_remi8.json
@@ -0,0 +1,312 @@
+{
+  "0": "SOS_None",
+  "1": "EOS_None",
+  "2": "Bar_None",
+  "3": "Bar_time_signature_1/4",
+  "4": "Bar_time_signature_1/8",
+  "5": "Bar_time_signature_2/2",
+  "6": "Bar_time_signature_2/4",
+  "7": "Bar_time_signature_3/4",
+  "8": "Bar_time_signature_4/4",
+  "9": "Bar_time_signature_6/8",
+  "10": "Beat_0",
+  "11": "Beat_1",
+  "12": "Beat_2",
+  "13": "Beat_3",
+  "14": "Beat_4",
+  "15": "Beat_5",
+  "16": "Beat_6",
+  "17": "Beat_7",
+  "18": "Beat_8",
+  "19": "Beat_9",
+  "20": "Beat_10",
+  "21": "Beat_11",
+  "22": "Beat_12",
+  "23": "Beat_13",
+  "24": "Beat_14",
+  "25": "Beat_15",
+  "26": "Note_Duration_1",
+  "27": "Note_Duration_10",
+  "28": "Note_Duration_12",
+  "29": "Note_Duration_16",
+  "30": "Note_Duration_2",
+  "31": "Note_Duration_20",
+  "32": "Note_Duration_24",
+  "33": "Note_Duration_28",
+  "34": "Note_Duration_3",
+  "35": "Note_Duration_32",
+  "36": "Note_Duration_4",
+  "37": "Note_Duration_5",
+  "38": "Note_Duration_6",
+  "39": "Note_Duration_8",
+  "40": "Note_Velocity_100",
+  "41": "Note_Velocity_120",
+  "42": "Note_Velocity_40",
+  "43": "Note_Velocity_60",
+  "44": "Note_Velocity_80",
+  "45": "Tempo_100",
+  "46": "Tempo_104",
+  "47": "Tempo_108",
+  "48": "Tempo_112",
+  "49": "Tempo_116",
+  "50": "Tempo_121",
+  "51": "Tempo_126",
+  "52": "Tempo_131",
+  "53": "Tempo_136",
+  "54": "Tempo_141",
+  "55": "Tempo_147",
+  "56": "Tempo_159",
+  "57": "Tempo_172",
+  "58": "Tempo_179",
+  "59": "Tempo_193",
+  "60": "Tempo_30",
+  "61": "Tempo_31",
+  "62": "Tempo_34",
+  "63": "Tempo_36",
+  "64": "Tempo_37",
+  "65": "Tempo_38",
+  "66": "Tempo_40",
+  "67": "Tempo_42",
+  "68": "Tempo_44",
+  "69": "Tempo_46",
+  "70": "Tempo_48",
+  "71": "Tempo_50",
+  "72": "Tempo_52",
+  "73": "Tempo_54",
+  "74": "Tempo_56",
+  "75": "Tempo_58",
+  "76": "Tempo_60",
+  "77": "Tempo_62",
+  "78": "Tempo_64",
+  "79": "Tempo_67",
+  "80": "Tempo_70",
+  "81": "Tempo_73",
+  "82": "Tempo_76",
+  "83": "Tempo_79",
+  "84": "Tempo_82",
+  "85": "Tempo_85",
+  "86": "Tempo_88",
+  "87": "Tempo_92",
+  "88": "Tempo_96",
+  "89": "Note_Pitch_17",
+  "90": "Note_Pitch_18",
+  "91": "Note_Pitch_19",
+  "92": "Note_Pitch_20",
+  "93": "Note_Pitch_21",
+  "94": "Note_Pitch_22",
+  "95": "Note_Pitch_23",
+  "96": "Note_Pitch_24",
+  "97": "Note_Pitch_25",
+  "98": "Note_Pitch_26",
+  "99": "Note_Pitch_27",
+  "100": "Note_Pitch_28",
+  "101": "Note_Pitch_29",
+  "102": "Note_Pitch_30",
+  "103": "Note_Pitch_31",
+  "104": "Note_Pitch_32",
+  "105": "Note_Pitch_33",
+  "106": "Note_Pitch_34",
+  "107": "Note_Pitch_35",
+  "108": "Note_Pitch_36",
+  "109": "Note_Pitch_37",
+  "110": "Note_Pitch_38",
+  "111": "Note_Pitch_39",
+  "112": "Note_Pitch_40",
+  "113": "Note_Pitch_41",
+  "114": "Note_Pitch_42",
+  "115": "Note_Pitch_43",
+  "116": "Note_Pitch_44",
+  "117": "Note_Pitch_45",
+  "118": "Note_Pitch_46",
+  "119": "Note_Pitch_47",
+  "120": "Note_Pitch_48",
+  "121": "Note_Pitch_49",
+  "122": "Note_Pitch_50",
+  "123": "Note_Pitch_51",
+  "124": "Note_Pitch_52",
+  "125": "Note_Pitch_53",
+  "126": "Note_Pitch_54",
+  "127": "Note_Pitch_55",
+  "128": "Note_Pitch_56",
+  "129": "Note_Pitch_57",
+  "130": "Note_Pitch_58",
+  "131": "Note_Pitch_59",
+  "132": "Note_Pitch_60",
+  "133": "Note_Pitch_61",
+  "134": "Note_Pitch_62",
+  "135": "Note_Pitch_63",
+  "136": "Note_Pitch_64",
+  "137": "Note_Pitch_65",
+  "138": "Note_Pitch_66",
+  "139": "Note_Pitch_67",
+  "140": "Note_Pitch_68",
+  "141": "Note_Pitch_69",
+  "142": "Note_Pitch_70",
+  "143": "Note_Pitch_71",
+  "144": "Note_Pitch_72",
+  "145": "Note_Pitch_73",
+  "146": "Note_Pitch_74",
+  "147": "Note_Pitch_75",
+  "148": "Note_Pitch_76",
+  "149": "Note_Pitch_77",
+  "150": "Note_Pitch_78",
+  "151": "Note_Pitch_79",
+  "152": "Note_Pitch_80",
+  "153": "Note_Pitch_81",
+  "154": "Note_Pitch_82",
+  "155": "Note_Pitch_83",
+  "156": "Note_Pitch_84",
+  "157": "Note_Pitch_85",
+  "158": "Note_Pitch_86",
+  "159": "Note_Pitch_87",
+  "160": "Note_Pitch_88",
+  "161": "Note_Pitch_89",
+  "162": "Note_Pitch_90",
+  "163": "Note_Pitch_91",
+  "164": "Note_Pitch_92",
+  "165": "Note_Pitch_93",
+  "166": "Note_Pitch_94",
+  "167": "Note_Pitch_95",
+  "168": "Note_Pitch_96",
+  "169": "Note_Pitch_97",
+  "170": "Note_Pitch_98",
+  "171": "Note_Pitch_99",
+  "172": "Note_Pitch_100",
+  "173": "Note_Pitch_101",
+  "174": "Note_Pitch_102",
+  "175": "Note_Pitch_103",
+  "176": "Note_Pitch_104",
+  "177": "Note_Pitch_105",
+  "178": "Note_Pitch_106",
+  "179": "Note_Pitch_107",
+  "180": "Note_Pitch_108",
+  "181": "Note_Pitch_109",
+  "182": "Note_Pitch_110",
+  "183": "Note_Pitch_111",
+  "184": "Note_Pitch_112",
+  "185": "Note_Pitch_113",
+  "186": "Note_Pitch_114",
+  "187": "Note_Pitch_115",
+  "188": "Instrument_0",
+  "189": "Chord_A_+",
+  "190": "Chord_A#_+",
+  "191": "Chord_B_+",
+  "192": "Chord_C_+",
+  "193": "Chord_C#_+",
+  "194": "Chord_D_+",
+  "195": "Chord_D#_+",
+  "196": "Chord_E_+",
+  "197": "Chord_F_+",
+  "198": "Chord_F#_+",
+  "199": "Chord_G_+",
+  "200": "Chord_G#_+",
+  "201": "Chord_A_/o7",
+  "202": "Chord_A#_/o7",
+  "203": "Chord_B_/o7",
+  "204": "Chord_C_/o7",
+  "205": "Chord_C#_/o7",
+  "206": "Chord_D#_/o7",
+  "207": "Chord_E_/o7",
+  "208": "Chord_F_/o7",
+  "209": "Chord_F#_/o7",
+  "210": "Chord_G_/o7",
+  "211": "Chord_G#_/o7",
+  "212": "Chord_A_7",
+  "213": "Chord_A#_7",
+  "214": "Chord_B_7",
+  "215": "Chord_C_7",
+  "216": "Chord_C#_7",
+  "217": "Chord_D_7",
+  "218": "Chord_D#_7",
+  "219": "Chord_E_7",
+  "220": "Chord_F_7",
+  "221": "Chord_F#_7",
+  "222": "Chord_G_7",
+  "223": "Chord_G#_7",
+  "224": "Chord_A_M",
+  "225": "Chord_A#_M",
+  "226": "Chord_B_M",
+  "227": "Chord_C_M",
+  "228": "Chord_C#_M",
+  "229": "Chord_D_M",
+  "230": "Chord_D#_M",
+  "231": "Chord_E_M",
+  "232": "Chord_F_M",
+  "233": "Chord_F#_M",
+  "234": "Chord_G_M",
+  "235": "Chord_G#_M",
+  "236": "Chord_A_M7",
+  "237": "Chord_A#_M7",
+  "238": "Chord_B_M7",
+  "239": "Chord_C_M7",
+  "240": "Chord_C#_M7",
+  "241": "Chord_D_M7",
+  "242": "Chord_D#_M7",
+  "243": "Chord_E_M7",
+  "244": "Chord_F_M7",
+  "245": "Chord_F#_M7",
+  "246": "Chord_G_M7",
+  "247": "Chord_G#_M7",
+  "248": "Chord_A_m",
+  "249": "Chord_A#_m",
+  "250": "Chord_B_m",
+  "251": "Chord_C_m",
+  "252": "Chord_C#_m",
+  "253": "Chord_D_m",
+  "254": "Chord_D#_m",
+  "255": "Chord_E_m",
+  "256": "Chord_F_m",
+  "257": "Chord_F#_m",
+  "258": "Chord_G_m",
+  "259": "Chord_G#_m",
+  "260": "Chord_A_m7",
+  "261": "Chord_A#_m7",
+  "262": "Chord_B_m7",
+  "263": "Chord_C_m7",
+  "264": "Chord_C#_m7",
+  "265": "Chord_D_m7",
+  "266": "Chord_D#_m7",
+  "267": "Chord_E_m7",
+  "268": "Chord_F_m7",
+  "269": "Chord_F#_m7",
+  "270": "Chord_G_m7",
+  "271": "Chord_G#_m7",
+  "272": "Chord_A_o",
+  "273": "Chord_A#_o",
+  "274": "Chord_B_o",
+  "275": "Chord_C_o",
+  "276": "Chord_C#_o",
+  "277": "Chord_D_o",
+  "278": "Chord_D#_o",
+  "279": "Chord_E_o",
+  "280": "Chord_F_o",
+  "281": "Chord_F#_o",
+  "282": "Chord_G_o",
+  "283": "Chord_G#_o",
+  "284": "Chord_E_o7",
+  "285": "Chord_A_sus2",
+  "286": "Chord_A#_sus2",
+  "287": "Chord_B_sus2",
+  "288": "Chord_C_sus2",
+  "289": "Chord_C#_sus2",
+  "290": "Chord_D_sus2",
+  "291": "Chord_D#_sus2",
+  "292": "Chord_E_sus2",
+  "293": "Chord_F_sus2",
+  "294": "Chord_F#_sus2",
+  "295": "Chord_G_sus2",
+  "296": "Chord_G#_sus2",
+  "297": "Chord_A_sus4",
+  "298": "Chord_A#_sus4",
+  "299": "Chord_B_sus4",
+  "300": "Chord_C_sus4",
+  "301": "Chord_C#_sus4",
+  "302": "Chord_D_sus4",
+  "303": "Chord_D#_sus4",
+  "304": "Chord_E_sus4",
+  "305": "Chord_F_sus4",
+  "306": "Chord_F#_sus4",
+  "307": "Chord_G_sus4",
+  "308": "Chord_G#_sus4",
+  "309": "Chord_N_N"
+}
\ No newline at end of file
diff --git a/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json b/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_SOD/vocab_SOD_cp5.json b/vocab/vocab_SOD/vocab_SOD_cp5.json
new file mode 100644
index 0000000..a81d79f
--- /dev/null
+++ b/vocab/vocab_SOD/vocab_SOD_cp5.json
@@ -0,0 +1,340 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Metrical",
+    "3": "Note"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Bar",
+    "2": "Bar_time_signature_1/1",
+    "3": "Bar_time_signature_1/2",
+    "4": "Bar_time_signature_1/4",
+    "5": "Bar_time_signature_1/8",
+    "6": "Bar_time_signature_11/8",
+    "7": "Bar_time_signature_12/8",
+    "8": "Bar_time_signature_2/2",
+    "9": "Bar_time_signature_2/4",
+    "10": "Bar_time_signature_2/8",
+    "11": "Bar_time_signature_3/2",
+    "12": "Bar_time_signature_3/4",
+    "13": "Bar_time_signature_3/8",
+    "14": "Bar_time_signature_4/2",
+    "15": "Bar_time_signature_4/4",
+    "16": "Bar_time_signature_4/8",
+    "17": "Bar_time_signature_5/4",
+    "18": "Bar_time_signature_5/8",
+    "19": "Bar_time_signature_6/4",
+    "20": "Bar_time_signature_6/8",
+    "21": "Bar_time_signature_7/4",
+    "22": "Bar_time_signature_7/8",
+    "23": "Bar_time_signature_8/4",
+    "24": "Bar_time_signature_8/8",
+    "25": "Bar_time_signature_9/8",
+    "26": "Beat_0",
+    "27": "Beat_1",
+    "28": "Beat_2",
+    "29": "Beat_3",
+    "30": "Beat_4",
+    "31": "Beat_5",
+    "32": "Beat_6",
+    "33": "Beat_7",
+    "34": "Beat_8",
+    "35": "Beat_9",
+    "36": "Beat_10",
+    "37": "Beat_11",
+    "38": "Beat_12",
+    "39": "Beat_13",
+    "40": "Beat_14",
+    "41": "Beat_15",
+    "42": "Beat_16",
+    "43": "Beat_17",
+    "44": "Beat_18",
+    "45": "Beat_19",
+    "46": "Beat_20",
+    "47": "Beat_21",
+    "48": "Beat_22",
+    "49": "Beat_23",
+    "50": "Beat_24",
+    "51": "Beat_25",
+    "52": "Beat_26",
+    "53": "Beat_27",
+    "54": "Beat_28",
+    "55": "Beat_29",
+    "56": "Beat_30",
+    "57": "Beat_31",
+    "58": "Beat_32",
+    "59": "Beat_33",
+    "60": "Beat_34",
+    "61": "Beat_35",
+    "62": "Beat_36",
+    "63": "Beat_37",
+    "64": "Beat_38",
+    "65": "Beat_39",
+    "66": "Beat_40",
+    "67": "Beat_41",
+    "68": "Beat_42",
+    "69": "Beat_43",
+    "70": "Beat_44",
+    "71": "Beat_45",
+    "72": "Beat_46",
+    "73": "Beat_47",
+    "74": "Beat_48",
+    "75": "Beat_49",
+    "76": "Beat_50",
+    "77": "Beat_51",
+    "78": "Beat_52",
+    "79": "Beat_53",
+    "80": "Beat_54",
+    "81": "Beat_55",
+    "82": "Beat_56",
+    "83": "Beat_57",
+    "84": "Beat_58",
+    "85": "Beat_59",
+    "86": "Beat_60",
+    "87": "Beat_61",
+    "88": "Beat_62",
+    "89": "Beat_63",
+    "90": "Beat_64",
+    "91": "Beat_65",
+    "92": "Beat_66",
+    "93": "Beat_67",
+    "94": "Beat_68",
+    "95": "Beat_69",
+    "96": "Beat_70",
+    "97": "Beat_71",
+    "98": "Beat_72",
+    "99": "Beat_73",
+    "100": "Beat_74",
+    "101": "Beat_75",
+    "102": "Beat_76",
+    "103": "Beat_77",
+    "104": "Beat_78",
+    "105": "Beat_79",
+    "106": "Beat_80",
+    "107": "Beat_81",
+    "108": "Beat_82",
+    "109": "Beat_83",
+    "110": "Beat_84",
+    "111": "Beat_85",
+    "112": "Beat_86",
+    "113": "Beat_87",
+    "114": "Beat_88",
+    "115": "Beat_89",
+    "116": "Beat_90",
+    "117": "Beat_91",
+    "118": "Beat_92",
+    "119": "Beat_93",
+    "120": "Beat_94",
+    "121": "Beat_95"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_11",
+    "8": "Instrument_12",
+    "9": "Instrument_13",
+    "10": "Instrument_14",
+    "11": "Instrument_15",
+    "12": "Instrument_16",
+    "13": "Instrument_19",
+    "14": "Instrument_21",
+    "15": "Instrument_22",
+    "16": "Instrument_23",
+    "17": "Instrument_24",
+    "18": "Instrument_25",
+    "19": "Instrument_26",
+    "20": "Instrument_32",
+    "21": "Instrument_33",
+    "22": "Instrument_36",
+    "23": "Instrument_38",
+    "24": "Instrument_40",
+    "25": "Instrument_41",
+    "26": "Instrument_42",
+    "27": "Instrument_43",
+    "28": "Instrument_46",
+    "29": "Instrument_47",
+    "30": "Instrument_49",
+    "31": "Instrument_50",
+    "32": "Instrument_52",
+    "33": "Instrument_55",
+    "34": "Instrument_56",
+    "35": "Instrument_57",
+    "36": "Instrument_58",
+    "37": "Instrument_60",
+    "38": "Instrument_61",
+    "39": "Instrument_62",
+    "40": "Instrument_64",
+    "41": "Instrument_65",
+    "42": "Instrument_66",
+    "43": "Instrument_67",
+    "44": "Instrument_68",
+    "45": "Instrument_69",
+    "46": "Instrument_70",
+    "47": "Instrument_71",
+    "48": "Instrument_72",
+    "49": "Instrument_73",
+    "50": "Instrument_74",
+    "51": "Instrument_75",
+    "52": "Instrument_79",
+    "53": "Instrument_80",
+    "54": "Instrument_88",
+    "55": "Instrument_105",
+    "56": "Instrument_108",
+    "57": "Instrument_109",
+    "58": "Instrument_111",
+    "59": "Instrument_114",
+    "60": "Instrument_117",
+    "61": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_6",
+    "6": "Note_Duration_9",
+    "7": "Note_Duration_12",
+    "8": "Note_Duration_15",
+    "9": "Note_Duration_18",
+    "10": "Note_Duration_24",
+    "11": "Note_Duration_30",
+    "12": "Note_Duration_36",
+    "13": "Note_Duration_42",
+    "14": "Note_Duration_48",
+    "15": "Note_Duration_54",
+    "16": "Note_Duration_60",
+    "17": "Note_Duration_72",
+    "18": "Note_Duration_84",
+    "19": "Note_Duration_96"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_SOD/vocab_SOD_nb5.json b/vocab/vocab_SOD/vocab_SOD_nb5.json
new file mode 100644
index 0000000..aeb5d1e
--- /dev/null
+++ b/vocab/vocab_SOD/vocab_SOD_nb5.json
@@ -0,0 +1,341 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31",
+    "33": "Beat_32",
+    "34": "Beat_33",
+    "35": "Beat_34",
+    "36": "Beat_35",
+    "37": "Beat_36",
+    "38": "Beat_37",
+    "39": "Beat_38",
+    "40": "Beat_39",
+    "41": "Beat_40",
+    "42": "Beat_41",
+    "43": "Beat_42",
+    "44": "Beat_43",
+    "45": "Beat_44",
+    "46": "Beat_45",
+    "47": "Beat_46",
+    "48": "Beat_47",
+    "49": "Beat_48",
+    "50": "Beat_49",
+    "51": "Beat_50",
+    "52": "Beat_51",
+    "53": "Beat_52",
+    "54": "Beat_53",
+    "55": "Beat_54",
+    "56": "Beat_55",
+    "57": "Beat_56",
+    "58": "Beat_57",
+    "59": "Beat_58",
+    "60": "Beat_59",
+    "61": "Beat_60",
+    "62": "Beat_61",
+    "63": "Beat_62",
+    "64": "Beat_63",
+    "65": "Beat_64",
+    "66": "Beat_65",
+    "67": "Beat_66",
+    "68": "Beat_67",
+    "69": "Beat_68",
+    "70": "Beat_69",
+    "71": "Beat_70",
+    "72": "Beat_71",
+    "73": "Beat_72",
+    "74": "Beat_73",
+    "75": "Beat_74",
+    "76": "Beat_75",
+    "77": "Beat_76",
+    "78": "Beat_77",
+    "79": "Beat_78",
+    "80": "Beat_79",
+    "81": "Beat_80",
+    "82": "Beat_81",
+    "83": "Beat_82",
+    "84": "Beat_83",
+    "85": "Beat_84",
+    "86": "Beat_85",
+    "87": "Beat_86",
+    "88": "Beat_87",
+    "89": "Beat_88",
+    "90": "Beat_89",
+    "91": "Beat_90",
+    "92": "Beat_91",
+    "93": "Beat_92",
+    "94": "Beat_93",
+    "95": "Beat_94",
+    "96": "Beat_95"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_11",
+    "8": "Instrument_12",
+    "9": "Instrument_13",
+    "10": "Instrument_14",
+    "11": "Instrument_15",
+    "12": "Instrument_16",
+    "13": "Instrument_19",
+    "14": "Instrument_21",
+    "15": "Instrument_22",
+    "16": "Instrument_23",
+    "17": "Instrument_24",
+    "18": "Instrument_25",
+    "19": "Instrument_26",
+    "20": "Instrument_32",
+    "21": "Instrument_33",
+    "22": "Instrument_36",
+    "23": "Instrument_38",
+    "24": "Instrument_40",
+    "25": "Instrument_41",
+    "26": "Instrument_42",
+    "27": "Instrument_43",
+    "28": "Instrument_46",
+    "29": "Instrument_47",
+    "30": "Instrument_49",
+    "31": "Instrument_50",
+    "32": "Instrument_52",
+    "33": "Instrument_55",
+    "34": "Instrument_56",
+    "35": "Instrument_57",
+    "36": "Instrument_58",
+    "37": "Instrument_60",
+    "38": "Instrument_61",
+    "39": "Instrument_62",
+    "40": "Instrument_64",
+    "41": "Instrument_65",
+    "42": "Instrument_66",
+    "43": "Instrument_67",
+    "44": "Instrument_68",
+    "45": "Instrument_69",
+    "46": "Instrument_70",
+    "47": "Instrument_71",
+    "48": "Instrument_72",
+    "49": "Instrument_73",
+    "50": "Instrument_74",
+    "51": "Instrument_75",
+    "52": "Instrument_79",
+    "53": "Instrument_80",
+    "54": "Instrument_88",
+    "55": "Instrument_105",
+    "56": "Instrument_108",
+    "57": "Instrument_109",
+    "58": "Instrument_111",
+    "59": "Instrument_114",
+    "60": "Instrument_117",
+    "61": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_6",
+    "6": "Note_Duration_9",
+    "7": "Note_Duration_12",
+    "8": "Note_Duration_15",
+    "9": "Note_Duration_18",
+    "10": "Note_Duration_24",
+    "11": "Note_Duration_30",
+    "12": "Note_Duration_36",
+    "13": "Note_Duration_42",
+    "14": "Note_Duration_48",
+    "15": "Note_Duration_54",
+    "16": "Note_Duration_60",
+    "17": "Note_Duration_72",
+    "18": "Note_Duration_84",
+    "19": "Note_Duration_96"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_SOD/vocab_SOD_nb8.json b/vocab/vocab_SOD/vocab_SOD_nb8.json
new file mode 100644
index 0000000..e3a24fb
--- /dev/null
+++ b/vocab/vocab_SOD/vocab_SOD_nb8.json
@@ -0,0 +1,572 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31",
+    "33": "Beat_32",
+    "34": "Beat_33",
+    "35": "Beat_34",
+    "36": "Beat_35",
+    "37": "Beat_36",
+    "38": "Beat_37",
+    "39": "Beat_38",
+    "40": "Beat_39",
+    "41": "Beat_40",
+    "42": "Beat_41",
+    "43": "Beat_42",
+    "44": "Beat_43",
+    "45": "Beat_44",
+    "46": "Beat_45",
+    "47": "Beat_46",
+    "48": "Beat_47",
+    "49": "Beat_48",
+    "50": "Beat_49",
+    "51": "Beat_50",
+    "52": "Beat_51",
+    "53": "Beat_52",
+    "54": "Beat_53",
+    "55": "Beat_54",
+    "56": "Beat_55",
+    "57": "Beat_56",
+    "58": "Beat_57",
+    "59": "Beat_58",
+    "60": "Beat_59",
+    "61": "Beat_60",
+    "62": "Beat_61",
+    "63": "Beat_62",
+    "64": "Beat_63",
+    "65": "Beat_64",
+    "66": "Beat_65",
+    "67": "Beat_66",
+    "68": "Beat_67",
+    "69": "Beat_68",
+    "70": "Beat_69",
+    "71": "Beat_70",
+    "72": "Beat_71",
+    "73": "Beat_72",
+    "74": "Beat_73",
+    "75": "Beat_74",
+    "76": "Beat_75",
+    "77": "Beat_76",
+    "78": "Beat_77",
+    "79": "Beat_78",
+    "80": "Beat_79",
+    "81": "Beat_80",
+    "82": "Beat_81",
+    "83": "Beat_82",
+    "84": "Beat_83",
+    "85": "Beat_84",
+    "86": "Beat_85",
+    "87": "Beat_86",
+    "88": "Beat_87",
+    "89": "Beat_88",
+    "90": "Beat_89",
+    "91": "Beat_90",
+    "92": "Beat_91",
+    "93": "Beat_92",
+    "94": "Beat_93",
+    "95": "Beat_94",
+    "96": "Beat_95"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_30",
+    "2": "Tempo_31",
+    "3": "Tempo_32",
+    "4": "Tempo_33",
+    "5": "Tempo_34",
+    "6": "Tempo_35",
+    "7": "Tempo_36",
+    "8": "Tempo_37",
+    "9": "Tempo_38",
+    "10": "Tempo_40",
+    "11": "Tempo_42",
+    "12": "Tempo_44",
+    "13": "Tempo_46",
+    "14": "Tempo_48",
+    "15": "Tempo_50",
+    "16": "Tempo_52",
+    "17": "Tempo_53",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_77",
+    "29": "Tempo_79",
+    "30": "Tempo_82",
+    "31": "Tempo_85",
+    "32": "Tempo_88",
+    "33": "Tempo_92",
+    "34": "Tempo_94",
+    "35": "Tempo_96",
+    "36": "Tempo_100",
+    "37": "Tempo_103",
+    "38": "Tempo_104",
+    "39": "Tempo_108",
+    "40": "Tempo_112",
+    "41": "Tempo_113",
+    "42": "Tempo_116",
+    "43": "Tempo_121",
+    "44": "Tempo_124",
+    "45": "Tempo_126",
+    "46": "Tempo_131",
+    "47": "Tempo_136",
+    "48": "Tempo_141",
+    "49": "Tempo_147",
+    "50": "Tempo_150",
+    "51": "Tempo_153",
+    "52": "Tempo_159",
+    "53": "Tempo_165",
+    "54": "Tempo_172",
+    "55": "Tempo_179",
+    "56": "Tempo_182",
+    "57": "Tempo_186",
+    "58": "Tempo_193",
+    "59": "Tempo_200",
+    "60": "Tempo_201",
+    "61": "Tempo_209",
+    "62": "Tempo_217",
+    "63": "Tempo_220",
+    "64": "Tempo_226",
+    "65": "Tempo_235",
+    "66": "Tempo_242",
+    "67": "Tempo_244",
+    "68": "Tempo_254",
+    "69": "Tempo_264",
+    "70": "Tempo_266",
+    "71": "Tempo_275",
+    "72": "Tempo_286",
+    "73": "Tempo_293",
+    "74": "Tempo_297",
+    "75": "Tempo_309",
+    "76": "Tempo_321",
+    "77": "Tempo_322",
+    "78": "Tempo_334",
+    "79": "Tempo_347",
+    "80": "Tempo_354",
+    "81": "Tempo_361",
+    "82": "Tempo_375",
+    "83": "Tempo_389",
+    "84": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_11",
+    "8": "Instrument_12",
+    "9": "Instrument_13",
+    "10": "Instrument_14",
+    "11": "Instrument_15",
+    "12": "Instrument_16",
+    "13": "Instrument_19",
+    "14": "Instrument_21",
+    "15": "Instrument_22",
+    "16": "Instrument_23",
+    "17": "Instrument_24",
+    "18": "Instrument_25",
+    "19": "Instrument_26",
+    "20": "Instrument_32",
+    "21": "Instrument_33",
+    "22": "Instrument_36",
+    "23": "Instrument_38",
+    "24": "Instrument_40",
+    "25": "Instrument_41",
+    "26": "Instrument_42",
+    "27": "Instrument_43",
+    "28": "Instrument_46",
+    "29": "Instrument_47",
+    "30": "Instrument_49",
+    "31": "Instrument_50",
+    "32": "Instrument_52",
+    "33": "Instrument_55",
+    "34": "Instrument_56",
+    "35": "Instrument_57",
+    "36": "Instrument_58",
+    "37": "Instrument_60",
+    "38": "Instrument_61",
+    "39": "Instrument_62",
+    "40": "Instrument_64",
+    "41": "Instrument_65",
+    "42": "Instrument_66",
+    "43": "Instrument_67",
+    "44": "Instrument_68",
+    "45": "Instrument_69",
+    "46": "Instrument_70",
+    "47": "Instrument_71",
+    "48": "Instrument_72",
+    "49": "Instrument_73",
+    "50": "Instrument_74",
+    "51": "Instrument_75",
+    "52": "Instrument_79",
+    "53": "Instrument_80",
+    "54": "Instrument_88",
+    "55": "Instrument_105",
+    "56": "Instrument_108",
+    "57": "Instrument_109",
+    "58": "Instrument_111",
+    "59": "Instrument_114",
+    "60": "Instrument_117",
+    "61": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_6",
+    "6": "Note_Duration_9",
+    "7": "Note_Duration_12",
+    "8": "Note_Duration_15",
+    "9": "Note_Duration_18",
+    "10": "Note_Duration_24",
+    "11": "Note_Duration_30",
+    "12": "Note_Duration_36",
+    "13": "Note_Duration_42",
+    "14": "Note_Duration_48",
+    "15": "Note_Duration_54",
+    "16": "Note_Duration_60",
+    "17": "Note_Duration_72",
+    "18": "Note_Duration_84",
+    "19": "Note_Duration_96"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_SOD/vocab_SOD_remi5.json b/vocab/vocab_SOD/vocab_SOD_remi5.json
new file mode 100644
index 0000000..c6373bd
--- /dev/null
+++ b/vocab/vocab_SOD/vocab_SOD_remi5.json
@@ -0,0 +1,324 @@
+{
+  "0": "SOS_None",
+  "1": "EOS_None",
+  "2": "Bar_None",
+  "3": "Bar_time_signature_1/1",
+  "4": "Bar_time_signature_1/2",
+  "5": "Bar_time_signature_1/4",
+  "6": "Bar_time_signature_1/8",
+  "7": "Bar_time_signature_11/8",
+  "8": "Bar_time_signature_12/8",
+  "9": "Bar_time_signature_2/2",
+  "10": "Bar_time_signature_2/4",
+  "11": "Bar_time_signature_2/8",
+  "12": "Bar_time_signature_3/2",
+  "13": "Bar_time_signature_3/4",
+  "14": "Bar_time_signature_3/8",
+  "15": "Bar_time_signature_4/2",
+  "16": "Bar_time_signature_4/4",
+  "17": "Bar_time_signature_4/8",
+  "18": "Bar_time_signature_5/4",
+  "19": "Bar_time_signature_5/8",
+  "20": "Bar_time_signature_6/4",
+  "21": "Bar_time_signature_6/8",
+  "22": "Bar_time_signature_7/4",
+  "23": "Bar_time_signature_7/8",
+  "24": "Bar_time_signature_8/4",
+  "25": "Bar_time_signature_8/8",
+  "26": "Bar_time_signature_9/8",
+  "27": "Beat_0",
+  "28": "Beat_1",
+  "29": "Beat_2",
+  "30": "Beat_3",
+  "31": "Beat_4",
+  "32": "Beat_5",
+  "33": "Beat_6",
+  "34": "Beat_7",
+  "35": "Beat_8",
+  "36": "Beat_9",
+  "37": "Beat_10",
+  "38": "Beat_11",
+  "39": "Beat_12",
+  "40": "Beat_13",
+  "41": "Beat_14",
+  "42": "Beat_15",
+  "43": "Beat_16",
+  "44": "Beat_17",
+  "45": "Beat_18",
+  "46": "Beat_19",
+  "47": "Beat_20",
+  "48": "Beat_21",
+  "49": "Beat_22",
+  "50": "Beat_23",
+  "51": "Beat_24",
+  "52": "Beat_25",
+  "53": "Beat_26",
+  "54": "Beat_27",
+  "55": "Beat_28",
+  "56": "Beat_29",
+  "57": "Beat_30",
+  "58": "Beat_31",
+  "59": "Beat_32",
+  "60": "Beat_33",
+  "61": "Beat_34",
+  "62": "Beat_35",
+  "63": "Beat_36",
+  "64": "Beat_37",
+  "65": "Beat_38",
+  "66": "Beat_39",
+  "67": "Beat_40",
+  "68": "Beat_41",
+  "69": "Beat_42",
+  "70": "Beat_43",
+  "71": "Beat_44",
+  "72": "Beat_45",
+  "73": "Beat_46",
+  "74": "Beat_47",
+  "75": "Beat_48",
+  "76": "Beat_49",
+  "77": "Beat_50",
+  "78": "Beat_51",
+  "79": "Beat_52",
+  "80": "Beat_53",
+  "81": "Beat_54",
+  "82": "Beat_55",
+  "83": "Beat_56",
+  "84": "Beat_57",
+  "85": "Beat_58",
+  "86": "Beat_59",
+  "87": "Beat_60",
+  "88": "Beat_61",
+  "89": "Beat_62",
+  "90": "Beat_63",
+  "91": "Beat_64",
+  "92": "Beat_65",
+  "93": "Beat_66",
+  "94": "Beat_67",
+  "95": "Beat_68",
+  "96": "Beat_69",
+  "97": "Beat_70",
+  "98": "Beat_71",
+  "99": "Beat_72",
+  "100": "Beat_73",
+  "101": "Beat_74",
+  "102": "Beat_75",
+  "103": "Beat_76",
+  "104": "Beat_77",
+  "105": "Beat_78",
+  "106": "Beat_79",
+  "107": "Beat_80",
+  "108": "Beat_81",
+  "109": "Beat_82",
+  "110": "Beat_83",
+  "111": "Beat_84",
+  "112": "Beat_85",
+  "113": "Beat_86",
+  "114": "Beat_87",
+  "115": "Beat_88",
+  "116": "Beat_89",
+  "117": "Beat_90",
+  "118": "Beat_91",
+  "119": "Beat_92",
+  "120": "Beat_93",
+  "121": "Beat_94",
+  "122": "Beat_95",
+  "123": "Note_Duration_1",
+  "124": "Note_Duration_12",
+  "125": "Note_Duration_15",
+  "126": "Note_Duration_18",
+  "127": "Note_Duration_2",
+  "128": "Note_Duration_24",
+  "129": "Note_Duration_3",
+  "130": "Note_Duration_30",
+  "131": "Note_Duration_36",
+  "132": "Note_Duration_4",
+  "133": "Note_Duration_42",
+  "134": "Note_Duration_48",
+  "135": "Note_Duration_54",
+  "136": "Note_Duration_6",
+  "137": "Note_Duration_60",
+  "138": "Note_Duration_72",
+  "139": "Note_Duration_84",
+  "140": "Note_Duration_9",
+  "141": "Note_Duration_96",
+  "142": "Note_Pitch_6",
+  "143": "Note_Pitch_7",
+  "144": "Note_Pitch_8",
+  "145": "Note_Pitch_9",
+  "146": "Note_Pitch_10",
+  "147": "Note_Pitch_11",
+  "148": "Note_Pitch_12",
+  "149": "Note_Pitch_13",
+  "150": "Note_Pitch_14",
+  "151": "Note_Pitch_15",
+  "152": "Note_Pitch_16",
+  "153": "Note_Pitch_17",
+  "154": "Note_Pitch_18",
+  "155": "Note_Pitch_19",
+  "156": "Note_Pitch_20",
+  "157": "Note_Pitch_21",
+  "158": "Note_Pitch_22",
+  "159": "Note_Pitch_23",
+  "160": "Note_Pitch_24",
+  "161": "Note_Pitch_25",
+  "162": "Note_Pitch_26",
+  "163": "Note_Pitch_27",
+  "164": "Note_Pitch_28",
+  "165": "Note_Pitch_29",
+  "166": "Note_Pitch_30",
+  "167": "Note_Pitch_31",
+  "168": "Note_Pitch_32",
+  "169": "Note_Pitch_33",
+  "170": "Note_Pitch_34",
+  "171": "Note_Pitch_35",
+  "172": "Note_Pitch_36",
+  "173": "Note_Pitch_37",
+  "174": "Note_Pitch_38",
+  "175": "Note_Pitch_39",
+  "176": "Note_Pitch_40",
+  "177": "Note_Pitch_41",
+  "178": "Note_Pitch_42",
+  "179": "Note_Pitch_43",
+  "180": "Note_Pitch_44",
+  "181": "Note_Pitch_45",
+  "182": "Note_Pitch_46",
+  "183": "Note_Pitch_47",
+  "184": "Note_Pitch_48",
+  "185": "Note_Pitch_49",
+  "186": "Note_Pitch_50",
+  "187": "Note_Pitch_51",
+  "188": "Note_Pitch_52",
+  "189": "Note_Pitch_53",
+  "190": "Note_Pitch_54",
+  "191": "Note_Pitch_55",
+  "192": "Note_Pitch_56",
+  "193": "Note_Pitch_57",
+  "194": "Note_Pitch_58",
+  "195": "Note_Pitch_59",
+  "196": "Note_Pitch_60",
+  "197": "Note_Pitch_61",
+  "198": "Note_Pitch_62",
+  "199": "Note_Pitch_63",
+  "200": "Note_Pitch_64",
+  "201": "Note_Pitch_65",
+  "202": "Note_Pitch_66",
+  "203": "Note_Pitch_67",
+  "204": "Note_Pitch_68",
+  "205": "Note_Pitch_69",
+  "206": "Note_Pitch_70",
+  "207": "Note_Pitch_71",
+  "208": "Note_Pitch_72",
+  "209": "Note_Pitch_73",
+  "210": "Note_Pitch_74",
+  "211": "Note_Pitch_75",
+  "212": "Note_Pitch_76",
+  "213": "Note_Pitch_77",
+  "214": "Note_Pitch_78",
+  "215": "Note_Pitch_79",
+  "216": "Note_Pitch_80",
+  "217": "Note_Pitch_81",
+  "218": "Note_Pitch_82",
+  "219": "Note_Pitch_83",
+  "220": "Note_Pitch_84",
+  "221": "Note_Pitch_85",
+  "222": "Note_Pitch_86",
+  "223": "Note_Pitch_87",
+  "224": "Note_Pitch_88",
+  "225": "Note_Pitch_89",
+  "226": "Note_Pitch_90",
+  "227": "Note_Pitch_91",
+  "228": "Note_Pitch_92",
+  "229": "Note_Pitch_93",
+  "230": "Note_Pitch_94",
+  "231": "Note_Pitch_95",
+  "232": "Note_Pitch_96",
+  "233": "Note_Pitch_97",
+  "234": "Note_Pitch_98",
+  "235": "Note_Pitch_99",
+  "236": "Note_Pitch_100",
+  "237": "Note_Pitch_101",
+  "238": "Note_Pitch_102",
+  "239": "Note_Pitch_103",
+  "240": "Note_Pitch_104",
+  "241": "Note_Pitch_105",
+  "242": "Note_Pitch_106",
+  "243": "Note_Pitch_107",
+  "244": "Note_Pitch_108",
+  "245": "Note_Pitch_109",
+  "246": "Note_Pitch_110",
+  "247": "Note_Pitch_111",
+  "248": "Note_Pitch_112",
+  "249": "Note_Pitch_113",
+  "250": "Note_Pitch_114",
+  "251": "Note_Pitch_115",
+  "252": "Note_Pitch_116",
+  "253": "Note_Pitch_117",
+  "254": "Note_Pitch_118",
+  "255": "Note_Pitch_119",
+  "256": "Note_Pitch_120",
+  "257": "Note_Pitch_121",
+  "258": "Note_Pitch_122",
+  "259": "Note_Pitch_123",
+  "260": "Note_Pitch_124",
+  "261": "Instrument_0",
+  "262": "Instrument_4",
+  "263": "Instrument_6",
+  "264": "Instrument_7",
+  "265": "Instrument_8",
+  "266": "Instrument_9",
+  "267": "Instrument_11",
+  "268": "Instrument_12",
+  "269": "Instrument_13",
+  "270": "Instrument_14",
+  "271": "Instrument_15",
+  "272": "Instrument_16",
+  "273": "Instrument_19",
+  "274": "Instrument_21",
+  "275": "Instrument_22",
+  "276": "Instrument_23",
+  "277": "Instrument_24",
+  "278": "Instrument_25",
+  "279": "Instrument_26",
+  "280": "Instrument_32",
+  "281": "Instrument_33",
+  "282": "Instrument_36",
+  "283": "Instrument_38",
+  "284": "Instrument_40",
+  "285": "Instrument_41",
+  "286": "Instrument_42",
+  "287": "Instrument_43",
+  "288": "Instrument_46",
+  "289": "Instrument_47",
+  "290": "Instrument_49",
+  "291": "Instrument_50",
+  "292": "Instrument_52",
+  "293": "Instrument_55",
+  "294": "Instrument_56",
+  "295": "Instrument_57",
+  "296": "Instrument_58",
+  "297": "Instrument_60",
+  "298": "Instrument_61",
+  "299": "Instrument_62",
+  "300": "Instrument_64",
+  "301": "Instrument_65",
+  "302": "Instrument_66",
+  "303": "Instrument_67",
+  "304": "Instrument_68",
+  "305": "Instrument_69",
+  "306": "Instrument_70",
+  "307": "Instrument_71",
+  "308": "Instrument_72",
+  "309": "Instrument_73",
+  "310": "Instrument_74",
+  "311": "Instrument_75",
+  "312": "Instrument_79",
+  "313": "Instrument_80",
+  "314": "Instrument_88",
+  "315": "Instrument_105",
+  "316": "Instrument_108",
+  "317": "Instrument_109",
+  "318": "Instrument_111",
+  "319": "Instrument_114",
+  "320": "Instrument_117",
+  "321": "Instrument_118"
+}
\ No newline at end of file
diff --git a/vocab/vocab_SOD/vocab_SOD_remi8.json b/vocab/vocab_SOD/vocab_SOD_remi8.json
new file mode 100644
index 0000000..e5e252a
--- /dev/null
+++ b/vocab/vocab_SOD/vocab_SOD_remi8.json
@@ -0,0 +1,546 @@
+{
+  "0": "SOS_None",
+  "1": "EOS_None",
+  "2": "Bar_None",
+  "3": "Bar_time_signature_1/1",
+  "4": "Bar_time_signature_1/2",
+  "5": "Bar_time_signature_1/4",
+  "6": "Bar_time_signature_1/8",
+  "7": "Bar_time_signature_11/8",
+  "8": "Bar_time_signature_12/8",
+  "9": "Bar_time_signature_2/2",
+  "10": "Bar_time_signature_2/4",
+  "11": "Bar_time_signature_2/8",
+  "12": "Bar_time_signature_3/2",
+  "13": "Bar_time_signature_3/4",
+  "14": "Bar_time_signature_3/8",
+  "15": "Bar_time_signature_4/2",
+  "16": "Bar_time_signature_4/4",
+  "17": "Bar_time_signature_4/8",
+  "18": "Bar_time_signature_5/4",
+  "19": "Bar_time_signature_5/8",
+  "20": "Bar_time_signature_6/4",
+  "21": "Bar_time_signature_6/8",
+  "22": "Bar_time_signature_7/4",
+  "23": "Bar_time_signature_7/8",
+  "24": "Bar_time_signature_8/4",
+  "25": "Bar_time_signature_8/8",
+  "26": "Bar_time_signature_9/8",
+  "27": "Beat_0",
+  "28": "Beat_1",
+  "29": "Beat_2",
+  "30": "Beat_3",
+  "31": "Beat_4",
+  "32": "Beat_5",
+  "33": "Beat_6",
+  "34": "Beat_7",
+  "35": "Beat_8",
+  "36": "Beat_9",
+  "37": "Beat_10",
+  "38": "Beat_11",
+  "39": "Beat_12",
+  "40": "Beat_13",
+  "41": "Beat_14",
+  "42": "Beat_15",
+  "43": "Beat_16",
+  "44": "Beat_17",
+  "45": "Beat_18",
+  "46": "Beat_19",
+  "47": "Beat_20",
+  "48": "Beat_21",
+  "49": "Beat_22",
+  "50": "Beat_23",
+  "51": "Beat_24",
+  "52": "Beat_25",
+  "53": "Beat_26",
+  "54": "Beat_27",
+  "55": "Beat_28",
+  "56": "Beat_29",
+  "57": "Beat_30",
+  "58": "Beat_31",
+  "59": "Beat_32",
+  "60": "Beat_33",
+  "61": "Beat_34",
+  "62": "Beat_35",
+  "63": "Beat_36",
+  "64": "Beat_37",
+  "65": "Beat_38",
+  "66": "Beat_39",
+  "67": "Beat_40",
+  "68": "Beat_41",
+  "69": "Beat_42",
+  "70": "Beat_43",
+  "71": "Beat_44",
+  "72": "Beat_45",
+  "73": "Beat_46",
+  "74": "Beat_47",
+  "75": "Beat_48",
+  "76": "Beat_49",
+  "77": "Beat_50",
+  "78": "Beat_51",
+  "79": "Beat_52",
+  "80": "Beat_53",
+  "81": "Beat_54",
+  "82": "Beat_55",
+  "83": "Beat_56",
+  "84": "Beat_57",
+  "85": "Beat_58",
+  "86": "Beat_59",
+  "87": "Beat_60",
+  "88": "Beat_61",
+  "89": "Beat_62",
+  "90": "Beat_63",
+  "91": "Beat_64",
+  "92": "Beat_65",
+  "93": "Beat_66",
+  "94": "Beat_67",
+  "95": "Beat_68",
+  "96": "Beat_69",
+  "97": "Beat_70",
+  "98": "Beat_71",
+  "99": "Beat_72",
+  "100": "Beat_73",
+  "101": "Beat_74",
+  "102": "Beat_75",
+  "103": "Beat_76",
+  "104": "Beat_77",
+  "105": "Beat_78",
+  "106": "Beat_79",
+  "107": "Beat_80",
+  "108": "Beat_81",
+  "109": "Beat_82",
+  "110": "Beat_83",
+  "111": "Beat_84",
+  "112": "Beat_85",
+  "113": "Beat_86",
+  "114": "Beat_87",
+  "115": "Beat_88",
+  "116": "Beat_89",
+  "117": "Beat_90",
+  "118": "Beat_91",
+  "119": "Beat_92",
+  "120": "Beat_93",
+  "121": "Beat_94",
+  "122": "Beat_95",
+  "123": "Note_Duration_1",
+  "124": "Note_Duration_12",
+  "125": "Note_Duration_15",
+  "126": "Note_Duration_18",
+  "127": "Note_Duration_2",
+  "128": "Note_Duration_24",
+  "129": "Note_Duration_3",
+  "130": "Note_Duration_30",
+  "131": "Note_Duration_36",
+  "132": "Note_Duration_4",
+  "133": "Note_Duration_42",
+  "134": "Note_Duration_48",
+  "135": "Note_Duration_54",
+  "136": "Note_Duration_6",
+  "137": "Note_Duration_60",
+  "138": "Note_Duration_72",
+  "139": "Note_Duration_84",
+  "140": "Note_Duration_9",
+  "141": "Note_Duration_96",
+  "142": "Note_Velocity_100",
+  "143": "Note_Velocity_120",
+  "144": "Note_Velocity_40",
+  "145": "Note_Velocity_60",
+  "146": "Note_Velocity_80",
+  "147": "Tempo_100",
+  "148": "Tempo_103",
+  "149": "Tempo_104",
+  "150": "Tempo_108",
+  "151": "Tempo_112",
+  "152": "Tempo_113",
+  "153": "Tempo_116",
+  "154": "Tempo_121",
+  "155": "Tempo_124",
+  "156": "Tempo_126",
+  "157": "Tempo_131",
+  "158": "Tempo_136",
+  "159": "Tempo_141",
+  "160": "Tempo_147",
+  "161": "Tempo_150",
+  "162": "Tempo_153",
+  "163": "Tempo_159",
+  "164": "Tempo_165",
+  "165": "Tempo_172",
+  "166": "Tempo_179",
+  "167": "Tempo_182",
+  "168": "Tempo_186",
+  "169": "Tempo_193",
+  "170": "Tempo_200",
+  "171": "Tempo_201",
+  "172": "Tempo_209",
+  "173": "Tempo_217",
+  "174": "Tempo_220",
+  "175": "Tempo_226",
+  "176": "Tempo_235",
+  "177": "Tempo_242",
+  "178": "Tempo_244",
+  "179": "Tempo_254",
+  "180": "Tempo_264",
+  "181": "Tempo_266",
+  "182": "Tempo_275",
+  "183": "Tempo_286",
+  "184": "Tempo_293",
+  "185": "Tempo_297",
+  "186": "Tempo_30",
+  "187": "Tempo_309",
+  "188": "Tempo_31",
+  "189": "Tempo_32",
+  "190": "Tempo_321",
+  "191": "Tempo_322",
+  "192": "Tempo_33",
+  "193": "Tempo_334",
+  "194": "Tempo_34",
+  "195": "Tempo_347",
+  "196": "Tempo_35",
+  "197": "Tempo_354",
+  "198": "Tempo_36",
+  "199": "Tempo_361",
+  "200": "Tempo_37",
+  "201": "Tempo_375",
+  "202": "Tempo_38",
+  "203": "Tempo_389",
+  "204": "Tempo_390",
+  "205": "Tempo_40",
+  "206": "Tempo_42",
+  "207": "Tempo_44",
+  "208": "Tempo_46",
+  "209": "Tempo_48",
+  "210": "Tempo_50",
+  "211": "Tempo_52",
+  "212": "Tempo_53",
+  "213": "Tempo_54",
+  "214": "Tempo_56",
+  "215": "Tempo_58",
+  "216": "Tempo_60",
+  "217": "Tempo_62",
+  "218": "Tempo_64",
+  "219": "Tempo_67",
+  "220": "Tempo_70",
+  "221": "Tempo_73",
+  "222": "Tempo_76",
+  "223": "Tempo_77",
+  "224": "Tempo_79",
+  "225": "Tempo_82",
+  "226": "Tempo_85",
+  "227": "Tempo_88",
+  "228": "Tempo_92",
+  "229": "Tempo_94",
+  "230": "Tempo_96",
+  "231": "Note_Pitch_6",
+  "232": "Note_Pitch_7",
+  "233": "Note_Pitch_8",
+  "234": "Note_Pitch_9",
+  "235": "Note_Pitch_10",
+  "236": "Note_Pitch_11",
+  "237": "Note_Pitch_12",
+  "238": "Note_Pitch_13",
+  "239": "Note_Pitch_14",
+  "240": "Note_Pitch_15",
+  "241": "Note_Pitch_16",
+  "242": "Note_Pitch_17",
+  "243": "Note_Pitch_18",
+  "244": "Note_Pitch_19",
+  "245": "Note_Pitch_20",
+  "246": "Note_Pitch_21",
+  "247": "Note_Pitch_22",
+  "248": "Note_Pitch_23",
+  "249": "Note_Pitch_24",
+  "250": "Note_Pitch_25",
+  "251": "Note_Pitch_26",
+  "252": "Note_Pitch_27",
+  "253": "Note_Pitch_28",
+  "254": "Note_Pitch_29",
+  "255": "Note_Pitch_30",
+  "256": "Note_Pitch_31",
+  "257": "Note_Pitch_32",
+  "258": "Note_Pitch_33",
+  "259": "Note_Pitch_34",
+  "260": "Note_Pitch_35",
+  "261": "Note_Pitch_36",
+  "262": "Note_Pitch_37",
+  "263": "Note_Pitch_38",
+  "264": "Note_Pitch_39",
+  "265": "Note_Pitch_40",
+  "266": "Note_Pitch_41",
+  "267": "Note_Pitch_42",
+  "268": "Note_Pitch_43",
+  "269": "Note_Pitch_44",
+  "270": "Note_Pitch_45",
+  "271": "Note_Pitch_46",
+  "272": "Note_Pitch_47",
+  "273": "Note_Pitch_48",
+  "274": "Note_Pitch_49",
+  "275": "Note_Pitch_50",
+  "276": "Note_Pitch_51",
+  "277": "Note_Pitch_52",
+  "278": "Note_Pitch_53",
+  "279": "Note_Pitch_54",
+  "280": "Note_Pitch_55",
+  "281": "Note_Pitch_56",
+  "282": "Note_Pitch_57",
+  "283": "Note_Pitch_58",
+  "284": "Note_Pitch_59",
+  "285": "Note_Pitch_60",
+  "286": "Note_Pitch_61",
+  "287": "Note_Pitch_62",
+  "288": "Note_Pitch_63",
+  "289": "Note_Pitch_64",
+  "290": "Note_Pitch_65",
+  "291": "Note_Pitch_66",
+  "292": "Note_Pitch_67",
+  "293": "Note_Pitch_68",
+  "294": "Note_Pitch_69",
+  "295": "Note_Pitch_70",
+  "296": "Note_Pitch_71",
+  "297": "Note_Pitch_72",
+  "298": "Note_Pitch_73",
+  "299": "Note_Pitch_74",
+  "300": "Note_Pitch_75",
+  "301": "Note_Pitch_76",
+  "302": "Note_Pitch_77",
+  "303": "Note_Pitch_78",
+  "304": "Note_Pitch_79",
+  "305": "Note_Pitch_80",
+  "306": "Note_Pitch_81",
+  "307": "Note_Pitch_82",
+  "308": "Note_Pitch_83",
+  "309": "Note_Pitch_84",
+  "310": "Note_Pitch_85",
+  "311": "Note_Pitch_86",
+  "312": "Note_Pitch_87",
+  "313": "Note_Pitch_88",
+  "314": "Note_Pitch_89",
+  "315": "Note_Pitch_90",
+  "316": "Note_Pitch_91",
+  "317": "Note_Pitch_92",
+  "318": "Note_Pitch_93",
+  "319": "Note_Pitch_94",
+  "320": "Note_Pitch_95",
+  "321": "Note_Pitch_96",
+  "322": "Note_Pitch_97",
+  "323": "Note_Pitch_98",
+  "324": "Note_Pitch_99",
+  "325": "Note_Pitch_100",
+  "326": "Note_Pitch_101",
+  "327": "Note_Pitch_102",
+  "328": "Note_Pitch_103",
+  "329": "Note_Pitch_104",
+  "330": "Note_Pitch_105",
+  "331": "Note_Pitch_106",
+  "332": "Note_Pitch_107",
+  "333": "Note_Pitch_108",
+  "334": "Note_Pitch_109",
+  "335": "Note_Pitch_110",
+  "336": "Note_Pitch_111",
+  "337": "Note_Pitch_112",
+  "338": "Note_Pitch_113",
+  "339": "Note_Pitch_114",
+  "340": "Note_Pitch_115",
+  "341": "Note_Pitch_116",
+  "342": "Note_Pitch_117",
+  "343": "Note_Pitch_118",
+  "344": "Note_Pitch_119",
+  "345": "Note_Pitch_120",
+  "346": "Note_Pitch_121",
+  "347": "Note_Pitch_122",
+  "348": "Note_Pitch_123",
+  "349": "Note_Pitch_124",
+  "350": "Instrument_0",
+  "351": "Instrument_4",
+  "352": "Instrument_6",
+  "353": "Instrument_7",
+  "354": "Instrument_8",
+  "355": "Instrument_9",
+  "356": "Instrument_11",
+  "357": "Instrument_12",
+  "358": "Instrument_13",
+  "359": "Instrument_14",
+  "360": "Instrument_15",
+  "361": "Instrument_16",
+  "362": "Instrument_19",
+  "363": "Instrument_21",
+  "364": "Instrument_22",
+  "365": "Instrument_23",
+  "366": "Instrument_24",
+  "367": "Instrument_25",
+  "368": "Instrument_26",
+  "369": "Instrument_32",
+  "370": "Instrument_33",
+  "371": "Instrument_36",
+  "372": "Instrument_38",
+  "373": "Instrument_40",
+  "374": "Instrument_41",
+  "375": "Instrument_42",
+  "376": "Instrument_43",
+  "377": "Instrument_46",
+  "378": "Instrument_47",
+  "379": "Instrument_49",
+  "380": "Instrument_50",
+  "381": "Instrument_52",
+  "382": "Instrument_55",
+  "383": "Instrument_56",
+  "384": "Instrument_57",
+  "385": "Instrument_58",
+  "386": "Instrument_60",
+  "387": "Instrument_61",
+  "388": "Instrument_62",
+  "389": "Instrument_64",
+  "390": "Instrument_65",
+  "391": "Instrument_66",
+  "392": "Instrument_67",
+  "393": "Instrument_68",
+  "394": "Instrument_69",
+  "395": "Instrument_70",
+  "396": "Instrument_71",
+  "397": "Instrument_72",
+  "398": "Instrument_73",
+  "399": "Instrument_74",
+  "400": "Instrument_75",
+  "401": "Instrument_79",
+  "402": "Instrument_80",
+  "403": "Instrument_88",
+  "404": "Instrument_105",
+  "405": "Instrument_108",
+  "406": "Instrument_109",
+  "407": "Instrument_111",
+  "408": "Instrument_114",
+  "409": "Instrument_117",
+  "410": "Instrument_118",
+  "411": "Chord_A_+",
+  "412": "Chord_A#_+",
+  "413": "Chord_B_+",
+  "414": "Chord_C_+",
+  "415": "Chord_C#_+",
+  "416": "Chord_D_+",
+  "417": "Chord_D#_+",
+  "418": "Chord_E_+",
+  "419": "Chord_F_+",
+  "420": "Chord_F#_+",
+  "421": "Chord_G_+",
+  "422": "Chord_G#_+",
+  "423": "Chord_A_/o7",
+  "424": "Chord_A#_/o7",
+  "425": "Chord_B_/o7",
+  "426": "Chord_C_/o7",
+  "427": "Chord_C#_/o7",
+  "428": "Chord_D_/o7",
+  "429": "Chord_D#_/o7",
+  "430": "Chord_E_/o7",
+  "431": "Chord_F_/o7",
+  "432": "Chord_F#_/o7",
+  "433": "Chord_G_/o7",
+  "434": "Chord_G#_/o7",
+  "435": "Chord_A_7",
+  "436": "Chord_A#_7",
+  "437": "Chord_B_7",
+  "438": "Chord_C_7",
+  "439": "Chord_C#_7",
+  "440": "Chord_D_7",
+  "441": "Chord_D#_7",
+  "442": "Chord_E_7",
+  "443": "Chord_F_7",
+  "444": "Chord_F#_7",
+  "445": "Chord_G_7",
+  "446": "Chord_G#_7",
+  "447": "Chord_A_M",
+  "448": "Chord_A#_M",
+  "449": "Chord_B_M",
+  "450": "Chord_C_M",
+  "451": "Chord_C#_M",
+  "452": "Chord_D_M",
+  "453": "Chord_D#_M",
+  "454": "Chord_E_M",
+  "455": "Chord_F_M",
+  "456": "Chord_F#_M",
+  "457": "Chord_G_M",
+  "458": "Chord_G#_M",
+  "459": "Chord_A_M7",
+  "460": "Chord_A#_M7",
+  "461": "Chord_B_M7",
+  "462": "Chord_C_M7",
+  "463": "Chord_C#_M7",
+  "464": "Chord_D_M7",
+  "465": "Chord_D#_M7",
+  "466": "Chord_E_M7",
+  "467": "Chord_F_M7",
+  "468": "Chord_F#_M7",
+  "469": "Chord_G_M7",
+  "470": "Chord_G#_M7",
+  "471": "Chord_A_m",
+  "472": "Chord_A#_m",
+  "473": "Chord_B_m",
+  "474": "Chord_C_m",
+  "475": "Chord_C#_m",
+  "476": "Chord_D_m",
+  "477": "Chord_D#_m",
+  "478": "Chord_E_m",
+  "479": "Chord_F_m",
+  "480": "Chord_F#_m",
+  "481": "Chord_G_m",
+  "482": "Chord_G#_m",
+  "483": "Chord_A_m7",
+  "484": "Chord_A#_m7",
+  "485": "Chord_B_m7",
+  "486": "Chord_C_m7",
+  "487": "Chord_C#_m7",
+  "488": "Chord_D_m7",
+  "489": "Chord_D#_m7",
+  "490": "Chord_E_m7",
+  "491": "Chord_F_m7",
+  "492": "Chord_F#_m7",
+  "493": "Chord_G_m7",
+  "494": "Chord_G#_m7",
+  "495": "Chord_A_o",
+  "496": "Chord_A#_o",
+  "497": "Chord_B_o",
+  "498": "Chord_C_o",
+  "499": "Chord_C#_o",
+  "500": "Chord_D_o",
+  "501": "Chord_D#_o",
+  "502": "Chord_E_o",
+  "503": "Chord_F_o",
+  "504": "Chord_F#_o",
+  "505": "Chord_G_o",
+  "506": "Chord_G#_o",
+  "507": "Chord_A_o7",
+  "508": "Chord_A#_o7",
+  "509": "Chord_B_o7",
+  "510": "Chord_C_o7",
+  "511": "Chord_C#_o7",
+  "512": "Chord_D_o7",
+  "513": "Chord_D#_o7",
+  "514": "Chord_E_o7",
+  "515": "Chord_F_o7",
+  "516": "Chord_F#_o7",
+  "517": "Chord_G_o7",
+  "518": "Chord_G#_o7",
+  "519": "Chord_A_sus2",
+  "520": "Chord_A#_sus2",
+  "521": "Chord_B_sus2",
+  "522": "Chord_C_sus2",
+  "523": "Chord_C#_sus2",
+  "524": "Chord_D_sus2",
+  "525": "Chord_D#_sus2",
+  "526": "Chord_E_sus2",
+  "527": "Chord_F_sus2",
+  "528": "Chord_F#_sus2",
+  "529": "Chord_G_sus2",
+  "530": "Chord_G#_sus2",
+  "531": "Chord_A_sus4",
+  "532": "Chord_A#_sus4",
+  "533": "Chord_B_sus4",
+  "534": "Chord_C_sus4",
+  "535": "Chord_C#_sus4",
+  "536": "Chord_D_sus4",
+  "537": "Chord_D#_sus4",
+  "538": "Chord_E_sus4",
+  "539": "Chord_F_sus4",
+  "540": "Chord_F#_sus4",
+  "541": "Chord_G_sus4",
+  "542": "Chord_G#_sus4",
+  "543": "Chord_N_N"
+}
\ No newline at end of file
diff --git a/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json b/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json
new file mode 100644
index 0000000..6b4d0c3
--- /dev/null
+++ b/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json
@@ -0,0 +1,494 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json
new file mode 100644
index 0000000..c3a2fac
--- /dev/null
+++ b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json
@@ -0,0 +1,378 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_4/4"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_32",
+    "2": "Tempo_35",
+    "3": "Tempo_38",
+    "4": "Tempo_40",
+    "5": "Tempo_44",
+    "6": "Tempo_46",
+    "7": "Tempo_50",
+    "8": "Tempo_54",
+    "9": "Tempo_56",
+    "10": "Tempo_60",
+    "11": "Tempo_62",
+    "12": "Tempo_64",
+    "13": "Tempo_67",
+    "14": "Tempo_70",
+    "15": "Tempo_73",
+    "16": "Tempo_76",
+    "17": "Tempo_79",
+    "18": "Tempo_82",
+    "19": "Tempo_85",
+    "20": "Tempo_88",
+    "21": "Tempo_92",
+    "22": "Tempo_96",
+    "23": "Tempo_100",
+    "24": "Tempo_104",
+    "25": "Tempo_108",
+    "26": "Tempo_112",
+    "27": "Tempo_116",
+    "28": "Tempo_121",
+    "29": "Tempo_126",
+    "30": "Tempo_131",
+    "31": "Tempo_136",
+    "32": "Tempo_141",
+    "33": "Tempo_147",
+    "34": "Tempo_153",
+    "35": "Tempo_159",
+    "36": "Tempo_165",
+    "37": "Tempo_172",
+    "38": "Tempo_179",
+    "39": "Tempo_186",
+    "40": "Tempo_193",
+    "41": "Tempo_201",
+    "42": "Tempo_209",
+    "43": "Tempo_217",
+    "44": "Tempo_226"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_12",
+    "3": "Instrument_16",
+    "4": "Instrument_25",
+    "5": "Instrument_32",
+    "6": "Instrument_40",
+    "7": "Instrument_46",
+    "8": "Instrument_48",
+    "9": "Instrument_56",
+    "10": "Instrument_58",
+    "11": "Instrument_66",
+    "12": "Instrument_73",
+    "13": "Instrument_80",
+    "14": "Instrument_88",
+    "15": "Instrument_104",
+    "16": "Instrument_107",
+    "17": "Instrument_114"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json b/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json
new file mode 100644
index 0000000..085da32
--- /dev/null
+++ b/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json
@@ -0,0 +1,311 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_4/4"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_121"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_11",
+    "2": "Note_Pitch_12",
+    "3": "Note_Pitch_13",
+    "4": "Note_Pitch_14",
+    "5": "Note_Pitch_15",
+    "6": "Note_Pitch_16",
+    "7": "Note_Pitch_17",
+    "8": "Note_Pitch_18",
+    "9": "Note_Pitch_19",
+    "10": "Note_Pitch_20",
+    "11": "Note_Pitch_21",
+    "12": "Note_Pitch_22",
+    "13": "Note_Pitch_23",
+    "14": "Note_Pitch_24",
+    "15": "Note_Pitch_25",
+    "16": "Note_Pitch_26",
+    "17": "Note_Pitch_27",
+    "18": "Note_Pitch_28",
+    "19": "Note_Pitch_29",
+    "20": "Note_Pitch_30",
+    "21": "Note_Pitch_31",
+    "22": "Note_Pitch_32",
+    "23": "Note_Pitch_33",
+    "24": "Note_Pitch_34",
+    "25": "Note_Pitch_35",
+    "26": "Note_Pitch_36",
+    "27": "Note_Pitch_37",
+    "28": "Note_Pitch_38",
+    "29": "Note_Pitch_39",
+    "30": "Note_Pitch_40",
+    "31": "Note_Pitch_41",
+    "32": "Note_Pitch_42",
+    "33": "Note_Pitch_43",
+    "34": "Note_Pitch_44",
+    "35": "Note_Pitch_45",
+    "36": "Note_Pitch_46",
+    "37": "Note_Pitch_47",
+    "38": "Note_Pitch_48",
+    "39": "Note_Pitch_49",
+    "40": "Note_Pitch_50",
+    "41": "Note_Pitch_51",
+    "42": "Note_Pitch_52",
+    "43": "Note_Pitch_53",
+    "44": "Note_Pitch_54",
+    "45": "Note_Pitch_55",
+    "46": "Note_Pitch_56",
+    "47": "Note_Pitch_57",
+    "48": "Note_Pitch_58",
+    "49": "Note_Pitch_59",
+    "50": "Note_Pitch_60",
+    "51": "Note_Pitch_61",
+    "52": "Note_Pitch_62",
+    "53": "Note_Pitch_63",
+    "54": "Note_Pitch_64",
+    "55": "Note_Pitch_65",
+    "56": "Note_Pitch_66",
+    "57": "Note_Pitch_67",
+    "58": "Note_Pitch_68",
+    "59": "Note_Pitch_69",
+    "60": "Note_Pitch_70",
+    "61": "Note_Pitch_71",
+    "62": "Note_Pitch_72",
+    "63": "Note_Pitch_73",
+    "64": "Note_Pitch_74",
+    "65": "Note_Pitch_75",
+    "66": "Note_Pitch_76",
+    "67": "Note_Pitch_77",
+    "68": "Note_Pitch_78",
+    "69": "Note_Pitch_79",
+    "70": "Note_Pitch_80",
+    "71": "Note_Pitch_81",
+    "72": "Note_Pitch_82",
+    "73": "Note_Pitch_83",
+    "74": "Note_Pitch_84",
+    "75": "Note_Pitch_85",
+    "76": "Note_Pitch_86",
+    "77": "Note_Pitch_87",
+    "78": "Note_Pitch_88",
+    "79": "Note_Pitch_89",
+    "80": "Note_Pitch_90",
+    "81": "Note_Pitch_91",
+    "82": "Note_Pitch_92",
+    "83": "Note_Pitch_93",
+    "84": "Note_Pitch_94",
+    "85": "Note_Pitch_95",
+    "86": "Note_Pitch_96",
+    "87": "Note_Pitch_97",
+    "88": "Note_Pitch_98",
+    "89": "Note_Pitch_99",
+    "90": "Note_Pitch_100",
+    "91": "Note_Pitch_101",
+    "92": "Note_Pitch_102",
+    "93": "Note_Pitch_103",
+    "94": "Note_Pitch_104",
+    "95": "Note_Pitch_105",
+    "96": "Note_Pitch_106",
+    "97": "Note_Pitch_107",
+    "98": "Note_Pitch_108",
+    "99": "Note_Pitch_109",
+    "100": "Note_Pitch_110",
+    "101": "Note_Pitch_111",
+    "102": "Note_Pitch_112",
+    "103": "Note_Pitch_113",
+    "104": "Note_Pitch_114",
+    "105": "Note_Pitch_115",
+    "106": "Note_Pitch_116",
+    "107": "Note_Pitch_117",
+    "108": "Note_Pitch_118",
+    "109": "Note_Pitch_119"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json b/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json b/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json
new file mode 100644
index 0000000..6b4d0c3
--- /dev/null
+++ b/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json
@@ -0,0 +1,494 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json b/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json b/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json
new file mode 100644
index 0000000..9f49c7e
--- /dev/null
+++ b/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json
@@ -0,0 +1,557 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_N_N",
+    "2": "Tempo_30",
+    "3": "Tempo_31",
+    "4": "Tempo_32",
+    "5": "Tempo_33",
+    "6": "Tempo_34",
+    "7": "Tempo_35",
+    "8": "Tempo_36",
+    "9": "Tempo_37",
+    "10": "Tempo_38",
+    "11": "Tempo_40",
+    "12": "Tempo_42",
+    "13": "Tempo_44",
+    "14": "Tempo_46",
+    "15": "Tempo_48",
+    "16": "Tempo_50",
+    "17": "Tempo_52",
+    "18": "Tempo_54",
+    "19": "Tempo_56",
+    "20": "Tempo_58",
+    "21": "Tempo_60",
+    "22": "Tempo_62",
+    "23": "Tempo_64",
+    "24": "Tempo_67",
+    "25": "Tempo_70",
+    "26": "Tempo_73",
+    "27": "Tempo_76",
+    "28": "Tempo_79",
+    "29": "Tempo_82",
+    "30": "Tempo_85",
+    "31": "Tempo_88",
+    "32": "Tempo_92",
+    "33": "Tempo_96",
+    "34": "Tempo_100",
+    "35": "Tempo_104",
+    "36": "Tempo_108",
+    "37": "Tempo_112",
+    "38": "Tempo_116",
+    "39": "Tempo_121",
+    "40": "Tempo_126",
+    "41": "Tempo_131",
+    "42": "Tempo_136",
+    "43": "Tempo_141",
+    "44": "Tempo_147",
+    "45": "Tempo_153",
+    "46": "Tempo_159",
+    "47": "Tempo_165",
+    "48": "Tempo_172",
+    "49": "Tempo_179",
+    "50": "Tempo_186",
+    "51": "Tempo_193",
+    "52": "Tempo_201",
+    "53": "Tempo_209",
+    "54": "Tempo_217",
+    "55": "Tempo_226",
+    "56": "Tempo_235",
+    "57": "Tempo_244",
+    "58": "Tempo_254",
+    "59": "Tempo_264",
+    "60": "Tempo_275",
+    "61": "Tempo_286",
+    "62": "Tempo_297",
+    "63": "Tempo_309",
+    "64": "Tempo_321",
+    "65": "Tempo_334",
+    "66": "Tempo_347",
+    "67": "Tempo_361",
+    "68": "Tempo_375",
+    "69": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_1",
+    "3": "Instrument_2",
+    "4": "Instrument_3",
+    "5": "Instrument_4",
+    "6": "Instrument_5",
+    "7": "Instrument_6",
+    "8": "Instrument_7",
+    "9": "Instrument_8",
+    "10": "Instrument_9",
+    "11": "Instrument_10",
+    "12": "Instrument_11",
+    "13": "Instrument_12",
+    "14": "Instrument_13",
+    "15": "Instrument_14",
+    "16": "Instrument_15",
+    "17": "Instrument_16",
+    "18": "Instrument_17",
+    "19": "Instrument_18",
+    "20": "Instrument_19",
+    "21": "Instrument_20",
+    "22": "Instrument_21",
+    "23": "Instrument_22",
+    "24": "Instrument_23",
+    "25": "Instrument_24",
+    "26": "Instrument_25",
+    "27": "Instrument_26",
+    "28": "Instrument_27",
+    "29": "Instrument_28",
+    "30": "Instrument_29",
+    "31": "Instrument_30",
+    "32": "Instrument_31",
+    "33": "Instrument_32",
+    "34": "Instrument_33",
+    "35": "Instrument_34",
+    "36": "Instrument_35",
+    "37": "Instrument_36",
+    "38": "Instrument_37",
+    "39": "Instrument_38",
+    "40": "Instrument_39",
+    "41": "Instrument_40",
+    "42": "Instrument_41",
+    "43": "Instrument_42",
+    "44": "Instrument_43",
+    "45": "Instrument_44",
+    "46": "Instrument_45",
+    "47": "Instrument_46",
+    "48": "Instrument_47",
+    "49": "Instrument_48",
+    "50": "Instrument_49",
+    "51": "Instrument_50",
+    "52": "Instrument_51",
+    "53": "Instrument_52",
+    "54": "Instrument_53",
+    "55": "Instrument_54",
+    "56": "Instrument_55",
+    "57": "Instrument_56",
+    "58": "Instrument_57",
+    "59": "Instrument_58",
+    "60": "Instrument_59",
+    "61": "Instrument_60",
+    "62": "Instrument_61",
+    "63": "Instrument_62",
+    "64": "Instrument_63",
+    "65": "Instrument_64",
+    "66": "Instrument_65",
+    "67": "Instrument_66",
+    "68": "Instrument_67",
+    "69": "Instrument_68",
+    "70": "Instrument_69",
+    "71": "Instrument_70",
+    "72": "Instrument_71",
+    "73": "Instrument_72",
+    "74": "Instrument_73",
+    "75": "Instrument_74",
+    "76": "Instrument_75",
+    "77": "Instrument_76",
+    "78": "Instrument_77",
+    "79": "Instrument_78",
+    "80": "Instrument_79",
+    "81": "Instrument_80",
+    "82": "Instrument_81",
+    "83": "Instrument_82",
+    "84": "Instrument_83",
+    "85": "Instrument_84",
+    "86": "Instrument_85",
+    "87": "Instrument_86",
+    "88": "Instrument_87",
+    "89": "Instrument_88",
+    "90": "Instrument_89",
+    "91": "Instrument_90",
+    "92": "Instrument_91",
+    "93": "Instrument_92",
+    "94": "Instrument_93",
+    "95": "Instrument_94",
+    "96": "Instrument_95",
+    "97": "Instrument_96",
+    "98": "Instrument_97",
+    "99": "Instrument_98",
+    "100": "Instrument_99",
+    "101": "Instrument_100",
+    "102": "Instrument_101",
+    "103": "Instrument_102",
+    "104": "Instrument_103",
+    "105": "Instrument_104",
+    "106": "Instrument_105",
+    "107": "Instrument_106",
+    "108": "Instrument_107",
+    "109": "Instrument_108",
+    "110": "Instrument_109",
+    "111": "Instrument_110",
+    "112": "Instrument_111",
+    "113": "Instrument_112",
+    "114": "Instrument_113",
+    "115": "Instrument_114",
+    "116": "Instrument_115",
+    "117": "Instrument_116",
+    "118": "Instrument_117",
+    "119": "Instrument_118",
+    "120": "Instrument_119",
+    "121": "Instrument_120",
+    "122": "Instrument_121",
+    "123": "Instrument_122",
+    "124": "Instrument_123",
+    "125": "Instrument_124",
+    "126": "Instrument_125",
+    "127": "Instrument_126",
+    "128": "Instrument_127"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file
diff --git a/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json b/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json
new file mode 100644
index 0000000..2a1adbb
--- /dev/null
+++ b/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json
@@ -0,0 +1,493 @@
+{
+  "type": {
+    "0": "SOS",
+    "1": "EOS",
+    "2": "Empty_Bar",
+    "3": "SSS",
+    "4": "SSN",
+    "5": "SNN",
+    "6": "NNN_time_signature_12/8",
+    "7": "NNN_time_signature_11/8",
+    "8": "NNN_time_signature_9/8",
+    "9": "NNN_time_signature_8/8",
+    "10": "NNN_time_signature_7/8",
+    "11": "NNN_time_signature_6/8",
+    "12": "NNN_time_signature_5/8",
+    "13": "NNN_time_signature_4/8",
+    "14": "NNN_time_signature_3/8",
+    "15": "NNN_time_signature_2/8",
+    "16": "NNN_time_signature_1/8",
+    "17": "NNN_time_signature_8/4",
+    "18": "NNN_time_signature_7/4",
+    "19": "NNN_time_signature_6/4",
+    "20": "NNN_time_signature_5/4",
+    "21": "NNN_time_signature_4/4",
+    "22": "NNN_time_signature_3/4",
+    "23": "NNN_time_signature_2/4",
+    "24": "NNN_time_signature_1/4",
+    "25": "NNN_time_signature_4/2",
+    "26": "NNN_time_signature_3/2",
+    "27": "NNN_time_signature_2/2",
+    "28": "NNN_time_signature_1/2",
+    "29": "NNN_time_signature_1/1"
+  },
+  "beat": {
+    "0": 0,
+    "1": "Beat_0",
+    "2": "Beat_1",
+    "3": "Beat_2",
+    "4": "Beat_3",
+    "5": "Beat_4",
+    "6": "Beat_5",
+    "7": "Beat_6",
+    "8": "Beat_7",
+    "9": "Beat_8",
+    "10": "Beat_9",
+    "11": "Beat_10",
+    "12": "Beat_11",
+    "13": "Beat_12",
+    "14": "Beat_13",
+    "15": "Beat_14",
+    "16": "Beat_15",
+    "17": "Beat_16",
+    "18": "Beat_17",
+    "19": "Beat_18",
+    "20": "Beat_19",
+    "21": "Beat_20",
+    "22": "Beat_21",
+    "23": "Beat_22",
+    "24": "Beat_23",
+    "25": "Beat_24",
+    "26": "Beat_25",
+    "27": "Beat_26",
+    "28": "Beat_27",
+    "29": "Beat_28",
+    "30": "Beat_29",
+    "31": "Beat_30",
+    "32": "Beat_31"
+  },
+  "chord": {
+    "0": 0,
+    "1": "Chord_N_N",
+    "2": "Chord_A_+",
+    "3": "Chord_A#_+",
+    "4": "Chord_B_+",
+    "5": "Chord_C_+",
+    "6": "Chord_C#_+",
+    "7": "Chord_D_+",
+    "8": "Chord_D#_+",
+    "9": "Chord_E_+",
+    "10": "Chord_F_+",
+    "11": "Chord_F#_+",
+    "12": "Chord_G_+",
+    "13": "Chord_G#_+",
+    "14": "Chord_A_/o7",
+    "15": "Chord_A#_/o7",
+    "16": "Chord_B_/o7",
+    "17": "Chord_C_/o7",
+    "18": "Chord_C#_/o7",
+    "19": "Chord_D_/o7",
+    "20": "Chord_D#_/o7",
+    "21": "Chord_E_/o7",
+    "22": "Chord_F_/o7",
+    "23": "Chord_F#_/o7",
+    "24": "Chord_G_/o7",
+    "25": "Chord_G#_/o7",
+    "26": "Chord_A_7",
+    "27": "Chord_A#_7",
+    "28": "Chord_B_7",
+    "29": "Chord_C_7",
+    "30": "Chord_C#_7",
+    "31": "Chord_D_7",
+    "32": "Chord_D#_7",
+    "33": "Chord_E_7",
+    "34": "Chord_F_7",
+    "35": "Chord_F#_7",
+    "36": "Chord_G_7",
+    "37": "Chord_G#_7",
+    "38": "Chord_A_M",
+    "39": "Chord_A#_M",
+    "40": "Chord_B_M",
+    "41": "Chord_C_M",
+    "42": "Chord_C#_M",
+    "43": "Chord_D_M",
+    "44": "Chord_D#_M",
+    "45": "Chord_E_M",
+    "46": "Chord_F_M",
+    "47": "Chord_F#_M",
+    "48": "Chord_G_M",
+    "49": "Chord_G#_M",
+    "50": "Chord_A_M7",
+    "51": "Chord_A#_M7",
+    "52": "Chord_B_M7",
+    "53": "Chord_C_M7",
+    "54": "Chord_C#_M7",
+    "55": "Chord_D_M7",
+    "56": "Chord_D#_M7",
+    "57": "Chord_E_M7",
+    "58": "Chord_F_M7",
+    "59": "Chord_F#_M7",
+    "60": "Chord_G_M7",
+    "61": "Chord_G#_M7",
+    "62": "Chord_A_m",
+    "63": "Chord_A#_m",
+    "64": "Chord_B_m",
+    "65": "Chord_C_m",
+    "66": "Chord_C#_m",
+    "67": "Chord_D_m",
+    "68": "Chord_D#_m",
+    "69": "Chord_E_m",
+    "70": "Chord_F_m",
+    "71": "Chord_F#_m",
+    "72": "Chord_G_m",
+    "73": "Chord_G#_m",
+    "74": "Chord_A_m7",
+    "75": "Chord_A#_m7",
+    "76": "Chord_B_m7",
+    "77": "Chord_C_m7",
+    "78": "Chord_C#_m7",
+    "79": "Chord_D_m7",
+    "80": "Chord_D#_m7",
+    "81": "Chord_E_m7",
+    "82": "Chord_F_m7",
+    "83": "Chord_F#_m7",
+    "84": "Chord_G_m7",
+    "85": "Chord_G#_m7",
+    "86": "Chord_A_o",
+    "87": "Chord_A#_o",
+    "88": "Chord_B_o",
+    "89": "Chord_C_o",
+    "90": "Chord_C#_o",
+    "91": "Chord_D_o",
+    "92": "Chord_D#_o",
+    "93": "Chord_E_o",
+    "94": "Chord_F_o",
+    "95": "Chord_F#_o",
+    "96": "Chord_G_o",
+    "97": "Chord_G#_o",
+    "98": "Chord_A_o7",
+    "99": "Chord_A#_o7",
+    "100": "Chord_B_o7",
+    "101": "Chord_C_o7",
+    "102": "Chord_C#_o7",
+    "103": "Chord_D_o7",
+    "104": "Chord_D#_o7",
+    "105": "Chord_E_o7",
+    "106": "Chord_F_o7",
+    "107": "Chord_F#_o7",
+    "108": "Chord_G_o7",
+    "109": "Chord_G#_o7",
+    "110": "Chord_A_sus2",
+    "111": "Chord_A#_sus2",
+    "112": "Chord_B_sus2",
+    "113": "Chord_C_sus2",
+    "114": "Chord_C#_sus2",
+    "115": "Chord_D_sus2",
+    "116": "Chord_D#_sus2",
+    "117": "Chord_E_sus2",
+    "118": "Chord_F_sus2",
+    "119": "Chord_F#_sus2",
+    "120": "Chord_G_sus2",
+    "121": "Chord_G#_sus2",
+    "122": "Chord_A_sus4",
+    "123": "Chord_A#_sus4",
+    "124": "Chord_B_sus4",
+    "125": "Chord_C_sus4",
+    "126": "Chord_C#_sus4",
+    "127": "Chord_D_sus4",
+    "128": "Chord_D#_sus4",
+    "129": "Chord_E_sus4",
+    "130": "Chord_F_sus4",
+    "131": "Chord_F#_sus4",
+    "132": "Chord_G_sus4",
+    "133": "Chord_G#_sus4"
+  },
+  "tempo": {
+    "0": 0,
+    "1": "Tempo_30",
+    "2": "Tempo_31",
+    "3": "Tempo_32",
+    "4": "Tempo_33",
+    "5": "Tempo_34",
+    "6": "Tempo_35",
+    "7": "Tempo_36",
+    "8": "Tempo_37",
+    "9": "Tempo_38",
+    "10": "Tempo_40",
+    "11": "Tempo_42",
+    "12": "Tempo_44",
+    "13": "Tempo_46",
+    "14": "Tempo_48",
+    "15": "Tempo_50",
+    "16": "Tempo_52",
+    "17": "Tempo_54",
+    "18": "Tempo_56",
+    "19": "Tempo_58",
+    "20": "Tempo_60",
+    "21": "Tempo_62",
+    "22": "Tempo_64",
+    "23": "Tempo_67",
+    "24": "Tempo_70",
+    "25": "Tempo_73",
+    "26": "Tempo_76",
+    "27": "Tempo_79",
+    "28": "Tempo_82",
+    "29": "Tempo_85",
+    "30": "Tempo_88",
+    "31": "Tempo_92",
+    "32": "Tempo_96",
+    "33": "Tempo_100",
+    "34": "Tempo_104",
+    "35": "Tempo_108",
+    "36": "Tempo_112",
+    "37": "Tempo_116",
+    "38": "Tempo_121",
+    "39": "Tempo_126",
+    "40": "Tempo_131",
+    "41": "Tempo_136",
+    "42": "Tempo_141",
+    "43": "Tempo_147",
+    "44": "Tempo_153",
+    "45": "Tempo_159",
+    "46": "Tempo_165",
+    "47": "Tempo_172",
+    "48": "Tempo_179",
+    "49": "Tempo_186",
+    "50": "Tempo_193",
+    "51": "Tempo_201",
+    "52": "Tempo_209",
+    "53": "Tempo_217",
+    "54": "Tempo_226",
+    "55": "Tempo_235",
+    "56": "Tempo_244",
+    "57": "Tempo_254",
+    "58": "Tempo_264",
+    "59": "Tempo_275",
+    "60": "Tempo_286",
+    "61": "Tempo_297",
+    "62": "Tempo_309",
+    "63": "Tempo_321",
+    "64": "Tempo_334",
+    "65": "Tempo_347",
+    "66": "Tempo_361",
+    "67": "Tempo_375",
+    "68": "Tempo_390"
+  },
+  "instrument": {
+    "0": 0,
+    "1": "Instrument_0",
+    "2": "Instrument_4",
+    "3": "Instrument_6",
+    "4": "Instrument_7",
+    "5": "Instrument_8",
+    "6": "Instrument_9",
+    "7": "Instrument_10",
+    "8": "Instrument_11",
+    "9": "Instrument_12",
+    "10": "Instrument_13",
+    "11": "Instrument_14",
+    "12": "Instrument_15",
+    "13": "Instrument_16",
+    "14": "Instrument_19",
+    "15": "Instrument_21",
+    "16": "Instrument_22",
+    "17": "Instrument_23",
+    "18": "Instrument_24",
+    "19": "Instrument_25",
+    "20": "Instrument_26",
+    "21": "Instrument_32",
+    "22": "Instrument_33",
+    "23": "Instrument_36",
+    "24": "Instrument_38",
+    "25": "Instrument_40",
+    "26": "Instrument_41",
+    "27": "Instrument_42",
+    "28": "Instrument_43",
+    "29": "Instrument_46",
+    "30": "Instrument_47",
+    "31": "Instrument_49",
+    "32": "Instrument_50",
+    "33": "Instrument_52",
+    "34": "Instrument_55",
+    "35": "Instrument_56",
+    "36": "Instrument_57",
+    "37": "Instrument_58",
+    "38": "Instrument_60",
+    "39": "Instrument_61",
+    "40": "Instrument_62",
+    "41": "Instrument_64",
+    "42": "Instrument_65",
+    "43": "Instrument_66",
+    "44": "Instrument_67",
+    "45": "Instrument_68",
+    "46": "Instrument_69",
+    "47": "Instrument_70",
+    "48": "Instrument_71",
+    "49": "Instrument_72",
+    "50": "Instrument_73",
+    "51": "Instrument_74",
+    "52": "Instrument_75",
+    "53": "Instrument_79",
+    "54": "Instrument_80",
+    "55": "Instrument_88",
+    "56": "Instrument_104",
+    "57": "Instrument_105",
+    "58": "Instrument_106",
+    "59": "Instrument_107",
+    "60": "Instrument_108",
+    "61": "Instrument_109",
+    "62": "Instrument_111",
+    "63": "Instrument_114",
+    "64": "Instrument_117",
+    "65": "Instrument_118"
+  },
+  "pitch": {
+    "0": 0,
+    "1": "Note_Pitch_6",
+    "2": "Note_Pitch_7",
+    "3": "Note_Pitch_8",
+    "4": "Note_Pitch_9",
+    "5": "Note_Pitch_10",
+    "6": "Note_Pitch_11",
+    "7": "Note_Pitch_12",
+    "8": "Note_Pitch_13",
+    "9": "Note_Pitch_14",
+    "10": "Note_Pitch_15",
+    "11": "Note_Pitch_16",
+    "12": "Note_Pitch_17",
+    "13": "Note_Pitch_18",
+    "14": "Note_Pitch_19",
+    "15": "Note_Pitch_20",
+    "16": "Note_Pitch_21",
+    "17": "Note_Pitch_22",
+    "18": "Note_Pitch_23",
+    "19": "Note_Pitch_24",
+    "20": "Note_Pitch_25",
+    "21": "Note_Pitch_26",
+    "22": "Note_Pitch_27",
+    "23": "Note_Pitch_28",
+    "24": "Note_Pitch_29",
+    "25": "Note_Pitch_30",
+    "26": "Note_Pitch_31",
+    "27": "Note_Pitch_32",
+    "28": "Note_Pitch_33",
+    "29": "Note_Pitch_34",
+    "30": "Note_Pitch_35",
+    "31": "Note_Pitch_36",
+    "32": "Note_Pitch_37",
+    "33": "Note_Pitch_38",
+    "34": "Note_Pitch_39",
+    "35": "Note_Pitch_40",
+    "36": "Note_Pitch_41",
+    "37": "Note_Pitch_42",
+    "38": "Note_Pitch_43",
+    "39": "Note_Pitch_44",
+    "40": "Note_Pitch_45",
+    "41": "Note_Pitch_46",
+    "42": "Note_Pitch_47",
+    "43": "Note_Pitch_48",
+    "44": "Note_Pitch_49",
+    "45": "Note_Pitch_50",
+    "46": "Note_Pitch_51",
+    "47": "Note_Pitch_52",
+    "48": "Note_Pitch_53",
+    "49": "Note_Pitch_54",
+    "50": "Note_Pitch_55",
+    "51": "Note_Pitch_56",
+    "52": "Note_Pitch_57",
+    "53": "Note_Pitch_58",
+    "54": "Note_Pitch_59",
+    "55": "Note_Pitch_60",
+    "56": "Note_Pitch_61",
+    "57": "Note_Pitch_62",
+    "58": "Note_Pitch_63",
+    "59": "Note_Pitch_64",
+    "60": "Note_Pitch_65",
+    "61": "Note_Pitch_66",
+    "62": "Note_Pitch_67",
+    "63": "Note_Pitch_68",
+    "64": "Note_Pitch_69",
+    "65": "Note_Pitch_70",
+    "66": "Note_Pitch_71",
+    "67": "Note_Pitch_72",
+    "68": "Note_Pitch_73",
+    "69": "Note_Pitch_74",
+    "70": "Note_Pitch_75",
+    "71": "Note_Pitch_76",
+    "72": "Note_Pitch_77",
+    "73": "Note_Pitch_78",
+    "74": "Note_Pitch_79",
+    "75": "Note_Pitch_80",
+    "76": "Note_Pitch_81",
+    "77": "Note_Pitch_82",
+    "78": "Note_Pitch_83",
+    "79": "Note_Pitch_84",
+    "80": "Note_Pitch_85",
+    "81": "Note_Pitch_86",
+    "82": "Note_Pitch_87",
+    "83": "Note_Pitch_88",
+    "84": "Note_Pitch_89",
+    "85": "Note_Pitch_90",
+    "86": "Note_Pitch_91",
+    "87": "Note_Pitch_92",
+    "88": "Note_Pitch_93",
+    "89": "Note_Pitch_94",
+    "90": "Note_Pitch_95",
+    "91": "Note_Pitch_96",
+    "92": "Note_Pitch_97",
+    "93": "Note_Pitch_98",
+    "94": "Note_Pitch_99",
+    "95": "Note_Pitch_100",
+    "96": "Note_Pitch_101",
+    "97": "Note_Pitch_102",
+    "98": "Note_Pitch_103",
+    "99": "Note_Pitch_104",
+    "100": "Note_Pitch_105",
+    "101": "Note_Pitch_106",
+    "102": "Note_Pitch_107",
+    "103": "Note_Pitch_108",
+    "104": "Note_Pitch_109",
+    "105": "Note_Pitch_110",
+    "106": "Note_Pitch_111",
+    "107": "Note_Pitch_112",
+    "108": "Note_Pitch_113",
+    "109": "Note_Pitch_114",
+    "110": "Note_Pitch_115",
+    "111": "Note_Pitch_116",
+    "112": "Note_Pitch_117",
+    "113": "Note_Pitch_118",
+    "114": "Note_Pitch_119",
+    "115": "Note_Pitch_120",
+    "116": "Note_Pitch_121",
+    "117": "Note_Pitch_122",
+    "118": "Note_Pitch_123",
+    "119": "Note_Pitch_124",
+    "120": "Note_Pitch_125",
+    "121": "Note_Pitch_126"
+  },
+  "duration": {
+    "0": 0,
+    "1": "Note_Duration_1",
+    "2": "Note_Duration_2",
+    "3": "Note_Duration_3",
+    "4": "Note_Duration_4",
+    "5": "Note_Duration_5",
+    "6": "Note_Duration_6",
+    "7": "Note_Duration_8",
+    "8": "Note_Duration_10",
+    "9": "Note_Duration_12",
+    "10": "Note_Duration_16",
+    "11": "Note_Duration_20",
+    "12": "Note_Duration_24",
+    "13": "Note_Duration_28",
+    "14": "Note_Duration_32"
+  },
+  "velocity": {
+    "0": 0,
+    "1": "Note_Velocity_40",
+    "2": "Note_Velocity_60",
+    "3": "Note_Velocity_80",
+    "4": "Note_Velocity_100",
+    "5": "Note_Velocity_120"
+  }
+}
\ No newline at end of file